16.4 Data Pre-processing
We will conduct data pre-processing in this stage
Some steps (depend on the types of problems analysed)
- Create a corpus
- Change encoding
- Convert to lower case
- Remove hashtags
- Remove URLs
- Remove @ mentions
- Remove punctuations
- Remove stop words
- Stemming can also be conducting (avoided in this example)
library(tm)
= readRDS("tweets_auspol.rds")
rt # encoding
$text <- sapply(rt$text, function(row) iconv(row, "latin1", "ASCII", sub = ""))
rt
# build a corpus, and specify the source to be character vectors
= Corpus(VectorSource(rt$text))
myCorpus
# convert to lower case
= tm_map(myCorpus, content_transformer(tolower))
myCorpus # remove punctuation
<- tm_map(myCorpus, removePunctuation)
myCorpus # remove numbers
<- tm_map(myCorpus, removeNumbers)
myCorpus
# remove URLs
= function(x) gsub("http[^[:space:]]*", "", x)
removeURL = function(x) gsub("https[^[:space:]]*", "", x)
removeURLs
# remove hashtags
= function(x) gsub("#\\S+", "", x)
removehash
# remove @
<- function(x) gsub("@\\w+", "", x) #only removes the '@'
removeats
# remove numbers and punctuations
= function(x) gsub("[^[:alpha:][:space:]]*", "", x)
removeNumPunct
# leading and trailing white spaces
= function(x) gsub("^[[:space:]]*", "", x) ## Remove leading whitespaces
wspace1 = function(x) gsub("[[:space:]]*$", "", x) ## Remove trailing whitespaces
wspace2 = function(x) gsub(" +", " ", x) ## Remove extra whitespaces
wspace3
<- function(x) gsub("im", "", x)
removeIms
= tm_map(myCorpus, content_transformer(removeURL)) #url
myCorpus = tm_map(myCorpus, content_transformer(removeURLs)) #url
myCorpus
<- tm_map(myCorpus, content_transformer(removehash)) #hashtag
myCorpus <- tm_map(myCorpus, content_transformer(removeats)) #mentions
myCorpus
= tm_map(myCorpus, content_transformer(removeNumPunct)) #number and punctuation (just in case some are left over)
myCorpus
= tm_map(myCorpus, content_transformer(removeIms)) #Ims
myCorpus
= tm_map(myCorpus, content_transformer(wspace1))
myCorpus = tm_map(myCorpus, content_transformer(wspace2))
myCorpus = tm_map(myCorpus, content_transformer(wspace3)) #other white spaces
myCorpus
# remove extra whitespace
= tm_map(myCorpus, stripWhitespace)
myCorpus
# remove extra stopwords
= c(stopwords("English"), stopwords("SMART"), "rt", "ht", "via", "amp",
myStopwords "the", "australia", "australians", "australian", "auspol")
= tm_map(myCorpus, removeWords, myStopwords)
myCorpus
# generally a good idea to save the processed corpus now
save(myCorpus, file = "auspol_sep.RData")
= data.frame(text = get("content", myCorpus), row.names = NULL)
data_tw2 = cbind(data_tw2, ID = rt$status_id)
data_tw2 = cbind(Date = as.Date(rt$created_at), data_tw2)
data_tw2 # look at the data frame, still some white spaces left so let's get rid of them
$text = gsub("\r?\n|\r", "", data_tw2$text)
data_tw2$text = gsub(" +", " ", data_tw2$text)
data_tw2head(data_tw2)
# save data
saveRDS(data_tw2, file = "processed_data.rds")