16.4 Data Pre-processing

  • We will conduct data pre-processing in this stage

  • Some steps (depend on the types of problems analysed)

    • Create a corpus
    • Change encoding
    • Convert to lower case
    • Remove hashtags
    • Remove URLs
    • Remove @ mentions
    • Remove punctuations
    • Remove stop words
    • Stemming can also be conducting (avoided in this example)
library(tm)
rt = readRDS("tweets_auspol.rds")
# encoding
rt$text <- sapply(rt$text, function(row) iconv(row, "latin1", "ASCII", sub = ""))

# build a corpus, and specify the source to be character vectors

myCorpus = Corpus(VectorSource(rt$text))


# convert to lower case
myCorpus = tm_map(myCorpus, content_transformer(tolower))
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)


# remove URLs
removeURL = function(x) gsub("http[^[:space:]]*", "", x)
removeURLs = function(x) gsub("https[^[:space:]]*", "", x)

# remove hashtags

removehash = function(x) gsub("#\\S+", "", x)

# remove @
removeats <- function(x) gsub("@\\w+", "", x)  #only removes the '@'

# remove numbers and punctuations
removeNumPunct = function(x) gsub("[^[:alpha:][:space:]]*", "", x)

# leading and trailing white spaces

wspace1 = function(x) gsub("^[[:space:]]*", "", x)  ## Remove leading whitespaces
wspace2 = function(x) gsub("[[:space:]]*$", "", x)  ## Remove trailing whitespaces
wspace3 = function(x) gsub(" +", " ", x)  ## Remove extra whitespaces

removeIms <- function(x) gsub("im", "", x)

myCorpus = tm_map(myCorpus, content_transformer(removeURL))  #url
myCorpus = tm_map(myCorpus, content_transformer(removeURLs))  #url

myCorpus <- tm_map(myCorpus, content_transformer(removehash))  #hashtag
myCorpus <- tm_map(myCorpus, content_transformer(removeats))  #mentions

myCorpus = tm_map(myCorpus, content_transformer(removeNumPunct))  #number and punctuation (just in case some are left over)

myCorpus = tm_map(myCorpus, content_transformer(removeIms))  #Ims

myCorpus = tm_map(myCorpus, content_transformer(wspace1))
myCorpus = tm_map(myCorpus, content_transformer(wspace2))
myCorpus = tm_map(myCorpus, content_transformer(wspace3))  #other white spaces

# remove extra whitespace
myCorpus = tm_map(myCorpus, stripWhitespace)

# remove extra stopwords
myStopwords = c(stopwords("English"), stopwords("SMART"), "rt", "ht", "via", "amp",
    "the", "australia", "australians", "australian", "auspol")

myCorpus = tm_map(myCorpus, removeWords, myStopwords)

# generally a good idea to save the processed corpus now
save(myCorpus, file = "auspol_sep.RData")

data_tw2 = data.frame(text = get("content", myCorpus), row.names = NULL)
data_tw2 = cbind(data_tw2, ID = rt$status_id)
data_tw2 = cbind(Date = as.Date(rt$created_at), data_tw2)
# look at the data frame, still some white spaces left so let's get rid of them
data_tw2$text = gsub("\r?\n|\r", "", data_tw2$text)
data_tw2$text = gsub(" +", " ", data_tw2$text)
head(data_tw2)
# save data
saveRDS(data_tw2, file = "processed_data.rds")