7.2 Data preprocessing

  • Download stock data using R’s quantmod package

  • Convert data to returns

  • Generate some descriptive statistics

  • Some plots

  • Data

# Run the following to download and save the data, this should be
# done once and when updating the time period
library(quantmod)
library(pander)
library(xts)
library(TTR)
# download stock
BHP = getSymbols("BHP.AX", from = "2019-01-01", to = "2021-07-31", auto.assign = FALSE)
# download index
ASX = getSymbols("^AXJO", from = "2019-01-01", to = "2021-07-31", auto.assign = FALSE)
# save both in rds (to be used in the TA chapter)
saveRDS(BHP, file = "data/bhp_prices.rds")
saveRDS(ASX, file = "data/asx200.rds")
  • Convert to returns
library(quantmod)
library(pander)
library(xts)
library(TTR)
# load data from the saved files (not required if we execute the
# chunk above)
BHP = readRDS("data/bhp_prices.rds")
ASX = readRDS("data/asx200.rds")
# using close prices
bhp2 = BHP$BHP.AX.Close
asx2 = ASX$AXJO.Close
# covert to returns

bhp_ret = dailyReturn(bhp2, type = "log")
asx_ret = dailyReturn(asx2, type = "log")

# merge the two with 'inner' join to get the same dates
data_lm1 = merge.xts(bhp_ret, asx_ret, join = "inner")
# convert to data frame
data_lm2 = data.frame(index(data_lm1), data_lm1$daily.returns, data_lm1$daily.returns.1)
# change column names
colnames(data_lm2) = c("Date", "bhp", "asx")
head(data_lm2)  #there are row names which can be removed if required
                 Date          bhp          asx
2019-01-02 2019-01-02  0.000000000  0.000000000
2019-01-03 2019-01-03  0.000000000  0.013510839
2019-01-04 2019-01-04 -0.008947241 -0.002488271
2019-01-07 2019-01-07  0.029808847  0.011289609
2019-01-08 2019-01-08  0.001162482  0.006873792
2019-01-09 2019-01-09 -0.003782952  0.009721207
library(pastecs)
desc_stat1 = stat.desc(data_lm2[, 2:3], norm = TRUE)
pander(desc_stat1, caption = "Descriptive Statistics", split.table = Inf)
Descriptive Statistics
  bhp asx
nbr.val 653 653
nbr.null 9 1
nbr.na 0 0
min -0.1557 -0.102
max 0.1128 0.06766
range 0.2685 0.1697
sum 0.4948 0.2853
median 0 0.001163
mean 0.0007577 0.0004369
SE.mean 0.0007444 0.0005075
CI.mean.0.95 0.001462 0.0009966
var 0.0003618 0.0001682
std.dev 0.01902 0.01297
coef.var 25.1 29.69
skewness -0.4883 -1.443
skew.2SE -2.553 -7.546
kurtosis 9.991 13.32
kurt.2SE 26.16 34.86
normtest.W 0.9198 0.8266
normtest.p 3.885e-18 5.497e-26