library(ggplot2) library(dplyr) big_font <- theme_grey(base_size = 24) # install.packages("TSA") data(oil.price, package = "TSA") source(url("http://stat565.cwick.co.nz/code/fortify-ts.r")) oil <- fortify(oil.price) oil <- rename(oil, price = x) qplot(time, price, data = oil, geom = "line") + big_font acf(log(oil$price)) qplot(time, price, data = oil, geom = "line", log = "y") + big_font # differencing can remove trends oil$diff1 <- c(NA, diff(oil$price)) qplot(time, diff1, data = oil, geom = "line") + big_font oil$diff2 <- c(NA, diff(oil$diff1)) qplot(time, diff2, data = oil, geom = "line") + big_font # the trend is gone, but the variation looks like it increases, try log transform oil$diff1_log <- c(NA, diff(log(oil$price))) qplot(time, diff1_log, data = oil, geom = "line") + big_font # better # orignal plot with log transform qplot(time, log(price), data = oil, geom = "line") + big_font oil$diff2_log <- c(NA, diff(oil$diff1_log)) qplot(time, diff2_log, data = oil, geom = "line") + big_font # so the first difference of log price looks stationary in mean, but what about # variance qplot(time, diff1_log^2, data = oil) + geom_smooth() + big_font # a few outliers at the start but otherwise looks okish qplot(time %% 1, diff1_log^2, data = oil, geom = c("boxplot", "point"), group = time %% 1) + big_font # looks ok, %% is the modolu operator, time %% 1 will give the remainder after # dividing by 1, in this case the decimal part of the year, which is equivalent # to the month # looks stationary, let's choose an ARMA(p, q) model acf(oil$diff1_log, lag.max = 24, na.action = na.pass) # significant at lag 1 pacf(oil$diff1_log, lag.max = 24, na.action = na.pass) # significant at lag 1, maybe something happening at lag 2 and 15 # models to try MA(1), AR(2), ARMA(1, 1) # now fit an ARIMA(0, 1, 1) model to difference of log price, xrg term # is to include a constant in the differenced series. n <- nrow(oil) (fit_ma1 <- arima(log(oil$price), order = c(0, 1, 1), xreg = 1:n)) (fit_ar2 <- arima(log(oil$price), order = c(2, 1, 0), xreg = 1:n)) (fit_arma1 <- arima(log(oil$price), order = c(1, 1, 1), xreg = 1:n)) (fit_ma2 <- arima(log(oil$price), order = c(0, 1, 2), xreg = 1:n)) # MA(1) seems best, check residuals (a.k.a innovations) # diagnostics # is there any correlation left in the residuals acf(residuals(fit_ma1)) pacf(residuals(fit_ma1)) # look good # check normality qqnorm(residuals(fit_ma1)) qqline(residuals(fit_ma1)) # a few outliers but mostly good # a time plot of residuals oil$residuals <- residuals(fit_ma1) qplot(time, residuals, data = oil, geom = "line") # outliers subset(oil, abs(residuals) > 0.3) # our model looks good let's forecast log oil price pred.df <- as.data.frame(predict(fit_ma1, n.ahead = 48, newxreg = (n + 1):(n+48))) pred.df$time <- max(oil$time) + (1:48)/12 qplot(time, log(price), data = oil, geom = "line") + geom_ribbon(aes(ymin = pred- 2*se, ymax = pred + 2*se, y = NULL), data = pred.df, alpha = 0.2) + geom_line(aes(y = pred), data = pred.df) + big_font qplot(time, price, data = oil, geom = "line") + geom_ribbon(aes(ymin = exp(pred- 2*se), ymax = exp(pred + 2*se), y = NULL), data = pred.df, alpha = 0.2) + geom_line(aes(y = exp(pred)), data = pred.df) + big_font # check against truth oil_new <- read.csv("../data/oil_update.csv") oil_new$time <- seq(2006 + 1/12, 2012 + 4/12, 1/12) qplot(time, price, data = oil, geom = "line") + geom_ribbon(aes(ymin = exp(pred- 2*se), ymax = exp(pred + 2*se), y = NULL), data = pred.df, alpha = 0.2) + geom_line(aes(y = exp(pred)), data = pred.df) + geom_line(data = oil_new, linetype = "dashed") + big_font