This RMarkdown file contains the report of the data analysis done for the project on forecasting daily bike rental demand using time series models in R. It contains analysis such as data exploration, summary statistics and building the time series models. The final report was completed on Sat Aug 9 22:13:17 2025.
Data Description:
This dataset contains the daily count of rental bike transactions between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information.
Data Source: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset
Relevant Paper:
Fanaee-T, Hadi, and Gama, Joao, ‘Event labeling combining ensemble detectors and background knowledge’, Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg
## Import required packages
## Load data and install packages
if(!require(tidyverse)) install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
if(!require(lubridate)) install.packages("lubridate")
if(!require(timetk)) install.packages("timetk")
## Loading required package: timetk
if(!require(forecast)) install.packages("forecast")
## Loading required package: forecast
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
if(!require(tseries)) install.packages("tseries")
## Loading required package: tseries
if(!require(zoo)) install.packages("zoo")
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
if(!require(ggplot2)) install.packages("ggplot2")
library(tidyverse)
library(lubridate)
library(timetk)
library(forecast)
library(tseries)
library(zoo)
library(ggplot2)
url <- "C:/Users/samaw/projects/bikeshare/day.csv"
bike_data <- read.csv(url)
head(bike_data)
## instant dteday season yr mnth holiday weekday workingday weathersit
## 1 1 2011-01-01 1 0 1 0 6 0 2
## 2 2 2011-01-02 1 0 1 0 0 0 2
## 3 3 2011-01-03 1 0 1 0 1 1 1
## 4 4 2011-01-04 1 0 1 0 2 1 1
## 5 5 2011-01-05 1 0 1 0 3 1 1
## 6 6 2011-01-06 1 0 1 0 4 1 1
## temp atemp hum windspeed casual registered cnt
## 1 0.344167 0.363625 0.805833 0.1604460 331 654 985
## 2 0.363478 0.353739 0.696087 0.2485390 131 670 801
## 3 0.196364 0.189405 0.437273 0.2483090 120 1229 1349
## 4 0.200000 0.212122 0.590435 0.1602960 108 1454 1562
## 5 0.226957 0.229270 0.436957 0.1869000 82 1518 1600
## 6 0.204348 0.233209 0.518261 0.0895652 88 1518 1606
glimpse(bike_data)
## Rows: 731
## Columns: 16
## $ instant <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ dteday <chr> "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04", "20…
## $ season <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ yr <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ mnth <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ holiday <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ weekday <int> 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4,…
## $ workingday <int> 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,…
## $ weathersit <int> 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2,…
## $ temp <dbl> 0.3441670, 0.3634780, 0.1963640, 0.2000000, 0.2269570, 0.20…
## $ atemp <dbl> 0.3636250, 0.3537390, 0.1894050, 0.2121220, 0.2292700, 0.23…
## $ hum <dbl> 0.805833, 0.696087, 0.437273, 0.590435, 0.436957, 0.518261,…
## $ windspeed <dbl> 0.1604460, 0.2485390, 0.2483090, 0.1602960, 0.1869000, 0.08…
## $ casual <int> 331, 131, 120, 108, 82, 88, 148, 68, 54, 41, 43, 25, 38, 54…
## $ registered <int> 654, 670, 1229, 1454, 1518, 1518, 1362, 891, 768, 1280, 122…
## $ cnt <int> 985, 801, 1349, 1562, 1600, 1606, 1510, 959, 822, 1321, 126…
summary(bike_data)
## instant dteday season yr
## Min. : 1.0 Length:731 Min. :1.000 Min. :0.0000
## 1st Qu.:183.5 Class :character 1st Qu.:2.000 1st Qu.:0.0000
## Median :366.0 Mode :character Median :3.000 Median :1.0000
## Mean :366.0 Mean :2.497 Mean :0.5007
## 3rd Qu.:548.5 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :731.0 Max. :4.000 Max. :1.0000
## mnth holiday weekday workingday
## Min. : 1.00 Min. :0.00000 Min. :0.000 Min. :0.000
## 1st Qu.: 4.00 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.000
## Median : 7.00 Median :0.00000 Median :3.000 Median :1.000
## Mean : 6.52 Mean :0.02873 Mean :2.997 Mean :0.684
## 3rd Qu.:10.00 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.:1.000
## Max. :12.00 Max. :1.00000 Max. :6.000 Max. :1.000
## weathersit temp atemp hum
## Min. :1.000 Min. :0.05913 Min. :0.07907 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.33708 1st Qu.:0.33784 1st Qu.:0.5200
## Median :1.000 Median :0.49833 Median :0.48673 Median :0.6267
## Mean :1.395 Mean :0.49538 Mean :0.47435 Mean :0.6279
## 3rd Qu.:2.000 3rd Qu.:0.65542 3rd Qu.:0.60860 3rd Qu.:0.7302
## Max. :3.000 Max. :0.86167 Max. :0.84090 Max. :0.9725
## windspeed casual registered cnt
## Min. :0.02239 Min. : 2.0 Min. : 20 Min. : 22
## 1st Qu.:0.13495 1st Qu.: 315.5 1st Qu.:2497 1st Qu.:3152
## Median :0.18097 Median : 713.0 Median :3662 Median :4548
## Mean :0.19049 Mean : 848.2 Mean :3656 Mean :4504
## 3rd Qu.:0.23321 3rd Qu.:1096.0 3rd Qu.:4776 3rd Qu.:5956
## Max. :0.50746 Max. :3410.0 Max. :6946 Max. :8714
bike_data$dteday <- ymd(bike_data$dteday)
# Plot of daily rentals over time
ggplot(bike_data, aes(x = dteday, y=cnt)) +
geom_line(color="blue") +
labs(title="Daily bike rentals over time", x="Date", y="Total Rentals (cnt)") +
theme_minimal()
#Summary statistics for rentals
mean_rentals <- mean(bike_data$cnt)
median_rentals <- median(bike_data$cnt)
max_rentals <- max(bike_data$cnt)
min_rentals <- min(bike_data$cnt)
cat("Mean rentals:", mean_rentals, "\n")
## Mean rentals: 4504.349
cat("Median rentals", median_rentals, "\n")
## Median rentals 4548
cat("Max rentals", max_rentals, "\n")
## Max rentals 8714
cat("Min rentals", min_rentals, "\n")
## Min rentals 22
#Bike rentals by weekday to show weekly seasonality
bike_data %>%
mutate(weekday = wday(dteday, label=TRUE)) %>%
group_by(weekday) %>%
summarise(avg_rentals = mean(cnt)) %>%
ggplot(aes(x=weekday, y=avg_rentals)) +
geom_bar(stat="identity",fill="skyblue") +
labs(title="Average rentals by weekday",
x="Weekday", y="Average Rentals") +
theme_minimal()
# Rentals vs Temperature
ggplot(bike_data, aes(x=temp, y=cnt)) +
geom_point(alpha=0.5, color="darkred") +
geom_smooth(method="loess") +
labs(title="Bike Rentals vs Temperature", x="Temperatue (normalized)", y="Rentals") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Interactive time series plots
## Read about the timetk package
ts_data <- bike_data %>%
select(dteday, cnt) %>%
tk_ts(start = c(2011, 1), frequency = 365)
## Warning: Non-numeric columns being dropped: dteday
plot(ts_data, main = "Daily Bike Rental Demand in Time Series")
## Smoothing of time series data
# 7 Day Moving Average to smooth data
bike_data <- bike_data %>%
arrange(dteday) %>%
mutate(ma_7day = zoo::rollmean(cnt, k=7, fill=NA, align="right"))
#Plot of original and smoothed rentals
ggplot(bike_data, aes(x=dteday)) +
geom_line(aes(y=cnt), color="blue", alpha=0.6) +
geom_line(aes(y=ma_7day), color="red", size=1) +
labs(title="Daily Rentals with 7-day Moving Average", x="Date", y="Rentals") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Decompsing and accessing the stationarity of time series data
rentals_ts <- ts(bike_data$cnt, start=c(2011, 1), frequency=365)
ts_decomp <- stl(rentals_ts, s.window="periodic")
plot(ts_decomp)
#Dicky-Fuller test for stationarity
adf_test <- adf.test(rentals_ts)
adf_test
##
## Augmented Dickey-Fuller Test
##
## data: rentals_ts
## Dickey-Fuller = -1.6351, Lag order = 9, p-value = 0.7327
## alternative hypothesis: stationary
if (adf_test$p.value > 0.05) {
rentals_ts <- diff(rentals_ts)
}
rentals_ts <- na.omit(rentals_ts)
#Auto ARIMA model fitting
fit_auto <- auto.arima(rentals_ts)
summary(fit_auto)
## Series: rentals_ts
## ARIMA(1,0,1) with zero mean
##
## Coefficients:
## ar1 ma1
## 0.3584 -0.8896
## s.e. 0.0423 0.0192
##
## sigma^2 = 857769: log likelihood = -6021.96
## AIC=12049.91 AICc=12049.94 BIC=12063.69
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 12.89471 924.8887 648.5267 164.5701 245.294 0.5930705 0.01116795
#Checking residuals diagnostics(fit_auto)
checkresiduals(fit_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1) with zero mean
## Q* = 176.93, df = 144, p-value = 0.03227
##
## Model df: 2. Total lags used: 146
# Inspect ACF/PACF for manual choice
acf(rentals_ts)
pacf(rentals_ts)
fit_manual <- tryCatch({
Arima(
rentals_ts,
order = c(1,0,1),
seasonal = list(order = c(1,1,1), period = 7) # safer for convergence
)
}, error = function(e) {
message("Manual ARIMA failed: ", e$message)
NULL
})
if (!is.null(fit_manual)) {
summary(fit_manual)
checkresiduals(fit_manual)
}
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1)(1,1,1)[7]
## Q* = 157.25, df = 142, p-value = 0.1804
##
## Model df: 4. Total lags used: 146
#Next 30 days forecast
forecast_auto <- forecast(fit_auto, h=30)
forecast_manual <- forecast(fit_manual, h=30)
accuracy_auto <- accuracy(forecast_auto)
accuracy_manual = accuracy(forecast_manual)
list(Auto_ARIMA = accuracy_auto, Manual_ARIMA = accuracy_manual)
## $Auto_ARIMA
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 12.89471 924.8887 648.5267 164.5701 245.294 0.5930705 0.01116795
##
## $Manual_ARIMA
## ME RMSE MAE MPE MAPE MASE
## Training set -39.00165 918.2499 631.1259 82.34632 189.7771 0.5771577
## ACF1
## Training set 0.005056909
autoplot(forecast_auto) +
labs(title="30-Day Forecast(Auto ARIMA) of Daily Bike Rentals", x="Time", y="Rentals") +
theme_minimal()
autoplot(forecast_manual) +
labs(title="30-Day Forecast(Manual ARIMA) of Daily Bike Rentals", x="Time", y="Rentals") +
theme_minimal()
(1,0,1)
non-seasonal
and (1,1,1)
seasonal terms initially failed with a
365-day seasonal period due to convergence issues;
switching to a 7-day period improved stability.