This analysis is based on the Divvy case study “‘Sophisticated, Clear, and Polished’: Divvy and Data Visualization” written by Kevin Hartman (found here: https://artscience.blog/home/divvy-dataviz-case-study). The purpose of this script is to consolidate downloaded Divvy data into a single dataframe and then conduct simple analysis to help answer the key question: “In what ways do members and casual riders use Divvy bikes differently?”

In this case study, I have performed data analysis for a fictional bike-share company in order to help them attract more riders. Along the way, I performed numerous real-world tasks of a junior data analyst by following the steps of the data analysis process: Ask, Prepare, Process, Analyze, Share, and Act.

# # # # # # # # # # # # # # # # # # # # #

Installed required packages

tidyverse for data import and wrangling

lubridate for date functions

ggplot for visualization

# # # # # # # # # # # # # # # # # # # # #

install.packages("tidyverse", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/3j/tlpxgbnn03s1zb0rcz4xx9j80000gn/T//RtmpvwAzt4/downloaded_packages
install.packages("lubridate", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/3j/tlpxgbnn03s1zb0rcz4xx9j80000gn/T//RtmpvwAzt4/downloaded_packages
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/3j/tlpxgbnn03s1zb0rcz4xx9j80000gn/T//RtmpvwAzt4/downloaded_packages
install.packages("dplyr", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/3j/tlpxgbnn03s1zb0rcz4xx9j80000gn/T//RtmpvwAzt4/downloaded_packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)
## Loading required package: timechange
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(dplyr)
getwd()
## [1] "/Users/rising.volkan007/Desktop/q1_2020"
setwd("/Users/rising.volkan007/Desktop/q1_2020")

STEP 1: COLLECTED DATA

Uploaded Divvy datasets for the year of 2022 (csv files) here

q1_2022 <- read_csv("202210-divvy-tripdata.csv", show_col_types = FALSE)
q2_2022 <- read_csv("202209-divvy-tripdata.csv", show_col_types = FALSE)
q3_2022 <- read_csv("202208-divvy-tripdata.csv", show_col_types = FALSE)
q4_2022 <- read_csv("202207-divvy-tripdata.csv", show_col_types = FALSE)
q5_2022 <- read_csv("202206-divvy-tripdata.csv", show_col_types = FALSE)
q6_2022 <- read_csv("202205-divvy-tripdata.csv", show_col_types = FALSE)
q7_2022 <- read_csv("202204-divvy-tripdata.csv", show_col_types = FALSE)
q8_2022 <- read_csv("202203-divvy-tripdata.csv", show_col_types = FALSE)
q9_2022 <- read_csv("202201-divvy-tripdata.csv", show_col_types = FALSE)

STEP 2: WRANGLE DATA AND COMBINE INTO A SINGLE FILE

Compared column names each of the files

While the names don’t have to be in the same order, they DO need to match perfectly before we can use a command to join them into one file

colnames(q1_2022)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(q2_2022)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(q3_2022)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(q4_2022)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(q5_2022)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(q6_2022)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

Inspected the dataframes and look for incongruencies

Converted ride_id and rideable_type to character so that they can stack correctly

q1_2022 <- mutate(q1_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))

q2_2022 <- mutate(q2_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))

q3_2022 <- mutate(q3_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))

q4_2022 <- mutate(q4_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))

q5_2022 <- mutate(q5_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))

q6_2022 <- mutate(q6_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))

q7_2022 <- mutate(q7_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))

q8_2022 <- mutate(q8_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))

q9_2022 <- mutate(q9_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))

Stack individual quarter’s data frames into one big data frame

all_trips <- bind_rows(q1_2022, q2_2022, q3_2022, q4_2022,q5_2022,q6_2022,q7_2022,q8_2022,q9_2022)

Removed lat, long data was dropped beginning in 2022

all_trips <- all_trips %>% 
  select(-c(start_lat,start_lng,end_lat,end_lng))

STEP 3: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS

Inspect the new table that has been created

colnames(all_trips)  #List of column names
## [1] "ride_id"            "rideable_type"      "started_at"        
## [4] "ended_at"           "start_station_name" "start_station_id"  
## [7] "end_station_name"   "end_station_id"     "member_casual"
nrow(all_trips)  #How many rows are in data frame?
## [1] 5032567
dim(all_trips)  #Dimensions of the data frame?
## [1] 5032567       9
head(all_trips)  #See the first 6 rows of data frame.  Also tail(all_trips)
## # A tibble: 6 × 9
##   ride_id        ridea…¹ started_at          ended_at            start…² start…³
##   <chr>          <chr>   <dttm>              <dttm>              <chr>   <chr>  
## 1 A50255C1E1794… classi… 2022-10-14 17:13:30 2022-10-14 17:19:39 Noble … 13290  
## 2 DB692A70BD2DD… electr… 2022-10-01 16:29:26 2022-10-01 16:49:06 Damen … 13288  
## 3 3C02727AAF60F… electr… 2022-10-19 18:55:40 2022-10-19 19:03:30 Hoyne … 655    
## 4 47E653FDC2D99… electr… 2022-10-31 07:52:36 2022-10-31 07:58:49 Rush S… KA1504…
## 5 8B5407BE53515… classi… 2022-10-13 18:41:03 2022-10-13 19:26:18 900 W … 13028  
## 6 A177C92E9F021… electr… 2022-10-13 15:53:27 2022-10-13 15:59:17 900 W … 13028  
## # … with 3 more variables: end_station_name <chr>, end_station_id <chr>,
## #   member_casual <chr>, and abbreviated variable names ¹​rideable_type,
## #   ²​start_station_name, ³​start_station_id
tail(all_trips)
## # A tibble: 6 × 9
##   ride_id        ridea…¹ started_at          ended_at            start…² start…³
##   <chr>          <chr>   <dttm>              <dttm>              <chr>   <chr>  
## 1 9C80CD03B685B… electr… 2022-01-09 18:56:50 2022-01-09 19:02:50 Broadw… 13325  
## 2 8788DA3EDE8FD… electr… 2022-01-18 12:36:48 2022-01-18 12:46:19 Clinto… WL-012 
## 3 C6C3B64FDC827… electr… 2022-01-27 11:00:06 2022-01-27 11:02:40 Racine… 13155  
## 4 CA281AE7D8B06… electr… 2022-01-10 16:14:51 2022-01-10 16:20:58 Broadw… 13325  
## 5 44E3489918623… electr… 2022-01-19 13:22:11 2022-01-19 13:24:27 Racine… 13155  
## 6 E477C594A182A… electr… 2022-01-13 17:24:43 2022-01-13 17:28:14 Clinto… WL-012 
## # … with 3 more variables: end_station_name <chr>, end_station_id <chr>,
## #   member_casual <chr>, and abbreviated variable names ¹​rideable_type,
## #   ²​start_station_name, ³​start_station_id
str(all_trips)  #See list of columns and data types (numeric, character, etc)
## tibble [5,032,567 × 9] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:5032567] "A50255C1E17942AB" "DB692A70BD2DD4E3" "3C02727AAF60F873" "47E653FDC2D99236" ...
##  $ rideable_type     : chr [1:5032567] "classic_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:5032567], format: "2022-10-14 17:13:30" "2022-10-01 16:29:26" ...
##  $ ended_at          : POSIXct[1:5032567], format: "2022-10-14 17:19:39" "2022-10-01 16:49:06" ...
##  $ start_station_name: chr [1:5032567] "Noble St & Milwaukee Ave" "Damen Ave & Charleston St" "Hoyne Ave & Balmoral Ave" "Rush St & Cedar St" ...
##  $ start_station_id  : chr [1:5032567] "13290" "13288" "655" "KA1504000133" ...
##  $ end_station_name  : chr [1:5032567] "Larrabee St & Division St" "Damen Ave & Cullerton St" "Western Ave & Leland Ave" "Orleans St & Chestnut St (NEXT Apts)" ...
##  $ end_station_id    : chr [1:5032567] "KA1504000079" "13089" "TA1307000140" "620" ...
##  $ member_casual     : chr [1:5032567] "member" "casual" "member" "member" ...
summary(all_trips)  #Statistical summary of data. Mainly for numerics
##    ride_id          rideable_type        started_at                   
##  Length:5032567     Length:5032567     Min.   :2022-01-01 00:00:05.0  
##  Class :character   Class :character   1st Qu.:2022-05-26 22:46:23.0  
##  Mode  :character   Mode  :character   Median :2022-07-14 15:02:45.0  
##                                        Mean   :2022-07-10 18:41:37.5  
##                                        3rd Qu.:2022-09-01 07:25:58.5  
##                                        Max.   :2022-10-31 23:59:33.0  
##     ended_at                      start_station_name start_station_id  
##  Min.   :2022-01-01 00:01:48.00   Length:5032567     Length:5032567    
##  1st Qu.:2022-05-26 23:01:21.00   Class :character   Class :character  
##  Median :2022-07-14 15:23:31.00   Mode  :character   Mode  :character  
##  Mean   :2022-07-10 19:01:45.52                                        
##  3rd Qu.:2022-09-01 07:39:32.00                                        
##  Max.   :2022-11-07 04:53:58.00                                        
##  end_station_name   end_station_id     member_casual     
##  Length:5032567     Length:5032567     Length:5032567    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 

Checked to make sure the proper number of observations were reassigned

table(all_trips$member_casual)
## 
##  casual  member 
## 2154950 2877617

Added columns that list the date, month, day, and year of each ride

This will allow us to aggregate ride data for each month, day, or year … before completing these operations we could only aggregate at the ride level

Added a “ride_length” calculation to all_trips (in seconds)

https://stat.ethz.ch/R-manual/R-devel/library/base/html/difftime.html

all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at)

Inspect the structure of the columns

str(all_trips)
## tibble [5,032,567 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:5032567] "A50255C1E17942AB" "DB692A70BD2DD4E3" "3C02727AAF60F873" "47E653FDC2D99236" ...
##  $ rideable_type     : chr [1:5032567] "classic_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:5032567], format: "2022-10-14 17:13:30" "2022-10-01 16:29:26" ...
##  $ ended_at          : POSIXct[1:5032567], format: "2022-10-14 17:19:39" "2022-10-01 16:49:06" ...
##  $ start_station_name: chr [1:5032567] "Noble St & Milwaukee Ave" "Damen Ave & Charleston St" "Hoyne Ave & Balmoral Ave" "Rush St & Cedar St" ...
##  $ start_station_id  : chr [1:5032567] "13290" "13288" "655" "KA1504000133" ...
##  $ end_station_name  : chr [1:5032567] "Larrabee St & Division St" "Damen Ave & Cullerton St" "Western Ave & Leland Ave" "Orleans St & Chestnut St (NEXT Apts)" ...
##  $ end_station_id    : chr [1:5032567] "KA1504000079" "13089" "TA1307000140" "620" ...
##  $ member_casual     : chr [1:5032567] "member" "casual" "member" "member" ...
##  $ date              : Date[1:5032567], format: "2022-10-14" "2022-10-01" ...
##  $ month             : chr [1:5032567] "10" "10" "10" "10" ...
##  $ day               : chr [1:5032567] "14" "01" "19" "31" ...
##  $ year              : chr [1:5032567] "2022" "2022" "2022" "2022" ...
##  $ day_of_week       : chr [1:5032567] "Friday" "Saturday" "Wednesday" "Monday" ...
##  $ ride_length       : 'difftime' num [1:5032567] 369 1180 470 373 ...
##   ..- attr(*, "units")= chr "secs"

Convert “ride_length” from Factor to numeric so we can run calculations on the data is.factor(all_trips$ride_length)

all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
is.numeric(all_trips$ride_length)
## [1] TRUE

Removed “bad” data

The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality by Divvy or ride_length was negative

We will create a new version of the dataframe (v2) since data is being removed

https://www.datasciencemadesimple.com/delete-or-drop-rows-in-r-with-conditions-2/

all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length<0),]

STEP 4: CONDUCT DESCRIPTIVE ANALYSIS

Descriptive analysis on ride_length (all figures in seconds)

mean(all_trips_v2$ride_length) #straight average (total ride length / rides)
## [1] NA
median(all_trips_v2$ride_length) #midpoint number in the ascending array of ride lengths
## [1] NA
max(all_trips_v2$ride_length) #longest ride
## [1] NA
min(all_trips_v2$ride_length) #shortest ride
## [1] NA

I had condensed the four lines above to one line using summary() on the specific attribute

summary(all_trips_v2$ride_length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0     371     653    1277    1176 2483235  733225

Compared members and casual users

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)
##   all_trips_v2$member_casual all_trips_v2$ride_length
## 1                     casual                 1932.152
## 2                     member                  787.723
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)
##   all_trips_v2$member_casual all_trips_v2$ride_length
## 1                     casual                      837
## 2                     member                      551
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)
##   all_trips_v2$member_casual all_trips_v2$ride_length
## 1                     casual                  2483235
## 2                     member                    93594
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)
##   all_trips_v2$member_casual all_trips_v2$ride_length
## 1                     casual                        0
## 2                     member                        0

See the average ride time by each day for members vs casual users

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
##    all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1                      casual                   Friday                1863.6220
## 2                      member                   Friday                 770.7869
## 3                      casual                   Monday                1948.2110
## 4                      member                   Monday                 759.6822
## 5                      casual                 Saturday                2142.9944
## 6                      member                 Saturday                 883.3955
## 7                      casual                   Sunday                2219.5627
## 8                      member                   Sunday                 874.1955
## 9                      casual                 Thursday                1678.0602
## 10                     member                 Thursday                 756.6890
## 11                     casual                  Tuesday                1737.7837
## 12                     member                  Tuesday                 750.2949
## 13                     casual                Wednesday                1652.1514
## 14                     member                Wednesday                 747.5031

Notice that the days of the week are out of order. Let’s fix that

all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

Now, let’s run the average ride time by each day for members vs casual users

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
##    all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1                      casual                   Sunday                2219.5627
## 2                      member                   Sunday                 874.1955
## 3                      casual                   Monday                1948.2110
## 4                      member                   Monday                 759.6822
## 5                      casual                  Tuesday                1737.7837
## 6                      member                  Tuesday                 750.2949
## 7                      casual                Wednesday                1652.1514
## 8                      member                Wednesday                 747.5031
## 9                      casual                 Thursday                1678.0602
## 10                     member                 Thursday                 756.6890
## 11                     casual                   Friday                1863.6220
## 12                     member                   Friday                 770.7869
## 13                     casual                 Saturday                2142.9944
## 14                     member                 Saturday                 883.3955

analyze ridership data by type and weekday

all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>%  #creates weekday field using wday()
  group_by(member_casual, weekday) %>%  #groups by usertype and weekday
  summarise(number_of_rides = n()                           #calculates the number of rides and average duration 
  ,average_duration = mean(ride_length)) %>%        # calculates the average duration
  arrange(member_casual, weekday)   
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 15 × 4
## # Groups:   member_casual [3]
##    member_casual weekday number_of_rides average_duration
##    <chr>         <ord>             <int>            <dbl>
##  1 casual        Sun              316093            2220.
##  2 casual        Mon              220388            1948.
##  3 casual        Tue              202664            1738.
##  4 casual        Wed              209858            1652.
##  5 casual        Thu              237255            1678.
##  6 casual        Fri              261776            1864.
##  7 casual        Sat              388860            2143.
##  8 member        Sun              289515             874.
##  9 member        Mon              348050             760.
## 10 member        Tue              375587             750.
## 11 member        Wed              381003             748.
## 12 member        Thu              389414             757.
## 13 member        Fri              344143             771.
## 14 member        Sat              334677             883.
## 15 <NA>          <NA>             733225              NA

Let’s visualize the number of rides by rider type

all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
  geom_col(position = "dodge")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Let’s create a visualization for average duration

all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
  geom_col(position = "dodge")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Warning: Removed 1 rows containing missing values (`geom_col()`).

STEP 5: EXPORT SUMMARY FILE FOR FURTHER ANALYSIS

Created a csv file that we will visualize in Excel, Tableau, or my presentation software

N.B.: This file location is for a Mac. If you are working on a PC, change the file location accordingly (most likely “C:_USERNAME...”) to export the data. You can read more here: https://datatofish.com/export-dataframe-to-csv-in-r/
counts <- aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
write.csv(counts, file = '~/Desktop/q1_2020/avg_ride_length.csv')