STEP 2: WRANGLE DATA AND COMBINE INTO A SINGLE FILE
Compared column names each of the files
While the names don’t have to be in the same order, they DO need to
match perfectly before we can use a command to join them into one
file
colnames(q1_2022)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q2_2022)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q3_2022)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q4_2022)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q5_2022)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q6_2022)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
Inspected the dataframes and look for incongruencies
Converted ride_id and rideable_type to character so that they can
stack correctly
q1_2022 <- mutate(q1_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))
q2_2022 <- mutate(q2_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))
q3_2022 <- mutate(q3_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))
q4_2022 <- mutate(q4_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))
q5_2022 <- mutate(q5_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))
q6_2022 <- mutate(q6_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))
q7_2022 <- mutate(q7_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))
q8_2022 <- mutate(q8_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))
q9_2022 <- mutate(q9_2022, ride_id= as.character(ride_id), rideable_type=as.character(rideable_type))
Stack individual quarter’s data frames into one big data frame
all_trips <- bind_rows(q1_2022, q2_2022, q3_2022, q4_2022,q5_2022,q6_2022,q7_2022,q8_2022,q9_2022)
Removed lat, long data was dropped beginning in 2022
all_trips <- all_trips %>%
select(-c(start_lat,start_lng,end_lat,end_lng))
STEP 3: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS
Inspect the new table that has been created
colnames(all_trips) #List of column names
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "member_casual"
nrow(all_trips) #How many rows are in data frame?
## [1] 5032567
dim(all_trips) #Dimensions of the data frame?
## [1] 5032567 9
head(all_trips) #See the first 6 rows of data frame. Also tail(all_trips)
## # A tibble: 6 × 9
## ride_id ridea…¹ started_at ended_at start…² start…³
## <chr> <chr> <dttm> <dttm> <chr> <chr>
## 1 A50255C1E1794… classi… 2022-10-14 17:13:30 2022-10-14 17:19:39 Noble … 13290
## 2 DB692A70BD2DD… electr… 2022-10-01 16:29:26 2022-10-01 16:49:06 Damen … 13288
## 3 3C02727AAF60F… electr… 2022-10-19 18:55:40 2022-10-19 19:03:30 Hoyne … 655
## 4 47E653FDC2D99… electr… 2022-10-31 07:52:36 2022-10-31 07:58:49 Rush S… KA1504…
## 5 8B5407BE53515… classi… 2022-10-13 18:41:03 2022-10-13 19:26:18 900 W … 13028
## 6 A177C92E9F021… electr… 2022-10-13 15:53:27 2022-10-13 15:59:17 900 W … 13028
## # … with 3 more variables: end_station_name <chr>, end_station_id <chr>,
## # member_casual <chr>, and abbreviated variable names ¹rideable_type,
## # ²start_station_name, ³start_station_id
tail(all_trips)
## # A tibble: 6 × 9
## ride_id ridea…¹ started_at ended_at start…² start…³
## <chr> <chr> <dttm> <dttm> <chr> <chr>
## 1 9C80CD03B685B… electr… 2022-01-09 18:56:50 2022-01-09 19:02:50 Broadw… 13325
## 2 8788DA3EDE8FD… electr… 2022-01-18 12:36:48 2022-01-18 12:46:19 Clinto… WL-012
## 3 C6C3B64FDC827… electr… 2022-01-27 11:00:06 2022-01-27 11:02:40 Racine… 13155
## 4 CA281AE7D8B06… electr… 2022-01-10 16:14:51 2022-01-10 16:20:58 Broadw… 13325
## 5 44E3489918623… electr… 2022-01-19 13:22:11 2022-01-19 13:24:27 Racine… 13155
## 6 E477C594A182A… electr… 2022-01-13 17:24:43 2022-01-13 17:28:14 Clinto… WL-012
## # … with 3 more variables: end_station_name <chr>, end_station_id <chr>,
## # member_casual <chr>, and abbreviated variable names ¹rideable_type,
## # ²start_station_name, ³start_station_id
str(all_trips) #See list of columns and data types (numeric, character, etc)
## tibble [5,032,567 × 9] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:5032567] "A50255C1E17942AB" "DB692A70BD2DD4E3" "3C02727AAF60F873" "47E653FDC2D99236" ...
## $ rideable_type : chr [1:5032567] "classic_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:5032567], format: "2022-10-14 17:13:30" "2022-10-01 16:29:26" ...
## $ ended_at : POSIXct[1:5032567], format: "2022-10-14 17:19:39" "2022-10-01 16:49:06" ...
## $ start_station_name: chr [1:5032567] "Noble St & Milwaukee Ave" "Damen Ave & Charleston St" "Hoyne Ave & Balmoral Ave" "Rush St & Cedar St" ...
## $ start_station_id : chr [1:5032567] "13290" "13288" "655" "KA1504000133" ...
## $ end_station_name : chr [1:5032567] "Larrabee St & Division St" "Damen Ave & Cullerton St" "Western Ave & Leland Ave" "Orleans St & Chestnut St (NEXT Apts)" ...
## $ end_station_id : chr [1:5032567] "KA1504000079" "13089" "TA1307000140" "620" ...
## $ member_casual : chr [1:5032567] "member" "casual" "member" "member" ...
summary(all_trips) #Statistical summary of data. Mainly for numerics
## ride_id rideable_type started_at
## Length:5032567 Length:5032567 Min. :2022-01-01 00:00:05.0
## Class :character Class :character 1st Qu.:2022-05-26 22:46:23.0
## Mode :character Mode :character Median :2022-07-14 15:02:45.0
## Mean :2022-07-10 18:41:37.5
## 3rd Qu.:2022-09-01 07:25:58.5
## Max. :2022-10-31 23:59:33.0
## ended_at start_station_name start_station_id
## Min. :2022-01-01 00:01:48.00 Length:5032567 Length:5032567
## 1st Qu.:2022-05-26 23:01:21.00 Class :character Class :character
## Median :2022-07-14 15:23:31.00 Mode :character Mode :character
## Mean :2022-07-10 19:01:45.52
## 3rd Qu.:2022-09-01 07:39:32.00
## Max. :2022-11-07 04:53:58.00
## end_station_name end_station_id member_casual
## Length:5032567 Length:5032567 Length:5032567
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
Checked to make sure the proper number of observations were
reassigned
table(all_trips$member_casual)
##
## casual member
## 2154950 2877617
Added columns that list the date, month, day, and year of each
ride
This will allow us to aggregate ride data for each month, day, or
year … before completing these operations we could only aggregate at the
ride level
Added a “ride_length” calculation to all_trips (in seconds)
all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at)
Inspect the structure of the columns
str(all_trips)
## tibble [5,032,567 × 15] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:5032567] "A50255C1E17942AB" "DB692A70BD2DD4E3" "3C02727AAF60F873" "47E653FDC2D99236" ...
## $ rideable_type : chr [1:5032567] "classic_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:5032567], format: "2022-10-14 17:13:30" "2022-10-01 16:29:26" ...
## $ ended_at : POSIXct[1:5032567], format: "2022-10-14 17:19:39" "2022-10-01 16:49:06" ...
## $ start_station_name: chr [1:5032567] "Noble St & Milwaukee Ave" "Damen Ave & Charleston St" "Hoyne Ave & Balmoral Ave" "Rush St & Cedar St" ...
## $ start_station_id : chr [1:5032567] "13290" "13288" "655" "KA1504000133" ...
## $ end_station_name : chr [1:5032567] "Larrabee St & Division St" "Damen Ave & Cullerton St" "Western Ave & Leland Ave" "Orleans St & Chestnut St (NEXT Apts)" ...
## $ end_station_id : chr [1:5032567] "KA1504000079" "13089" "TA1307000140" "620" ...
## $ member_casual : chr [1:5032567] "member" "casual" "member" "member" ...
## $ date : Date[1:5032567], format: "2022-10-14" "2022-10-01" ...
## $ month : chr [1:5032567] "10" "10" "10" "10" ...
## $ day : chr [1:5032567] "14" "01" "19" "31" ...
## $ year : chr [1:5032567] "2022" "2022" "2022" "2022" ...
## $ day_of_week : chr [1:5032567] "Friday" "Saturday" "Wednesday" "Monday" ...
## $ ride_length : 'difftime' num [1:5032567] 369 1180 470 373 ...
## ..- attr(*, "units")= chr "secs"
Convert “ride_length” from Factor to numeric so we can run
calculations on the data is.factor(all_trips$ride_length)
all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
is.numeric(all_trips$ride_length)
## [1] TRUE
Removed “bad” data
The dataframe includes a few hundred entries when bikes were taken
out of docks and checked for quality by Divvy or ride_length was
negative
We will create a new version of the dataframe (v2) since data is
being removed
all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length<0),]
STEP 4: CONDUCT DESCRIPTIVE ANALYSIS
I had condensed the four lines above to one line using summary() on
the specific attribute
summary(all_trips_v2$ride_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 371 653 1277 1176 2483235 733225
Compared members and casual users
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 1932.152
## 2 member 787.723
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 837
## 2 member 551
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 2483235
## 2 member 93594
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 0
## 2 member 0
See the average ride time by each day for members vs casual
users
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1 casual Friday 1863.6220
## 2 member Friday 770.7869
## 3 casual Monday 1948.2110
## 4 member Monday 759.6822
## 5 casual Saturday 2142.9944
## 6 member Saturday 883.3955
## 7 casual Sunday 2219.5627
## 8 member Sunday 874.1955
## 9 casual Thursday 1678.0602
## 10 member Thursday 756.6890
## 11 casual Tuesday 1737.7837
## 12 member Tuesday 750.2949
## 13 casual Wednesday 1652.1514
## 14 member Wednesday 747.5031
Notice that the days of the week are out of order. Let’s fix
that
all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
Now, let’s run the average ride time by each day for members vs
casual users
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1 casual Sunday 2219.5627
## 2 member Sunday 874.1955
## 3 casual Monday 1948.2110
## 4 member Monday 759.6822
## 5 casual Tuesday 1737.7837
## 6 member Tuesday 750.2949
## 7 casual Wednesday 1652.1514
## 8 member Wednesday 747.5031
## 9 casual Thursday 1678.0602
## 10 member Thursday 756.6890
## 11 casual Friday 1863.6220
## 12 member Friday 770.7869
## 13 casual Saturday 2142.9944
## 14 member Saturday 883.3955
analyze ridership data by type and weekday
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday()
group_by(member_casual, weekday) %>% #groups by usertype and weekday
summarise(number_of_rides = n() #calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # calculates the average duration
arrange(member_casual, weekday)
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 15 × 4
## # Groups: member_casual [3]
## member_casual weekday number_of_rides average_duration
## <chr> <ord> <int> <dbl>
## 1 casual Sun 316093 2220.
## 2 casual Mon 220388 1948.
## 3 casual Tue 202664 1738.
## 4 casual Wed 209858 1652.
## 5 casual Thu 237255 1678.
## 6 casual Fri 261776 1864.
## 7 casual Sat 388860 2143.
## 8 member Sun 289515 874.
## 9 member Mon 348050 760.
## 10 member Tue 375587 750.
## 11 member Wed 381003 748.
## 12 member Thu 389414 757.
## 13 member Fri 344143 771.
## 14 member Sat 334677 883.
## 15 <NA> <NA> 733225 NA
Let’s visualize the number of rides by rider type
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Let’s create a visualization for average duration
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Warning: Removed 1 rows containing missing values (`geom_col()`).
