DATA PROJECT 2: NYC NIGHTLIFE
CODE
---
title: "DP Final"
output: html_notebook
---
```{r}
library(english)
library(readr)
library(dplyr)
library(tidyverse)
library(magrittr)
library(leaflet)
set.seed(1998)
```
```{r}
setwd("~/Desktop/College/Senior/OIDD 245/DP 2/Data/")
halloween = NULL
for(year in (2009:2018)) {
if(year == 2010) next
as.data.frame(assign(paste(as.english(year-2000), "halloween", sep = "_"), read_csv(paste(toString(year), "_halloween", ".csv", sep=''))))
}
```
```{r}
# NYU Coordinates
# 6th Ave, 8th St: 40.733852, -73.999583
# 6th Ave, W Houston St: 40.727922, -74.003160
# Broadway, W Houston St: 40.725564, -73.996783
# Broadway, 8th St: 40.725306, -73.996712
NYU_TL_lat = 40.733852
NYU_TL_lon = -73.999583
NYU_BL_lat = 40.727922
NYU_BL_lon = -74.003160
NYU_BR_lat = 40.725564
NYU_BR_lon = -73.996783
NYU_TR_lat = 40.725306
NYU_TR_lon = -73.996712
range_NYU_min_lat = min(NYU_TR_lat, NYU_BR_lat, NYU_BL_lat,NYU_TL_lat)
range_NYU_max_lat = max(NYU_TR_lat, NYU_BR_lat, NYU_BL_lat,NYU_TL_lat)
range_NYU_min_lon = min(NYU_TR_lon, NYU_BR_lon, NYU_BL_lon, NYU_TL_lon)
range_NYU_max_lon = max(NYU_TR_lon, NYU_BR_lon, NYU_BL_lon, NYU_TL_lon)
# Columbia Coordinates
# Riverside Dr, 122nd St: 40.812765, -73.963072
# Morningside Dr, 122nd St: 40.810018, -73.956806
# Morningside Dr, 110th St: 40.801566, -73.961005
# Riverside Dr, 110th St: 40.804859, -73.968842
Col_TL_lat = 40.812765
Col_TL_lon = -73.963072
Col_BL_lat = 40.804859
Col_BL_lon = -73.968842
Col_BR_lat = 40.801566
Col_BR_lon = -73.961005
Col_TR_lat = 40.810018
Col_TR_lon = -73.956806
range_Col_min_lat = min(Col_TR_lat, Col_BR_lat, Col_BL_lat,Col_TL_lat)
range_Col_max_lat = max(Col_TR_lat, Col_BR_lat, Col_BL_lat,Col_TL_lat)
range_Col_min_lon = min(Col_TR_lon, Col_BR_lon, Col_BL_lon, Col_TL_lon)
range_Col_max_lon = max(Col_TR_lon, Col_BR_lon, Col_BL_lon, Col_TL_lon)
```
```{r}
nine_NYU = filter(nine_halloween, between(pickup_latitude, range_NYU_min_lat, range_NYU_max_lat)) %>%
filter(between(pickup_longitude, range_NYU_min_lon, range_NYU_max_lon)) %>%
filter(pickup_latitude != dropoff_latitude & pickup_longitude != dropoff_longitude) %>%
cbind(reference = toString("2009 NYU"))
#filter(passenger_count>1)
nine_NYU
nine_Col = filter(nine_halloween, between(pickup_latitude, range_Col_min_lat, range_Col_max_lat)) %>%
filter(between(pickup_longitude, range_Col_min_lon, range_Col_max_lon)) %>%
filter(pickup_latitude != dropoff_latitude & pickup_longitude != dropoff_longitude) %>%
cbind(reference = toString("2009 Col"))
#filter(passenger_count>1)
nine_Col
twelve_NYU = filter(twelve_halloween, between(pickup_latitude, range_NYU_min_lat, range_NYU_max_lat)) %>%
filter(between(pickup_longitude, range_NYU_min_lon, range_NYU_max_lon)) %>%
filter(pickup_latitude != dropoff_latitude & pickup_longitude != dropoff_longitude) %>%
cbind(reference = toString("2012 NYU"))
#filter(passenger_count>1)
twelve_NYU
twelve_Col = filter(twelve_halloween, between(pickup_latitude, range_Col_min_lat, range_Col_max_lat)) %>%
filter(between(pickup_longitude, range_Col_min_lon, range_Col_max_lon)) %>%
filter(pickup_latitude != dropoff_latitude & pickup_longitude != dropoff_longitude) %>%
cbind(reference = toString("2012 Col"))
#filter(passenger_count>1)
twelve_Col
fifteen_NYU = filter(fifteen_halloween, between(pickup_latitude, range_NYU_min_lat, range_NYU_max_lat)) %>%
filter(between(pickup_longitude, range_NYU_min_lon, range_NYU_max_lon)) %>%
filter(pickup_latitude != dropoff_latitude & pickup_longitude != dropoff_longitude) %>%
cbind(reference = toString("2015 NYU"))
#filter(passenger_count>1)
fifteen_NYU
fifteen_Col = filter(fifteen_halloween, between(pickup_latitude, range_Col_min_lat, range_Col_max_lat)) %>%
filter(between(pickup_longitude, range_Col_min_lon, range_Col_max_lon)) %>%
filter(pickup_latitude != dropoff_latitude & pickup_longitude != dropoff_longitude) %>%
cbind(reference = toString("2015 Col"))
#filter(passenger_count>1)
fifteen_Col
eighteen_NYU = filter(eighteen_halloween, PULocationID == 114) %>%
cbind(reference = toString("2018 NYU"))
#filter(passenger_count>1)
eighteen_NYU
eighteen_Col = filter(eighteen_halloween, PULocationID == 166) %>%
cbind(reference = toString("2018 Col"))
#filter(passenger_count>1)
eighteen_Col
```
```{r}
k_max = 25
check_df = nine_NYU[,c(6,7)]
wss = sapply(1:k_max,function(k) {
kmeans(check_df, k, nstart=50, iter.max = 15)$tot.withinss})
wss
plot(1:k_max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of Clusters K",
ylab="Total Within-Clusters Sum of Squares")
```
```{r}
# Create Clusters
number_clusters = 100
nine_NYU_cluster = nine_NYU[,c(6,7)]
nine_NYU_cluster_values = kmeans(nine_NYU_cluster, number_clusters)
nine_NYU_cluster = cbind(nine_NYU, cluster = nine_NYU_cluster_values$cluster)
nine_NYU_cluster_group = group_by(nine_NYU_cluster, cluster) %>%
summarise(Freq = n()) %>%
arrange(desc(Freq))
nine_NYU_cluster_group
nine_Col_cluster = nine_Col[,c(6,7)]
nine_Col_cluster_values = kmeans(nine_Col_cluster, number_clusters)
nine_Col_cluster = cbind(nine_Col, cluster = nine_Col_cluster_values$cluster)
nine_Col_cluster_group = group_by(nine_Col_cluster, cluster) %>%
summarise(Freq = n()) %>%
arrange(desc(Freq))
nine_Col_cluster_group
twelve_NYU_cluster = twelve_NYU[,c(6,7)]
twelve_NYU_cluster_values = kmeans(twelve_NYU_cluster, number_clusters)
twelve_NYU_cluster = cbind(twelve_NYU, cluster = twelve_NYU_cluster_values$cluster)
twelve_NYU_cluster_group = group_by(twelve_NYU_cluster, cluster) %>%
summarise(Freq = n()) %>%
arrange(desc(Freq))
twelve_NYU_cluster_group
twelve_Col_cluster = twelve_Col[,c(6,7)]
twelve_Col_cluster_values = kmeans(twelve_Col_cluster, number_clusters)
twelve_Col_cluster = cbind(twelve_Col, cluster = twelve_Col_cluster_values$cluster)
twelve_Col_cluster_group = group_by(twelve_Col_cluster, cluster) %>%
summarise(Freq = n()) %>%
arrange(desc(Freq))
twelve_Col_cluster_group
fifteen_NYU_cluster = fifteen_NYU[,c(3,4)]
fifteen_NYU_cluster_values = kmeans(fifteen_NYU_cluster, number_clusters)
fifteen_NYU_cluster = cbind(fifteen_NYU, cluster = fifteen_NYU_cluster_values$cluster)
fifteen_NYU_cluster_group = group_by(fifteen_NYU_cluster, cluster) %>%
summarise(Freq = n()) %>%
arrange(desc(Freq))
fifteen_NYU_cluster_group
fifteen_Col_cluster = fifteen_Col[,c(3,4)]
fifteen_Col_cluster_values = kmeans(fifteen_Col_cluster, number_clusters)
fifteen_Col_cluster = cbind(fifteen_Col, cluster = fifteen_Col_cluster_values$cluster)
fifteen_Col_cluster_group = group_by(fifteen_Col_cluster, cluster) %>%
summarise(Freq = n()) %>%
arrange(desc(Freq))
fifteen_Col_cluster_group
eighten_NYU_cluster_group = group_by(eighteen_NYU, DOLocationID) %>%
summarise(Freq = n()) %>%
arrange(desc(Freq))
eighten_NYU_cluster_group
eighten_Col_cluster_group = group_by(eighteen_Col, DOLocationID) %>%
summarise(Freq = n()) %>%
arrange(desc(Freq))
eighten_Col_cluster_group
```
```{r}
nine_NYU_top = filter(nine_NYU_cluster, nine_NYU_cluster$cluster == nine_NYU_cluster_group$cluster[1] | nine_NYU_cluster$cluster == nine_NYU_cluster_group$cluster[2] | nine_NYU_cluster$cluster == nine_NYU_cluster_group$cluster[3])
nine_NYU_top = nine_NYU_top[,c(10,11,21,22)]
nine_NYU_top
nine_Col_top = filter(nine_Col_cluster, nine_Col_cluster$cluster == nine_Col_cluster_group$cluster[1] | nine_Col_cluster$cluster == nine_Col_cluster_group$cluster[2] | nine_Col_cluster$cluster == nine_Col_cluster_group$cluster[3])
nine_Col_top = nine_Col_top[,c(10,11,21,22)]
nine_Col_top
twelve_NYU_top = filter(twelve_NYU_cluster, twelve_NYU_cluster$cluster == twelve_NYU_cluster_group$cluster[1] | twelve_NYU_cluster$cluster == twelve_NYU_cluster_group$cluster[2] | twelve_NYU_cluster$cluster == twelve_NYU_cluster_group$cluster[3])
twelve_NYU_top = twelve_NYU_top[,c(10,11,21,22)]
twelve_NYU_top
twelve_Col_top = filter(twelve_Col_cluster, twelve_Col_cluster$cluster == twelve_Col_cluster_group$cluster[1] | twelve_Col_cluster$cluster == twelve_Col_cluster_group$cluster[2] | twelve_Col_cluster$cluster == twelve_Col_cluster_group$cluster[3])
twelve_Col_top = twelve_Col_top[,c(10,11,21,22)]
twelve_Col_top
fifteen_NYU_top = filter(fifteen_NYU_cluster, fifteen_NYU_cluster$cluster == fifteen_NYU_cluster_group$cluster[1] | fifteen_NYU_cluster$cluster == fifteen_NYU_cluster_group$cluster[2] | fifteen_NYU_cluster$cluster == fifteen_NYU_cluster_group$cluster[3])
fifteen_NYU_top = fifteen_NYU_top[,c(7,8,21,22)]
fifteen_NYU_top
fifteen_Col_top = filter(fifteen_Col_cluster, fifteen_Col_cluster$cluster == fifteen_Col_cluster_group$cluster[1] | fifteen_Col_cluster$cluster == fifteen_Col_cluster_group$cluster[2] | fifteen_Col_cluster$cluster == fifteen_Col_cluster_group$cluster[3])
fifteen_Col_top = fifteen_Col_top[,c(7,8,21,22)]
fifteen_Col_top
```
```{r}
consolidated_top = rbind(nine_NYU_top, nine_Col_top, twelve_NYU_top, twelve_Col_top, fifteen_NYU_top, fifteen_Col_top)
consolidated_top
```
```{r}
# my_pal = colorRampPalette(colors = c("red", "blue", "orange", "purple", "yellow", "green"), space = "Lab")(6)
#
# pal_map = colorFactor(palette = my_pal, domain = consolidated_top$reference)
# pal_map
```
```{r}
consolidated_top_map = leaflet() %>%
addTiles() %>%
addCircleMarkers(lng=consolidated_top$dropoff_longitude, lat= consolidated_top$dropoff_latitude, color = pal_map(consolidated_top$reference), popup = paste(as.character(consolidated_top$reference))) %>%
addProviderTiles(providers$CartoDB.Positron)
consolidated_top_map
```
```{r}
top_map = leaflet() %>%
addTiles() %>%
addCircleMarkers(lng=nine_NYU_top$dropoff_longitude, lat= nine_NYU_top$dropoff_latitude, popup = paste(as.character(nine_NYU_top$reference)), group = "2009 NYU", color = "red") %>%
addCircleMarkers(lng=nine_Col_top$dropoff_longitude, lat= nine_Col_top$dropoff_latitude, popup = paste(as.character(nine_Col_top$reference)), group = "2009 Col", color = "blue") %>%
addCircleMarkers(lng=twelve_NYU_top$dropoff_longitude, lat= twelve_NYU_top$dropoff_latitude, popup = paste(as.character(twelve_NYU_top$reference)), group = "2012 NYU", color = "green") %>%
addCircleMarkers(lng=twelve_Col_top$dropoff_longitude, lat= twelve_Col_top$dropoff_latitude, popup = paste(as.character(twelve_Col_top$reference)), group = "2012 Col", color = "orange") %>%
addCircleMarkers(lng=fifteen_NYU_top$dropoff_longitude, lat= fifteen_NYU_top$dropoff_latitude, popup = paste(as.character(fifteen_NYU_top$reference)), group = "2015 NYU", color = "purple") %>%
addCircleMarkers(lng=fifteen_Col_top$dropoff_longitude, lat= fifteen_Col_top$dropoff_latitude, popup = paste(as.character(fifteen_Col_top$reference)), group = "2015 Col", color = "yellow") %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addLayersControl(overlayGroups = c("2009 NYU", "2009 Col", "2012 NYU", "2012 Col", "2015 NYU", "2015 Col"), options = layersControlOptions(collapsed = F))
top_map
```
```{r}
consolidated_top_map = leaflet() %>%
addTiles() %>%
addMarkers(lng=consolidated_top$dropoff_longitude, lat= consolidated_top$dropoff_latitude, clusterOptions = markerClusterOptions(), popup = paste(as.character(consolidated_top$reference))) %>%
addProviderTiles(providers$CartoDB.Positron)
consolidated_top_map
```
```{r}
setwd("~/Desktop/College/Senior/OIDD 245/DP 2/Data/")
NYC_restaurants = read_csv("NYC Restaurants Geocoded.csv", col_names = FALSE)
colnames(NYC_restaurants) = c("name","latitude", "longitude", "raw")
```
```{r}
NYC_restaurants_map = leaflet() %>%
addTiles() %>%
addMarkers(lng = NYC_restaurants$longitude, lat = NYC_restaurants$latitude, popup = paste(as.character(NYC_restaurants$name)), clusterOptions = markerClusterOptions()) %>%
addProviderTiles(providers$CartoDB.Positron)
NYC_restaurants_map
```
```{r}
# nine_NYU
# nine_Col
# twelve_NYU
# twelve_Col
# fifteen_NYU
# fifteen_Col
# eighteen_NYU
# eighteen_Col
stats_nine_NYU = nine_NYU[,c(4,5,13,16)]
stats_nine_Col = nine_Col[,c(4,5,13,16)]
stats_nine_NYU
stats_twelve_NYU = twelve_NYU[,c(4,5,13,16)]
stats_twelve_Col = twelve_Col[,c(4,5,13,16)]
stats_twelve_NYU
stats_fifteen_NYU = fifteen_NYU[,c(1,2,9,12)]
stats_fifteen_Col = fifteen_Col[,c(1,2,9,12)]
stats_fifteen_NYU
stats_eighteen_NYU = eighteen_NYU[,c(4,5,11,14)]
stats_eighteen_Col = eighteen_Col[,c(4,5,11,14)]
stats_eighteen_NYU
stats_NYU = rbind(stats_nine_NYU, stats_twelve_NYU, stats_fifteen_NYU, stats_eighteen_NYU)
stats_NYU$indicator = 1
stats_Col = rbind(stats_nine_Col, stats_twelve_Col, stats_fifteen_Col, stats_eighteen_Col)
stats_Col$indicator = 0
stats_NYU
stats_Col
stats_consolidated = rbind(stats_NYU, stats_Col)
stats_consolidated
```
```{r}
mean(stats_NYU$tip_amount)
mean(stats_Col$tip_amount)
max(stats_NYU$tip_amount)
max(stats_Col$tip_amount)
mean(stats_NYU$fare_amount)
mean(stats_Col$fare_amount)
max(stats_NYU$fare_amount)
max(stats_Col$fare_amount)
mean(stats_NYU$trip_distance)
mean(stats_Col$trip_distance)
max(stats_NYU$trip_distance)
max(stats_Col$trip_distance)
mean(stats_NYU$passenger_count)
mean(stats_Col$passenger_count)
max(stats_NYU$passenger_count)
max(stats_Col$passenger_count)
t.test(tip_amount ~ indicator, data = stats_consolidated)
t.test(fare_amount ~ indicator, data = stats_consolidated)
t.test(trip_distance ~ indicator, data = stats_consolidated)
t.test(passenger_count ~ indicator, data = stats_consolidated)
```
```{r}
# list_data2 = c('nine_hal', 'eleven_hal', 'twelve_hal','thirteen_hal', 'fourteen_hal', 'fifteen_hal', 'sixteen_hal', 'seventeen_hal', 'eighteen_hal')
#
# datasets = list(nine_hal, eleven_hal, twelve_hal, thirteen_hal, fourteen_hal, fifteen_hal, sixteen_hal, seventeen_hal, eighteen_hal)
#
# lapply(datasets, function(x) {
# filter(x, between(pickup_latitude, range_NYU_min_lat, range_NYU_max_lat)) %>%
# filter(between(pickup_longitude, range_NYU_min_lon, range_NYU_max_lon))
#
# })
```