In this lab, I practiced data wrangling and visualization skills using real-time COVID-19 data maintained by the New York Times. If you scroll down this page, you will see different tables demonstrating COVID trends across the state of California.

#LOAD DATA from NY Times
covid_url=("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
covid= read_csv(covid_url)  #covid= data from nytimes
#STEP2: subset Covid Data -> filters data to CA Counties
CA_covid= covid%>%
  filter(state=="California")%>%
  group_by(county) %>%
  mutate(newCases = c(cases[1], diff(cases))) %>%
  ungroup() %>%
  filter(date >=max(date)-13)

#STEP 3
#Most cumulative cases
CA_cumulative= CA_covid %>%
  filter(date==max(date)) %>%
  slice_max(cases, n=5) %>%
  select(county,cases)
#Most new cases
CA_new= CA_covid %>%
  filter(date==max(date)) %>%
  slice_max(newCases, n=5) %>%
  select(county, newCases)

#make table using Knitr::Kable
knitr::kable(CA_cumulative,
             caption = "Counties with Most Cumulative Cases",
             col.names = c("County", "Total Cases"))
Counties with Most Cumulative Cases
County Total Cases
Los Angeles 253985
Riverside 55073
Orange 52121
San Bernardino 50699
San Diego 42742
knitr::kable(CA_new,
             caption = "Counties with Most New Cases",
             col.names = c("County", "Total New Cases"))
Counties with Most New Cases
County Total New Cases
Los Angeles 809
San Diego 265
Orange 185
Fresno 159
San Bernardino 156
#STEP4/5 : Load population data
library(readxl)
pop = read_excel('../data/PopulationEstimates.xls', skip = 2) %>%
  select(pop2019 = POP_ESTIMATE_2019, fips = FIPStxt) #change column names

#STEP 6 #examine data
names(pop)
## [1] "pop2019" "fips"
dim(pop)
## [1] 3273    2
names(CA_covid)
## [1] "date"     "county"   "state"    "fips"     "cases"    "deaths"   "newCases"
dim(CA_covid)
## [1] 812   7
#STEP 7
#join COVID & Population data

all_data= left_join(covid, pop, by='fips')

These are the 13 counties deemed safe in terms of the number of covid cases among their residents. Over the most recent 14 days, if more than 100 people (per 100,000 residents) are infected with Covid, they are placed on the State Watch List.

last_14_days= all_data %>%
  filter(state==c("California")) %>%
  group_by(county, date) %>%
  summarize(totalcases=sum(cases, na.rm = TRUE),
            pop2019= sum(pop2019, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(date>=max(date)-13)%>%
  group_by(county,pop2019)%>%
  mutate(newCases=totalcases-lag(totalcases))%>%
  summarize(totalcases= sum(newCases,na.rm = TRUE)) %>%
  mutate(per100= totalcases/ (pop2019/100000)) %>%
  filter(per100 <=100)%>%
  pull(county)
last_14_days
##  [1] "Alpine"          "Amador"          "Del Norte"       "El Dorado"      
##  [5] "Humboldt"        "Inyo"            "Lake"            "Lassen"         
##  [9] "Mariposa"        "Mendocino"       "Mono"            "Napa"           
## [13] "Nevada"          "Placer"          "Plumas"          "Riverside"      
## [17] "San Francisco"   "San Luis Obispo" "Shasta"          "Sierra"         
## [21] "Siskiyou"        "Solano"          "Tehama"          "Trinity"        
## [25] "Tuolumne"
knitr::kable(last_14_days, caption = paste('Safe counties across State'),
             col.names = c('County'))
Safe counties across State
County
Alpine
Amador
Del Norte
El Dorado
Humboldt
Inyo
Lake
Lassen
Mariposa
Mendocino
Mono
Napa
Nevada
Placer
Plumas
Riverside
San Francisco
San Luis Obispo
Shasta
Sierra
Siskiyou
Solano
Tehama
Trinity
Tuolumne

This graph shows the number of daily, new cases at the state level for NY, CA, LA, and FL.

#QUESTION 2
##In this question, we are going to look at the story of 4 states and the impact scale can have on data interprtation. 
#The states include: New York, California, Louisiana, and Florida.
##Your task is to make a faceted bar plot showing the number of daily, new cases at the state level.

#Step 1: group data to state/county level, then filter it to the 4 states of interest

four_states=covid %>%
  group_by(state, date)%>%
  summarise(cases=sum(cases))%>%
  ungroup()%>%
  filter(state %in% c("New York", "California", "Louisiana", "Florida")) %>%
  group_by(state) %>%
  mutate(newCases=cases-lag(cases))%>%
  mutate(roll7=zoo::rollmean(newCases,7,fill=NA, align='right'))
  
ggplot(data= four_states, aes(x=date))+
  geom_col(aes(y=newCases), col='red')+
  geom_line(aes(y=roll7), col='black')+
  facet_wrap(~state)+
  labs(title = "Daily New Cases in NY, CA, LA, FL",
       x="Date",
       y="New Cases",
       caption = 'From NY Times COVID Data',
       theme_bw())

*Code below did not run as expected

The following graphs demonstrate the cases per capita of each state. This data is a better representation of the COVID situation in each state, as the graph above is strictly raw data and can be misleading.

percap_state_data= four_states%>% left_join(pop, by= c(“state”=‘fips’)) %>% mutate(new_percap= newCases/pop2019, perCap7=rollmean(new_percap, 7, fill=NA, align=‘center’))

#Graph new cases per capita ggplot(data = percap_state_data, aes(x=date)) + geom_col(aes(y=new_percap), col=‘green’) + geom_line(aes(y=perCap7), col= “black”)+ facet_wrap(~state) + labs(title = “Daily New Cases Per Capita”, x=“Date”, y=“New Cases”, caption = ‘From NY Times COVID Data’, theme_bw())