library(tidyverse) library(here) library(dplyr)

This assigment is focus is work with dataset from Tidy Tuesday (09/28/2021)

Load the data - Week 40 09/28/21

papers <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/papers.csv')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   paper = col_character(),
##   year = col_double(),
##   month = col_double(),
##   title = col_character()
## )
authors <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/authors.csv')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   author = col_character(),
##   name = col_character(),
##   user_nber = col_character(),
##   user_repec = col_character()
## )
programs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/programs.csv')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   program = col_character(),
##   program_desc = col_character(),
##   program_category = col_character()
## )
paper_authors <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/paper_authors.csv')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   paper = col_character(),
##   author = col_character()
## )
paper_programs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/paper_programs.csv')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   paper = col_character(),
##   program = col_character()
## )

Understanding the data

glimpse(papers)
## Rows: 29,434
## Columns: 4
## $ paper <chr> "w0001", "w0002", "w0003", "w0004", "w0005", "w0006", "w0007", "…
## $ year  <dbl> 1973, 1973, 1973, 1973, 1973, 1973, 1973, 1973, 1973, 1973, 1973…
## $ month <dbl> 6, 6, 6, 7, 7, 7, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12,…
## $ title <chr> "Education, Information, and Efficiency", "Hospital Utilization:…
glimpse(authors)
## Rows: 15,437
## Columns: 4
## $ author     <chr> "w0001.1", "w0002.1", "w0003.1", "w0004.1", "w0005.1", "w00…
## $ name       <chr> "Finis Welch", "Barry R Chiswick", "Swarnjit S Arora", "Lee…
## $ user_nber  <chr> "finis_welch", "barry_chiswick", "swarnjit_arora", NA, "jam…
## $ user_repec <chr> NA, "pch425", NA, "pli669", "psm28", NA, NA, NA, "pli259", …
glimpse(programs)
## Rows: 21
## Columns: 3
## $ program          <chr> "AG", "AP", "CF", "CH", "DAE", "DEV", "ED", "EEE", "E…
## $ program_desc     <chr> "Economics of Aging", "Asset Pricing", "Corporate Fin…
## $ program_category <chr> "Micro", "Finance", "Finance", "Micro", "Micro", "Mic…
glimpse(paper_authors)
## Rows: 67,090
## Columns: 2
## $ paper  <chr> "w0001", "w0002", "w0003", "w0004", "w0005", "w0006", "w0007", …
## $ author <chr> "w0001.1", "w0002.1", "w0003.1", "w0004.1", "w0005.1", "w0006.1…
glimpse(paper_programs)
## Rows: 53,996
## Columns: 2
## $ paper   <chr> "w0074", "w0087", "w0087", "w0107", "w0116", "w0117", "w0129",…
## $ program <chr> "EFG", "IFM", "ITI", "PE", "PE", "LS", "HE", "IFM", "ITI", "HE…
head(programs)
## # A tibble: 6 × 3
##   program program_desc                        program_category
##   <chr>   <chr>                               <chr>           
## 1 AG      Economics of Aging                  Micro           
## 2 AP      Asset Pricing                       Finance         
## 3 CF      Corporate Finance                   Finance         
## 4 CH      Children                            Micro           
## 5 DAE     Development of the American Economy Micro           
## 6 DEV     Development Economics               Micro
head(papers)
## # A tibble: 6 × 4
##   paper  year month title                                                       
##   <chr> <dbl> <dbl> <chr>                                                       
## 1 w0001  1973     6 Education, Information, and Efficiency                      
## 2 w0002  1973     6 Hospital Utilization: An Analysis of SMSA Differences in Ho…
## 3 w0003  1973     6 Error Components Regression Models and Their Applications   
## 4 w0004  1973     7 Human Capital Life Cycle of Earnings Models: A Specific Sol…
## 5 w0005  1973     7 A Life Cycle Family Model                                   
## 6 w0006  1973     7 A Review of Cyclical Indicators for the United States: Prel…
head(paper_programs)
## # A tibble: 6 × 2
##   paper program
##   <chr> <chr>  
## 1 w0074 EFG    
## 2 w0087 IFM    
## 3 w0087 ITI    
## 4 w0107 PE     
## 5 w0116 PE     
## 6 w0117 LS
paper_programs%>%
  count(program)
## # A tibble: 21 × 2
##    program     n
##    <chr>   <int>
##  1 AG       1646
##  2 AP       2722
##  3 CF       2361
##  4 CH       1689
##  5 DAE      1671
##  6 DEV      1277
##  7 ED       1601
##  8 EEE      1249
##  9 EFG      5803
## 10 HC       1708
## # … with 11 more rows

The main goal is analyze the category of National Bureau of Economic Research working papers over the time.

papers_joined <- paper_programs %>%
  left_join(programs) %>%
  left_join(papers) %>%
  filter(!is.na(program_category)) %>%
  distinct(paper, program, program_category, year, title)
## Joining, by = "program"
## Joining, by = "paper"
var1<-papers_joined %>% 
  count(program_category)
var1
## # A tibble: 3 × 2
##   program_category        n
##   <chr>               <int>
## 1 Finance              5083
## 2 Macro/International 16735
## 3 Micro               31791

Plot the results

P1<- ggplot(data=papers_joined) +
  geom_bar(
    mapping = aes(x=program_category , fill=program_category),
    show.legend = FALSE,
    width = 1)+
    theme(aspect.ratio = 1)+
    labs(x= NULL, y= NULL)
P1 + coord_flip()

P1 + coord_polar()

Now that we noticed that the program category behavior, we can check amount of papers through the years.

var2<- papers_joined %>%
      subset(select= c(year, program_category)) %>%
      group_by(year)%>%
      count(program_category) %>%
      group_by(year) %>% arrange(year)

#Plot Program category over the time 
P2<- ggplot(data= var2, mapping= aes(x=year, y=n, color= program_category)) + 
  geom_point() +
  geom_smooth()+
  xlab("Year") +
  ylab("Number of Articles")
P2    
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

From the graph we can see that the category Micro has been have a huge increse in the last 20 years, could be interesting to analyze what are the programs with more influence inside of Micro category.

head(papers_joined)
## # A tibble: 6 × 5
##   paper program program_category     year title                                 
##   <chr> <chr>   <chr>               <dbl> <chr>                                 
## 1 w0074 EFG     Macro/International  1975 Variation Across Household in the Rat…
## 2 w0087 IFM     Macro/International  1975 Exports and Foreign Investment in the…
## 3 w0087 ITI     Macro/International  1975 Exports and Foreign Investment in the…
## 4 w0107 PE      Micro                1975 Social Security and Retirement Decisi…
## 5 w0116 PE      Micro                1975 Notes on the Tax Treatment of Human C…
## 6 w0117 LS      Micro                1980 Job Mobility and Earnings Growth
var3<- filter(papers_joined, program_category=="Micro")


P3<- ggplot(data=var3) +
  geom_bar(
    mapping = aes(x=program , fill=program),
    show.legend = FALSE,
    width = 1)+
    theme(aspect.ratio = 1)+
    labs(x="Program", y= "Number of Papers")
P3 + coord_flip()

The graph shows that Public Economic and Labor Studies are the programs with major number of papers inside of Micro category.