Part 1

Question 1

Kaggle Data

setwd("scripts")
scripts <- vapply(dir(), FUN.VALUE = rep(list(1), 3), FUN = function(x) {
  temp <- read.csv(x, as.is = )
  Episode <- rep(gsub(".csv", "", x), nrow(temp))
  cbind(Episode, temp)
})
scripts <- data.frame(apply(scripts, 1, unlist, use.names = FALSE), stringsAsFactors = FALSE)
variable.names(scripts)

## [1] "Episode"   "Character" "Line"

dim(scripts)

## [1] 65942     3

IMDB Data

load("imdb.RData")
variable.names(imdb)

## [1] "rating" "votes"

dim(imdb)

## [1] 125   2

Wikipedia Data

library(rvest)

## Loading required package: xml2

parks_and_rec <- local({
  url <- "https://en.wikipedia.org/wiki/List_of_Parks_and_Recreation_episodes"
  episode_data <- html_table(html_nodes(read_html(url), xpath = "//*/table"), fill = TRUE)
  Reduce("rbind", episode_data[2:8])
})
parks_and_rec$Title <- gsub(pattern = "[^0-9a-zA-Z.,' ]", replacement = "", parks_and_rec$Title)
variable.names(parks_and_rec)

## [1] "No.overall"             "No. inseason"           "Title"                 
## [4] "Directed by"            "Written by"             "Original air date"     
## [7] "U.S. viewers(millions)"

dim(parks_and_rec)

## [1] 122   7

Question 2

Conform the Data

imdb <- local({
  for (x in c(91, 111, 124)) { # these are the rows of 2 part episodes
    mat <- imdb[c(x, x + 1), ]
    imdb[x + 1, ] <- c((mat[1] * mat[3] + mat[2] * mat[4]) / (mat[3] + mat[4]), (mat[3] + mat[4]) / 2)
  }
  imdb[-c(91, 111, 124), ]
})

Clean Up the Wikipedia Data

parks_and_rec <- with(parks_and_rec, {
  mm_of_viewers <- as.numeric(substr(`U.S. viewers(millions)`, 1, 4))
  cbind(parks_and_rec[, -7], mm_of_viewers)
})
parks_and_rec$No.overall <- as.numeric(rownames(parks_and_rec))

Question 3

source("reading_ease.R")

Combined episode data

parks_and_rec <- with(scripts, {
  ep_flesch <- tapply(Line, Episode, reading_ease)
  cbind(parks_and_rec, imdb, ep_flesch)
})

Per episode Flesch reading ease score by character

main_ch <- local({
  raw_ch <- c("Andy Dwyer, Ann Perkins, April Ludgate, Ben Wyatt, Chris Traeger, Donna Meagle, Jerry Gergich, Leslie Knope, Ron Swanson, Tom Haverford")
  unlist(strsplit(raw_ch, ", "))
})

s_no <- paste("s", 1:7, sep = "")

season <- local({
  list_of_seasons <- lapply(s_no, function(x) rep(x, sum(grepl(x, rownames(parks_and_rec)))))
  factor(unlist(list_of_seasons, use.names = FALSE))
})

setwd("scripts")
scriptz <- lapply(dir(), read.csv, stringsAsFactors = FALSE)

ch_flesch_per_ep <- local({
  ch_flesch <- lapply(scriptz, function(x) {
    with(x, {
      main_ind <- Character %in% main_ch
      tapply(Line[main_ind], Character[main_ind], reading_ease)
    })
  })
  with(parks_and_rec, {
    overall_ep <- rep(No.overall, sapply(ch_flesch, length))
    data.frame(
      "season" = season[overall_ep],
      "episode" = `No. inseason`[overall_ep],
      "title" = Title[overall_ep],
      "writer" = `Written by`[overall_ep],
      "character" = unlist(lapply(ch_flesch, names)),
      "reading_ease" = unlist(ch_flesch, use.names = FALSE)
    )
  })
})

Augment Scripts

scripts <- local({
  list_of_seasons <- lapply(s_no, function(x) rep(x, sum(grepl(x, scripts$Episode))))
  Season <- factor(unlist(list_of_seasons, use.names = FALSE))
  cbind(Season, scripts)
})

Part 2

Question 5

Line Plots

pdf("line_plots.pdf")
with(parks_and_rec, {
  plot(mm_of_viewers ~ No.overall,
       type = "l",
       main = "Line Plot of Viewers by Episode")
  plot(rating ~ No.overall,
       type = "l",
       main = "Line Plot of IMBd Rating by Episode")
})
with(scripts, {
  most_lines <- names(tail(sort(tapply(Line, Character, length)), 8))
  flesch_mat <- tapply(Line, list(Episode, Character), reading_ease)
  matplot(flesch_mat[, most_lines],
          type = "l",
          main = "Line Plot of Character Reading Ease Score by Episode",
          xlab = "Episode",
          ylab = "Flesch Reading Ease Score",
          lty = seq_along(most_lines),
          col = seq_along(most_lines))
  legend(x = "top",
         legend = main_ch,
         ncol = 3,
         lty = seq_along(most_lines),
         bty = "n",
         col = seq_along(most_lines))
  abline(h = 90)
})
dev.off()

line_plots.pdf

The first line plot tells us that the pilot episode had the most viewers for the entire show, and also shows a downward trend as the episodes pass. The downward trend indicates a decrease in interest of the viewers in the show as it progressed. The second line plot tells us that, on the contrary, the IMBd rating of the show went up as the show progressed, reaching a peak at the show’s finale. The last line plot is just a mess. It tells us that the 8 main characters of the show with the most lines have reading ease scores that fluctuate around a score of 90.

Scatter Plots

pdf("scatter_plots.pdf")
scatter_hist <- function(x, y, f = NULL) {
  xname <- deparse(substitute(x), backtick = FALSE)
  yname <- deparse(substitute(y), backtick = FALSE)
  breakz <- function(a) {
    increment <- sqrt(length(a)) * abs(mean(diff(a)))
    seq(min(a), max(a) + increment, increment)
  }
  layout(rbind(c(2, 4), c(1, 3)),
         widths = c(2, 1),
         heights = c(1, 2),
         respect = TRUE)
  par(mar = c(6, 6, 0, 0))
  # Branch here. Everything above is the same.
  if (is.null(f)) {
    xcounts <- hist(x,
                    breaks = breakz(x),
                    plot = FALSE)$counts
    ycounts <- hist(y,
                    breaks = breakz(y),
                    plot = FALSE)$counts
    top <- max(xcounts, ycounts)
    plot(x, y,
         xlim = range(x),
         ylim = range(y),
         xlab = xname,
         ylab = yname)
    par(mar = c(0, 6, 4, 0))
    barplot(xcounts,
            axes = FALSE,
            ylim = c(0, top),
            space = 0, # Space btwn bars.
            main = paste("Scatterplot of", xname, "against", yname))
    par(mar = c(6, 0, 0, 2))
    barplot(ycounts,
            axes = FALSE,
            xlim = c(0, top),
            space = 0,
            horiz = TRUE)
  } else {
    fname <- deparse(substitute(f), backtick = FALSE)
    histo <- function(b, c) tapply(b, c, hist,
                                   breaks = breakz(b),
                                   plot = FALSE)
    counts <- function(d, e) lapply(histo(d, e), `[[`, "counts")
    xcounts <- do.call(rbind, counts(x, f))
    ycounts <- do.call(rbind, counts(y, f))
    top <- max(colSums(xcounts), colSums(ycounts))
    plot(x, y,
         xlim = range(x),
         ylim = range(y),
         xlab = xname,
         ylab = yname,
         col = seq_along(unique(f)))
    par(mar = c(0, 6, 4, 0))
    barplot(xcounts,
            axes = FALSE,
            ylim = c(0, top),
            space = 0, # Space btwn bars.
            col = seq_along(unique(f)),
            main = paste("Scatterplot of\n", xname, "against", yname, "\nby", fname))
    par(mar = c(6, 0, 0, 2))
    barplot(ycounts,
            axes = FALSE,
            xlim = c(0, top),
            space = 0,
            col = seq_along(unique(f)),
            horiz = TRUE)
    par(mar = c(0, 0, 0, 2))
    plot.new()
    legend(x = "center",
           legend = unique(f),
           fill = seq_along(unique(f)),
           title = deparse(substitute(f), backtick = FALSE))
  }
}
with(parks_and_rec, {
  scatter_hist(rating, votes)
  scatter_hist(rating, votes, season)
  scatter_hist(mm_of_viewers, rating)
  scatter_hist(mm_of_viewers, rating, season)
})
dev.off()

scatter_plots.pdf

Please note that do.call() was mentioned in the order function section and deparse(substitute(), backtick = FALSE) was mentioned in the attach function in the Stats 20 Reference Manual. The two scatter plots show abolutely no correlation between rating and votes or mm_of_viewers and rating. The marginal histogram for rating on both scatterplots show a normal-ish, unimodal distribution, which may be due to either a random voting by the public, or the fact that the show randomly does better or worse based on unseen factors. The marginal histogram for votes shows a heavy right skew. This is probably due to the fact that particularly bad or good episodes will invite heavy voting. The marginal histogram for mm_of_viewers also show a right skew. This makes sense for a similar reason.

Box Plots

with(parks_and_rec, {
  writers <<- unlist(strsplit(`Written by`, " & "))
  writers_5 <- names(which(table(writers) >= 5))
  writer_list <<- sapply(writers_5, grep, `Written by`)
  directors_5 <- names(which(table(`Directed by`) >= 5))
  director_ind <<- sapply(directors_5, grep, `Directed by`)
})
pdf("box_plots.pdf")
with(parks_and_rec, {
  writer_rating <- lapply(writer_list, function(x) rating[x])
  director_views <- lapply(director_ind, function(y) mm_of_viewers[y])
  par(mar = c(5, 8, 4, 2))
  boxplot(writer_rating,
          las = 1, # This makes all tick labels horizontal.
          horizontal = TRUE,
          xlab = "IMDb Rating",
          main = "IMDb Ratings for Writers w/ 5+ Writing Credits")
  abline(v = 8.3)
  boxplot(director_views,
          las = 1,
          horizontal = TRUE,
          xlab = "Viewers (in millions)",
          main = "Millions of Viewers for Directors w/ 5+ Directing Credits")
})
dev.off()

box_plots.pdf

The IMDb Ratings for Writers w/ 5+ Writing Credits shows that these writers consistently get similar IMDb ratings, with Amy Poehler being especially popular. Most writers cluster around an IMDb rating of 8.3. There are 3 groups of directors with similar viewership. Morgan Sackett and Craig Zisk have similar boxplots, indicating that they may have directed episodes that a particular group of viewers enjoy. Michael Schur, Ken Whittingham, and Dean Holland have similar boxplots, indicating that they may have directed episodes that a particular group of viewers enjoy. Finally, Troy Miller and Randall Einhorn have similar boxplots, indicating that they may have directed episodes that a particular group of viewers enjoy.

Histograms

pdf("histograms.pdf")
hist(sapply(scriptz, nrow), ylim = c(0, 80), xlab = "Amount of Lines", main = "Histogram of Amount of Lines Spoken in each Episode")
with(ch_flesch_per_ep, {
  par(mfrow = c(1, 3))
  hist. <- function(x, y) {
    x_first <- substr(x, 1, regexpr(" ", x) - 1)
    y_first <- substr(y, 1, regexpr(" ", y) - 1)
    v <- function(z) reading_ease[character %in% z]
    breakz <- pretty(unique(v(x), v(y)))
    hist(v(x),
         breaks = breakz,
         xlab = "",
         ylab = "",
         ylim = c(-115, 115),
         yaxt = "n",
         main = paste("Mirrored Histogram of\nReading Ease Scores of\n", x_first, "&", y_first))
    text(x = mean(v(x)),
         y = 117,
         labels = x)
    par(new = TRUE)
    hist(v(y),
         breaks = breakz,
         xlab = "Reading Ease Score",
         ylim = c(115, -115),
         axes = FALSE,
         main = "")
    axis(side = 2,
         at = pretty(par("usr")[3:4]),
         labels = abs(pretty(par("usr")[3:4])))
    text(x = mean(v(y)),
         y = 117,
         labels = y)
  }
  hist.("April Ludgate", "Andy Dwyer")
  hist.("Donna Meagle", "Tom Haverford")
  hist.("Ron Swanson", "Leslie Knope")
})
dev.off()

histograms.pdf

The first histogram has a heavy right skew, but this makes sense since people usually speak more shorter sentances than longer ones. The second, third, and fourth mirrored histograms show similar reading ease score distributions between the compared characters. Ron and Leslie have more variability Than the others seen on the page in reading ease score distribution. Donna and Tom have very centralized histograms, indicating an integrity of the two in sticking to a particularly uniform way of speaking.

Question 6

Part A

Leslie_flesch <- with(scripts, sapply(Line[Character %in% "Leslie Knope"], reading_ease))
summary(Leslie_flesch)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -217.18   75.88   95.69   90.39  118.17  205.82

Part B

viewers_per_season <- tapply(parks_and_rec$mm_of_viewers, season, sum)
summary(viewers_per_season)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   32.11   48.91   72.51   67.45   79.09  111.52

Part C

rate_vote_df <- by(imdb, season, function(x) {
    sum(x["rating"] * x["votes"]) / sum(x["votes"])
})
summary(rate_vote_df)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.282   8.152   8.299   8.213   8.468   8.674

Part D

line_per_episode <- with(scripts, tapply(Line, Episode, length))
summary(line_per_episode)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   402.0   495.2   528.0   540.5   552.8   993.0

Question 7

pdf("free_exploration.pdf")
with(scripts, {
  seas_flesch <- tapply(Line, Season, reading_ease)
  plot(1:7, seas_flesch,
       type = "l",
       xlab = "Season",
       ylab = "Reading Ease Score",
       main = "Line Plot of Reading Ease Score by Season")
  ch_per_ep <- tapply(Character, Episode, function(x) {
    length(unique(x))
  })
  plot(1:122, ch_per_ep,
       type = "l",
       xlab = "Overall Episode",
       ylab = "No. of Characters",
       main = "Line Plot of Amount of Characters by Episode")
  ch_per_seas <- tapply(Character, Season, function(x) {
    length(unique(x))
  })
  plot(1:7, ch_per_seas,
       type = "l",
       xlab = "Season",
       ylab = "No. of Characters",
       main = "Line Plot of Amount of Characters by Season")
})
with(parks_and_rec, {
  plot(ep_flesch ~ No.overall,
       type = "l",
       xlab = "Overall Episode",
       ylab = "Reading Ease Score",
       main = "Line Plot of Reading Ease Score by Episode")
  writer_flesch <- lapply(writer_list, function(x) ep_flesch[x])
  par(mar = c(5, 8, 4, 2))
  boxplot(writer_flesch,
          las = 1,
          horizontal = TRUE,
          xlab = "Reading Ease Score",
          main = "Boxplots of Reading Ease Scores for Writers w/ 5+ Writing Credits")
  abline(v = 90)
  writer_ind <- sort(unlist(writer_list))
  writers_w_5 <- gsub("[0-9]", "", names(writer_ind))
  overall_ep <- No.overall[writer_ind]
  ep_reading_ease <- ep_flesch[writer_ind]
  scatter_hist(overall_ep, ep_reading_ease, writers_w_5)
})
dev.off()

free_exploration.pdf

There’s a steep drop in reading ease score through the seasons of the show as indicated by the Line Plot of Reading Ease Score by Season. The number of characters per episode fluctuates wildly with a slight upward trend as shown by the Line Plot of Amount of Characters by Episode, with peaks at episode 20, 107 or 108, and the finale of the show. In comparison, the amount of characters by season also fluctuates pretty wildly by season, with a moderate upward trend and a peak character count in season 2. The Line Plot of Reading Ease Score by Episode shows a wildly-fluctuating, slight-downward trend in reading ease score through the episodes. The downward trend could indicate the attempt of the show to tackle more complex issues, thus sacrificing simplicity in understanding for coverage of deep topics. As we can see for the boxplots, most writers have a median reading ease score above 90, which indicates that they are very good writers. This good writing quality probably contributed to the success of the show. The top marginal histogram on the scatter plot shows a near-uniform distribution of writers per episode, indicating an equal distribution of writing responsibility across the writers over time. The side marginal histogram on the scatter plot shows a normal-ish, unimodal distribution of episodic reading scores. This might be due to the random variability in writing ability from episode to episode of the writers.

Part 3

Question 8

Part A

set.seed(605328259)
Leslie_lines <- with(scripts, Line[Character %in% "Leslie Knope"])
sample1 <- sample(Leslie_lines, 100)
head(sample1)

## [1] "Oh, cry me a river."                               
## [2] "Call me."                                          
## [3] "It depicts the very famous battle at Conega Creek."
## [4] "Really?"                                           
## [5] "Can you read it, please?"                          
## [6] "No, no."

Part B

interval1 <- t.test(sapply(sample1, reading_ease))$conf.int
mu <- mean(sapply(Leslie_lines, reading_ease))
interval1

## [1] 83.26549 95.38827
## attr(,"conf.level")
## [1] 0.95

mu

## [1] 90.39229

Leslie’s true mean reading ease score, 90.3922916, is indeed captured by the interval, 83.2654887, 95.388266.

Part C

intervals <- t(replicate(1000, {
  samp <- sample(Leslie_lines, 100)
  t.test(sapply(samp, reading_ease))$conf.int
}))
sum(intervals[, 1] < mu & intervals[, 2] > mu) / nrow(intervals)

## [1] 0.936

Question 9

Do the directors have writers that they prefer to work with? This is a chi-squared association test. The null hypothesis is that there’s no association between directors and writers of episodes. The alternative hypothesis is that there is an association between directors and writers of episodes.

with(parks_and_rec, {
  insert <- function(x, new, ind) {
    x[(ind + 1):(length(x) + 1)] <- x[ind:length(x)]
    x[ind] <- new
    x
  }
  index <- which(grepl("&", `Written by`))
  for (i in index) {
    `Directed by` <- insert(`Directed by`, `Directed by`[i], i)
  }
  chisq.test(table(writers, `Directed by`))
})

## Warning in chisq.test(table(writers, `Directed by`)): Chi-squared approximation
## may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  table(writers, `Directed by`)
## X-squared = 1179.5, df = 1073, p-value = 0.01252

Since our P-value, .013, is less than a significance level of .05, we reject the null hypothesis. There’s convincing evidence that there’s an association between directors and writers of episodes. The directors and writers of episodes are not independenct. Specifically, Dean Holland really likes to work with Alan Yang and Michael Schur.

Thank you for reading my report!

Final Project

James Chen

12/8/2019

Part 1

Question 1

Kaggle Data

IMDB Data

Wikipedia Data

Question 2

Conform the Data

Clean Up the Wikipedia Data

Question 3

Combined episode data

Per episode Flesch reading ease score by character

Augment Scripts

Part 2

Question 5

Line Plots

Scatter Plots

Box Plots

Histograms

Question 6

Part A

Part B

Part C

Part D

Question 7

Part 3

Question 8

Part A

Part B

Part C

Question 9

Thank you for reading my report!