pacman::p_load(
        tidyverse, gutenbergr,
        tidytext, text2vec, 
        MASS, ggplot2, stringi,
        textclean, ggpubr, 
        ggrepel, text2map,
        install = TRUE)

    # if you'd like to match the plot aesthetics load 
    text2map.theme::set_theme()

Death and Body Counts in Shakespeare’s Plays

We are interested in the relationship between how much – i.e., the magnitude – one of Shakespeare’s plays engages with the concept of death and the number of actual people who die in a play (i.e. body counts). First, we need to get the text of the plays.

Go ahead and download all of Shakespeare’s plays from Project Gutenberg using the gutenbergr package. The output will be a data.frame with each row being one line, but we want to collapse all of the lines for each play. We will first use group_by to group each play together, and then summarize the text column within each group.


meta_plays <- read_csv("data/shakespeare_meta.csv")

meta_plays$gutenberg_id

# Grab the text from Project GUTENBERG
df_plays <- meta_plays |> 
    dplyr::select(gutenberg_id) |>
    gutenberg_download() |>
    group_by(gutenberg_id) |>
    summarize(text = paste(text, collapse=", ") )

Next we will take care of a bit of cleaning. Making sure there are no special characters, removing punctuation, taking care of curly quotes (which can be a pain), and then smooshing contractions. Finally, we will remove any digits that might be floating around (this is downloaded from the Internet after all), and then making sure there is only a single space between words. You’ll notice that instead of replacing the text column entirely, we have added a column called clean_text.


df_plays <- df_plays |> mutate(
    ## transliterate and lowercase
    clean_text = replace_curly_quote(text),
    clean_text = stri_trans_general(clean_text,id="Any-Latin; Latin-ASCII"),
    clean_text = tolower(clean_text),
    ## punctuation
    clean_text = gsub("(\\w+[_'-]+\\w+)|[[:punct:]]+", "\\1", clean_text),
    clean_text = replace_contraction(clean_text),
    ## numbers and spaces
    clean_text = gsub('[[:digit:]]+', " ", clean_text),
    clean_text = gsub("[[:space:]]+", " ", clean_text),
    clean_text = trimws(clean_text)
)

Finally, we will need to convert the cleaned text of the plays into a Document-Term Matrix. The text2map package has a very efficient DTM builder called dtm_builder() – it has very few options, but the straightforward unigram DTM is typically all we need.


  dtm_plays <- df_plays |> 
               dtm_builder(clean_text, gutenberg_id)
  
  # text2map's handy DTM stats function
  tbl <- dtm_stats(dtm_plays)
  knitr::kable(tbl, row.names=TRUE)

Word Embeddings and Semantic Directions

You will need to load pre-trained word embeddings. We use the fastText English embeddings. We have also hosted an Rds file of these embeddings on Google Drive that you can download directly into your R session (it will take a couple minutes).


    library(googledrive)
    temp <- tempfile()
    drive_download(as_id("17H4GOGedeGo0urQdDC-4e5qWQMeWLpGG"), path = temp, overwrite = TRUE)
    ft.wv <- readRDS(temp)

Next we will want to extract a “semantic direction” from the embedding space using the get_direction() function.


adds <- c("death", "casualty", "demise", "dying", "fatality")
subs <- c("life", "survivor", "birth", "living", "endure")
terms <- cbind(adds, subs)
sem_dir <- get_direction(terms, ft.wv)

Concept Mover’s Distance

The next step is to use CMDist to measure how “close” each of the words in the plays (defined by the DTM) engages with the location of the semantic direction. We also want to compare engagement with semantic direction that points away from life toward death to engagement with the individual terms, “life” and “death.”


# Run CMD for concept words and semantic direction
cmd_plays <- dtm_plays |>
    CMDist(cw=c("life", "death"), 
           cv = sem_dir, 
            wv = ft.wv, 
            scale = TRUE) 

# combine with metadata
cmd_plays <- cmd_plays |>
    mutate(doc_id = as.numeric(doc_id)) |>
    left_join(meta_plays, 
              by = c("doc_id" = "gutenberg_id"))

Illustrating the Binary Concept Problem

Below we show that when we only measure a documents distance to life or death (as opposed to the semantic direction), there is a positive correlation between the two measures. And, with other juxtaposing concept, the correlation is considerably higher.

  #Correlation between "life" and "death" words
  ggscatter(cmd_plays, x = "life", y = "death", 
            add = "reg.line", conf.int = TRUE, 
            cor.coef = TRUE, cor.method = "pearson",
            xlab = 'Engagement with "Life"', 
            ylab = 'Engagement with "Death"',
            color = "#1696d2") +
            xlim(-2.5,2.5) +
            ylim(-2.5,2.5)

Predicting Body Counts in Shakespeare

Below we compare how well engagement with life, death, or the life-to-death direction predicts the actual number of people who die in a given play. To decide on an estimator, we first look at the distribution of our outcome variable, body_count.

# Decide on estimator
  ggplot(cmd_plays, aes(body_count) ) + 
    geom_histogram(binwidth = 1) # Should do a count model

This tells us we should use a count model.


m1 <- glm.nb(body_count ~ death_pole, 
          data = cmd_plays)
m2 <- glm(body_count ~ death_pole, 
          data = cmd_plays, family = "poisson")
m3 <- glm.nb(body_count ~ life, 
          data = cmd_plays)
m4 <- glm(body_count ~ life, 
          data = cmd_plays, family = "poisson")
m5 <- glm.nb(body_count ~ death, 
          data = cmd_plays)
m6 <- glm(body_count ~ death, 
          data = cmd_plays, family = "poisson")

pchisq(2 * (logLik(m1) - logLik(m2)), df = 1, lower.tail = F) # Specifically an nbreg
pchisq(2 * (logLik(m3) - logLik(m4)), df = 1, lower.tail = F)
pchisq(2 * (logLik(m5) - logLik(m6)), df = 1, lower.tail = F)

# Models
glm.nb(body_count ~ death_pole, data = cmd_plays) |> summary()
glm.nb(body_count ~ poly(death_pole, 2), data = cmd_plays) |> summary()
glm.nb(body_count ~ life, data = cmd_plays) |> summary()
glm.nb(body_count ~ death, data = cmd_plays) |> summary()

# Predicted Body Counts
new.data <- data.frame(death_pole = -1:1) # death pole
new.data$phat <- predict(m1, new.data, type = "response")
new.data

new.data <- data.frame(death = -1:1) # death
new.data$phat <- predict(m5, new.data, type = "response")
new.data