day 15: correlation

Author

Jen Richmond

Published

April 15, 2026

Code

library(tidyverse)
library(tidytuesdayR)
library(tidyplots)
library(tidytext)
library(janitor)
library(ggeasy)
library(ggpubr)
library(here)


# choosing a dataset randomly

set.seed(1515)
ttyears <- c(2018:2025)
ttweeks <- c(1:52)

# choose a year at random
chosen_year <- sample(ttyears, size = 1)

# choose at week at random
chosen_week <- sample(ttweeks, size = 1)

# read the data from that year/week

df <- tidytuesdayR::tt_load(chosen_year, chosen_week)

# print dataset
print(df)


user <- df[[3]]

options(scipen = 999)
set.seed(15)

The data for this plot comes from a 2020 TidyTuesday challenge. The Week 19 data that year was about Animal Crossing reviews.

ggplot

Code

user <- user %>%
  mutate(year = year(date))

user %>%
  ggplot(aes(x = year, y =grade)) +
  geom_jitter()

Code

ug <- user %>%
  select(user_name, grade)

ut <-user %>%
  select(user_name, text)

Code

# turn reviews into tokens, remove stop words

words <- ut %>%
  group_by(user_name) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

# join with afinn sentiment scores, group and summarise mean by user

sentbyuser <- words %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(user_name) %>%
  summarise(sentiment = mean(value))

# join sentiment scores with review scores
join <- left_join(sentbyuser, ug, by = "user_name")

Code

count <- join %>%
  tabyl(grade)

count %>%
  ggplot(aes(x = grade, y = n)) +
  geom_col() +
  theme_minimal(base_family = "Lato") +
  scale_x_continuous(breaks = 0:10) +
  labs(y = "Review count", x = "Review score", 
       title = "Most people rating Animal Crossing either love it or hate it", 
       caption = "TidyTuesday 2020 Week 19 | Data from Metacritic") +
  geom_text(
  aes(
    label = scales::percent(percent, accuracy = 1),
    vjust = ifelse(percent < 0.02, -0.5, 1.5)
  ),
  colour = ifelse(count$percent < 0.02, "black", "white"),
  size = 3
)

Code

join %>%
  ggplot(aes(x = sentiment, y = grade)) +
  geom_jitter() +
  geom_smooth(se = FALSE) +
theme_minimal(base_family = "Lato") +
  labs(y = "Review score", x = "Review sentiment (AFINN)", title = "Animal crossing review scores are correlated with review \ntext sentiment", caption = "TidyTuesday 2020 Week 19 | Data from Metacritic") +
  stat_cor(method = "pearson", p.accuracy = 0.001, r.accuracy = 0.01, label.x = -3.5, label.y = 7)

tidyplot

Things I learned about tidyplots…

You create count, sum and mean bar plots from the raw data, rather than summarising first and then making a plot out of a summary table
It is hard to work out how to spread the jitter points, width and height don’t seem to do anything

Code

join %>%
  tidyplot(x = grade) %>%
  add_count_bar(fill = "darkgrey") %>%
  adjust_size(width = 100, height = 80) %>%
  adjust_x_axis(breaks = seq(0,10, 1)) %>%
  adjust_font(family = "Lato", fontsize = 10) %>%
  adjust_x_axis_title("Review score") %>%
  adjust_y_axis_title("Review count") %>%
  adjust_title("Most people rating Animal Crossing either love it or hate it", fontsize = 12) %>%
  adjust_caption("TidyTuesday 2020 Week 19 | Data from Metacritic") %>%
  save_plot(here::here("charts26", "2026-04-15_correlation","tidyfeatured1.png"))

Code

join %>%
  tidyplot(x = sentiment, y = grade) %>%
  add_data_points_jitter(colour = "black") %>%
  add_curve_fit(se = FALSE, linewidth = 1) %>%
  adjust_size(width = 100, height = 80) %>%
  adjust_font(family = "Lato", fontsize = 10) %>%
  adjust_x_axis_title("Review sentiment (AFINN)") %>%
  adjust_y_axis_title("Review score") %>%
  adjust_title("Animal crossing review scores are correlated with review \ntext sentiment", fontsize = 12) %>%
   adjust_caption("TidyTuesday 2020 Week 19 | Data from Metacritic") %>%
  save_plot(here::here("charts26", "2026-04-15_correlation","tidyfeatured2.png"))