Analys av rubrikerna på 1 976 337 nyhetsartiklar från 7 februari 2014 till 18 juni 2017.

Koden kommer från David Robinson, vars blogg jag varmt rekommenderar andra att följa, och jag har endast modifierat koden för mitt ändamål.

Data

Datan innehåller inte bara nyhetsartiklar utan även debattartiklar och kulturartiklar. Urvalskriteriet var de 12 största nyhetssajterna vid 2014. Det är delar av Aftonbladet, Expressen, DN, SvD, GP, SVT, Metro, DI, SR, Sydsvenskan, Nyheter24, Helsingborgs dagblad. En mängd andra nyhetssajter har också tillkommit under 2017 som är med i denna analys.

Datan är fritt tillgänglig via SND, men den datan sträcker sig dock bara till och med hösten 2015 och innehåller bara de 12 nyhetssajterna.

Setup

knitr::opts_chunk$set(echo=TRUE)

library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library(scales)
library(stringr)
library(tidyverse)

## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## as.difftime(): lubridate, base
## col_factor():  readr, scales
## date():        lubridate, base
## discard():     purrr, scales
## filter():      dplyr, stats
## intersect():   lubridate, base
## lag():         dplyr, stats
## setdiff():     lubridate, base
## union():       lubridate, base

library(tidytext)
library(RMySQL)

## Loading required package: DBI

theme_set(theme_minimal())

getquery <- function(SqlQuery) {
  drv <- dbDriver("MySQL")
  con <- dbConnect(drv, host="localhost", user="root", pass="root", dbname="headlines")
  df <- dbGetQuery(con, statement=SqlQuery)
  dbDisconnect(con)
  return(df)
}

df <- getquery("SELECT id, retrieved, title FROM articles ORDER BY id DESC")

Encoding(df$title) <- "UTF-8"

titles <- df %>% mutate(time = as.POSIXct(retrieved, origin = "1970-01-01"),
         month = round_date(time, "month"))

Stoppord

Plockar bort stoppord (“och”, “att”, “så” etc) på svenska.

# Swedish.
swe_stopwords <- function() {
  words <- read.csv("https://gist.githubusercontent.com/peterdalle/8865eb918a824a475b7ac5561f2f88e9/raw/ba2de0a5d2bddf12e3c51bc0c6d3f78759bcc973/swedish-stopwords.txt", encoding="UTF-8", header=FALSE)
  return(data.frame(word=words$V1, lexicon="peterdalle"))
}

# Custom.
custom_stopwords <- data.frame(word=c("quot", "visar", "person"), lexicon="peterdalle")

# Combine.
stopwords_combined <- rbind(swe_stopwords(), custom_stopwords)

Tokenize

Bryter ned rubrikerna till ord.

title_words <- titles %>%
  distinct(title, .keep_all = TRUE) %>%
  unnest_tokens(word, title, drop = FALSE) %>%
  distinct(id, word, .keep_all = TRUE) %>%
  anti_join(stopwords_combined, by = "word") %>%
  filter(str_detect(word, "[^\\d]")) %>%
  group_by(word) %>%
  mutate(word_total = n()) %>%
  ungroup()

word_counts <- title_words %>%
  count(word, sort = TRUE)

Vanligaste orden

word_counts %>%
  head(25) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
    geom_col(fill = "lightblue") +
    scale_y_continuous(labels = comma_format()) +
    coord_flip() +
    labs(title = "Vanligaste orden",
         subtitle = paste("Baserat på ", NROW(titles), " nyhetsrubriker med stoppord borttagna", sep=""),
         x = NULL,
         y = "Frekvens")

Förändringar över tid

stories_per_month <- titles %>%
  group_by(month) %>%
  summarize(month_total = n())

word_month_counts <- title_words %>%
  filter(word_total >= 1000) %>%
  count(word, month) %>%
  complete(word, month, fill = list(n = 0)) %>%
  inner_join(stories_per_month, by = "month") %>%
  mutate(percent = n / month_total) %>%
  mutate(year = year(month) + yday(month) / 365)

word_month_counts

## Source: local data frame [28,268 x 6]
## Groups: word [676]
## 
##     word      month     n month_total      percent     year
##    <chr>     <dttm> <dbl>       <int>        <dbl>    <dbl>
## 1      a 2014-02-01    16       12400 0.0012903226 2014.088
## 2      a 2014-03-01    36       47252 0.0007618725 2014.164
## 3      a 2014-04-01    33       49517 0.0006664378 2014.249
## 4      a 2014-05-01    39       47310 0.0008243500 2014.332
## 5      a 2014-06-01    26       45274 0.0005742810 2014.416
## 6      a 2014-07-01    40       43943 0.0009102701 2014.499
## 7      a 2014-08-01    34       41760 0.0008141762 2014.584
## 8      a 2014-09-01    62       42955 0.0014433710 2014.668
## 9      a 2014-10-01    40       49805 0.0008031322 2014.751
## 10     a 2014-11-01    24       49211 0.0004876958 2014.836
## # ... with 28,258 more rows

Growth model

Uppskatta vilket ord som ökat mest med en growth curve model.

library(broom)

mod <- ~ glm(cbind(n, month_total - n) ~ year, ., family = "binomial")

slopes <- word_month_counts %>%
  nest(-word) %>%
  mutate(model = map(data, mod)) %>%
  unnest(map(model, tidy)) %>%
  filter(term == "year") %>%
  arrange(desc(estimate))

# Remove more stop words ex post facto.
slopes <- slopes %>% filter(!(word %in% c("quot", "visar", "person", "ville", "hittade", "söker", "the", "just", "donald", "trumps", "förd", "höjer", "sänker")))

slopes

## # A tibble: 666 × 6
##         word  term  estimate  std.error statistic       p.value
##        <chr> <chr>     <dbl>      <dbl>     <dbl>         <dbl>
## 1      trump  year 1.5350289 0.02052210  74.79883  0.000000e+00
## 2     brexit  year 0.8659545 0.03921822  22.08041 4.876221e-108
## 3    clinton  year 0.7131490 0.02820690  25.28278 4.940780e-141
## 4     london  year 0.6294950 0.03363362  18.71624  3.650083e-78
## 5  nyanlända  year 0.6168283 0.03238740  19.04532  7.185236e-81
## 6   attacken  year 0.6095297 0.03303718  18.44981  5.233047e-76
## 7         e6  year 0.5581252 0.02439697  22.87682 7.905283e-116
## 8  personbil  year 0.5533030 0.02890289  19.14352  1.096088e-81
## 9       larm  year 0.5183420 0.02014182  25.73462 4.792174e-146
## 10  avstängd  year 0.4843942 0.02698537  17.95025  4.777162e-72
## # ... with 656 more rows

Ord som ökar i frekvens

slopes %>%
  head(16) %>%
  inner_join(word_month_counts, by = "word") %>%
  mutate(word = reorder(word, -estimate)) %>%
  ggplot(aes(month, n / month_total, color = word)) +
    geom_line(show.legend = FALSE) +
    scale_y_continuous(labels = percent_format()) +
    facet_wrap(~ word, scales = "free_y") +
    expand_limits(y = 0) +
    labs(title = "16 snabbast växande orden i nyhetsrubriker",
         subtitle = paste("Baserat på ", NROW(titles), " nyhetsrubriker med stoppord borttagna", sep=""),
         x = "År",
         y = "Procent av rubriker som innehåller ord")

Ord som minskar i frekvens

slopes %>%
  tail(16) %>%
  inner_join(word_month_counts, by = "word") %>%
  mutate(word = reorder(word, estimate)) %>%
  ggplot(aes(month, n / month_total, color = word)) +
    geom_line(show.legend = FALSE) +
    scale_y_continuous(labels = percent_format()) +
    facet_wrap(~ word, scales = "free_y") +
    expand_limits(y = 0) +
    labs(title = "16 snabbast sjunkande orden i nyhetsrubriker",
         subtitle = paste("Baserat på ", NROW(titles), " nyhetsrubriker med stoppord borttagna", sep=""),
         x = "År",
         y = "Procent av rubriker som innehåller ord")

Jämförelse mellan ord

word_month_counts %>%
  filter(word %in% c("ukraina", "ryssland", "usa", "sverige")) %>%
  ggplot(aes(month, n / month_total, color = word)) +
    geom_line(size = 1, alpha = .8) +
    scale_y_continuous(labels = percent_format()) +
    expand_limits(y = 0) +
    labs(title = "Förekomst av länder",
         x = "År",
         y = "Procent av rubriker som innehåller ordet")

word_month_counts %>%
  filter(word %in% c("flyktingar", "migranter", "nyanlända")) %>%
  ggplot(aes(month, n / month_total, color = word)) +
    geom_line(size = 1, alpha = .8) +
    scale_y_continuous(labels = percent_format()) +
    expand_limits(y = 0) +
    labs(title = "Förekomst av migration m.m.",
         x = "År",
         y = "Procent av rubriker som innehåller ordet")

Föredettingar

Ord som hade en kraftig topp men sedan försvann.

library(splines)

mod2 <- ~ glm(cbind(n, month_total - n) ~ ns(year, 4), ., family = "binomial")

# Fit a cubic spline to each shape
spline_predictions <- word_month_counts %>%
  mutate(year = as.integer(as.Date(month)) / 365) %>%
  nest(-word) %>%
  mutate(model = map(data, mod2)) %>%
  unnest(map2(model, data, augment, type.predict = "response"))

# Find the terms with the highest peak / average ratio
peak_per_month <- spline_predictions %>%
  group_by(word) %>%
  mutate(average = mean(.fitted)) %>%
  top_n(1, .fitted) %>%
  ungroup() %>%
  mutate(ratio = .fitted / average) %>%
  filter(month != min(month), month != max(month)) %>%
  top_n(16, ratio)

# Peaks
peak_per_month %>%
  select(word, peak = month) %>%
  inner_join(spline_predictions, by = "word") %>%
  mutate(word = reorder(word, peak)) %>%
  ggplot(aes(month, percent)) +
    geom_line(aes(color = word), show.legend = FALSE) +
    geom_line(aes(y = .fitted), lty = 2) +
    facet_wrap(~ word, scales = "free_y") +
    scale_y_continuous(labels = percent_format()) +
    expand_limits(y = 0) +
    labs(title="16 ord som toppade och sedan försvann från nyhetsrubrikerna",
         subtitle="Spline fit (df = 4) shown.\nSelected based on the peak of the spline divided by the overall average; ordered by peak month.",
         x = "År",
         y = "Procent av rubriker med ord")

Analys av två miljoner nyhetsrubriker under tre år

Peter M. Dahlgren

2017-06-18