Now I use data for lower level. Counties. Caveat: this is my playground, do not draw any serious conclusions.

Poland Presidential Elections 2020

Void votes per county

Jacek Kotowski

Loading libs

pacman::p_load(tidyverse, readxl, janitor, sf, scales, mgcv)

Loading data

from https://wybory.gov.pl/prezydent20200628/pl/dane_w_arkuszach

df_raw <-
  read_xlsx(path = "dane/wyniki_gl_na_kand_po_gminach_proc_utf8.xlsx") %>%
  clean_names() %>%
  mutate(across(
    .cols = c(
      "frekwencja"                                                                           ,
      "percent_glosow_niewaznych"                                                            ,
      "w_tym_z_powodu_postawienia_znaku_x_obok_nazwiska_dwoch_lub_wiekszej_liczby_kandydatow",
      "w_tym_z_powodu_niepostawienia_znaku_x_obok_nazwiska_zadnego_kandydata"                ,
      "w_tym_z_powodu_postawienia_znaku_x_wylacznie_obok_skreslonego_nazwiska_kandydata"     ,
      "percent_glosow_waznych"                                                               ,
      "andrzej_sebastian_duda"                                                               ,
      "rafal_kazimierz_trzaskowski"                                                          ,
      "liczba_obwodow"
    ),
    ~ str_replace(.,pattern = ",", replacement = ".") %>% as.numeric
  ))

Cleaning data

Counties codes are six digits precision in map but 7 digits precision in data. We need to aggregate them.

Also, Warsaw is several “counties” which need to be collapsed.

# df_raw <-
#   df_raw %>%
#   mutate(
#     gmina =
#       if_else(str_detect(rodzaj_jednostki, "Warszawa"),
#               "Warszawa",
#               gmina),
#     kod_teryt =
#       if_else(str_detect(rodzaj_jednostki, "Warszawa"),
#               "146501",
#               kod_teryt),
#   )

# df_raw <- 
#   df_raw %>% 
#   group_by(kod_teryt) %>% 
#   summarise(gmina = first(gmina), 
#             across(where(is.numeric), sum))

Loading shp files

from https://gis-support.pl/granice-administracyjne/

Read also about warning related to st_simplify here: https://stackoverflow.com/questions/60008135/st-simplify-dtolerence-with-decimal-degree

map_gminy <-
  #  st_read("dane/Gminy.shp") %>%
  st_read("dane/Jednostki_ewidencyjne.shp") %>%
  st_make_valid() %>% 
  select(kod_teryt = JPT_KOD_JE, gmina = JPT_NAZWA_) %>%
  mutate(kod_teryt = str_sub(kod_teryt, end = 6)) %>%
  mutate(
    gmina =
      case_when(
        str_detect(kod_teryt, "10610")  ~ "Łódz",
        str_detect(kod_teryt, "12610")  ~ "Kraków",
        TRUE ~ as.character(gmina)
      ),
    kod_teryt =
      case_when(
        # Lod
        str_detect(kod_teryt, "10610")  ~ "106101",
        # Kra
        str_detect(kod_teryt, "12610")  ~ "126101",
        # ZGr
        str_detect(kod_teryt, "080910") ~ "086201",
        TRUE ~ as.character(kod_teryt)
      )
  )  %>%
  group_by(kod_teryt) %>%
  mutate(n = n()) %>% 
  summarize(gmina = first(gmina), 
            geometry = st_combine(geometry) ) %>% 
  ungroup()

## Reading layer `Jednostki_ewidencyjne' from data source `C:\Users\jkotows2\Desktop\_r_wybory\dane\Jednostki_ewidencyjne.shp' using driver `ESRI Shapefile'
## Simple feature collection with 3138 features and 29 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 14.12288 ymin: 49.00205 xmax: 24.14578 ymax: 54.83642
## geographic CRS: ETRS89

## `summarise()` ungrouping output (override with `.groups` argument)

Joining shp with data

df <- merge(map_gminy, df_raw, by="kod_teryt")

Which ones did not merge:

df_raw %>% anti_join(df)

## Joining, by = c("nr_okw", "kod_teryt", "rodzaj_jednostki", "powiat", "wojewodztwo", "frekwencja", "percent_glosow_niewaznych", "w_tym_z_powodu_postawienia_znaku_x_obok_nazwiska_dwoch_lub_wiekszej_liczby_kandydatow", "w_tym_z_powodu_niepostawienia_znaku_x_obok_nazwiska_zadnego_kandydata", "w_tym_z_powodu_postawienia_znaku_x_wylacznie_obok_skreslonego_nazwiska_kandydata", "percent_glosow_waznych", "andrzej_sebastian_duda", "rafal_kazimierz_trzaskowski", "liczba_obwodow")

nr_okw <dbl>	kod_teryt <chr>	rodzaj_jednostki <chr>	gmina <chr>	powiat <chr>	wojewodztwo <chr>	frekwencja <dbl>
21	149801	statki	statki	statki	mazowieckie	99.74
21	149901	zagranica	zagranica	zagranica	mazowieckie	79.57

as.data.frame(map_gminy) %>% anti_join(df)

## Joining, by = c("kod_teryt", "geometry")

## Warning in `[<-.data.frame`(`*tmp*`, is_list, value = list(`3` = "<S3:
## sfc_GEOMETRY>")): replacement element 1 has 1 row to replace 0 rows

df %>% filter(str_detect(gmina.x, "Zielona"))

	kod_teryt <chr>	gmina.x <chr>	nr_okw <dbl>	rodzaj_jednostki <chr>	gmina.y <chr>	powiat <chr>	wojewodztwo <chr>
1	086201	m. Zielona Góra	12	gmina miejska	m. Zielona Góra	Zielona Góra	lubuskie
2	240402	Dąbrowa Zielona	39	gmina wiejska	gm. Dąbrowa Zielona	częstochowski	śląskie

Only statki (ships) and zagranica (foreign votes) left. Good.

Neighbourhood of counties

Generate id’s for each row

df <- 
  df %>% 
  mutate(idx = row_number() %>% as.factor())

Generate matrix with neighboring counties

Output will look like:

Sparse geometry binary predicate list of length 2477, where the predicate was `touches’ first 10 elements: 1: 2 2: 1, 3, 4, 5, 6, 78, 591 3: 2, 6, 58, 96, 100, 591 4: 2, 5, 68, 76, 78, 156, 158

# counties_nb <- df %>%
#   st_touches(sparse = T) %>%
#   set_names(df$idx)
# 
# saveRDS(counties_nb, "counties_nb.Rds")

counties_nb <- readRDS("counties_nb.Rds")

Simplify geometries to make a dataframe lighter

I could not do that earlier because calculation of neighborhood matrix would not work.

df <- 
  df %>%  
  st_simplify(preserveTopology = T, dTolerance = 0.005)

## Warning in st_simplify.sfc(st_geometry(x), preserveTopology, dTolerance):
## st_simplify does not correctly simplify longitude/latitude data, dTolerance
## needs to be in decimal degrees

Ranking void votes

df %>% 
  slice_max(percent_glosow_niewaznych,n=20) %>% 
  ggplot(
  aes(
    fct_reorder(gmina.x, percent_glosow_niewaznych), 
    percent_glosow_niewaznych)
  ) +
  geom_col(fill = "blue", 
           color = "gray", 
           alpha = 0.1, 
           width = 0.7) + 
  geom_text(
    aes(
      label = paste0(
        scales::number(
          percent_glosow_niewaznych, 
          accuracy = 0.01)
        )
      )
    ) +
  coord_flip() +
  theme_light() +
  labs(
    title="Poland Presidential Elections 2020",
    subtitle = "Void votes percent top 20",
    caption = "Source: PKW, Author: Jacek Kotowski",
    x = "",
    y = ""
      )

Smooth with a gam regression

see: https://fromthebottomoftheheap.net/2017/10/19/first-steps-with-mrf-smooths/ For count data nb (negative binomial) distribution works well. For % data later on think using ? betar()

# Model
df_for_model <- df %>% select(-geometry) %>% as_tibble()

hist(df_for_model$percent_glosow_niewaznych)

hist(df_for_model$percent_glosow_niewaznych %>% scale(.))

What distribution to use? Poisson, negative-binomial, zero-inflated ? Trial and error elected negative-binomial.

m1 <-
  gam(
    percent_glosow_niewaznych ~
      s(
        idx,
        bs = 'mrf',
        k  = 500,
        xt = list(nb = counties_nb),
        fx = TRUE
      ),
    data   = df_for_model,
    method = 'REML',
   control = gam.control(nthreads = 8),
    family = Gamma() #tried Gamma.
  )

summary(m1)

## 
## Family: Gamma 
## Link function: inverse 
## 
## Formula:
## percent_glosow_niewaznych ~ s(idx, bs = "mrf", k = 500, xt = list(nb = counties_nb), 
##     fx = TRUE)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.166806   0.006176   188.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##        edf Ref.df     F p-value    
## s(idx) 499    499 1.971  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.159   Deviance explained = 30.1%
## -REML = 1625.5  Scale est. = 0.068381  n = 2494

gam.check(m1)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-3.354595e-07,-3.354595e-07]
## (score 1625.498 & scale 0.06838101).
## Hessian positive definite, eigenvalue range [1025.09,1025.09].
## Model rank =  500 / 500 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##         k' edf k-index p-value
## s(idx) 499 499      NA      NA

Add predictions to database.

df_m1<- data.frame(.predicted = predict(m1, type = "response"))

df <- df %>% 
 bind_cols(df_m1 )

Plot original data.

p <- ggplot(df) +
  geom_sf(aes(fill=percent_glosow_niewaznych)) +
  theme_void()+
  labs(title = "Presidential elections Poland 2020",
       subtitle = "Void votes",
       caption = "Source: PKW, Author: Jacek Kotowski", 
       fill = "Void votes prc"
       )+
  scale_fill_viridis_c(
    guide = guide_colourbar(barwidth = 0.5, barheight = 10),
    labels = scales::number ,
    option = "viridis", 
    breaks = c(0.01,  0.1, 0.2, 0.5, 1, 2, 3.8),
    trans = "log10", 
    direction = -1)

p

Plot smoothed data.

p2 <- ggplot(df) +
  geom_sf(aes( fill=.predicted)) +
  theme_void()+
  labs(title = "Presidential elections Poland 2020",
       subtitle = "Void votes smoothed with GAM",
       caption = "Source: PKW, Author: Jacek Kotowski", 
       fill = "Void votes"
       )+
  scale_fill_viridis_c(
    guide = guide_colourbar(barwidth = 0.5, barheight = 10),
    labels = scales::number ,
    option = "viridis", 
    breaks = c(0.01,  0.1, 0.2, 0.5, 1, 2, 3),
    trans = "log10", 
    direction = -1)

p2

Plot smoothed election map

Inspection of histograms of real, fitted and residual values.

library(patchwork)

hist1 <- ggplot(df, aes(x=percent_glosow_niewaznych)) +
  geom_freqpoly() +
  geom_histogram(alpha=0.5)+
  scale_x_log10()+
  labs(title = "Real")

hist2 <- ggplot(df, aes(x=.predicted)) +
  geom_freqpoly() +
  geom_histogram(alpha=0.5)+
  scale_x_log10()+
  labs(title = "Fitted")


hist1|hist2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

How to transform skewed values? f(x) log10(x + 1 - min(x))?

p3 <- ggplot(df) +
  geom_sf(aes( fill=percent_glosow_niewaznych-.predicted  ), color="lightgrey") +
  theme_void()+
  labs(title = "Presidential elections Poland 2020",
       subtitle = "Residuals",
       caption = "Source: PKW, Author: Jacek Kotowski", 
       fill = "Void votes residuals"
       )+
  scale_fill_gradientn(
    # low = "lightblue",
    # mid = "white",
    # high = "red",
    space = "Lab",
    breaks = c( 0, 0.5, 1, 1.5, 1.8,2),
    colours = c("white", "lightyellow",  "orange", "red"),
    # midpoint = 0,
    na.value = "gray",
    guide = "colourbar",
    aesthetics = "fill"
#    trans= "pseudo_log"
  ) +
  guides(fill =
  guide_legend(
    reverse = TRUE,
    title.theme = element_text(
      size = 10,
      face = "italic",
      colour = "black",
      angle = 0
    )))

p3

Which counties seem anomalous to me?

df %>% st_drop_geometry() %>% 
  mutate(reszty = number(percent_glosow_niewaznych - .predicted, accuracy =0.01) ) %>%
  arrange(desc(reszty)) %>% 
  head(10) %>% 
  select(gmina = gmina.x,  wojewodztwo, prc_niewaznych = percent_glosow_niewaznych, reszty)

	gmina <chr>	wojewodztwo <chr>	prc_niewaznych <dbl>	reszty <chr>
1	Platerówka	dolnośląskie	3.02	1.82
2	SPYTKOWICE	małopolskie	2.62	1.63
3	Jordanów Śląski	dolnośląskie	2.90	1.60
4	Ślemień	śląskie	2.57	1.26
5	SŁUPIA	łódzkie	2.97	1.24
6	BONIEWO	kujawsko-pomorskie	2.36	1.16
7	ŻUKOWICE	dolnośląskie	2.08	1.07
8	Janowiec Kościelny	warmińsko-mazurskie	1.98	1.04
9	Pakosław	wielkopolskie	2.17	1.01
10	KLUKOWO	podlaskie	2.13	0.93

RStats Daimyo Shimada Blog

Saturday, July 25, 2020

Playground with election data v.2