For BIGSSS - Segregation and Polarization.

In this section we will calculate various segregation measures on the raster data of a Dutch city. This requires us to import the raster file we have previously prepared; to calculate a proximity matrix of the raster cells; and to calculate their local environment.

1 Setting things up

1.1 Cleaning up the local environment

rm(list = ls())
gc()

1.2 General custom functions

fpacage.check: Check if packages are installed (and install if not) in R (source).
fsave: Function to save data with time stamp in correct directory

fsave <- function(x, file, location = "./data/processed/", ...) {
    if (!dir.exists(location))
        dir.create(location)
    datename <- substr(gsub("[:-]", "", Sys.time()), 1, 8)
    totalname <- paste(location, datename, file, sep = "")
    print(paste("SAVED: ", totalname, sep = ""))
    save(x, file = totalname)
}

fpackage.check <- function(packages) {
    lapply(packages, FUN = function(x) {
        if (!require(x, character.only = TRUE)) {
            install.packages(x, dependencies = TRUE)
            library(x, character.only = TRUE)
        }
    })
}

colorize <- function(x, color) {
    sprintf("<span style='color: %s;'>%s</span>", color, x)
}

fMoranI <- function(x, y = NULL, weight, scaled = FALSE, na.rm = FALSE, alternative = "two.sided", rowstandardize = TRUE) {
    if (is.null(y)) {
        y <- x
    }

    if (dim(weight)[1] != dim(weight)[2])
        stop("'weight' must be a square matrix")
    nx <- length(x)
    ny <- length(y)
    if (dim(weight)[1] != nx | dim(weight)[1] != ny)
        stop("'weight' must have as many rows as observations in 'x' (and 'y', for the bivariate case) ")
    ei <- -1/(nx - 1)
    nas <- is.na(x) | is.na(y)
    if (any(nas)) {
        if (na.rm) {
            x <- x[!nas]
            y <- y[!nas]
            nx <- length(x)
            weight <- weight[!nas, !nas]
        } else {
            warning("'x' and/or 'y' have missing values: maybe you wanted to set na.rm = TRUE?")
            return(list(observed = NA, expected = ei, sd = NA, p.value = NA))
        }
    }
    if (rowstandardize) {
        ROWSUM <- rowSums(weight)
        ROWSUM[ROWSUM == 0] <- 1
        weight <- weight/ROWSUM
    }
    s <- sum(weight)
    mx <- mean(x)
    sx <- x - mx
    my <- mean(y)
    sy <- y - my
    v <- sum(sx^2)
    cv <- sum(weight * sx %o% sy)
    obs <- (nx/s) * (cv/v)
    cv_loc <- rowSums(weight * sx %o% sy)
    obs_loc <- (nx/s) * (cv_loc/v)
    if (scaled) {
        i.max <- (nx/s) * (sd(rowSums(weight) * sx)/sqrt(v/(nx - 1)))
        obs <- obs/i.max
        obs_loc <- obs_loc/i.max
    }
    S1 <- 0.5 * sum((weight + t(weight))^2)
    S2 <- sum((apply(weight, 1, sum) + apply(weight, 2, sum))^2)
    s.sq <- s^2
    k <- (sum(sx^4)/nx)/(v/nx)^2
    sdi <- sqrt((nx * ((nx^2 - 3 * nx + 3) * S1 - nx * S2 + 3 * s.sq) - k * (nx * (nx - 1) * S1 - 2 *
        nx * S2 + 6 * s.sq))/((nx - 1) * (nx - 2) * (nx - 3) * s.sq) - 1/((nx - 1)^2))
    alternative <- match.arg(alternative, c("two.sided", "less", "greater"))
    pv <- pnorm(obs, mean = ei, sd = sdi)
    if (alternative == "two.sided")
        pv <- if (obs <= ei)
            2 * pv else 2 * (1 - pv)
    if (alternative == "greater")
        pv <- 1 - pv
    list(observed = obs, expected = ei, sd = sdi, p.value = pv, observed_locals = obs_loc)


}
fMoranI <- compiler::cmpfun(fMoranI)

# Moran's I for aggregated
# data_____________________________________________________________________
fMoranIdens <- function(x, y = NULL, weight, dens = NULL, N = length(x)) {
    # Adapted from Anselin (1995, eq. 7, 10, 11)
    # https://onlinelibrary.wiley.com/doi/epdf/10.1111/j.1538-4632.1995.tb00338.x dens: the
    # proportion of individuals in each cell over the district population if individual level data
    # dens is.null and N is simply length of input if we have aggregate data then N should be total
    # population size (or actually just a large number)
    if (is.null(y)) {
        y <- x
    }
    # N <- length(x)
    if (is.null(dens)) {
        dens <- rep(1/N, times = N)
    }

    # correct scaling of opinions for densities #this is really inefficient, should use weighted
    # var from hmsci
    v1dens_ind <- rep(x, times = (dens * N))
    v1dens <- (x - mean(v1dens_ind))/sd(v1dens_ind)
    v2dens_ind <- rep(y, times = (dens * N))
    v2dens <- (y - mean(v2dens_ind))/sd(v2dens_ind)

    # (density) weighted proximity matrix
    w <- weight
    wdens <- t(dens * t(w))
    wdens <- wdens/rowSums(wdens)

    # density and proximity weighted locals
    localI <- (v1dens * wdens %*% v2dens)  #formula 7

    # correct the normalization constants
    m2 <- sum(v1dens^2 * dens)
    S0 <- N  #we know the weight matrix for the individual level should add up to N
    ydens <- S0 * m2
    globalI <- sum(localI * dens * N)/ydens  # formula 10/11

    return(list(globalI = globalI, localI = as.numeric(localI)))
}
fMoranIdens <- compiler::cmpfun(fMoranIdens)

1.3 Loading packages

fpackage.check(c("sf", "seg", "leaflet", "ggplot2", "ggmap", "compiler"))

1.4 Loading data files

We are going to use the raster data we have constructed in a previous session (see here);
the associated shapefiles,
and the polling station data we have constructed in a previous session (see here).

To load these files, adjust the filenames in the following code so that they match the most recent versions of the RData file you have in your ./data/processed/ folder:

load("./data/processed/20220711raster_vor.RData")
rast <- x
rm(x)

load("./data/processed/20220711shapes.RData")
shapes <- x
rm(x)
voronoi <- shapes[[6]]

load("./data/processed/20220702polling_df")
pollstations <- x
rm(x)

# Ensuring that the class of 'pollstations' is 'sf' and the CRS is correct:
pollstations <- sf::st_as_sf(x = as.data.frame(pollstations), crs = sf::st_crs("+proj=longlat +datum=WGS84"),
    coords = c("long", "lat"))

1.5 Matching voronoi tiles, raster cells and Polling stations

We are going to use the Voronoi neighborhoods - therefore, we associate each polling station to its corresponding Voronoi tile. In principle we already know which raster cells correspond to which Voronoi tile: the columns voronoi$voronoi and rast$voronoi are ID variables that should indicate which row in the pollstations data each voronoi$geometry and raster cell of the rast data belong to.
However, this is not guaranteed to work because of two reasons. First, trivially, because we might have created the voronoi geometry before we have made some changes to the pollstations data.frame that we used to generate the Voronoi diagram. Secondly, and more insidiously, it appears that the output of sf::st_voronoi (which we used to generate the Voronoi diagram) is a spatial dataframe where order of the rows/tiles is not always the same as the order of the polling stations it used as seeds. Pretty inconvenient.
Therefore, here I use st_intersection to ensure that pollstations$voronoi correctly matches the corresponding IDs in rast$voronoi and voronoi$geometry.

pollstations <- sf::st_intersection(x = pollstations, y = voronoi, sf::sf_use_s2(FALSE))

Let’s see if our data checks out. Let’s choose a Voronoi tile at random and plot its corresponding raster cells and polling station.

tile = 3000

tilerast <- subset(rast, rast$voronoi == tile)
station <- subset(pollstations, pollstations$voronoi == tile)
voronoi_tile <- voronoi[tile, ]

leaflet::leaflet(tilerast) |>
    leaflet::addTiles() |>
    leaflet::addProviderTiles(providers$Stamen.Toner) |>
    leaflet::addPolygons(data = voronoi) |>
    leaflet::addPolygons(data = voronoi_tile, color = "red") |>
    leaflet::addCircles(color = "green") |>
    leaflet::addCircles(data = station, color = "red", opacity = 1) |>
    leaflet::setView(lng = sf::st_coordinates(station)[, 1], lat = sf::st_coordinates(station)[, 2],
        zoom = 14)

Seems okay :)

2 Segregation within Voronoi neighbourhoods

Then, we select the city or area we are going to study. For illustration purposes I select here a relatively small city (i.e. a city with ‘only’ about a thousand raster cells). Keep in mind that, for larger cities, the code presented here might take considerably longer to run.

city <- "Delft"
cityrast_id <- which(rast$GM_NAAM == city)  # will come handy later ;)
cityrast <- rast[cityrast_id, ]

2.1 The distance matrix

Taking raster cells as our local units, our goal here is to calculate the matrix of distances between all local units. There are different ways to tackle the problem. We have previously seen geosphere::distVincentySphere and geosphere::distMeeus which is a slighlty less precise but faster method. We also used distance functions from the package sp. Here we call instead sf::st_distance. The function st_distance gives us distances in meters, which we convert in kilometers by dividing by 1000.

Note that the calculation of the distance matrix is a computationally intensive task - and it may require a lot of memory. As mentioned above, the size of the raster will have a big impact on the CPU time and memory needed in this step.

distmat <- matrix(sf::st_distance(cityrast), ncol = nrow(cityrast))
distmat <- distmat/1000

A bone of contention is how to treat the diagonal of this matrix, i.e. the distance between someone in a raster cell and someone else from the same cell. If our unit of analysis are individuals, it makes sense to assume that distance to be the average distance between all points in a 100-by-100-meters square. That number is about 52.1 meters, or ~0.0521 km. Therefore:

diag(distmat) <- 0.052140543316

2.1.1 plotting distances

Let’s plot the distance from a raster cell (I’ll arbitrarily pick one) to the rest of the raster cells in the city.

egocell <- 100 # we pick the 100th cell in the raster.

palette <- leaflet::colorNumeric(
  palette = "viridis",
  domain = distmat[egocell,]
)
leaflet::leaflet(cityrast) |>
  leaflet::addTiles() |>
  leaflet::addProviderTiles(providers$Stamen.Toner) |>
  leaflet::addCircles( # Plotting the raster with color to represent distances
    data = cityrast,
    label = ~round(distmat[egocell,], digits = 3), # Hovering shows the distance
    color = ~palette(distmat[egocell,]),
    opacity = 0.7
  ) |>
  leaflet::addCircles( # Plotting our "egocell" (in red)
    data = cityrast[egocell,],
    label = "Ego",
    color = "red",
    opacity = 1
  ) |>
  leaflet::addLegend(
    "topleft",
    pal = palette, 
    values = ~distmat[egocell,],
    title = "Distance to ego (km)",
    opacity = 0.8
  )

2.2 From distance to proximity

2.2.1 distance decay function

We use the distance matrix to calculate the proximity matrix. We need a distance decay function for this - and in this example We’ll use an exponential function with a parameter s to determine the slope.

s <- 2
proxmat <- exp(-distmat * s)

2.2.2 plotting proximities

Let’s plot proximity like we did for distance:

egocell <- 100 # we pick the 100th cell in the raster.

palette <- leaflet::colorNumeric(
  palette = "viridis",
  domain = proxmat[egocell,]
)
leaflet::leaflet(cityrast) |>
  leaflet::addTiles() |>
  leaflet::addProviderTiles(providers$Stamen.Toner) |>
  leaflet::addCircles(
    data = cityrast,
    label = ~round(proxmat[egocell,], digits = 3),
    color = ~palette(proxmat[egocell,]),
    opacity = 0.7
  ) |>
  leaflet::addCircles( # Plotting our "egocell" (in red)
    data = cityrast[egocell,],
    label = "Ego",
    color = "red",
    opacity = 1
  ) |>
  leaflet::addLegend(
    "topleft",
    pal = palette, 
    values = ~proxmat[egocell,],
    title = "Proximity to Ego",
    opacity = 0.8
  )

With this function, proximity decays quite fast over increasing distances: see how proximity effectively drop to zero just a few hundred meters (i.e. a few raster cells away) from our egocell (yellow). By setting the slope parameter s to higher or lower values we can regulate this.

2.4 Preparing variables

Our raster data gives us the density (read: number of inhabitants) of each cell (cityrast$aantal_inwoners) and the percentage of them with a non-western migration background (percentage_niet_westerse_migr_achtergr).
Let’s first clean these variables. Statistics Netherlands reserves the integer -99997 to denote censored/missing data – that is, data not shared publicly on privacy grounds for raster cells with fewer than 5 residents. There are a few ways we can handle this - and we need to make an arbitrary decision. We could replace -99997 with some percentage or we could replace it with NA.

cityrast$pnw <- cityrast$percentage_niet_westerse_migr_achtergr
cityrast$pnw[cityrast$pnw == -99997] <- 0  # or some other arbitrary value
cityrast$pnw <- cityrast$pnw/100

Now cityrast$pnw gives us the proportion of residents with a non-western migration background.

Next, we calculate the number of residents with and without a non-western migration background. These are goning to be named cityrast$n_nw and cityrast$n_w respectively.

cityrast$n_nw <- cityrast$aantal_inwoners * cityrast$pnw
cityrast$n_w <- cityrast$aantal_inwoners - cityrast$n_nw

2.4.1 let’s have a quick look

cor.test(cityrast$n_nw, cityrast$n_w)

#> 
#>  Pearson's product-moment correlation
#> 
#> data:  cityrast$n_nw and cityrast$n_w
#> t = 14.441, df = 1034, p-value < 2.2e-16
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#>  0.3576994 0.4591299
#> sample estimates:
#>       cor 
#> 0.4096801

plot(cityrast$n_nw, cityrast$n_w)

### plot ethnic/population density

We can plot these figures. For example, we could map a raster cell’s color to its proportion of residents with a non-western migration background; and we could map the transparency of the cell (alternatively, its size - see commented code) to its total population.

palette <- leaflet::colorNumeric(
  palette = "viridis",
  domain = cityrast$pnw
)
leaflet::leaflet(cityrast) |>
  leaflet::addTiles() |>
  leaflet::addProviderTiles(providers$Stamen.Toner) |>
  leaflet::addCircles(
    data = cityrast,
    color = ~palette(cityrast$pnw),
    #radius = ~cityrast$aantal_inwoners,
    opacity = ~(cityrast$aantal_inwoners / max(cityrast$aantal_inwoners))#0.7
  ) |>
  leaflet::addLegend(
    "topleft",
    pal = palette, 
    values = ~cityrast$pnw,
    title = "Prop. NW migr. background",
    opacity = 0.8
  )

2.5 Calculating the local environment

The package seg comes with a built-in function to calculate the local environment of each spatial unit (in our case, of each raster cell). This can be done using the following code.
There’s a catch however. This function, localenv, assumes that the diagonal of the distance matrix is zero - that is, the distance between two individuals from the same raster cells is zero. This is because the function localenv expects the distance matrix to be an object of class “nb” or “dist” - and this makes setting the diagonal to something other than zero quite difficult (or downright impossible).

# Do not run:
# This method of calculating the local environment is flawed. See the next chunk of code for a better solution!

distmat2 <- proxy::as.dist(distmat)
myenv <- seg::localenv(
  x = sf::st_coordinates(cityrast),
  data = as.matrix(cbind(cityrast$n_w, cityrast$n_nw)), # selecting relevant columns 
  power = s,
  useExp = TRUE, 
  scale = FALSE, 
  maxdist = max(distmat2), 
  sprel = distmat2, ###
  tol = .Machine$double.eps
)

Therefore, the easiest workaround is to calculate the local environment ourselves. This is how we do it.
For our convenience, I calculate the proximity matrix again - this allows us to easily update our local environment with a different value of s.

fcalcLocalEnv <- function( # "data" is a 2-columns matrix
  data, coords, distmat, s, proj4string = sp::CRS("+proj=longlat +datum=WGS84")
  ) {
  
  # Recalculating proximities:
  proxmat <- exp(- distmat * s)
  
  # Calculating the local environment from scratch:
  #if(is.null(data)) data <- as.matrix(cbind(cityrast$n_w, cityrast$n_nw))
  env <- matrix(NA, nrow = nrow(data), ncol = 2)
  for (i in 1:nrow(data)) {
    env[i,1] <- stats::weighted.mean(x = data[,1], w = proxmat[i,])
    env[i,2] <- stats::weighted.mean(x = data[,2], w = proxmat[i,])
  }
  
  # And now we bundle this all together in an object of class
  # "SegLocal", which allows us to use the functions from the package
  # "seg" to calculate the various measures of segregation.
  return(seg::SegLocal(
    coords = coords,
    data = data,
    env = env,
    proj4string = proj4string
  ))
}

fcalcLocalEnv <- compiler::cmpfun(fcalcLocalEnv)

And then we can run it:

myenv <- fcalcLocalEnv(data = as.matrix(cbind(cityrast$n_w, cityrast$n_nw)), distmat = distmat, coords = sf::st_coordinates(cityrast),
    s = s  #already defined above
)

2.6 Spatial measures

2.6.1 Spatial segregation via seg package

We now have all we need to calculate the spatial measures.

seg::spatseg(env = myenv)

#> 
#>  Reardon and O'Sullivan's spatial segregation measures
#> 
#> Dissimilarity (D)     : 0.1496 
#> Relative diversity (R): 0.0107 
#> Information theory (H): 0.0121 
#> Exposure/Isolation (P): 
#>           Group 1   Group 2
#> Group 1 0.7886155 0.2113845
#> Group 2 0.7530019 0.2469981
#> --
#> The exposure/isolation matrix should be read horizontally.
#> Read 'help(spseg)' for more details.

2.6.2 Moran’s I

2.6.2.1 bivariate on counts

We can also calculate Moran’s I:

Let us use the counts and the bivariate version.

I <- fMoranI (
  x = myenv@data[,1],
  y = myenv@data[,2],
  weight = proxmat, ## The diagonal in distmat is ~51 meters
  scaled = FALSE,
  rowstandardize = TRUE
)
print(I[1:4])

#> $observed
#> [1] 0.01607163
#> 
#> $expected
#> [1] -0.0009661836
#> 
#> $sd
#> [1] 0.002554511
#> 
#> $p.value
#> [1] 2.563327e-11

Interpretation: areas where relatively high numbers of non-western minorities live are surrounded by areas where relatively high numbers of natives live. This is also what our quick and dirty correlation should us above.

2.6.2.2 univariate on proportions

We could compare this with the spatial correlation of the proportions, without a density correction.

I <- fMoranI (
  x = myenv@data[,1] / rowSums(myenv@data),
  weight = proxmat, ## The diagonal in distmat is ~51 meters
  scaled = FALSE,
  rowstandardize = TRUE
)
print(I[1:4])

#> $observed
#> [1] 0.1271462
#> 
#> $expected
#> [1] -0.0009661836
#> 
#> $sd
#> [1] 0.002561822
#> 
#> $p.value
#> [1] 0

Interpretation: areas with relatively high proportion of minories are surrounded by areas with relatively high proportion of minorities.

2.6.2.3 univariate on proportions density corrected

Finally, apply a density correction.

I <- fMoranIdens (
  x = myenv@data[,1] / rowSums(myenv@data), #proportion of majority in each cell
  weight = proxmat, ## The diagonal in distmat is ~51 meters
  dens = rowSums(myenv@data) / sum(myenv@data), #the proportion of the population in this cell compared to total environment
  N = sum(myenv@data) #total population size
)
I$globalI

#> [1] 0.1849895

3 Looping over all Voronoi neighbourhoods

Let’s calculate the Moran I of each Voronoi tile. We could do the same for each other type of spatial unit that constitute the city.
We start by selecting the Voronoi tiles in our city.

cityvor <- voronoi[voronoi$voronoi %in% rast$voronoi[cityrast_id], ]

Then we follow the same steps as above, but for each of the Voronoi tiles we have chosen. Here’s how it can be done in a simple (and largely improvable) for loop:

# slope of the distance decay function:
s <- 2

# For each voronoi tile "i" in the city...
for (u in 1:nrow(cityvor)) {
  
  #... we find which raster cells belong to tile "i".
  #tilerast <- subset(cityrast, cityrast$voronoi == cityvor$voronoi[u])
  tilerast <- cityrast[cityrast$voronoi == cityvor$voronoi[u],]
  
  # And if there are more than 2 tiles...
  if (nrow(tilerast) > 1) {
    # ... then calculate distances among its raster cells...
    distmat <- matrix(sf::st_distance(tilerast), ncol = nrow(tilerast))
    distmat <- distmat / 1000
    
    #... set the diagonal of the distance matrix...
    diag(distmat) <- 0.052140543316
    
    #... calculate the local environment of each cell...
    myenv <- fcalcLocalEnv(
      data = as.matrix(cbind(tilerast$n_w, tilerast$n_nw)),
      distmat = distmat,
      coords = sf::st_coordinates(tilerast),
      s = s
    )
    
    #use the seg package to calculate segregation measures. 
    # use your own segregtion functions and functions of oasisR
    proxmat <- exp(-distmat*s)
    #... calculate the I...
    #density corrected based on proportions
    I <- fMoranIdens (
      x = myenv@data[,1] / rowSums(myenv@data),
      weight = proxmat, ## The diagonal in distmat is ~51 meters
      dens = rowSums(myenv@data) / sum(myenv@data), 
      N = sum(myenv@data)
      )
    # 
    # I <- fMoranI (
    #   x = myenv@data[,1],
    #   y = myenv@data[,2],
    #   weight = proxmat, ## The diagonal in distmat is ~51 meters
    #   scaled = FALSE,
    #   rowstandardize = TRUE
    # )
    # 
    
    #... and, finally, save the I estimate to our data.frame "vor":
    cityvor$moranI[u] <- I$globalI
  }
  
}

3.1 Assignment

Add more measures of segregation to cityvor.

3.2 visual inspection

Let’s plot what we got:

palette <- leaflet::colorNumeric(
  palette = "viridis", # or other RColorBrewer palettes e.g "Greens", "magma"
  domain = cityvor$moranI
)
leaflet::leaflet(cityvor) |>
  leaflet::addTiles() |>
  leaflet::addProviderTiles(providers$Stamen.Toner) |>
  leaflet::addPolygons(
    label = ~moranI,
    color = ~palette(moranI),
    opacity = 0.7
  ) |>
  leaflet::addLegend(
    "topleft",
    pal = palette, 
    values = ~moranI,
    title = "Moran I",
    opacity = 0.7
  )

3.3 Assignment

A. Add the polarization data to this dataset.

B. Calculation the correlation between the polarization measures and your segregation measures
C. Repeat this exercise for your chosen region

Segregation in the NL