I have been experimenting with spatial data and looking at ways to combine data objects to get different views and perspectives. This post has the following objectives:
- Import and load data direct to R without manually saving files to a local directory, so that the end to end process is reproducible.
- Use vectors to efficiently clean the Excel sheet, reducing manual steps where possible so that the process could be leveraged to clean other Excel sheets in the Census data.
- Understand the format and structure of spatial objects and required transformations to select appropriate plotting functions.
- Join datasets to gain further insights to the original dataset.
The data used is the New Zealand 2013 age and sex Census data and a New Zealand spatial object, with regions as the common identifiers.
library(tidyverse)
# Packages for loading data
library(XLConnect)
library(readxl)
library(httr)
# Packages for data manipulation
library(stringi)
library(purrr)
# Packages for spatial files
library(rgdal)
library(rgeos)
# Packages for geoviz
library(ggiraph)
library(ggthemes)
Load New Zealand 2006 and 2013 Census data
Firstly, get census data from the Stats NZ website using httr and readxl R packages.
The 2011 Census was cancelled due to the Christchurch earthquake.
The census eventually took place on 5 March 2013, two years late. The five-yearly pattern will resume when the next census is held on 6 March 2018.
# Set the url and use GET function from httr to import the Excel file to a temporary local directory
url <- "http://archive.stats.govt.nz/~/media/Statistics/Census/2013%20Census/data-tables/regional-summary-part-1/regional-summary-tables-RC-final.xls"
GET(url, write_disk(tf <- tempfile(fileext = ".xls")))
## Response [http://archive.stats.govt.nz/~/media/Statistics/Census/2013%20Census/data-tables/regional-summary-part-1/regional-summary-tables-RC-final.xls]
## Date: 2018-08-15 05:06
## Status: 200
## Content-Type: application/vnd.ms-excel
## Size: 850 kB
## <ON DISK> C:\Users\Home\AppData\Local\Temp\RtmpK0km27\filec4845ae21e5a.xls
# Read list of tables from 1st worksheet that are available in subsequent worksheets
censustables <- read_excel(tf, 1L)
# Take a look at the census tables available
head(censustables)
## # A tibble: 6 x 2
## `2013 Census regional summary tables – part 1` X__1
## <chr> <chr>
## 1 <NA> <NA>
## 2 List of regional council area tables <NA>
## 3 1 Age group and sex, by re~
## 4 1b Age group and sex, by re~
## 5 2 Birthplace (broad geogra~
## 6 3 Ethnic group (grouped to~
# Read in table 1, which is the second sheet in the Excel file
sex <- read_excel(tf,2L)
# Inspect the file layout
head(sex,20)
## # A tibble: 20 x 7
## `Table 1` X__1 X__2 X__3 X__4 X__5 X__6
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 Age group and sex <NA> <NA> <NA> <NA> <NA> <NA>
## 3 By regional council area <NA> <NA> <NA> <NA> <NA> <NA>
## 4 For the census usually r~ <NA> <NA> <NA> <NA> <NA> <NA>
## 5 2006 and 2013 Censuses <NA> <NA> <NA> <NA> <NA> <NA>
## 6 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 7 Age group and regional c~ 2006 Ce~ <NA> <NA> 2013 C~ <NA> <NA>
## 8 <NA> Male Female Total ~ Male Fema~ Total ~
## 9 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 10 Northland Region <NA> <NA> <NA> <NA> <NA> <NA>
## 11 0–4 Years 5169 5100 10269 5457 5202 10659
## 12 5–9 Years 5889 5670 11562 5565 5454 11019
## 13 10–14 Years 6684 6264 12951 5670 5400 11073
## 14 Total people, 0–14 Years 17742 17037 34779 16689 16059 32748
## 15 15–19 Years 5466 5091 10560 5001 4626 9627
## 16 20–24 Years 3357 3246 6600 3435 3621 7056
## 17 25–29 Years 2940 3384 6327 3156 3498 6657
## 18 30–34 Years 3882 4395 8274 3042 3546 6591
## 19 35–39 Years 4668 5295 9963 3579 4320 7899
## 20 40–44 Years 5247 5880 11127 4563 5271 9834
Clean Excel file
Next we will clean the Census file using logical vectors and the stringi R package.
# Remove the two "Total People" columns
sex <- sex[,c(-4,-7)]
# Rename columns
names(sex) <- c("agegroup", "male2006","female2006","male2013","female2013")
# Replace the hypens with to in the agegroup column using regex and stringi
sex$agegroup <- str_replace_all(sex$agegroup, pattern = "[[:punct:]]", "to")
# Based on inspection of the file, remove header and footer file with empty space, Total regional council areas, Area Outside Region and Total New Zealand
sex <- sex[c(10:377),]
# Create a vector with the region names. From observing the file, the regions are populated in the first column in steps of 23 and the len with is the number of rows, which should be divisible by the number of steps
steps <- 23
len <- dim(sex)[1]
region <- sex[seq(1, len, by=steps),1]
names(region) <-"region"
# Add a column with the region names filled in groups of 23 using dplyr mutate
sex <- sex %>%
mutate(region =rep(region$region,each=steps) )
# Create a logical vector which will be used to remove rows containing Total
t <- sex %>% select(agegroup) %>%
unlist() %>%
stri_detect_fixed('Total')
# Remove rows with Total
sex <- sex %>%
filter(!t)
# Use a for loop to iterate over the region$region vector and remove each region name 1 by 1.
for (i in seq_along(region$region)) {
# Create a logical vector r to remove rows containing region name i
r <- sex %>% select(agegroup) %>%
unlist() %>%
stri_detect_fixed(region$region[i])
# Remove rows with region name i
sex <- sex %>%
filter(!r)
}
# Change the numerical variables from characters
sex[,c(2,3,4,5)] <- as.numeric(as.character(unlist(sex[,c(2,3,4,5)])))
Load Spatial Data
Next download the spatial file with New Zealand regions REGC2017_GV_Clipped.shp to a temporary directory from the Stats NZ website.
# Download the ESRI shapefile to the working directory, first capture the url
URL <- "http://www3.stats.govt.nz/digitalboundaries/annual/ESRI_Shapefile_2017_Digital_Boundaries_Generalised_Clipped.zip"
# Create a temporary directory
td <- tempdir()
# Create a placeholder file
temp <- tempfile(tmpdir=td, fileext=".zip")
# Download into the placeholder file
download.file(URL, temp)
# Check the name of the REGC2017_GV_Clipped.shp file in the zip archive
unzip(temp, list=TRUE)$Name[52]
## [1] "2017 Digital Boundaries Generalised Clipped/REGC2017_GV_Clipped.shp"
# Unzip the file to the temporary directory
unzip(temp, exdir=td, overwrite=TRUE)
# Create a new directory path for thelocation of the unzipped files as they have a subfolder
unziptd <- paste0(td,"\\",list.files(td)[1])
# Read in the REGC2017_GV_Clipped.shp files
nz_region <- readOGR(dsn = unziptd, layer="REGC2017_GV_Clipped", stringsAsFactors = F)
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\Home\AppData\Local\Temp\RtmpK0km27\2017 Digital Boundaries Generalised Clipped", layer: "REGC2017_GV_Clipped"
## with 17 features
## It has 4 fields
# unlink the temporary files and folders
unlink(temp)
unlink(td)
unlink(unziptd)
Explore Spatial Data
Let’s explore the plotting options first given this spatial object following the introduction to spatial in R tutorial. This nz_region file is a SpatialPolygonsDataFrame. These spatial files have data and polygons “slots” which are each accessed with @. Our plotting options are initially R base plot.
The generic plot() function can use Spatial* objects directly; plot is one of the most useful functions in R, as it changes its behaviour depending on the input data (this is called polymorphism by computer scientists). Inputting another object such as plot(lnd@data) will generate an entirely different type of plot.
Also note that the R package ggmap plots objects of class ggmap and since we are using our sourced spatial object, we will not use this package in this exercise.
# Let's see a summary of the spatial object, the names refer the variables in the data slot
summary(nz_region,level=1)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x 1089970 2470042
## y 4747987 6194308
## Is projected: TRUE
## proj4string :
## [+proj=tmerc +lat_0=0 +lon_0=173 +k=0.9996 +x_0=1600000
## +y_0=10000000 +ellps=GRS80 +units=m +no_defs]
## Data attributes:
## REGC2017 REGC2017_N AREA_SQ_KM LAND_SQ_KM
## Length:17 Length:17 Min. : 424.4 Min. : 424.4
## Class :character Class :character 1st Qu.: 8119.6 1st Qu.: 8048.7
## Mode :character Mode :character Median :12280.4 Median :12070.5
## Mean :15771.5 Mean :15583.0
## 3rd Qu.:23319.7 3rd Qu.:23243.8
## Max. :45207.2 Max. :44507.6
# Take a look at the slot names
slotNames(nz_region)
## [1] "data" "polygons" "plotOrder" "bbox" "proj4string"
# First plot the spatial object using base plot
plot(nz_region, col = 'light blue', border = 'blue')
Transform coordinate reference system (CRS)
In order to use the the other plotting packages available we need to convert the coordinate reference system (CRS) using the spTransform function from the rgdal R package as described in this blog from
+proj=tmerc +lat_0=0 +lon_0=173 +k=0.9996 +x_0=1600000 +y_0=10000000 +ellps=GRS80 +units=m +no_defs
to
‘WGS84’ (epsg:4326) which is a very commonly used CRS worldwide.
# Transform the coordinate reference system (CRS)
wgs84 = '+proj=longlat +datum=WGS84'
nz_region_wgs <- spTransform(nz_region, CRS(wgs84))
# Confirm the coord ref is now longlat
summary(nz_region_wgs)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x -176.89314 178.57724
## y -47.28999 -34.39263
## Is projected: FALSE
## proj4string :
## [+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0]
## Data attributes:
## REGC2017 REGC2017_N AREA_SQ_KM LAND_SQ_KM
## Length:17 Length:17 Min. : 424.4 Min. : 424.4
## Class :character Class :character 1st Qu.: 8119.6 1st Qu.: 8048.7
## Mode :character Mode :character Median :12280.4 Median :12070.5
## Mean :15771.5 Mean :15583.0
## 3rd Qu.:23319.7 3rd Qu.:23243.8
## Max. :45207.2 Max. :44507.6
# Remove the nz_regions spatial object to save memory
rm(nz_region)
# Plot the transformed dataframe
plot(nz_region_wgs, col = 'light blue', border = 'blue')
# Remove the Area outside region which is -176 longitude to focus in this exercise on the north and south islands.
nz_region_wgs<- subset(nz_region_wgs, REGC2017_N != "Area Outside Region")
# Plot the transformed dataframe
plot(nz_region_wgs, col = 'blue', border = 'white')
Fortify Data
Next we will use ggplot2 which requires a dataframe to plot spatial objects, according to the tutorial. We will use the fortify function from the rgeos R package.
# Update the region id 'REGC2017'
nz_region_wgs@data$id <- row.names(nz_region_wgs@data)
# Fortify the nz_region_wgs to add long and lat variables and convert to a dataframe
nz_region_f <- fortify(nz_region_wgs,id='id')
## Regions defined for each Polygons
# Check the new id variable in the fortified dataframe. This refers to the number of elements in the SpatialPolygonsData
table(nz_region_f$id)
##
## 0 1 10 11 12 13 14 15 2 3
## 155702 147730 47849 34626 120469 28807 4185 96450 73019 37105
## 4 5 6 7 8 9
## 14618 17498 16068 16881 20780 39287
# Join the new fortified and the @data
nz_region_f <- merge(nz_region_f, nz_region_wgs@data, by="id")
# Let's check the longitude and latitude ranges to fit the map onto one page
range(nz_region_f$long)
## [1] 166.4261 178.5772
# Now order the dataframe
nz_region_f <- nz_region_f[order(nz_region_f$order), ]
Join non-spatial Census and spatial data
Now let’s join the non-spatial Census data with the data in the spatial object and create a population density variable.
# Compare the region names for the Census data and the spatial data to note differences
setdiff(unique(sex$region),unique(nz_region_wgs$REGC2017_N))
## [1] "Hawketos Bay Region" "ManawatutoWanganui Region"
# Replace the regions in the sex dataframe to match the spatial object
sex <- sex %>%
mutate(region = replace(region, region == "Hawketos Bay Region", "Hawke's Bay Region"), region = replace(region, region == "ManawatutoWanganui Region", "Manawatu-Wanganui Region"))
# Sum the male and female numbers grouping by region
sex_ages <- sex %>%
group_by(region) %>%
summarize_at( .vars = colnames(.)[2:5] , sum)
# Merge the nz_region and sex datasets by the region
nz_region_f <- merge(nz_region_f,sex_ages, by.x='REGC2017_N', by.y='region')
# Add new variables for the population density which is a relative measure than only population counts for the regions
nz_region_f <- nz_region_f %>%
mutate(maledens2006 = male2006/AREA_SQ_KM,
femaledens2006 = female2006/AREA_SQ_KM,
maledens2013 = male2013/AREA_SQ_KM,
femaledens2013 = female2013/AREA_SQ_KM)
Interactive Map
Finally let’s create an interactive map using ggiraph.
# Now order the dataframe
nz_region_f <- nz_region_f[order(nz_region_f$order), ]
# Compare the region names for the Census data and the transformed and fortified spatial dataframe to note differences
setdiff(unique(sex$region),unique(nz_region_f$REGC2017_N))
## character(0)
# Remove the nz_region_wgs spatial object to save memory
rm(nz_region_wgs)
# Create an object g with the merged dataframe with the ggplot and geom_polygon functions
nz_region_f$tooltip <- paste0(nz_region_f$REGC2017_N, " : ", round(nz_region_f$femaledens2013), " per sq km")
g <- ggplot(nz_region_f[,c(1:11,19,20)], aes(long, lat,group = group)) +
geom_polygon_interactive(aes(fill=femaledens2013,
tooltip = htmltools::htmlEscape(tooltip, TRUE))) +
geom_path(color = "white") +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "BuPu", name="Population per sq km") +
theme_void() +
labs(color="Pop per sq km)") +
ggtitle("New Zealand Female Population Density in 2013")
# Now plot the g object using ggiraph
fe2013 <- ggiraph(code = print(g))
fe2013