Setup

library(raster)
library(tidyverse)
library(sf)
library(rpart)
library(rpart.plot)
library(rasterVis)
library(mapedit)
library(mapview)
library(caret)
library(viridis)
library(forcats)
library(ggplot2)
library(psych)
library(plotROC)
library(pROC)
library(FNN)
mapTheme <- function(base_size = 12) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(size = 14,colour = "black"),
    plot.subtitle=element_text(face="italic"),
    plot.caption=element_text(hjust=0),
    axis.ticks = element_blank(),
    panel.background = element_blank(),axis.title = element_blank(),
    axis.text = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=2)
  )
}
load(".RData")

0 Motivation

The ultimate goal for developing this project is keeping people and property safe from inundation. With the algorithm we developed and the prediciton results, planners and developers could better understand the spatial distribution of the risk of inundation and make informed decision to avoid the loss of life and property. This algortithm can be applied to cities with similar conditions to Calgary, like Edmonton that we showed in this project. Water department of these cities can be in charge of collecting data and making prediction. The results should be publicized.

1 Features Building

Dependent Variable: Inundation
inundation <- aggregate(raster("inundation_re.tif"),fact = 3)
plot(inundation)
title(main = "Dependent Variable: Inundation - Calgary")

plot

1.1 Topography

slope <- aggregate(raster("slope_c.tif"), fact = 20) %>% 
         projectRaster(inundation,method = 'ngb')
elevation <- aggregate(raster("elevation_c.tif"), fact = 20) %>% 
         projectRaster(inundation,method = 'ngb')

1.2 Hydrology Analysis

dist_main <- aggregate(raster("dist_main_c.tif"), fact = 20) %>% 
             projectRaster(inundation,method = 'ngb')
dist_stream <- aggregate(raster("dist_stream_c.tif"), fact = 20) %>% 
               projectRaster(inundation,method = 'ngb')
fowdist_down <- aggregate(raster("fowdist_down.tif"), fact = 20) %>% 
                projectRaster(inundation,method = 'ngb')

Distance from Mainstream

plot(dist_main)
title(main="Distance from Mainstream")

plot

Flow length to Downstream

We calculated the flow length to the downstream since downstream area is likely more vulnerable to the flooding. This festure is a very strong predictor according to the regression. Flow Length to Downstream

1.3 Landcover

landcover <- raster("landcover.tif") %>% 
             projectRaster(inundation,method = 'ngb')
lc_label <- ratify(landcover)
levels(lc_label) <- levels(lc_label)[[1]] %>%
  mutate(legend = c("Impermable","Permeable","NaturalCover"))
Landcover
levelplot(lc_label, maxpixels = 1e6,
          col.regions = c('burlywood', 'cyan', 'darkgreen'),
          scales=list(draw=FALSE),
          main = "Landcover")

plot

1.4 Classifying Satellite Imagery

band1 <- raster("band1.tif")
band2 <- raster("band2.tif")
band3 <- raster("band3.tif")
band4 <- raster("band4.tif")
band5 <- raster("band5.tif")
band6 <- raster("band6.tif")
band7 <- raster("band7.tif")
band8 <- raster("band8.tif")
band9 <- raster("band9.tif")
band10 <- raster("band10.tif")
band11 <- raster("band11.tif")
band8 <- aggregate(band8, fact = 2)

image <- stack(band1, band2, band3, band4, band5, band6, band7, 
               band8, band9, band10, band11)
points <- viewRGB(image, r = 4, g = 3, b = 2) %>% editMap()
# save as clouds after first iteration
clouds <- points$finished$geometry %>% st_sf() %>% mutate(class = "clouds", id = 1)
# save as developed land second time
developed <- points$finished$geometry %>% st_sf() %>% mutate(class = "developed", id = 2)
# then save as undeveloped land after third iteration
undeveloped <- points$finished$geometry %>% st_sf() %>% mutate(class = "undeveloped", id = 3)
# finally save as water
water <- points$finished$geometry %>% st_sf() %>% mutate(class = "water", id = 4)
training_points <-  st_read("calgary_trainingPoints.shp")
training_points <- as(training_points, 'Spatial')
df <- raster::extract(image, training_points) %>%
  round()
df <- data.frame(training_points$class, df)
model.class <- rpart(as.factor(training_points.class)~., data = df, method = 'class')
pr <- predict(image, model.class, type ='class', progress = 'text') %>% 
  ratify()
levels(pr) <- levels(pr)[[1]] %>%
  mutate(legend = c("cloud","developed","undeveloped","water"))
Supervised Classification of Imagery
levelplot(pr, maxpixels = 1e6,
          col.regions = c('cyan', 'burlywood', 'darkgreen', 'blue'),
          scales=list(draw=FALSE),
          main = "Supervised Classification of Imagery")

plot

test <- raster::extract(pr, training_points) %>% 
  as.data.frame() %>% 
  rename(id = ".")

testProbs <- data.frame(
  obs = as.factor(training_points$id),
  pred = as.factor(test$id)
) %>% 
  mutate(correct = ifelse(obs == pred, 1, 0))

confMatrix <- confusionMatrix(testProbs$obs, testProbs$pred)

2 Create Fishnet & Summarise features by grid cell

Boundary <- 
  st_read("boundary.shp") %>%
  st_transform(crs=4326)
fishnet <- 
  st_make_grid(Boundary, cellsize = 0.005) %>%
  st_sf()[Boundary,] %>%
  mutate(uniqueID = rownames(.)) %>%
  dplyr::select(uniqueID)
ggplot() +
  geom_sf(data=fishnet) +
  labs(title = "Fishnet in Calgary") +
  mapTheme()
nums <- stack(dist_main, dist_stream, fowdist_down, slope, elevation) %>% 
           raster::extract(fishnet, fun=mean) %>% 
           as.data.frame()

For category variables, we used majority as aggregate function.

majority <- function(vec,...) {
  if (is.null(vec)){
    return(NA)
  } else {
    return(as.numeric(names(which.max(table(vec))))[1])
  }
}
cats <- stack(landcover,inundation) %>% 
           raster::extract(fishnet, fun=majority) %>% 
           as.data.frame() 

Since we want to predict inundation, water is the most important type in landcover and we will build another feature based on it. However, water is so rare and scattered that it is hard to capture water landcover by directly using majority. Here, we modified the function so that as long as water area is larger than 1/12 of one cell, this cell would be identified as water.

majority_landpre <- function(vec,...) {
  if (is.null(vec)){
    return(NA)
  } else if (is.na(table(vec)[3])==F){
    if (as.numeric(table(vec)[3])>length(vec)/12){
      return(3)
    }
    else {
    return(as.numeric(names(which.max(table(vec))))[1])
    }
  }
    else {
    return(as.numeric(names(which.max(table(vec))))[1])
  }
}
landpre <- projectRaster(pr,inundation,method = 'ngb')
landpre_net <- stack(landpre) %>% 
           raster::extract(fishnet, fun=majority_landpre) %>% 
           as.data.frame()
vars <- cbind(nums,landpre_net) %>%
        cbind(cats) %>% 
        cbind(as.data.frame(fishnet)) %>% 
        na.omit()
names(vars) <- c("dist_main", "dist_stream", "fowdist_down", "slope","elevation", "landpre", "landcover",  "inundation","id","geometry" ) 
vars$inundation <- as.factor(vars$inundation)
vars$landcover <- as.factor(vars$landcover)
vars$landpre <- as.factor(vars$landpre)
ggplot() +
  geom_sf(data=st_sf(vars), aes(fill=landpre)) +
  labs(title="Supervised Classification of Satellite Imagery - Calgary") +
  mapTheme() 

1-cloud, 2-developed, 3-undeveloped, 4-water

3 Exploratory analysis

vars_cat <- vars %>%
    dplyr::select(inundation, landcover, landpre) %>%
    gather(Variable, value, -inundation) %>%
    count(Variable, value, inundation) %>% 
    as.data.frame
vars_cat[vars_cat$inundation==1,]$n <- vars_cat[vars_cat$inundation==1,]$n*200/sum(vars_cat[vars_cat$inundation==1,]$n) 
vars_cat[vars_cat$inundation==0,]$n <- vars_cat[vars_cat$inundation==0,]$n*200/sum(vars_cat[vars_cat$inundation==0,]$n)
Feature associations with the Likelihood of Inundation
ggplot(vars_cat, aes(value, n, fill = inundation)) +   
  geom_bar(position = "dodge", stat="identity") +
  facet_wrap(~Variable, scales="free") +
  scale_fill_manual(values = c("darkgreen","chocolate1")) +
  labs(x="", y="Percentile Value",
       title = "Feature associations with the Likelihood of Inundation",
       subtitle = "Two category features") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot

As charts showed above, the likelihood of inundation distributes quite unevenly across different categories of land cover and predicted land cover.

Variable Distributions across Inundation and Non-inundation
ggplot(data = vars %>% 
         select(-id,-geometry) %>% 
         gather(variable,value,dist_main:elevation), aes(inundation, value)) +
  geom_boxplot(aes(fill=inundation)) +  
  facet_wrap(~variable,scales="free",ncol=3) +
  scale_fill_manual(values = c("darkgreen","chocolate1"),
                      name = "Inundation",
                      labels=c("N","Y")) +
  labs(title="Variable Distributions across Inundation and Non-inundation",
       x="Inundation",
       y="Value") +
  theme(plot.title = element_text(size = 18,colour = "black"))

plot

4 Feature Engineering

4.1 Distance from Water

nn_function <- function(measureFrom,measureTo,k) {
  measureFrom_Matrix <-
    as.matrix(measureFrom)
  measureTo_Matrix <-
    as.matrix(measureTo)
  nn <-   
    get.knnx(measureTo_Matrix, measureFrom_Matrix, k)$nn.dist
    output <-
      as.data.frame(nn) %>%
      rownames_to_column(var = "thisPoint") %>%
      gather(points, point_distance, V1:ncol(.)) %>%
      arrange(as.numeric(thisPoint)) %>%
      group_by(thisPoint) %>%
      summarize(pointDistance = mean(point_distance)) %>%
      arrange(as.numeric(thisPoint)) %>% 
      dplyr::select(-thisPoint) %>%
      pull()
  
  return(output)  
}
vars$dist_water <- nn_function(st_coordinates(st_centroid(st_sf(vars))),
                                              st_coordinates(st_centroid(st_sf(vars[vars$landpre==4,]))),8)

Here we assume that those places near existing water have higher likelihood of inundation. So, for each cell, we calculated the mean of distance between this cell and 8 nearest cell classified as water. In this way, we took the area size of water into consideration and limited the distance that could be too long.

ggplot() +
  geom_sf(data=st_sf(vars), aes(fill=dist_water)) +
  scale_fill_viridis() +
  labs(title="KNN(8) Distance from Water") +
  mapTheme() 

ggplot(data = vars %>% 
         select(-id,-geometry) %>% 
         gather(variable,value,dist_water), aes(inundation, value)) +
  geom_boxplot(aes(fill=inundation)) +
  scale_fill_manual(values = c("darkgreen","chocolate1"),
                      name = "Inundation",
                      labels=c("N","Y")) +
  labs(title="Distance from Water distributions across Inundation and Not",
       x="Inundation",
       y="Value") +
  theme(plot.title = element_text(size = 18,colour = "black"))

This plot shows that generally speaking, inundation area is much closer to water than non-inundation area.

4.2 Relative Elevation & Slope

mean_ele <- function(d){
  return(mean(vars$elevation[as.numeric(as.vector(d[1,]))]))
}
mean_slope <- function(d){
  return(mean(vars$slope[as.numeric(as.vector(d[1,]))]))
}
knn_mean <- function(df,n,f){
  Matrix <-  as.matrix(st_coordinates(st_centroid(st_sf(df))))
  nn <- get.knnx(Matrix, Matrix, n)$nn.index
  df$id <- seq(1:length(df[,1]))
  df <- cbind(df$id,as.data.frame(nn)) %>% 
        group_by(df$id) %>% 
        nest() %>% 
        mutate(mean= map_dbl(data, f))
  return(df$mean)}  
vars$local_ele <- knn_mean(vars[,c("elevation","geometry")],24,mean_ele)
vars$relative_ele <- vars$elevation - vars$local_ele
vars$local_ele <- NULL
vars$local_slope <- knn_mean(vars[,c("slope","geometry")],24,mean_slope)
vars$relative_slope <- vars$slope - vars$local_slope
vars$local_slope <- NULL

Relative Slope was calculated by comparing the slope of each cell with the average slope of 24 nearest cell. Though it is still not a good predictor as the later regression shows, it is much better than raw slope data as proved during our experiment.

ggplot() +
  geom_sf(data=st_sf(vars), aes(fill=relative_slope)) +
  scale_fill_viridis() +
  labs(title="Relative Slope") +
  mapTheme() 

ggplot(data = vars %>% 
         select(inundation,elevation,relative_ele,slope,relative_slope) %>% 
         gather(variable,value,-inundation), aes(inundation, value)) +
  geom_boxplot(aes(fill=inundation)) +  
  facet_wrap(~variable,scales="free",ncol=2) +
  scale_fill_manual(values = c("darkgreen","chocolate1"),
                      name = "Inundation",
                      labels=c("N","Y")) +
  labs(title="Relative Slope distributions across inundation and not",
       x="Inundation",
       y="Value") +
  theme(plot.title = element_text(size = 18,colour = "black"))

plot

5 Regression

5.1 Normalization

Since the model trained by Calgary data will be applied to other city, we firstly normalized all features by unifying their range between 0 to 1.

normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
vars_norm <- dplyr::select(vars,landpre,landcover,inundation,geometry)
vars_norm$dist_main <- normalize(vars$dist_main)
vars_norm$fowdist_down <- normalize(vars$fowdist_down)
vars_norm$relative_slope <- normalize(vars$relative_slope)
vars_norm$elevation <- normalize(vars$elevation)
vars_norm$dist_water <- normalize(vars$dist_water)
vars_norm$relative_ele <- normalize(vars$relative_ele)
vars_norm$slope <- normalize(vars$slope)
vars_norm$dist_stream <- normalize(vars$dist_stream)

5.2 Split test/traning

We randomly select 70% cells as training set and other 30% as test set.

set.seed(3456)
trainIndex <- createDataPartition(vars_norm$inundation, p = .70,
                                  list = FALSE,
                                  times = 1)
varsTrain <- vars_norm[ trainIndex,]
varsTest  <- vars_norm[-trainIndex,]
reg <-  glm(inundation ~ ., family = "binomial"(link="logit"), 
        data = varsTrain  %>%
          as.data.frame() %>% 
          dplyr::select(-relative_ele,-slope,-dist_stream,-geometry))
summary(reg)
## 
## Call:
## glm(formula = inundation ~ ., family = binomial(link = "logit"), 
##     data = varsTrain %>% as.data.frame() %>% dplyr::select(-relative_ele, 
##         -slope, -dist_stream, -geometry))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1720  -0.0708  -0.0218  -0.0005   6.6129  
## 
## Coefficients:
##                Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)      6.3491     1.2866   4.935  0.00000080219506469 ***
## landpre2        -2.4677     0.9842  -2.507               0.0122 *  
## landpre3        -1.4308     0.9969  -1.435               0.1512    
## landpre4        -2.0411     1.0569  -1.931               0.0534 .  
## landcover2       0.4533     0.3948   1.148               0.2508    
## landcover3       1.0902     0.4784   2.279               0.0227 *  
## dist_main       -3.3347     1.6679  -1.999               0.0456 *  
## fowdist_down    20.1896     2.5456   7.931  0.00000000000000217 ***
## relative_slope  -1.5163     1.6993  -0.892               0.3722    
## elevation      -58.4036     6.5553  -8.909 < 0.0000000000000002 ***
## dist_water      -6.2417     1.5104  -4.132  0.00003588850668639 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 849.52  on 2622  degrees of freedom
## Residual deviance: 348.07  on 2612  degrees of freedom
## AIC: 370.07
## 
## Number of Fisher Scoring iterations: 10

The summary of regression on training data shows that elevation, flow length to downstream, and distance from water are the most strong features.

classProbs <- predict(reg, varsTest, type="response")
testProbs <- data.frame(obs = as.numeric(as.character(varsTest$inundation)),
                        pred = classProbs)

5.2.1 Distribution of predicted probabilities by observed outcome

ggplot(testProbs, aes(x = pred, fill=as.factor(obs))) + geom_density() +
  facet_grid(obs ~ .,scales = "free") + xlab("Probability") + geom_vline(xintercept = .5) +
  scale_fill_manual(values = c("dodgerblue4", "darkgreen"),
                      labels = c("Non-inundation","Inundation"),
                      name = "")

plot

5.2.2 ConfusionMatrix

The threshhold we chose here is 0.2.

testProbs$predClass  <-  ifelse(testProbs$pred > .2 ,1,0)
caret::confusionMatrix(reference = as.factor(testProbs$obs), 
                       data = as.factor(testProbs$predClass), 
                       positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1048    4
##          1   33   38
##                                           
##                Accuracy : 0.9671          
##                  95% CI : (0.9549, 0.9767)
##     No Information Rate : 0.9626          
##     P-Value [Acc > NIR] : 0.2433          
##                                           
##                   Kappa : 0.6564          
##                                           
##  Mcnemar's Test P-Value : 0.000004161     
##                                           
##             Sensitivity : 0.90476         
##             Specificity : 0.96947         
##          Pos Pred Value : 0.53521         
##          Neg Pred Value : 0.99620         
##              Prevalence : 0.03740         
##          Detection Rate : 0.03384         
##    Detection Prevalence : 0.06322         
##       Balanced Accuracy : 0.93712         
##                                           
##        'Positive' Class : 1               
## 

5.2.3 ROC Curve & AUC

The AUC is near 1, which suggests that this model might be a perfect fit or an overfit model.

ggplot(testProbs, aes(d = obs, m = pred)) + 
  geom_roc(n.cuts = 50, labels = FALSE) + 
  style_roc(theme = theme_grey) +
  geom_abline(slope = 1, intercept = 0, size = 1.5, color = 'grey') +
  labs(title = "ROC Curve - Calgary Inundation Model")

auc(testProbs$obs, testProbs$pred)
## Area under the curve: 0.9713

5.3 Cross Validation

We then did 100-fold cross validation

levels(vars_norm$inundation) <- c("N","Y")
ctrl <- trainControl(method = "cv", 
                     number = 100, 
                     classProbs=TRUE, 
                     summaryFunction=twoClassSummary)

cvFit <- train(inundation ~ .,  
              data = vars_norm  %>%
                     as.data.frame() %>% 
                     dplyr::select(-relative_ele,-slope,-dist_stream,-geometry),
               method="glm", family="binomial",metric="ROC",
               trControl = ctrl)
cvFit

output

## Generalized Linear Model 
## 
## 3746 samples
##    7 predictor
##    2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Cross-Validated (100 fold) 
## Summary of sample sizes: 3709, 3708, 3709, 3709, 3708, 3708, ... 
## Resampling results:
## 
##   ROC        Sens       Spec 
##   0.9660098  0.9941742  0.495
dplyr::select(cvFit$resample, -Resample) %>%
  gather(metric, value) %>%
  left_join(gather(cvFit$results[2:4], metric, mean)) %>%
  ggplot(aes(value)) + 
    geom_histogram(binwidth =0.01, fill = "#FF006A") +
    facet_wrap(~metric) +
    geom_vline(aes(xintercept = mean), colour = "#981FAC", linetype = 3, size = 1.5) +
    scale_x_continuous(limits = c(0, 1)) +
    labs(x="Goodness of Fit", y="Count", title="CV Goodness of Fit Metrics",
         subtitle = "Across-fold mean reprented as dotted lines")

Because we can’t select threshold during the cross validation(we don’t know how to customize the function) and the ratio of inundation to non-inundation is extremely small, sensitivity and Specificity are not good indicators. The distribution of AUC(ROC) suggests that generally speaking, our model generalizes not bad except there are two outliers. The reason for these extremly low AUC could be that in this fold, there are only a few inundation cells which happened to be the type that are difficult to predict.

6 Map Predictions

trainPredictions <- 
  predict(reg, varsTrain, type="response")
  
varsTrain <- 
  cbind(varsTrain,trainPredictions) %>%
  mutate(trainPredictions = round(trainPredictions * 100,5)) 
varsTrain$predClass <- ifelse(varsTrain$trainPredictions > 20 ,1,0)
varsTrain <- varsTrain %>% 
                 mutate(confusion = case_when(
                   inundation == 1 & predClass ==1 ~ "TruePositive",
                   inundation == 1 & predClass ==0 ~ "FalseNegative",
                   inundation == 0 & predClass ==1 ~ "FalsePositive",
                   inundation == 0 & predClass ==0 ~ "TrueNegative"
                 ))
ggplot() + 
  geom_sf(data=st_sf(varsTrain), aes(fill=confusion),colour=NA) +
  scale_fill_manual(values = c("darkred", "lightcoral","lightcyan","lightgreen"),name ="") +
  labs(title="Map Confusion Matrix of Calgary Inundation Prediction (Training Set) ") +
  mapTheme()

allPredictions <- 
  predict(cvFit, vars_norm, type="prob")[,2]
  
vars_norm <- 
  cbind(vars_norm,allPredictions) %>%
  mutate(allPredictions = round(allPredictions * 100,5)) 
ggplot() + 
    geom_sf(data=st_sf(vars_norm), aes(fill=factor(ntile(allPredictions,9))), colour=NA) +
    scale_fill_manual(values = rev(topo.colors(9)),
                      labels=as.character(quantile(vars_norm$allPredictions,
                                                 c(0.1,.2,.3,.4,.5,.6,.7,.8,.9),na.rm=T)),
                      name="Predicted\nProbabilities(%)\n(Quintile\nBreaks)") +
  mapTheme() +
  labs(title="Calgary Inundation Prediction")

Finally, we used the model to predict Edmonton, another Canada city that just 173 miles away from Calgary, which also has a river across the city. Below is the prediction result.

ggplot() + 
    geom_sf(data=vars_Edmonton, aes(fill=factor(ntile(Prediction,9))), colour=NA) +
    scale_fill_manual(values = rev(topo.colors(9)),
                      labels=as.character(quantile(vars_Edmonton$Prediction*100,
                                                 c(0.1,.2,.3,.4,.5,.6,.7,.8,.9),na.rm=T)),
                      name="Predicted\nProbabilities(%)\n(Quintile\nBreaks)") +
  mapTheme() +
  labs(title="Edmonton Inundation Prediction")