In-class exercise week 10

Published

March 25, 2023

Imports

Import packages

pacman::p_load(sf, sfdep, GWmodel, SpatialML, tidyverse, tmap, ggpubr, olsrr, devtools, rsample)

Data

mdata <- read_rds("data/aspatial/mdata.rds")
set.seed(1234) #To do data sampling

resale_split <- initial_split(mdata, prop = 6.5/10,)
train_data <- training(resale_split)
test_data <- testing(resale_split)
write_rds(train_data, "data/model/train_data.rds")
write_rds(test_data, "data/model/test_data.rds")
train_data <- read_rds("data/model/train_data.rds")
test_data <- read_rds("data/model/test_data.rds")

Building non-spatial multiple linear regression

price_mlr <- lm(resale_price ~ floor_area_sqm + storey_order +remaining_lease_mths + PROX_CBD +PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN + WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +WITHIN_1KM_PRISCH,
                data=train_data)
summary(price_mlr)

Call:
lm(formula = resale_price ~ floor_area_sqm + storey_order + remaining_lease_mths + 
    PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK + 
    PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN + 
    WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + WITHIN_1KM_PRISCH, 
    data = train_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-205193  -39120   -1930   36545  472355 

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)              107601.073  10601.261  10.150  < 2e-16 ***
floor_area_sqm             2780.698     90.579  30.699  < 2e-16 ***
storey_order              14299.298    339.115  42.167  < 2e-16 ***
remaining_lease_mths        344.490      4.592  75.027  < 2e-16 ***
PROX_CBD                 -16930.196    201.254 -84.124  < 2e-16 ***
PROX_ELDERLYCARE         -14441.025    994.867 -14.516  < 2e-16 ***
PROX_HAWKER              -19265.648   1273.597 -15.127  < 2e-16 ***
PROX_MRT                 -32564.272   1744.232 -18.670  < 2e-16 ***
PROX_PARK                 -5712.625   1483.885  -3.850 0.000119 ***
PROX_MALL                -14717.388   2007.818  -7.330 2.47e-13 ***
PROX_SUPERMARKET         -26881.938   4189.624  -6.416 1.46e-10 ***
WITHIN_350M_KINDERGARTEN   8520.472    632.812  13.464  < 2e-16 ***
WITHIN_350M_CHILDCARE     -4510.650    354.015 -12.741  < 2e-16 ***
WITHIN_350M_BUS             813.493    222.574   3.655 0.000259 ***
WITHIN_1KM_PRISCH         -8010.834    491.512 -16.298  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 61650 on 10320 degrees of freedom
Multiple R-squared:  0.7373,    Adjusted R-squared:  0.737 
F-statistic:  2069 on 14 and 10320 DF,  p-value: < 2.2e-16
write_rds(price_mlr, "data/model/price_mlr.rds")

GWR methods

GWR predictive model

train_data_sp <- as_Spatial(train_data)
train_data_sp
class       : SpatialPointsDataFrame 
features    : 10335 
extent      : 11597.31, 42623.63, 28217.39, 48741.06  (xmin, xmax, ymin, ymax)
crs         : +proj=tmerc +lat_0=1.36666666666667 +lon_0=103.833333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs 
variables   : 17
names       : resale_price, floor_area_sqm, storey_order, remaining_lease_mths,          PROX_CBD,     PROX_ELDERLYCARE,        PROX_HAWKER,           PROX_MRT,          PROX_PARK,   PROX_GOOD_PRISCH,        PROX_MALL,            PROX_CHAS,     PROX_SUPERMARKET, WITHIN_350M_KINDERGARTEN, WITHIN_350M_CHILDCARE, ... 
min values  :       218000,             74,            1,                  555, 0.999393538715878, 1.98943787433087e-08, 0.0333358643817954, 0.0220407324774434, 0.0441643212802781, 0.0652540365486641,                0, 6.20621206270077e-09, 1.21715176356525e-07,                        0,                     0, ... 
max values  :      1186888,            133,           17,                 1164,  19.6500691667807,     3.30163731686804,   2.86763031236184,   2.13060636038504,   2.41313695915468,   10.6223726149914, 2.27100643784442,    0.808332738794272,     1.57131703651196,                        7,                    20, ... 

Preparing coordinate data

Extracting coordinate data

coords <- st_coordinates(mdata)
coords_train <- st_coordinates(train_data)
coords_test <- st_coordinates(test_data)
write_rds(coords_train, "data/model/coords_train.rds")
write_rds(coords_test, "data/model/coords_test.rds")
coords_train <- read_rds("data/model/coords_train.rds")
coords_test <- read_rds("data/model/coords_test.rds")

Dropping geometry fields

train_data <- train_data %>%
  st_drop_geometry()

Callibrating random forest

set.seed(1234)
rf <- ranger(resale_price ~ floor_area_sqm + storey_order +remaining_lease_mths + PROX_CBD +PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN + WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +WITHIN_1KM_PRISCH,
                data=train_data)
summary(rf)
                          Length Class         Mode     
predictions               10335  -none-        numeric  
num.trees                     1  -none-        numeric  
num.independent.variables     1  -none-        numeric  
mtry                          1  -none-        numeric  
min.node.size                 1  -none-        numeric  
prediction.error              1  -none-        numeric  
forest                        7  ranger.forest list     
splitrule                     1  -none-        character
treetype                      1  -none-        character
r.squared                     1  -none-        numeric  
call                          3  -none-        call     
importance.mode               1  -none-        character
num.samples                   1  -none-        numeric  
replace                       1  -none-        logical  
print(rf)
Ranger result

Call:
 ranger(resale_price ~ floor_area_sqm + storey_order + remaining_lease_mths +      PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK +      PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +      WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + WITHIN_1KM_PRISCH,      data = train_data) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      10335 
Number of independent variables:  14 
Mtry:                             3 
Target node size:                 5 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       728602496 
R squared (OOB):                  0.9495728 

minimum node size is 5 can be increased

Higher R square value in comparison shows random forest performs better.

Geographically weighted random forest

#set.seed(1234)
#gwRF_adaptive <- grf(formula = resale_price ~ floor_area_sqm + storey_order +remaining_lease_mths + PROX_CBD +PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN + WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +WITHIN_1KM_PRISCH,
               # dframe=train_data,
                #bw =55,
                #kernel = "adaptive",
                #coords = coords_train)
#summary(gwRF_adaptive)
#write_rds(gwRF_adaptive,"data/model/gwRF.rds")
#vi_df <- as.data.frame(gwRF_adaptive$Global.Model$variable.importance)
#gwRF_adaptive <- read_rds("data/model/gwRF.rds")
test_data <- cbind(test_data, coords_test) %>%
  st_drop_geometry()
#TAKES A LOT OF TIME TO RUN
#gwRD_pred <- predict.grf(gwRF_adaptive,
#                         test_data,
#                         x.var.name = "X",
#                         y.var.name = "Y",
#                         local.w = 1,
 #                        global.w = 0)