::p_load('sf', 'tidyverse', 'tmap', 'spdep',
pacman'onemapsgapi', 'units', 'matrixStats', 'readxl', 'jsonlite',
'olsrr', 'corrplot', 'ggpubr', 'GWmodel',
'devtools', 'kableExtra', 'plotly', 'ggthemes', 'SpatialML', 'Metrics','rsample')
Take home assignment 3
Import
Packages
Aspatial data
We import asptial data and filter to obtain the data only for the period between 1st Jan 2021 to 31st Dec 2022, and that too we will only be observing the flats with 5 rooms
<- read_csv("data/aspatial/resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv") %>%
resale filter(month >= ('2021-01'), month <= ('2022-12'), flat_type == "5 ROOM")
<- read_csv("data/aspatial/hdb_onemap_coords.csv") omhdb
<- read_csv("data/aspatial/mall_coordinates_updated.csv") omMall
Lets turn the mall dataset into an sf dataframe
<- omMall%>%
mall_sf st_as_sf(coords=c("longitude", "latitude"), crs=4326) %>%
st_transform(3414)
This dataset will contain the top 20 primary schools in Singapore
<- read_csv("data/aspatial/primaryschoolsg.csv")%>%
prisch filter(Name %in% c("Nanyang Primary School","Catholic High School (Primary)","Tao Nan School","Nan Hua Primary School","St. Hilda's Primary School","Henry Park Primary School","Anglo-Chinese School (Primary)","Raffles Girls' Primary School", "Pei Hwa Presbyterian Primary School","CHIJ St. Nicholas Girls' School","Rosyth School","Kong Hwa School","Poi Ching School","Holy Innocents' Primary School","Ai Tong School","Red Swastika School","Maris Stella High School","Rulang Primary School","Pei Chun Public School","Singapore Chinese Girls' Primary School") )
This dataset will just be the collection of primary schools in singapore
<- read_csv("data/aspatial/primaryschoolsg.csv") prsc
Converting them both to geospatial datasets
<- st_as_sf(prsc,
prsc_sf coords = c("Longitude", "Latitude"),
crs=4326) %>%
st_transform(crs = 3414)
<- st_as_sf(prisch,
prisch_sf coords = c("Longitude", "Latitude"),
crs=4326) %>%
st_transform(crs = 3414)
Geospatial data
We shall take MRT exits since for larger MRT stations, these are more accurate than taking MRT centroids.
<- st_read(dsn="data/geospatial/TrainStationExit",
mrt_sf layer="Train_Station_Exit_Layer")
Reading layer `Train_Station_Exit_Layer' from data source
`C:\Study\Y3\S2\IS415\ELAbishek\IS415-GAA\Takehome_exercise\TakeHome3\data\geospatial\TrainStationExit'
using driver `ESRI Shapefile'
Simple feature collection with 562 features and 2 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 6134.086 ymin: 27499.7 xmax: 45356.36 ymax: 47865.92
Projected CRS: SVY21
<- st_read(dsn = "data/geospatial/mpsz", layer="MPSZ-2019") mpsz_sf
Reading layer `MPSZ-2019' from data source
`C:\Study\Y3\S2\IS415\ELAbishek\IS415-GAA\Takehome_exercise\TakeHome3\data\geospatial\mpsz'
using driver `ESRI Shapefile'
Simple feature collection with 332 features and 6 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 103.6057 ymin: 1.158699 xmax: 104.0885 ymax: 1.470775
Geodetic CRS: WGS 84
<- st_read(dsn = "data/geospatial/BusStop_Feb2023", layer="BusStop") bustop_sf
Reading layer `BusStop' from data source
`C:\Study\Y3\S2\IS415\ELAbishek\IS415-GAA\Takehome_exercise\TakeHome3\data\geospatial\BusStop_Feb2023'
using driver `ESRI Shapefile'
Simple feature collection with 5159 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 3970.122 ymin: 26482.1 xmax: 48284.56 ymax: 52983.82
Projected CRS: SVY21
<- st_read(dsn = "data/geospatial/Eldercare", layer="ELDERCARE") elder_sf
Reading layer `ELDERCARE' from data source
`C:\Study\Y3\S2\IS415\ELAbishek\IS415-GAA\Takehome_exercise\TakeHome3\data\geospatial\Eldercare'
using driver `ESRI Shapefile'
Simple feature collection with 133 features and 18 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 14481.92 ymin: 28218.43 xmax: 41665.14 ymax: 46804.9
Projected CRS: SVY21
<- st_read("data/geospatial/supermarkets.kml") super_sf
Reading layer `SUPERMARKETS' from data source
`C:\Study\Y3\S2\IS415\ELAbishek\IS415-GAA\Takehome_exercise\TakeHome3\data\geospatial\supermarkets.kml'
using driver `KML'
Simple feature collection with 526 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6258 ymin: 1.24715 xmax: 104.0036 ymax: 1.461526
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
<- st_read("data/geospatial/preschools-location.kml") pre_sf
Reading layer `PRESCHOOLS_LOCATION' from data source
`C:\Study\Y3\S2\IS415\ELAbishek\IS415-GAA\Takehome_exercise\TakeHome3\data\geospatial\preschools-location.kml'
using driver `KML'
Simple feature collection with 1925 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6824 ymin: 1.247759 xmax: 103.9897 ymax: 1.462134
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
<- st_read("data/geospatial/kindergartens.kml") kinder_sf
Reading layer `KINDERGARTENS' from data source
`C:\Study\Y3\S2\IS415\ELAbishek\IS415-GAA\Takehome_exercise\TakeHome3\data\geospatial\kindergartens.kml'
using driver `KML'
Simple feature collection with 448 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6887 ymin: 1.247759 xmax: 103.9752 ymax: 1.455452
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
<- st_read("data/geospatial/HawkerCentres/hawker-centres-kml.kml") hawker_sf
Reading layer `HAWKERCENTRE' from data source
`C:\Study\Y3\S2\IS415\ELAbishek\IS415-GAA\Takehome_exercise\TakeHome3\data\geospatial\HawkerCentres\hawker-centres-kml.kml'
using driver `KML'
Simple feature collection with 125 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6974 ymin: 1.272716 xmax: 103.9882 ymax: 1.449217
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
<- st_read("data/geospatial/Parks/parks.kml") parks_sf
Reading layer `NATIONALPARKS_New' from data source
`C:\Study\Y3\S2\IS415\ELAbishek\IS415-GAA\Takehome_exercise\TakeHome3\data\geospatial\Parks\parks.kml'
using driver `KML'
Simple feature collection with 421 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6929 ymin: 1.214491 xmax: 104.0538 ymax: 1.462094
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
<- st_read("data/geospatial/childcare.geojson") child_sf
Reading layer `childcare' from data source
`C:\Study\Y3\S2\IS415\ELAbishek\IS415-GAA\Takehome_exercise\TakeHome3\data\geospatial\childcare.geojson'
using driver `GeoJSON'
Simple feature collection with 1545 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6824 ymin: 1.248403 xmax: 103.9897 ymax: 1.462134
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
Choosing columns
<- child_sf %>%
child_sf select(c(1))
<- elder_sf %>%
elder_sf select(c(1))
<- bustop_sf %>%
bustop_sf select(c(1))
<- hawker_sf %>%
hawker_sf select(c(1))
<- kinder_sf %>%
kinder_sf select(c(1))
<- mrt_sf %>%
mrt_sf select(c(1))
<- parks_sf %>%
parks_sf select(c(1))
<- pre_sf %>%
pre_sf select(c(1))
<- super_sf %>%
super_sf select(c(1))
<- prisch_sf %>%
prisch_sf select(c(1))
#omMall <- omMall %>%
# select(c(2))
Checking for invalid geometries
length(which(st_is_valid(mpsz_sf) == FALSE))
[1] 6
length(which(st_is_valid(child_sf) == FALSE))
[1] 0
length(which(st_is_valid(elder_sf) == FALSE))
[1] 0
length(which(st_is_valid(bustop_sf) == FALSE))
[1] 0
length(which(st_is_valid(hawker_sf) == FALSE))
[1] 0
length(which(st_is_valid(kinder_sf) == FALSE))
[1] 0
length(which(st_is_valid(mrt_sf) == FALSE))
[1] 0
length(which(st_is_valid(parks_sf) == FALSE))
[1] 0
length(which(st_is_valid(pre_sf) == FALSE))
[1] 0
length(which(st_is_valid(super_sf) == FALSE))
[1] 0
length(which(st_is_valid(prisch_sf) == FALSE))
[1] 0
length(which(st_is_valid(mall_sf) == FALSE))
[1] 0
mpsz has 6 rows with invalid geometries so we will be making them valid
<- st_make_valid(mpsz_sf)
mpsz_sf length(which(st_is_valid(mpsz_sf) == FALSE))
[1] 0
Checking for missing values
rowSums(is.na(mpsz_sf))!=0,] mpsz_sf[
Simple feature collection with 0 features and 6 fields
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Geodetic CRS: WGS 84
[1] SUBZONE_N SUBZONE_C PLN_AREA_N PLN_AREA_C REGION_N REGION_C geometry
<0 rows> (or 0-length row.names)
rowSums(is.na(child_sf))!=0,] child_sf[
Simple feature collection with 0 features and 1 field
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Geodetic CRS: WGS 84
[1] Name geometry
<0 rows> (or 0-length row.names)
rowSums(is.na(elder_sf))!=0,] elder_sf[
Simple feature collection with 0 features and 1 field
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Projected CRS: SVY21
[1] OBJECTID geometry
<0 rows> (or 0-length row.names)
rowSums(is.na(bustop_sf))!=0,] bustop_sf[
Simple feature collection with 0 features and 1 field
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Projected CRS: SVY21
[1] BUS_STOP_N geometry
<0 rows> (or 0-length row.names)
rowSums(is.na(hawker_sf))!=0,] hawker_sf[
Simple feature collection with 0 features and 1 field
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Geodetic CRS: WGS 84
[1] Name geometry
<0 rows> (or 0-length row.names)
rowSums(is.na(kinder_sf))!=0,] kinder_sf[
Simple feature collection with 0 features and 1 field
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Geodetic CRS: WGS 84
[1] Name geometry
<0 rows> (or 0-length row.names)
rowSums(is.na(mrt_sf))!=0,] mrt_sf[
Simple feature collection with 0 features and 1 field
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Projected CRS: SVY21
[1] stn_name geometry
<0 rows> (or 0-length row.names)
rowSums(is.na(parks_sf))!=0,] parks_sf[
Simple feature collection with 0 features and 1 field
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Geodetic CRS: WGS 84
[1] Name geometry
<0 rows> (or 0-length row.names)
rowSums(is.na(pre_sf))!=0,] pre_sf[
Simple feature collection with 0 features and 1 field
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Geodetic CRS: WGS 84
[1] Name geometry
<0 rows> (or 0-length row.names)
rowSums(is.na(super_sf))!=0,] super_sf[
Simple feature collection with 0 features and 1 field
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Geodetic CRS: WGS 84
[1] Name geometry
<0 rows> (or 0-length row.names)
rowSums(is.na(prisch_sf))!=0,] prisch_sf[
Simple feature collection with 0 features and 1 field
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Projected CRS: SVY21 / Singapore TM
# A tibble: 0 × 2
# … with 2 variables: Name <chr>, geometry <GEOMETRY [m]>
No rows have missing values, we can proceed with the analysis
Checking CRS
st_crs(mpsz_sf)
Coordinate Reference System:
User input: WGS 84
wkt:
GEOGCRS["WGS 84",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
CS[ellipsoidal,2],
AXIS["latitude",north,
ORDER[1],
ANGLEUNIT["degree",0.0174532925199433]],
AXIS["longitude",east,
ORDER[2],
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4326]]
st_crs(bustop_sf)
Coordinate Reference System:
User input: SVY21
wkt:
PROJCRS["SVY21",
BASEGEOGCRS["WGS 84",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]],
ID["EPSG",6326]],
PRIMEM["Greenwich",0,
ANGLEUNIT["Degree",0.0174532925199433]]],
CONVERSION["unnamed",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["Degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["Degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["(E)",east,
ORDER[1],
LENGTHUNIT["metre",1,
ID["EPSG",9001]]],
AXIS["(N)",north,
ORDER[2],
LENGTHUNIT["metre",1,
ID["EPSG",9001]]]]
st_crs(child_sf)
Coordinate Reference System:
User input: WGS 84
wkt:
GEOGCRS["WGS 84",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
CS[ellipsoidal,2],
AXIS["geodetic latitude (Lat)",north,
ORDER[1],
ANGLEUNIT["degree",0.0174532925199433]],
AXIS["geodetic longitude (Lon)",east,
ORDER[2],
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4326]]
st_crs(elder_sf)
Coordinate Reference System:
User input: SVY21
wkt:
PROJCRS["SVY21",
BASEGEOGCRS["SVY21[WGS84]",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]],
ID["EPSG",6326]],
PRIMEM["Greenwich",0,
ANGLEUNIT["Degree",0.0174532925199433]]],
CONVERSION["unnamed",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["Degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["Degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["(E)",east,
ORDER[1],
LENGTHUNIT["metre",1,
ID["EPSG",9001]]],
AXIS["(N)",north,
ORDER[2],
LENGTHUNIT["metre",1,
ID["EPSG",9001]]]]
st_crs(hawker_sf)
Coordinate Reference System:
User input: WGS 84
wkt:
GEOGCRS["WGS 84",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
CS[ellipsoidal,2],
AXIS["geodetic latitude (Lat)",north,
ORDER[1],
ANGLEUNIT["degree",0.0174532925199433]],
AXIS["geodetic longitude (Lon)",east,
ORDER[2],
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4326]]
st_crs(kinder_sf)
Coordinate Reference System:
User input: WGS 84
wkt:
GEOGCRS["WGS 84",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
CS[ellipsoidal,2],
AXIS["geodetic latitude (Lat)",north,
ORDER[1],
ANGLEUNIT["degree",0.0174532925199433]],
AXIS["geodetic longitude (Lon)",east,
ORDER[2],
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4326]]
st_crs(mall_sf)
Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
st_crs(mrt_sf)
Coordinate Reference System:
User input: SVY21
wkt:
PROJCRS["SVY21",
BASEGEOGCRS["WGS 84",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]],
ID["EPSG",6326]],
PRIMEM["Greenwich",0,
ANGLEUNIT["Degree",0.0174532925199433]]],
CONVERSION["unnamed",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["Degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["Degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["(E)",east,
ORDER[1],
LENGTHUNIT["metre",1,
ID["EPSG",9001]]],
AXIS["(N)",north,
ORDER[2],
LENGTHUNIT["metre",1,
ID["EPSG",9001]]]]
st_crs(pre_sf)
Coordinate Reference System:
User input: WGS 84
wkt:
GEOGCRS["WGS 84",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
CS[ellipsoidal,2],
AXIS["geodetic latitude (Lat)",north,
ORDER[1],
ANGLEUNIT["degree",0.0174532925199433]],
AXIS["geodetic longitude (Lon)",east,
ORDER[2],
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4326]]
st_crs(super_sf)
Coordinate Reference System:
User input: WGS 84
wkt:
GEOGCRS["WGS 84",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
CS[ellipsoidal,2],
AXIS["geodetic latitude (Lat)",north,
ORDER[1],
ANGLEUNIT["degree",0.0174532925199433]],
AXIS["geodetic longitude (Lon)",east,
ORDER[2],
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4326]]
st_crs(prisch_sf)
Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
We will now change all the crs to 3414
<- st_set_crs(mpsz_sf, 3414)
mpsz_sf <- st_set_crs(mrt_sf, 3414)
mrt_sf <- st_set_crs(bustop_sf, 3414)
bustop_sf
<- st_transform(child_sf, crs=3414)
child_sf <- st_transform(elder_sf, crs=3414)
elder_sf <- st_transform(hawker_sf, crs=3414)
hawker_sf <- st_transform(kinder_sf, crs=3414)
kinder_sf <- st_transform(parks_sf, crs=3414)
parks_sf <- st_transform(super_sf, crs=3414)
super_sf <- st_transform(mall_sf, crs=3414) mall_sf
Visualization
plot(st_geometry(mpsz_sf))
tmap_mode("view")
tm_shape(mrt_sf) +
tm_dots(col="red", size=0.05)
tmap_mode("view")
tm_shape(hawker_sf) +
tm_dots(alpha=0.5, #affects transparency of points
col="#d62828",
size=0.05) +
tm_shape(parks_sf) +
tm_dots(alpha=0.5,
col="#f77f00",
size=0.05) +
tm_shape(super_sf) +
tm_dots(alpha=0.5,
col="#fcbf49",
size=0.05) +
tm_shape(mall_sf) +
tm_dots(alpha=0.5,
col="#eae2b7",
size=0.05)
Geospatial data wrangling
rowSums(is.na(resale))!=0,] resale[
# A tibble: 0 × 11
# … with 11 variables: month <chr>, town <chr>, flat_type <chr>, block <chr>,
# street_name <chr>, storey_range <chr>, floor_area_sqm <dbl>,
# flat_model <chr>, lease_commence_date <dbl>, remaining_lease <chr>,
# resale_price <dbl>
length(which(st_is_valid(mpsz_sf) == FALSE))
[1] 0
Geocoding resale dataframe
This function will geocode the resale dataset by using one map sg API.
library(httr)
<- function(block, streetname) {
geocode <- "https://developers.onemap.sg/commonapi/search"
base_url <- paste(block, streetname, sep = " ")
address <- list("searchVal" = address,
query "returnGeom" = "Y",
"getAddrDetails" = "N",
"pageNum" = "1")
<- GET(base_url, query = query)
res <-content(res, as="text")
restext
<- fromJSON(restext) %>%
output %>%
as.data.frame select(results.LATITUDE, results.LONGITUDE)
return(output)
}
This takes a really long time to run
# resale$LATITUDE <- 0
#resale$LONGITUDE <- 0
# for (i in 1:nrow(resale)){
# temp_output <- geocode(resale[i, 4], resale[i, 5])
# resale$LATITUDE[i] <- temp_output$results.LATITUDE
# resale$LONGITUDE[i] <- temp_output$results.LONGITUDE
# }
#write_rds(resale, "data/model/resale.rds")
<- read_rds("data/model/resale.rds") resale
unique(resale$storey_range)
[1] "01 TO 03" "13 TO 15" "16 TO 18" "07 TO 09" "22 TO 24" "04 TO 06"
[7] "19 TO 21" "10 TO 12" "25 TO 27" "28 TO 30" "34 TO 36" "31 TO 33"
[13] "37 TO 39" "40 TO 42" "43 TO 45" "46 TO 48" "49 TO 51"
<- str_split(resale$remaining_lease, " ")
str_list
for (i in 1:length(str_list)) {
if (length(unlist(str_list[i])) > 2) {
<- as.numeric(unlist(str_list[i])[1])
year <- as.numeric(unlist(str_list[i])[3])
month $remaining_lease[i] <- year + round(month/12, 2)
resale
}else {
<- as.numeric(unlist(str_list[i])[1])
year $remaining_lease[i] <- year
resale
} }
Determining CBD area
<- 1.287953
lat <- 103.851784
lng
<- data.frame(lat, lng) %>%
cbd_sf st_as_sf(coords = c("lng", "lat"), crs=4326) %>%
st_transform(crs=3414)
sum(is.na(resale$LATITUDE))
[1] 0
sum(is.na(resale$LONGITUDE))
[1] 0
<- st_as_sf(resale,
resale_sf coords = c("LONGITUDE",
"LATITUDE"),
# the geographical features are in longitude & latitude, in decimals
# as such, WGS84 is the most appropriate coordinates system
crs=4326) %>%
#afterwards, we transform it to SVY21, our desired CRS
st_transform(crs = 3414)
Converting remaining lease from character to numeric
#turning remaining lease from character to numeric
$remaining_lease <- as.numeric(as.character(resale_sf$remaining_lease)) resale_sf
Turning all unique storey ranges to a numeric value
<- sort(unique(resale_sf$storey_range)) storeys
<- 1:length(storeys)
storey_order <- data.frame(storeys, storey_order) storey_range_order
<- left_join(resale_sf, storey_range_order, by= c("storey_range" = "storeys")) resale_sf
Add in the age of unit into the dataset.
age of unit = current year (as of the data) - lease commence date
<- resale_sf %>%
resale_sf add_column(age_of_unit = NA) %>%
mutate(age_of_unit = (
as.numeric(substr(resale_sf$month, 1, 4)) -
as.numeric(resale_sf$lease_commence_date)))
Adding in proximity and frequency
<- function(df1, df2, varname) {
proximity <- st_distance(df1, df2) %>%
dist_matrix drop_units()
<- rowMins(dist_matrix)
df1[,varname] return(df1)
}
<-
resale_sf proximity(resale_sf, cbd_sf, "PROX_CBD") %>%
proximity(., child_sf, "PROX_CHILDCARE") %>%
proximity(., elder_sf, "PROX_ELDERCARE") %>%
proximity(., hawker_sf, "PROX_HAWKER") %>%
proximity(., mrt_sf, "PROX_MRT") %>%
proximity(., parks_sf, "PROX_PARK") %>%
proximity(., mall_sf, "PROX_MALL") %>%
proximity(., super_sf, "PROX_SPRMKT") %>%
proximity(., prisch_sf, "PROX_TOPPRISCH")
<- function(df1, df2, varname, radius) {
num_radius <- st_distance(df1, df2) %>%
dist_matrix drop_units() %>%
as.data.frame()
<- rowSums(dist_matrix <= radius)
df1[,varname] return(df1)
}
<-
resale_sf num_radius(resale_sf, kinder_sf, "NUM_KNDRGTN", 350) %>%
num_radius(., child_sf, "NUM_CHILDCARE", 350) %>%
num_radius(., bustop_sf, "NUM_BUS_STOP", 350) %>%
num_radius(., prsc_sf, "NUM_PRI_SCH", 1000)
write_rds(resale_sf, "data/model/resale_sf.rds")
Exploratory Data Analysis
read_rds("data/model/resale_sf.rds")
Simple feature collection with 14519 features and 26 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 6958.193 ymin: 28157.26 xmax: 42645.18 ymax: 48741.06
Projected CRS: SVY21 / Singapore TM
# A tibble: 14,519 × 27
month town flat_…¹ block stree…² store…³ floor…⁴ flat_…⁵ lease…⁶ remai…⁷
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
1 2021-01 ANG MO… 5 ROOM 551 ANG MO… 01 TO … 118 Improv… 1981 59.1
2 2021-01 ANG MO… 5 ROOM 305 ANG MO… 13 TO … 123 Standa… 1977 55.6
3 2021-01 ANG MO… 5 ROOM 520 ANG MO… 16 TO … 118 Improv… 1980 58.7
4 2021-01 ANG MO… 5 ROOM 253 ANG MO… 07 TO … 128 Improv… 1996 74.2
5 2021-01 ANG MO… 5 ROOM 423 ANG MO… 01 TO … 133 Improv… 1993 71.2
6 2021-01 ANG MO… 5 ROOM 617 ANG MO… 13 TO … 133 Model A 1996 74.5
7 2021-01 ANG MO… 5 ROOM 315A ANG MO… 22 TO … 110 Improv… 2006 84.3
8 2021-01 ANG MO… 5 ROOM 596C ANG MO… 13 TO … 110 Improv… 2002 80.9
9 2021-01 ANG MO… 5 ROOM 309A ANG MO… 16 TO … 110 Improv… 2006 84.3
10 2021-01 ANG MO… 5 ROOM 588C ANG MO… 16 TO … 112 DBSS 2011 89.6
# … with 14,509 more rows, 17 more variables: resale_price <dbl>,
# geometry <POINT [m]>, storey_order <int>, age_of_unit <dbl>,
# PROX_CBD <dbl>, PROX_CHILDCARE <dbl>, PROX_ELDERCARE <dbl>,
# PROX_HAWKER <dbl>, PROX_MRT <dbl>, PROX_PARK <dbl>, PROX_MALL <dbl>,
# PROX_SPRMKT <dbl>, PROX_TOPPRISCH <dbl>, NUM_KNDRGTN <dbl>,
# NUM_CHILDCARE <dbl>, NUM_BUS_STOP <dbl>, NUM_PRI_SCH <dbl>, and abbreviated
# variable names ¹flat_type, ²street_name, ³storey_range, ⁴floor_area_sqm, …
glimpse(resale_sf)
Rows: 14,519
Columns: 27
$ month <chr> "2021-01", "2021-01", "2021-01", "2021-01", "2021-…
$ town <chr> "ANG MO KIO", "ANG MO KIO", "ANG MO KIO", "ANG MO …
$ flat_type <chr> "5 ROOM", "5 ROOM", "5 ROOM", "5 ROOM", "5 ROOM", …
$ block <chr> "551", "305", "520", "253", "423", "617", "315A", …
$ street_name <chr> "ANG MO KIO AVE 10", "ANG MO KIO AVE 1", "ANG MO K…
$ storey_range <chr> "01 TO 03", "13 TO 15", "16 TO 18", "07 TO 09", "0…
$ floor_area_sqm <dbl> 118, 123, 118, 128, 133, 133, 110, 110, 110, 112, …
$ flat_model <chr> "Improved", "Standard", "Improved", "Improved", "I…
$ lease_commence_date <dbl> 1981, 1977, 1980, 1996, 1993, 1996, 2006, 2002, 20…
$ remaining_lease <dbl> 59.08, 55.58, 58.67, 74.25, 71.25, 74.50, 84.33, 8…
$ resale_price <dbl> 483000, 590000, 629000, 670000, 680000, 760000, 76…
$ geometry <POINT [m]> POINT (30820.82 39547.58), POINT (29412.84 3…
$ storey_order <int> 1, 5, 6, 3, 1, 5, 8, 5, 6, 6, 5, 5, 8, 2, 2, 3, 1,…
$ age_of_unit <dbl> 40, 44, 41, 25, 28, 25, 15, 19, 15, 10, 10, 10, 10…
$ PROX_CBD <dbl> 9537.543, 8663.223, 9449.166, 9211.988, 8935.289, …
$ PROX_CHILDCARE <dbl> 2.395193e+02, 1.113894e+02, 1.275906e+02, 1.028995…
$ PROX_ELDERCARE <dbl> 1064.6617, 190.8834, 789.1907, 147.6040, 441.8627,…
$ PROX_HAWKER <dbl> 482.8156, 331.7637, 379.2242, 588.4497, 512.9672, …
$ PROX_MRT <dbl> 1080.8607, 524.3923, 415.9309, 242.7019, 218.2532,…
$ PROX_PARK <dbl> 735.9373, 580.8933, 308.7999, 283.8337, 257.6041, …
$ PROX_MALL <dbl> 1213.2871, 441.6229, 549.4572, 1536.9839, 371.1503…
$ PROX_SPRMKT <dbl> 419.91387, 245.54343, 318.05791, 313.57577, 379.11…
$ PROX_TOPPRISCH <dbl> 274.8829, 1163.7853, 743.7382, 635.2942, 1009.6461…
$ NUM_KNDRGTN <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,…
$ NUM_CHILDCARE <dbl> 2, 5, 3, 3, 2, 3, 5, 2, 5, 4, 5, 4, 4, 3, 2, 4, 3,…
$ NUM_BUS_STOP <dbl> 2, 8, 6, 11, 6, 8, 4, 9, 5, 9, 7, 9, 9, 11, 5, 6, …
$ NUM_PRI_SCH <dbl> 3, 3, 5, 2, 4, 4, 3, 5, 2, 4, 4, 4, 4, 4, 3, 3, 3,…
summary(resale_sf$resale_price)
Min. 1st Qu. Median Mean 3rd Qu. Max.
350000 528000 595000 627297 690000 1418000
graph
ggplot(data=resale_sf, aes(x=resale_price)) +
geom_histogram(bins=20, color="black", fill="light blue") +
labs(title = "Distribution of Resale Prices",
x = "Resale Prices",
y = 'Frequency')
Box plot
ggplot(data = resale_sf, aes(x = '', y = resale_price)) +
geom_boxplot() +
labs(x='', y='Resale Prices')
point map
tmap_mode("view")
tmap_options(check.and.fix = TRUE)
tm_shape(resale_sf) +
tm_dots(col = "resale_price",
alpha = 0.6,
style="quantile") +
# sets minimum zoom level to 11, sets maximum zoom level to 14
tm_view(set.zoom.limits = c(11,14))
From the plotting of the reslae prices on the map of Singapore, we see that the hdb units in and around central areas of singapore are darker red indicating that they are of higher price. The units in the Western and Northern areas of Singapore are cheaper than those in the Eastern part.
tmap_mode("plot")
Top 10 records in terms of price
<- aggregate(resale_sf[,"resale_price"], list(resale_sf$town), mean)
town_mean = top_n(town_mean, 10, `resale_price`) %>%
top10_town arrange(desc(`resale_price`))
top10_town
Simple feature collection with 10 features and 2 fields
Geometry type: MULTIPOINT
Dimension: XY
Bounding box: xmin: 6958.193 ymin: 28157.26 xmax: 37492.71 ymax: 38377.45
Projected CRS: SVY21 / Singapore TM
Group.1 resale_price geometry
1 CENTRAL AREA 1054103.5 MULTIPOINT ((28851.09 28922...
2 QUEENSTOWN 915809.2 MULTIPOINT ((22213.08 31972...
3 BUKIT TIMAH 875349.2 MULTIPOINT ((21224.71 35657...
4 BISHAN 840447.2 MULTIPOINT ((27619.38 37901...
5 BUKIT MERAH 833586.2 MULTIPOINT ((25055.59 29894...
6 TOA PAYOH 832569.3 MULTIPOINT ((28957.02 34764...
7 MARINE PARADE 825244.3 MULTIPOINT ((6958.193 32934...
8 CLEMENTI 800793.5 MULTIPOINT ((19953.58 32606...
9 KALLANG/WHAMPOA 799784.5 MULTIPOINT ((22917.56 35326...
10 GEYLANG 748587.1 MULTIPOINT ((32760.87 33216...
<- ggplot(data = resale_sf, aes(x = `floor_area_sqm`)) +
AREA_SQM geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `age_of_unit`)) +
AGE_UNIT geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `remaining_lease`)) +
LEASE_YRS geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `PROX_CBD`)) +
PROX_CBD geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `PROX_CHILDCARE`)) +
PROX_CHILDCARE geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `PROX_ELDERCARE`)) +
PROX_ELDERCARE geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `PROX_HAWKER`)) +
PROX_HAWKER geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `PROX_MRT`)) +
PROX_MRT geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `PROX_PARK`)) +
PROX_PARK geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `PROX_TOPPRISCH`)) +
PROX_TOPPRISCH geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `PROX_MALL`)) +
PROX_MALL geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `PROX_SPRMKT`)) +
PROX_SPRMKT geom_histogram(bins=20, color="black", fill = 'lightblue')
ggarrange(AREA_SQM, AGE_UNIT, LEASE_YRS, PROX_CBD, PROX_CHILDCARE, PROX_ELDERCARE, PROX_HAWKER, PROX_MRT, PROX_PARK, PROX_TOPPRISCH, PROX_MALL, PROX_SPRMKT, ncol = 3, nrow = 4)
<- ggplot(data = resale_sf, aes(x = `NUM_CHILDCARE`)) +
NUM_CHILDCARE geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `NUM_KNDRGTN`)) +
NUM_KNDRGTN geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `NUM_BUS_STOP`)) +
NUM_BUS_STOP geom_histogram(bins=20, color="black", fill = 'lightblue')
<- ggplot(data = resale_sf, aes(x = `NUM_PRI_SCH`)) +
NUM_PRI_SCH geom_histogram(bins=20, color="black", fill = 'lightblue')
ggarrange(NUM_KNDRGTN, NUM_CHILDCARE, NUM_BUS_STOP, NUM_PRI_SCH, ncol = 2, nrow = 3)
Linear regression
Simple linear regression
<- lm(formula=resale_price ~ floor_area_sqm, data = resale_sf) resale_slr
summary(resale_slr)
Call:
lm(formula = resale_price ~ floor_area_sqm, data = resale_sf)
Residuals:
Min 1Q Median 3Q Max
-277295 -99351 -32417 62672 790677
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.266e+05 1.953e+04 32.092 <2e-16 ***
floor_area_sqm 5.565e+00 1.660e+02 0.034 0.973
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 146100 on 14517 degrees of freedom
Multiple R-squared: 7.741e-08, Adjusted R-squared: -6.881e-05
F-statistic: 0.001124 on 1 and 14517 DF, p-value: 0.9733
ggplot(data=resale_sf,
aes(x=`floor_area_sqm`, y=`resale_price`)) +
geom_point() +
geom_smooth(method = lm)
Multiple linear regression
set.seed(1234)
<- initial_split(resale_sf,
resale_split prop = 6.5/10,)
<- training(resale_split)
train_data <- testing(resale_split) test_data
#write_rds(train_data, "data/model/train_data.rds")
#write_rds(test_data, "data/model/test_data.rds")
Compute correlation matrix
<- resale_sf %>%
mdata_nogeo st_drop_geometry()
::corrplot(cor(mdata_nogeo[c(7:7, 10:10, 12:26)]),
corrplotdiag = FALSE,
order = "AOE",
tl.pos = "td",
tl.cex = 0.5,
method = "number",
type = "upper")
Retrieving stored data
<- read_rds("data/model/train_data.rds")
train_data <- read_rds("data/model/test_data.rds") test_data
Building non spatial multiple linear regression
# price_mlr <- lm(resale_price ~ floor_area_sqm + remaining_lease + age_of_unit + storey_order +
# PROX_CBD + PROX_ELDERCARE + PROX_HAWKER +
# PROX_MRT + PROX_PARK + PROX_MALL +
# PROX_SPRMKT + PROX_TOPPRISCH + NUM_KNDRGTN +
# NUM_CHILDCARE + NUM_BUS_STOP +
# NUM_PRI_SCH,
# data=train_data,
# eval = FALSE)
# summary(price_mlr)
#write_rds(price_mlr, "data/model/price_mlr.rds" )
<-read_rds("data/model/price_mlr.rds") price_mlr
GWR predictive model
<- as_Spatial(train_data)
train_data_sp train_data_sp
class : SpatialPointsDataFrame
features : 9437
extent : 6958.193, 42645.18, 28157.26, 48741.06 (xmin, xmax, ymin, ymax)
crs : +proj=tmerc +lat_0=1.36666666666667 +lon_0=103.833333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs
variables : 26
names : month, town, flat_type, block, street_name, storey_range, floor_area_sqm, flat_model, lease_commence_date, remaining_lease, resale_price, storey_order, age_of_unit, PROX_CBD, PROX_CHILDCARE, ...
min values : 2021-01, ANG MO KIO, 5 ROOM, 1, ADMIRALTY DR, 01 TO 03, 99, 3Gen, 1972, 49.08, 350000, 1, 2, 1610.9563636452, 0.000104604717071282, ...
max values : 2022-12, YISHUN, 5 ROOM, 9B, ZION RD, 49 TO 51, 167, Type S2, 2019, 96.75, 1418000, 17, 50, 23277.3731998479, 4567.62297445672, ...
# bw_adaptive <- bw.gwr(resale_price ~ floor_area_sqm + remaining_lease + age_of_unit+ storey_order +
# PROX_CBD + PROX_ELDERCARE + PROX_HAWKER +
# PROX_MRT + PROX_PARK + PROX_MALL +
# PROX_SPRMKT + PROX_TOPPRISCH + NUM_KNDRGTN +
# NUM_CHILDCARE + NUM_BUS_STOP +
# NUM_PRI_SCH,
# data=train_data_sp,
# approach="CV",
# kernel="gaussian",
# adaptive=TRUE,
# longlat=FALSE,
# eval = FALSE)
#write_rds(bw_adaptive, "data/model/bw_adaptive.rds")
Constructing adaptive bandwidth gwr model
<- read_rds("data/model/bw_adaptive.rds") bw_adaptive
# gwr_adaptive <- gwr.basic(formula = resale_price ~ floor_area_sqm + remaining_lease+ storey_order +
# PROX_CBD + PROX_ELDERCARE + PROX_HAWKER +
# PROX_MRT + PROX_PARK + PROX_MALL +
# PROX_SPRMKT + PROX_TOPPRISCH + NUM_KNDRGTN +
# NUM_CHILDCARE + NUM_BUS_STOP +
# NUM_PRI_SCH,
# data=train_data_sp,
# bw=bw_adaptive,
# kernel = 'gaussian',
# adaptive=TRUE,
# longlat = FALSE,
# eval = FALSE)
#write_rds(gwr_adaptive, "data/model/gwr_adaptive.rds")
<- read_rds("data/model/gwr_adaptive.rds")
gwr_adaptive gwr_adaptive
***********************************************************************
* Package GWmodel *
***********************************************************************
Program starts at: 2023-03-18 00:03:21
Call:
gwr.basic(formula = resale_price ~ floor_area_sqm + remaining_lease +
storey_order + PROX_CBD + PROX_ELDERCARE + PROX_HAWKER +
PROX_MRT + PROX_PARK + PROX_MALL + PROX_SPRMKT + PROX_TOPPRISCH +
NUM_KNDRGTN + NUM_CHILDCARE + NUM_BUS_STOP + NUM_PRI_SCH,
data = train_data_sp, bw = bw_adaptive, kernel = "gaussian",
adaptive = TRUE, longlat = FALSE)
Dependent (y) variable: resale_price
Independent variables: floor_area_sqm remaining_lease storey_order PROX_CBD PROX_ELDERCARE PROX_HAWKER PROX_MRT PROX_PARK PROX_MALL PROX_SPRMKT PROX_TOPPRISCH NUM_KNDRGTN NUM_CHILDCARE NUM_BUS_STOP NUM_PRI_SCH
Number of data points: 9437
***********************************************************************
* Results of Global Regression *
***********************************************************************
Call:
lm(formula = formula, data = data)
Residuals:
Min 1Q Median 3Q Max
-328797 -53744 -2343 48503 692491
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.986e+05 2.446e+04 -8.121 5.21e-16 ***
floor_area_sqm 5.792e+03 1.559e+02 37.155 < 2e-16 ***
remaining_lease 5.582e+03 1.012e+02 55.155 < 2e-16 ***
storey_order 2.019e+04 4.389e+02 46.004 < 2e-16 ***
PROX_CBD -2.001e+01 3.178e-01 -62.966 < 2e-16 ***
PROX_ELDERCARE 2.916e+00 1.483e+00 1.967 0.049264 *
PROX_HAWKER -3.258e+01 1.713e+00 -19.016 < 2e-16 ***
PROX_MRT -2.266e+01 2.572e+00 -8.807 < 2e-16 ***
PROX_PARK 8.416e+00 2.220e+00 3.791 0.000151 ***
PROX_MALL -2.414e+01 2.604e+00 -9.268 < 2e-16 ***
PROX_SPRMKT -5.933e+00 5.775e+00 -1.027 0.304256
PROX_TOPPRISCH -1.417e+00 4.976e-01 -2.848 0.004416 **
NUM_KNDRGTN 4.285e+03 9.738e+02 4.400 1.10e-05 ***
NUM_CHILDCARE -3.200e+03 4.833e+02 -6.622 3.73e-11 ***
NUM_BUS_STOP 2.890e+02 3.079e+02 0.939 0.348011
NUM_PRI_SCH -1.303e+04 6.385e+02 -20.406 < 2e-16 ***
---Significance stars
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 83990 on 9421 degrees of freedom
Multiple R-squared: 0.6722
Adjusted R-squared: 0.6717
F-statistic: 1288 on 15 and 9421 DF, p-value: < 2.2e-16
***Extra Diagnostic information
Residual sum of squares: 6.645688e+13
Sigma(hat): 83926.48
AIC: 240800.7
AICc: 240800.8
BIC: 231640.9
***********************************************************************
* Results of Geographically Weighted Regression *
***********************************************************************
*********************Model calibration information*********************
Kernel function: gaussian
Adaptive bandwidth: 46 (number of nearest neighbours)
Regression points: the same locations as observations are used.
Distance metric: Euclidean distance metric is used.
****************Summary of GWR coefficient estimates:******************
Min. 1st Qu. Median 3rd Qu. Max.
Intercept -5.7818e+08 -9.0215e+05 2.7616e+04 2.5420e+06 3.2336e+08
floor_area_sqm -4.0891e+05 1.2117e+03 3.4038e+03 5.6121e+03 6.5430e+04
remaining_lease -7.1227e+04 -1.0675e+03 4.2567e+03 6.4094e+03 1.2678e+04
storey_order 1.0768e+03 1.1865e+04 1.4526e+04 1.7808e+04 3.0065e+04
PROX_CBD -3.2747e+04 -1.6686e+02 -2.3985e+01 8.6541e+01 6.6518e+04
PROX_ELDERCARE -2.3479e+04 -4.6628e+01 3.3732e+01 1.3046e+02 5.1293e+03
PROX_HAWKER -1.6979e+04 -7.4513e+01 2.9321e+00 1.1548e+02 5.4425e+04
PROX_MRT -5.7199e+03 -1.1019e+02 -3.7220e+01 5.8608e+01 5.5159e+04
PROX_PARK -4.8577e+04 -1.1170e+02 -1.5924e+01 5.4928e+01 1.0037e+04
PROX_MALL -9.3580e+04 -1.0764e+02 -2.8123e+01 4.7222e+01 1.3162e+04
PROX_SPRMKT -3.2787e+03 -9.7985e+01 -1.8585e+01 6.2931e+01 2.3654e+03
PROX_TOPPRISCH -6.9634e+04 -1.8091e+02 -1.0695e+01 1.9041e+02 3.2801e+04
NUM_KNDRGTN -2.3913e+06 -9.9718e+03 -7.8011e+02 7.7026e+03 9.1610e+04
NUM_CHILDCARE -6.8852e+04 -4.4461e+03 -4.8502e+01 4.7066e+03 1.0285e+05
NUM_BUS_STOP -6.5309e+04 -3.0411e+03 2.4357e+02 3.3927e+03 3.0656e+04
NUM_PRI_SCH -1.1518e+07 -8.6535e+03 -4.2602e+02 8.3385e+03 2.1769e+05
************************Diagnostic information*************************
Number of data points: 9437
Effective number of parameters (2trace(S) - trace(S'S)): 1419.539
Effective degrees of freedom (n-2trace(S) + trace(S'S)): 8017.461
AICc (GWR book, Fotheringham, et al. 2002, p. 61, eq 2.33): 230488.3
AIC (GWR book, Fotheringham, et al. 2002,GWR p. 96, eq. 4.22): 229025.8
BIC (GWR book, Fotheringham, et al. 2002,GWR p. 61, eq. 2.34): 228914.7
Residual sum of squares: 1.696576e+13
R-square value: 0.9163106
Adjusted R-square value: 0.9014911
***********************************************************************
Program stops at: 2023-03-18 00:05:48
<- st_coordinates(resale_sf)
coords <- st_coordinates(train_data)
coords_train <- st_coordinates(test_data) coords_test
<- train_data %>%
train_data st_drop_geometry()
Calibrating random forest model
# set.seed(1234)
# rf <- ranger(resale_price ~ floor_area_sqm + remaining_lease+ storey_order +
# PROX_CBD + PROX_ELDERCARE + PROX_HAWKER +
# PROX_MRT + PROX_PARK + PROX_MALL +
# PROX_SPRMKT + PROX_TOPPRISCH + NUM_KNDRGTN +
# NUM_CHILDCARE + NUM_BUS_STOP +
# NUM_PRI_SCH,
# data=train_data)
#write_rds(rf, "data/model/rf.rds")
<- read_rds("data/model/rf.rds") rf
print(rf)
Ranger result
Call:
ranger(resale_price ~ floor_area_sqm + remaining_lease + storey_order + PROX_CBD + PROX_ELDERCARE + PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL + PROX_SPRMKT + PROX_TOPPRISCH + NUM_KNDRGTN + NUM_CHILDCARE + NUM_BUS_STOP + NUM_PRI_SCH, data = train_data)
Type: Regression
Number of trees: 500
Sample size: 9437
Number of independent variables: 15
Mtry: 3
Target node size: 5
Variable importance mode: none
Splitrule: variance
OOB prediction error (MSE): 2060441912
R squared (OOB): 0.9040941
Calibrating geographically random forest model
Calibrating using training data
# set.seed(1234)
# gwRF_adaptive <- grf(formula = resale_price ~ floor_area_sqm + remaining_lease+ storey_order +
# PROX_CBD + PROX_ELDERCARE + PROX_HAWKER +
# PROX_MRT + PROX_PARK + PROX_MALL +
# PROX_SPRMKT + PROX_TOPPRISCH + NUM_KNDRGTN +
# NUM_CHILDCARE + NUM_BUS_STOP +
# NUM_PRI_SCH,
# dframe=train_data,
# bw=500,
# ntree = 20,
# kernel="adaptive",
# coords=coords_train,
# verbose = T)
#write_rds(gwRF_adaptive, "data/model/gwRF_adaptive.rds")
<- read_rds("data/model/gwRF_adaptive.rds") gwRF_adaptive
Predicting using test data
Predicting GWRF
<- cbind(test_data, coords_test) %>%
test_data st_drop_geometry()
# gwRF_pred <- predict.grf(gwRF_adaptive,
# test_data,
# x.var.name="X",
# y.var.name="Y",
# local.w=1,
# global.w=0)
# GRF_pred <- write_rds(gwRF_pred, "data/model/GRF_pred.rds")
Predicting OLS
<- predict.lm(price_mlr,
mlr_pred
test_data,x.var.name="X",
y.var.name="Y",
local.w=1,
global.w=0)
<- write_rds(mlr_pred, "data/model/mlr_pred.rds") mlr_pred
Predicting RF
<- predict(rf,test_data) rf_pred
<- write_rds(rf_pred, "data/model/rf_pred.rds") rf_pred
Converting the predicting output into data frame
Creating GWRF prediction dataframe
<- read_rds("data/model/GRF_pred.rds")
GRF_pred <- as.data.frame(GRF_pred) GRF_pred_df
Creating mlr prediction dataframe
<- read_rds("data/model/mlr_pred.rds")
mlr_pred <- as.data.frame(mlr_pred) mlr_pred_df
Creating RF prediction dataframe
<- read_rds("data/model/rf_pred.rds")
rf_pred <- as.data.frame(rf_pred) rf_pred_df
<- cbind(test_data, GRF_pred_df) test_data_p
<- cbind(test_data, mlr_pred) test_data_mlr
<- cbind(test_data, rf_pred) test_data_rf
#write_rds(test_data_p, "data/model/test_data_p.rds")
Calculating RMSE
<- read_rds("data/model/mlr_pred.rds") %>%
mlr_pred_df as.data.frame()
colnames(mlr_pred_df) <- c("mlr_pred")
<- read_rds("data/model/rf_pred.rds")$predictions %>%
rf_pred_df as.data.frame()
colnames(rf_pred_df) <- c("rf_pred")
<- read_rds("data/model/grf_pred.rds") %>%
gwRF_pred_df as.data.frame()
colnames(gwRF_pred_df) <- c("gwrf_pred")
<- cbind(test_data$resale_price, mlr_pred_df, rf_pred_df,
prices_pred_df %>%
gwRF_pred_df) rename("actual_price" = "test_data$resale_price")
head(prices_pred_df)
actual_price mlr_pred rf_pred gwrf_pred
1 590000 650874.6 643090.1 628070.2
2 629000 619627.5 664420.1 625059.2
3 670000 739813.9 744091.3 781945.0
4 760000 764903.8 753052.8 797629.5
5 768000 808958.1 843679.0 812444.1
6 816800 679545.5 805450.4 796965.5
GWRF
sqrt(mean((prices_pred_df$actual_price - prices_pred_df$gwrf_pred)^2))
[1] 43643.26
OLS
sqrt(mean((prices_pred_df$actual_price - prices_pred_df$mlr_pred)^2))
[1] 83904.36
RF
sqrt(mean((prices_pred_df$actual_price - prices_pred_df$rf_pred)^2))
[1] 44908.32
Through root mean square error calculation we see that RMSE for GWRF prediction has the lowest value, thus showing that this method gives the best predictions of prices. While OLS method has a significantly higher RMSE, showing that it is the most unreliable
Visualizing predicted values
Just to visualize the results of RMSE calculation, lets plot a scatter plot graph
GRF pred
ggplot(data = test_data_p,
aes(x = GRF_pred,
y = resale_price)) +
geom_point()
MLR pred
ggplot(data = test_data_mlr,
aes(x = mlr_pred,
y = resale_price)) +
geom_point()
A good predictive model will have the plotted values close to the diagonal line. From our plotting of mlr_pred for OLS, and GRF pred for GWRF we can see that GWRF plotting is closer to the diagonal line. This plot is consistent with our RMSE calculation. hence we can conclude that the geographically weighted random forest method is the best in terms of predicting housing resale prices.