# Load the data as a data frame
url <- "https://www.dropbox.com/s/57b32h025egu1yw/kc_only_data.csv?dl=1"
df <- read.csv(url)
Drop rows that don’t contain data for “median listing price per square foot”, since that is the response variable of interest. This reduces the number of postal codes included in the analysis from 55 to 50.
Create a new column called ‘month’ which converts the “month_data_yyyymm” field into a representative integer. This allows the linear model to recognize the change between years (for example, from ‘202112’ (December 2021) to ‘202201’ (January 2022) would represent 89 months in the model otherwise when it should only be 1)
Convert the postal code from an integer into ‘character’ class so that it will be recognized in the linear model as a categorical class
# Drop rows that don't contain data for the median_listing_price_per_square_foot column
# since it's the response variable I'm looking at, to simplify analysis
# This reduces the data set from 55 postal codes to 50
library(tidyr)
df <- df %>% drop_na(median_listing_price_per_square_foot)
# Update the "yyyymm" format to a sequence of integers from 1 to 68 for each unique month
# so that it can be factored into a linear model as a monotonically increasing parameter, increasing by 1
dates <- sort(c(unique(df$month_date_yyyymm)))
date_dict <- data.frame(row.names=c(dates) , val=seq(1,length(dates)))
update_date <- function(date_orig){
return (date_dict[paste(date_orig),])
}
df$month <- sapply(df$month_date_yyyymm,update_date)
#sort(c(unique(df$month))) # to double check that month conversion worked, this should be a list of integers from 1 to 68
# Turn postal_code into character format to use as a "class" in model
df$postal_code <- as.character(df$postal_code)
# Get unique zip codes as a sorted list
zip_codes <- sort(c(unique(df$postal_code)))
print(paste('Total number of postal codes included in analysis:', length(zip_codes)))
## [1] "Total number of postal codes included in analysis: 50"
# Generate linear model using 'lm' function
# Creates a model (without an intercept) for the coefficient of price increase for all zip codes
# per each month. Each zip code (55 total) is included as a binary parameter (0 or 1), and the model
# takes into account the interaction effect between time and zip code
m1 <- lm(median_listing_price_per_square_foot~ month*postal_code-1, data=df)
summary(m1) # print model parameters
##
## Call:
## lm(formula = median_listing_price_per_square_foot ~ month * postal_code -
## 1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -100.10 -8.34 -0.98 6.89 789.15
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## month 0.592122 0.629389 0.941 0.34689
## postal_code64102 92.538755 29.804755 3.105 0.00192 **
## postal_code64105 186.042581 6.123576 30.381 < 2e-16 ***
## postal_code64106 218.227831 6.123576 35.637 < 2e-16 ***
## postal_code64108 200.581212 6.123576 32.756 < 2e-16 ***
## postal_code64109 68.906936 6.123576 11.253 < 2e-16 ***
## postal_code64110 104.801141 6.123576 17.114 < 2e-16 ***
## postal_code64111 150.068920 6.123576 24.507 < 2e-16 ***
## postal_code64112 266.773486 6.123576 43.565 < 2e-16 ***
## postal_code64113 194.194030 6.123576 31.713 < 2e-16 ***
## postal_code64114 116.213784 6.123576 18.978 < 2e-16 ***
## postal_code64116 105.338455 6.123576 17.202 < 2e-16 ***
## postal_code64117 62.666374 6.123576 10.234 < 2e-16 ***
## postal_code64118 92.137840 6.123576 15.046 < 2e-16 ***
## postal_code64119 94.748025 6.123576 15.473 < 2e-16 ***
## postal_code64120 27.935892 11.474862 2.435 0.01497 *
## postal_code64123 37.855575 6.123576 6.182 7.24e-10 ***
## postal_code64124 26.221686 6.123576 4.282 1.91e-05 ***
## postal_code64125 20.328068 7.817638 2.600 0.00936 **
## postal_code64126 22.872088 7.073245 3.234 0.00124 **
## postal_code64127 26.091308 6.123576 4.261 2.10e-05 ***
## postal_code64128 13.668569 6.123576 2.232 0.02568 *
## postal_code64129 47.178665 6.123576 7.704 1.80e-14 ***
## postal_code64130 19.585162 6.123576 3.198 0.00140 **
## postal_code64131 75.823968 6.123576 12.382 < 2e-16 ***
## postal_code64132 24.774802 6.123576 4.046 5.35e-05 ***
## postal_code64133 71.113696 6.123576 11.613 < 2e-16 ***
## postal_code64134 54.835382 6.123576 8.955 < 2e-16 ***
## postal_code64136 119.764025 6.383949 18.760 < 2e-16 ***
## postal_code64137 79.374451 6.123576 12.962 < 2e-16 ***
## postal_code64138 78.205004 6.123576 12.771 < 2e-16 ***
## postal_code64139 117.924056 6.123576 19.257 < 2e-16 ***
## postal_code64141 143.157560 27.963588 5.119 3.27e-07 ***
## postal_code64144 85.973316 28.552360 3.011 0.00263 **
## postal_code64145 123.069359 6.123576 20.098 < 2e-16 ***
## postal_code64146 87.804653 6.123576 14.339 < 2e-16 ***
## postal_code64149 124.296847 8.447776 14.714 < 2e-16 ***
## postal_code64151 96.497805 6.123576 15.758 < 2e-16 ***
## postal_code64152 115.763828 6.123576 18.905 < 2e-16 ***
## postal_code64153 136.724759 6.123576 22.328 < 2e-16 ***
## postal_code64154 121.485075 6.123576 19.839 < 2e-16 ***
## postal_code64155 119.387182 6.123576 19.496 < 2e-16 ***
## postal_code64156 120.830992 6.123576 19.732 < 2e-16 ***
## postal_code64157 111.826602 6.123576 18.262 < 2e-16 ***
## postal_code64158 105.385426 6.123576 17.210 < 2e-16 ***
## postal_code64161 90.699739 13.746223 6.598 4.95e-11 ***
## postal_code64163 87.012510 6.626581 13.131 < 2e-16 ***
## postal_code64164 150.262443 16.837901 8.924 < 2e-16 ***
## postal_code64165 93.254748 8.008488 11.644 < 2e-16 ***
## postal_code64166 97.376798 22.975231 4.238 2.32e-05 ***
## postal_code64167 143.404987 9.701940 14.781 < 2e-16 ***
## month:postal_code64105 -0.114243 0.648021 -0.176 0.86007
## month:postal_code64106 -0.675452 0.648021 -1.042 0.29735
## month:postal_code64108 -0.001978 0.648021 -0.003 0.99756
## month:postal_code64109 0.458743 0.648021 0.708 0.47906
## month:postal_code64110 0.051749 0.648021 0.080 0.93636
## month:postal_code64111 0.221311 0.648021 0.342 0.73274
## month:postal_code64112 -0.616247 0.648021 -0.951 0.34170
## month:postal_code64113 -0.178735 0.648021 -0.276 0.78271
## month:postal_code64114 0.223506 0.648021 0.345 0.73019
## month:postal_code64116 0.158937 0.648021 0.245 0.80627
## month:postal_code64117 0.456338 0.648021 0.704 0.48136
## month:postal_code64118 -0.086740 0.648021 -0.134 0.89353
## month:postal_code64119 -0.102295 0.648021 -0.158 0.87458
## month:postal_code64120 -0.034630 0.677456 -0.051 0.95924
## month:postal_code64123 0.355372 0.648021 0.548 0.58346
## month:postal_code64124 0.638878 0.648021 0.986 0.32427
## month:postal_code64125 0.237275 0.655415 0.362 0.71736
## month:postal_code64126 0.085099 0.652076 0.131 0.89618
## month:postal_code64127 0.187414 0.648021 0.289 0.77244
## month:postal_code64128 0.240588 0.648021 0.371 0.71047
## month:postal_code64129 0.253509 0.648021 0.391 0.69568
## month:postal_code64130 0.245989 0.648021 0.380 0.70427
## month:postal_code64131 0.409997 0.648021 0.633 0.52699
## month:postal_code64132 0.383715 0.648021 0.592 0.55381
## month:postal_code64133 0.052494 0.648021 0.081 0.93544
## month:postal_code64134 0.168404 0.648021 0.260 0.79498
## month:postal_code64136 -0.123434 0.649055 -0.190 0.84919
## month:postal_code64137 -0.172456 0.648021 -0.266 0.79016
## month:postal_code64138 -0.072062 0.648021 -0.111 0.91146
## month:postal_code64139 -0.144055 0.648021 -0.222 0.82410
## month:postal_code64141 NA NA NA NA
## month:postal_code64144 NA NA NA NA
## month:postal_code64145 -0.545113 0.648021 -0.841 0.40031
## month:postal_code64146 0.183787 0.648021 0.284 0.77673
## month:postal_code64149 1.178843 0.672885 1.752 0.07990 .
## month:postal_code64151 0.208027 0.648021 0.321 0.74822
## month:postal_code64152 0.257861 0.648021 0.398 0.69072
## month:postal_code64153 -0.414067 0.648021 -0.639 0.52289
## month:postal_code64154 0.408737 0.648021 0.631 0.52826
## month:postal_code64155 0.128113 0.648021 0.198 0.84329
## month:postal_code64156 0.236732 0.648021 0.365 0.71490
## month:postal_code64157 0.338309 0.648021 0.522 0.60167
## month:postal_code64158 0.712989 0.648021 1.100 0.27131
## month:postal_code64161 -0.502090 0.704456 -0.713 0.47607
## month:postal_code64163 1.028456 0.650226 1.582 0.11383
## month:postal_code64164 0.729031 0.707537 1.030 0.30292
## month:postal_code64165 1.335663 0.655779 2.037 0.04177 *
## month:postal_code64166 0.462496 0.784427 0.590 0.55551
## month:postal_code64167 -0.248645 0.679423 -0.366 0.71442
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.97 on 2836 degrees of freedom
## Multiple R-squared: 0.9685, Adjusted R-squared: 0.9674
## F-statistic: 890.3 on 98 and 2836 DF, p-value: < 2.2e-16
# Get error residuals from M1 and plot them
e.hat <- residuals(m1)
hist(e.hat,col="grey",breaks=50,main="Histogram of Error Residuals:\n Model 1 (LM)",xlab=expression(hat(epsilon)))
## Get just the slope estimates for each zip code
## (so they can be sorted in descending order to find the largest)
matrix_coef <- summary(m1)$coefficients # Extract coefficients from model in a matrix
m1_estimates <- matrix_coef[,1] # Get just the first column (coefficient estimate)
m1_estimates <- as.data.frame(m1_estimates) # Convert the coefficients into a dataframe for easier manipulation
m1_estimates$name <- rownames(m1_estimates) # Add a column "name" of the label for each coefficient
names(m1_estimates)[1] <- "coef"
rownames(m1_estimates) <- seq(1,length(m1_estimates[,1])) # Reset row names to an integer index
m1_estimates <- m1_estimates[startsWith(m1_estimates$name, "month"),] # Get just the slope coefficients (labeled starting with "month")
base_coef <- m1_estimates[m1_estimates$name == "month",][,1] # get the base slope coefficient from the linear model
m1_estimates[m1_estimates$name == "month",][,2] <- "64102" # update the label for the "base coefficient" row to the actual zip code for comparison
# function to add the base slope estimate to all others for comparison of real values
update_slopes_with_base <- function(df_row){
return(df_row[1] + base_coef)
}
# function to update the dataframe label column (to get just the zip code listed)
update_zip_name <- function(df_row){
return(sub("month:postal_code","",df_row[1]))
}
# Update the coefficients (add the base coefficient value (except for the base value itself))
m1_estimates[startsWith(m1_estimates$name, "month"),]$coef <- lapply(m1_estimates[startsWith(m1_estimates$name, "month"),]$coef,update_slopes_with_base)
# Update the dataframe row labels to be just the zip code
m1_estimates[startsWith(m1_estimates$name, "month"),]$name <- lapply(m1_estimates[startsWith(m1_estimates$name, "month:postal_code"),]$name,update_zip_name)
m1_estimates <- as.data.frame(lapply(m1_estimates, unlist)) # convert the df columns formatted as alist
# Sort the coefficient estimates
m1_estimates <- m1_estimates[order(m1_estimates$coef,decreasing=TRUE), ]
# Plot the top 10 estimated coefficients
library(dplyr)
library(ggplot2)
top_10 <- m1_estimates %>% slice(1:10)
ggplot(top_10, aes(reorder(name, -coef, sum), coef)) + # ggplot2 plot with modified x-axis labels
geom_bar(stat = "identity") +
labs(title="Postal Codes with Highest Home Sales Price Growth",
x="Postal Code", y= "Linear Model Slope Coefficient") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, vjust = 0.5)) +
geom_text(aes(label = signif(coef, digits = 4)), nudge_y = 0.1, size=3.5)
# Plot M1 with predictions for specific zip code and 95% confidence interval
plot_zip_func <- function(zip, mo_range=(1:68)){
plot_zip <- zip
plot_df <- subset(df, subset=postal_code==plot_zip)
plot(x = plot_df$month, y = plot_df$median_listing_price_per_square_foot, xlab = "Month",
ylab = expression("Median Listing Price Per Sq. Ft."),
main=paste("Linear model fitted line and confidence\n interval for postal code:",plot_zip, sep=" "),col="cornflowerblue",pch="*",cex=1.25,
ylim = c(0,350),xlim=c(0, max(mo_range)))
F <- predict(m1,newdata=data.frame(month=mo_range, postal_code=plot_zip),interval="confidence",level=0.95)
points(mo_range,F[,1], col="red",typ="l",lwd=1.5)
points(mo_range,F[,2], col="orange",typ="l",lwd=0.5)
points(mo_range,F[,3], col="orange",typ="l",lwd=0.5)
}
plot_zip_func('64165')
plot_zip_func('64149')
plot_zip_func('64163')
plot_zip_func('64164')
For each of the top postal codes, perform a hypothesis test, with the null hypothesis:
\({H_0 : \beta_2 = 0}\)
and alternative hypothesis:
\({H_a : \beta_2 > 0}\)
where \({\beta_2}\) is the slope parameter for each postal code
Plot the testing results (for both 95% and 99% confidence) as a formatted table
## Hypothesis test a list of postal codes
## and output a formatted table of hypothesis test results
## at 95% and 99% confidence
hypothesis_test_multiple <- function(post_code_list){
hypothesis_results_df <- data.frame(matrix(ncol = 3, nrow = 0)) # create a dataframe to store hypothesis test results for export to a table
colnames(hypothesis_results_df) <- c('Postal Code', 'β > 0 at 95% Confidence', 'β > 0 at 99% Confidence')
for (post_code in post_code_list){
test_df <- subset(df,subset=postal_code==post_code)
test_model <- lm(median_listing_price_per_square_foot~ month, data=test_df) # removed interactive effect between month and postal code because this is just a subset of a single month
beta1.null <- 0
beta1.hat <- coef(test_model)[2]
sigma2.beta1.hat <- vcov(test_model)[2,2]
t <- (beta1.hat - beta1.null)/sqrt(sigma2.beta1.hat)
n <- nrow(test_df)
p <- 2
f_stat <- pt(t,n-p,lower.tail=FALSE) # Calculate integral of t-distribution
print(paste(post_code,'test statistic:', f_stat))
flag_5 <- FALSE
flag_1 <- FALSE
if (f_stat < 0.05){
flag_5 <- TRUE
}
if (f_stat < 0.01){
flag_1 <- TRUE
}
hypothesis_results_df[nrow(hypothesis_results_df)+1,] <- c(post_code, flag_5, flag_1)
}
library(formattable)
formattable(hypothesis_results_df,
align=c("c","c","c"),
list(
'β > 0 at 95% Confidence' = formatter("span",
style = x ~ style(color = ifelse(x, "green", "red")),
x ~ icontext(ifelse(x, "ok", "remove"), ifelse(x, "Yes", "No"))),
'β > 0 at 99% Confidence' = formatter("span",
style = x ~ style(color = ifelse(x, "green", "red")),
x ~ icontext(ifelse(x, "ok", "remove"), ifelse(x, "Yes", "No")))
))
}
hypothesis_test_multiple(c('64165', '64149', '64163', '64164'))
## [1] "64165 test statistic: 0.0152174309475825"
## [1] "64149 test statistic: 0.0144677504425707"
## [1] "64163 test statistic: 9.79917211497219e-25"
## [1] "64164 test statistic: 0.132380308664403"
Postal Code | ß > 0 at 95% Confidence | ß > 0 at 99% Confidence |
---|---|---|
64165 | Yes | No |
64149 | Yes | No |
64163 | Yes | Yes |
64164 | No | No |
# Fit GAM Model (Model 2)
library(mgcv)
m2 <- gam(median_listing_price_per_square_foot~ s(month)+postal_code, data=df)
summary(m2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## median_listing_price_per_square_foot ~ s(month) + postal_code
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108.950 12.734 8.556 < 2e-16 ***
## postal_code64105 93.978 13.103 7.172 9.35e-13 ***
## postal_code64106 106.802 13.103 8.151 5.34e-16 ***
## postal_code64108 112.390 13.103 8.577 < 2e-16 ***
## postal_code64109 -3.389 13.103 -0.259 0.795918
## postal_code64110 18.464 13.103 1.409 0.158920
## postal_code64111 69.581 13.103 5.310 1.18e-07 ***
## postal_code64112 157.390 13.103 12.011 < 2e-16 ***
## postal_code64113 99.905 13.103 7.624 3.31e-14 ***
## postal_code64114 35.802 13.103 2.732 0.006328 **
## postal_code64116 22.699 13.103 1.732 0.083328 .
## postal_code64117 -9.713 13.103 -0.741 0.458605
## postal_code64118 1.022 13.103 0.078 0.937808
## postal_code64119 3.096 13.103 0.236 0.813236
## postal_code64120 -62.725 13.421 -4.674 3.09e-06 ***
## postal_code64123 -38.007 13.103 -2.901 0.003753 **
## postal_code64124 -39.860 13.103 -3.042 0.002372 **
## postal_code64125 -58.733 13.165 -4.461 8.47e-06 ***
## postal_code64126 -62.209 13.138 -4.735 2.30e-06 ***
## postal_code64127 -55.566 13.103 -4.241 2.30e-05 ***
## postal_code64128 -66.154 13.103 -5.049 4.73e-07 ***
## postal_code64129 -32.198 13.103 -2.457 0.014060 *
## postal_code64130 -60.051 13.103 -4.583 4.78e-06 ***
## postal_code64131 1.846 13.103 0.141 0.887974
## postal_code64132 -50.110 13.103 -3.824 0.000134 ***
## postal_code64133 -15.198 13.103 -1.160 0.246201
## postal_code64134 -27.478 13.103 -2.097 0.036082 *
## postal_code64136 27.259 13.114 2.079 0.037745 *
## postal_code64137 -14.698 13.103 -1.122 0.262082
## postal_code64138 -12.404 13.103 -0.947 0.343909
## postal_code64139 24.831 13.103 1.895 0.058188 .
## postal_code64141 55.379 28.475 1.945 0.051893 .
## postal_code64144 -1.115 28.477 -0.039 0.968766
## postal_code64145 16.140 13.103 1.232 0.218142
## postal_code64146 6.022 13.103 0.460 0.645829
## postal_code64149 73.627 13.424 5.485 4.50e-08 ***
## postal_code64151 15.552 13.103 1.187 0.235380
## postal_code64152 36.537 13.103 2.788 0.005332 **
## postal_code64153 34.317 13.103 2.619 0.008867 **
## postal_code64154 47.464 13.103 3.622 0.000297 ***
## postal_code64155 35.684 13.103 2.723 0.006503 **
## postal_code64156 40.875 13.103 3.119 0.001830 **
## postal_code64157 35.375 13.103 2.700 0.006980 **
## postal_code64158 41.861 13.103 3.195 0.001415 **
## postal_code64161 -17.401 13.407 -1.298 0.194423
## postal_code64163 35.672 13.119 2.719 0.006587 **
## postal_code64164 95.556 13.643 7.004 3.09e-12 ***
## postal_code64165 56.781 13.214 4.297 1.79e-05 ***
## postal_code64166 27.894 14.865 1.877 0.060685 .
## postal_code64167 47.618 13.556 3.513 0.000450 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(month) 5.717 6.871 150.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.817 Deviance explained = 82%
## GCV = 660.18 Scale est. = 647.64 n = 2934
# Print histogram of Model 2 error residuals
e.hat2 <- residuals(m2)
hist(e.hat2,col="grey",breaks=50,main="Histogram of Error Residuals:\n Model 2 (GAM)",xlab=expression(hat(epsilon)))
### PLOTTING GAM ZIPS TOGETHER
# Print all data and fitted line of gam model for each zip code
plot(df$month, df$median_listing_price_per_square_foot,pch=20,xlim=c(0,68), ylim=c(0,300),
xlab="Month \n(July 2016 - February 2022, represented as integers)",
ylab="Median Listing Price Per Square Foot",
main="GAM fitted lines for each postal code")
for (i in zip_codes){
E.zip.gam.y <- predict(m2,newdata=data.frame(month=1:128, postal_code=i))
points(1:128,E.zip.gam.y,typ="l",lwd=.5,col="red")
}
### ITERATIVELY EVALUATE LM VS GAN FOR RANGE OF PREDICTION INTERVALS
evaluate_models <- function(eval_range=(12:36)){
model_count_results_df <- data.frame(matrix(ncol = 5, nrow = 0)) # create a dataframe to store hypothesis test results for export to a table
colnames(model_count_results_df) <- c('Prediction Size', 'LM count', 'LM error', 'GAM count', 'GAM error')
for (p_size in eval_range){
pred_size <- p_size
train_size <- 68-pred_size
train_df <- df[df$month <= train_size,]
test_df <- df[df$month > train_size,]
train_m1 <- lm(median_listing_price_per_square_foot~month*postal_code, train_df)
train_m2 <- gam(median_listing_price_per_square_foot~ s(month)+postal_code, data=train_df)
counter = 0
m1_count = 0
m2_count = 0
sum_error_m1 = 0
sum_error_m2 = 0
for (i in zip_codes){
if (nrow(df[df$postal_code==i,]) > 1){ # filtering zip codes with <= 1 row of data since they can't be used for prediction
zip.E.m1.y <- predict(m1,newdata=data.frame(month=seq(train_size+1,68), postal_code=i))
zip.E.m2.y <- predict(m2,newdata=data.frame(month=seq(train_size+1,68), postal_code=i))
sum_zip_m1 = 0
sum_zip_m2 = 0
zip_counter = 0
for (j in 1:pred_size){
y <- df[df$postal_code==i & df$month==j+train_size,]$median_listing_price_per_square_foot
if (length(y) > 0 && is.na(y)==FALSE){
diff_m1 <- abs(y - zip.E.m1.y[j])
sum_error_m1 <- sum_error_m1 + diff_m1
sum_zip_m1 <- sum_zip_m1 + diff_m1
diff_m2 <- abs(y - zip.E.m2.y[j])
sum_error_m2 <- sum_error_m2 + diff_m2
sum_zip_m2 <- sum_zip_m2 + diff_m2
zip_counter <- zip_counter + 1
counter <- counter + 1
}
}
sum_zip_m1 <- sum_zip_m1/zip_counter
sum_zip_m2 <- sum_zip_m2/zip_counter
if (sum_zip_m1 < sum_zip_m2){
m1_count <- m1_count + 1
}
else{
m2_count <- m2_count + 1
}
}
}
m1_avg_error <- sum_error_m1/counter
m2_avg_error <- sum_error_m2/counter
model_count_results_df[nrow(model_count_results_df)+1,] <- c(p_size, m1_count, m1_avg_error, m2_count, m2_avg_error)
}
## Plot the number predicted better for each model
par(mar=c(5.1,5.1,4.1,2.1)) # update left margin (2nd value - from 4.1 to 5.1)
plot(model_count_results_df$`Prediction Size`, model_count_results_df$`LM count`, 'l', lwd=2.0, col='red', xaxt='n',
xlab = "Prediction Size \n (number of months)", ylab = "Number of Postal\n Codes Predicted Better",
main = "LM vs GAM: Number of \n Postal Codes Predicted Better", ylim=c(19,32))
lines(model_count_results_df$`Prediction Size`, model_count_results_df$`GAM count`, lwd=2.0, col='blue')
axis(1,at=c(12,18,24,30,36))
legend(27.71, 32.125, legend=c("Model 1 (LM)", "Model 2 (GAM)"), fill = c("red","blue"))
## Plot the mean abs error for each model
par(mar=c(5.1,4.1,4.1,2.1)) # set the plot margins back to default
plot(model_count_results_df$`Prediction Size`, model_count_results_df$`LM error`, 'l', lwd=2.0, col='red', xaxt='n',
xlab = "Prediction Size \n (number of months)", ylab = "Mean Absolute Error",
main = "LM vs GAM: \n Mean Absolute Error", ylim=c(12.5,17))
lines(model_count_results_df$`Prediction Size`, model_count_results_df$`GAM error`, lwd=2.0, col='blue')
axis(1,at=c(12,18,24,30,36))
legend(27.72, 17.05, legend=c("Model 1 (LM)", "Model 2 (GAM)"), fill = c("red","blue"))
}
evaluate_models()
# Plot M1 with predictions for specific zip code and 95% prediction interval
plot_zip_func <- function(zip, mo_range=(1:68)){
plot_zip <- zip
plot_df <- subset(df, subset=postal_code==plot_zip)
F <- predict(m1,newdata=data.frame(month=mo_range, postal_code=plot_zip),interval="prediction",level=0.95)
plot(x = plot_df$month, y = plot_df$median_listing_price_per_square_foot, xlab = "Month",
ylab = expression("Median Listing Price Per Sq. Ft."),
main=paste("Linear model ", max(mo_range)-68, "-month\n prediction for postal code: ",plot_zip, sep=""),col="cornflowerblue",pch="*",cex=1.25,
ylim = c(0,max(F[,3])+25), xlim=c(0, max(mo_range)))
points(mo_range,F[,1], col="red",typ="l",lwd=1.5)
points(mo_range,F[,2], col="orange",typ="l",lwd=0.5)
points(mo_range,F[,3], col="orange",typ="l",lwd=0.5)
# Plot the predicted values at the max range
text(max(mo_range)-1, F[,1][max(mo_range)], signif(F[,1][max(mo_range)],4), pos=1, cex=0.75)
points(max(mo_range), F[,1][max(mo_range)], pch='*')
text(max(mo_range)-1, F[,2][max(mo_range)], signif(F[,2][max(mo_range)],4), pos=1, cex=0.75)
points(max(mo_range), F[,2][max(mo_range)], pch='*')
text(max(mo_range)-1, F[,3][max(mo_range)], signif(F[,3][max(mo_range)],4), pos=1, cex=0.75)
points(max(mo_range), F[,3][max(mo_range)], pch='*')
}
plot_zip_func('64163', (1:86))