1: Import data into R

# Load the data as a data frame
url <- "https://www.dropbox.com/s/57b32h025egu1yw/kc_only_data.csv?dl=1"
df <- read.csv(url)

2: Pre-Process Data

# Drop rows that don't contain data for the median_listing_price_per_square_foot column
# since it's the response variable I'm looking at, to simplify analysis
# This reduces the data set from 55 postal codes to 50
library(tidyr)
df <- df %>% drop_na(median_listing_price_per_square_foot)

# Update the "yyyymm" format to a sequence of integers from 1 to 68 for each unique month
# so that it can be factored into a linear model as a monotonically increasing parameter, increasing by 1 
dates <- sort(c(unique(df$month_date_yyyymm)))
date_dict <- data.frame(row.names=c(dates) , val=seq(1,length(dates)))
update_date <- function(date_orig){
  return (date_dict[paste(date_orig),])
}
df$month <- sapply(df$month_date_yyyymm,update_date)
#sort(c(unique(df$month))) # to double check that month conversion worked, this should be a list of integers from 1 to 68

# Turn postal_code into character format to use as a "class" in model
df$postal_code <- as.character(df$postal_code)

# Get unique zip codes as a sorted list
zip_codes <- sort(c(unique(df$postal_code)))
print(paste('Total number of postal codes included in analysis:', length(zip_codes)))
## [1] "Total number of postal codes included in analysis: 50"

3: Create the linear model

# Generate linear model using 'lm' function
# Creates a model (without an intercept) for the coefficient of price increase for all zip codes
# per each month. Each zip code (55 total) is included as a binary parameter (0 or 1), and the model
# takes into account the interaction effect between time and zip code
m1 <- lm(median_listing_price_per_square_foot~ month*postal_code-1, data=df)
summary(m1) # print model parameters
## 
## Call:
## lm(formula = median_listing_price_per_square_foot ~ month * postal_code - 
##     1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -100.10   -8.34   -0.98    6.89  789.15 
## 
## Coefficients: (2 not defined because of singularities)
##                          Estimate Std. Error t value Pr(>|t|)    
## month                    0.592122   0.629389   0.941  0.34689    
## postal_code64102        92.538755  29.804755   3.105  0.00192 ** 
## postal_code64105       186.042581   6.123576  30.381  < 2e-16 ***
## postal_code64106       218.227831   6.123576  35.637  < 2e-16 ***
## postal_code64108       200.581212   6.123576  32.756  < 2e-16 ***
## postal_code64109        68.906936   6.123576  11.253  < 2e-16 ***
## postal_code64110       104.801141   6.123576  17.114  < 2e-16 ***
## postal_code64111       150.068920   6.123576  24.507  < 2e-16 ***
## postal_code64112       266.773486   6.123576  43.565  < 2e-16 ***
## postal_code64113       194.194030   6.123576  31.713  < 2e-16 ***
## postal_code64114       116.213784   6.123576  18.978  < 2e-16 ***
## postal_code64116       105.338455   6.123576  17.202  < 2e-16 ***
## postal_code64117        62.666374   6.123576  10.234  < 2e-16 ***
## postal_code64118        92.137840   6.123576  15.046  < 2e-16 ***
## postal_code64119        94.748025   6.123576  15.473  < 2e-16 ***
## postal_code64120        27.935892  11.474862   2.435  0.01497 *  
## postal_code64123        37.855575   6.123576   6.182 7.24e-10 ***
## postal_code64124        26.221686   6.123576   4.282 1.91e-05 ***
## postal_code64125        20.328068   7.817638   2.600  0.00936 ** 
## postal_code64126        22.872088   7.073245   3.234  0.00124 ** 
## postal_code64127        26.091308   6.123576   4.261 2.10e-05 ***
## postal_code64128        13.668569   6.123576   2.232  0.02568 *  
## postal_code64129        47.178665   6.123576   7.704 1.80e-14 ***
## postal_code64130        19.585162   6.123576   3.198  0.00140 ** 
## postal_code64131        75.823968   6.123576  12.382  < 2e-16 ***
## postal_code64132        24.774802   6.123576   4.046 5.35e-05 ***
## postal_code64133        71.113696   6.123576  11.613  < 2e-16 ***
## postal_code64134        54.835382   6.123576   8.955  < 2e-16 ***
## postal_code64136       119.764025   6.383949  18.760  < 2e-16 ***
## postal_code64137        79.374451   6.123576  12.962  < 2e-16 ***
## postal_code64138        78.205004   6.123576  12.771  < 2e-16 ***
## postal_code64139       117.924056   6.123576  19.257  < 2e-16 ***
## postal_code64141       143.157560  27.963588   5.119 3.27e-07 ***
## postal_code64144        85.973316  28.552360   3.011  0.00263 ** 
## postal_code64145       123.069359   6.123576  20.098  < 2e-16 ***
## postal_code64146        87.804653   6.123576  14.339  < 2e-16 ***
## postal_code64149       124.296847   8.447776  14.714  < 2e-16 ***
## postal_code64151        96.497805   6.123576  15.758  < 2e-16 ***
## postal_code64152       115.763828   6.123576  18.905  < 2e-16 ***
## postal_code64153       136.724759   6.123576  22.328  < 2e-16 ***
## postal_code64154       121.485075   6.123576  19.839  < 2e-16 ***
## postal_code64155       119.387182   6.123576  19.496  < 2e-16 ***
## postal_code64156       120.830992   6.123576  19.732  < 2e-16 ***
## postal_code64157       111.826602   6.123576  18.262  < 2e-16 ***
## postal_code64158       105.385426   6.123576  17.210  < 2e-16 ***
## postal_code64161        90.699739  13.746223   6.598 4.95e-11 ***
## postal_code64163        87.012510   6.626581  13.131  < 2e-16 ***
## postal_code64164       150.262443  16.837901   8.924  < 2e-16 ***
## postal_code64165        93.254748   8.008488  11.644  < 2e-16 ***
## postal_code64166        97.376798  22.975231   4.238 2.32e-05 ***
## postal_code64167       143.404987   9.701940  14.781  < 2e-16 ***
## month:postal_code64105  -0.114243   0.648021  -0.176  0.86007    
## month:postal_code64106  -0.675452   0.648021  -1.042  0.29735    
## month:postal_code64108  -0.001978   0.648021  -0.003  0.99756    
## month:postal_code64109   0.458743   0.648021   0.708  0.47906    
## month:postal_code64110   0.051749   0.648021   0.080  0.93636    
## month:postal_code64111   0.221311   0.648021   0.342  0.73274    
## month:postal_code64112  -0.616247   0.648021  -0.951  0.34170    
## month:postal_code64113  -0.178735   0.648021  -0.276  0.78271    
## month:postal_code64114   0.223506   0.648021   0.345  0.73019    
## month:postal_code64116   0.158937   0.648021   0.245  0.80627    
## month:postal_code64117   0.456338   0.648021   0.704  0.48136    
## month:postal_code64118  -0.086740   0.648021  -0.134  0.89353    
## month:postal_code64119  -0.102295   0.648021  -0.158  0.87458    
## month:postal_code64120  -0.034630   0.677456  -0.051  0.95924    
## month:postal_code64123   0.355372   0.648021   0.548  0.58346    
## month:postal_code64124   0.638878   0.648021   0.986  0.32427    
## month:postal_code64125   0.237275   0.655415   0.362  0.71736    
## month:postal_code64126   0.085099   0.652076   0.131  0.89618    
## month:postal_code64127   0.187414   0.648021   0.289  0.77244    
## month:postal_code64128   0.240588   0.648021   0.371  0.71047    
## month:postal_code64129   0.253509   0.648021   0.391  0.69568    
## month:postal_code64130   0.245989   0.648021   0.380  0.70427    
## month:postal_code64131   0.409997   0.648021   0.633  0.52699    
## month:postal_code64132   0.383715   0.648021   0.592  0.55381    
## month:postal_code64133   0.052494   0.648021   0.081  0.93544    
## month:postal_code64134   0.168404   0.648021   0.260  0.79498    
## month:postal_code64136  -0.123434   0.649055  -0.190  0.84919    
## month:postal_code64137  -0.172456   0.648021  -0.266  0.79016    
## month:postal_code64138  -0.072062   0.648021  -0.111  0.91146    
## month:postal_code64139  -0.144055   0.648021  -0.222  0.82410    
## month:postal_code64141         NA         NA      NA       NA    
## month:postal_code64144         NA         NA      NA       NA    
## month:postal_code64145  -0.545113   0.648021  -0.841  0.40031    
## month:postal_code64146   0.183787   0.648021   0.284  0.77673    
## month:postal_code64149   1.178843   0.672885   1.752  0.07990 .  
## month:postal_code64151   0.208027   0.648021   0.321  0.74822    
## month:postal_code64152   0.257861   0.648021   0.398  0.69072    
## month:postal_code64153  -0.414067   0.648021  -0.639  0.52289    
## month:postal_code64154   0.408737   0.648021   0.631  0.52826    
## month:postal_code64155   0.128113   0.648021   0.198  0.84329    
## month:postal_code64156   0.236732   0.648021   0.365  0.71490    
## month:postal_code64157   0.338309   0.648021   0.522  0.60167    
## month:postal_code64158   0.712989   0.648021   1.100  0.27131    
## month:postal_code64161  -0.502090   0.704456  -0.713  0.47607    
## month:postal_code64163   1.028456   0.650226   1.582  0.11383    
## month:postal_code64164   0.729031   0.707537   1.030  0.30292    
## month:postal_code64165   1.335663   0.655779   2.037  0.04177 *  
## month:postal_code64166   0.462496   0.784427   0.590  0.55551    
## month:postal_code64167  -0.248645   0.679423  -0.366  0.71442    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.97 on 2836 degrees of freedom
## Multiple R-squared:  0.9685, Adjusted R-squared:  0.9674 
## F-statistic: 890.3 on 98 and 2836 DF,  p-value: < 2.2e-16

4: Plot the linear model error residuals as a histogram

# Get error residuals from M1 and plot them
e.hat <- residuals(m1)
hist(e.hat,col="grey",breaks=50,main="Histogram of Error Residuals:\n Model 1 (LM)",xlab=expression(hat(epsilon)))

5: Extract the slope coefficients from the linear model and plot, to compare postal codes

## Get just the slope estimates for each zip code 
## (so they can be sorted in descending order to find the largest)
matrix_coef <- summary(m1)$coefficients  # Extract coefficients from model in a matrix
m1_estimates <- matrix_coef[,1] # Get just the first column (coefficient estimate) 
m1_estimates <- as.data.frame(m1_estimates) # Convert the coefficients into a dataframe for easier manipulation
m1_estimates$name <- rownames(m1_estimates) # Add a column "name" of the label for each coefficient
names(m1_estimates)[1] <- "coef"
rownames(m1_estimates) <- seq(1,length(m1_estimates[,1]))   # Reset row names to an integer index
m1_estimates <- m1_estimates[startsWith(m1_estimates$name, "month"),] # Get just the slope coefficients (labeled starting with "month")
base_coef <- m1_estimates[m1_estimates$name == "month",][,1] # get the base slope coefficient from the linear model 
m1_estimates[m1_estimates$name == "month",][,2] <- "64102" # update the label for the "base coefficient" row to the actual zip code for comparison

# function to add the base slope estimate to all others for comparison of real values
update_slopes_with_base <- function(df_row){
  return(df_row[1] + base_coef)
}

# function to update the dataframe label column (to get just the zip code listed)
update_zip_name <- function(df_row){
  return(sub("month:postal_code","",df_row[1]))
}

# Update the coefficients (add the base coefficient value (except for the base value itself))
m1_estimates[startsWith(m1_estimates$name, "month"),]$coef <- lapply(m1_estimates[startsWith(m1_estimates$name, "month"),]$coef,update_slopes_with_base)
# Update the dataframe row labels to be just the zip code
m1_estimates[startsWith(m1_estimates$name, "month"),]$name <- lapply(m1_estimates[startsWith(m1_estimates$name, "month:postal_code"),]$name,update_zip_name)
m1_estimates <- as.data.frame(lapply(m1_estimates, unlist)) # convert the df columns formatted as alist

# Sort the coefficient estimates
m1_estimates <- m1_estimates[order(m1_estimates$coef,decreasing=TRUE), ]

# Plot the top 10 estimated coefficients
library(dplyr)
library(ggplot2)
top_10 <- m1_estimates %>% slice(1:10)
ggplot(top_10, aes(reorder(name, -coef, sum), coef)) +    # ggplot2 plot with modified x-axis labels
  geom_bar(stat = "identity") +
  labs(title="Postal Codes with Highest Home Sales Price Growth",
       x="Postal Code", y= "Linear Model Slope Coefficient") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, vjust = 0.5)) +
  geom_text(aes(label = signif(coef, digits = 4)), nudge_y = 0.1, size=3.5)

6: Plot the data and fitted lines and 95% confidence interval for the top 4 postal codes

# Plot M1 with predictions for specific zip code and 95% confidence interval
plot_zip_func <- function(zip, mo_range=(1:68)){
  plot_zip <- zip
  plot_df <- subset(df, subset=postal_code==plot_zip)
  plot(x = plot_df$month, y = plot_df$median_listing_price_per_square_foot, xlab = "Month", 
       ylab = expression("Median Listing Price Per Sq. Ft."),  
       main=paste("Linear model fitted line and confidence\n interval for postal code:",plot_zip, sep=" "),col="cornflowerblue",pch="*",cex=1.25,
       ylim = c(0,350),xlim=c(0, max(mo_range)))
  F <- predict(m1,newdata=data.frame(month=mo_range, postal_code=plot_zip),interval="confidence",level=0.95)
  points(mo_range,F[,1], col="red",typ="l",lwd=1.5)
  points(mo_range,F[,2], col="orange",typ="l",lwd=0.5)
  points(mo_range,F[,3], col="orange",typ="l",lwd=0.5)
}
plot_zip_func('64165')

plot_zip_func('64149')

plot_zip_func('64163')

plot_zip_func('64164')

7: Hypothesis testing of top 4 postal codes

## Hypothesis test a list of postal codes
## and output a formatted table of hypothesis test results
## at 95% and 99% confidence 
hypothesis_test_multiple <- function(post_code_list){
  hypothesis_results_df <- data.frame(matrix(ncol = 3, nrow = 0)) # create a dataframe to store hypothesis test results for export to a table
  colnames(hypothesis_results_df) <- c('Postal Code', 'β > 0 at 95% Confidence', 'β > 0 at 99% Confidence')
  for (post_code in post_code_list){
    test_df <- subset(df,subset=postal_code==post_code)
    test_model <- lm(median_listing_price_per_square_foot~ month, data=test_df) # removed interactive effect between month and postal code because this is just a subset of a single month
    beta1.null <- 0
    beta1.hat <- coef(test_model)[2]
    sigma2.beta1.hat <- vcov(test_model)[2,2]
    t <- (beta1.hat - beta1.null)/sqrt(sigma2.beta1.hat)
    n <- nrow(test_df)
    p <- 2
    f_stat <- pt(t,n-p,lower.tail=FALSE)  # Calculate integral of t-distribution
    print(paste(post_code,'test statistic:', f_stat))
    flag_5 <- FALSE
    flag_1 <- FALSE
    if (f_stat < 0.05){
      flag_5 <- TRUE  
    }
    if (f_stat < 0.01){
      flag_1 <- TRUE
    }
    hypothesis_results_df[nrow(hypothesis_results_df)+1,] <- c(post_code, flag_5, flag_1)
  }
  library(formattable)
  formattable(hypothesis_results_df,
              align=c("c","c","c"),
              list(
    'β > 0 at 95% Confidence' = formatter("span",
                        style = x ~ style(color = ifelse(x, "green", "red")),
                        x ~ icontext(ifelse(x, "ok", "remove"), ifelse(x, "Yes", "No"))),
    'β > 0 at 99% Confidence' = formatter("span",
                              style = x ~ style(color = ifelse(x, "green", "red")),
                              x ~ icontext(ifelse(x, "ok", "remove"), ifelse(x, "Yes", "No")))
  ))
}
hypothesis_test_multiple(c('64165', '64149', '64163', '64164'))
## [1] "64165 test statistic: 0.0152174309475825"
## [1] "64149 test statistic: 0.0144677504425707"
## [1] "64163 test statistic: 9.79917211497219e-25"
## [1] "64164 test statistic: 0.132380308664403"
Postal Code ß > 0 at 95% Confidence ß > 0 at 99% Confidence
64165 Yes No
64149 Yes No
64163 Yes Yes
64164 No No

8: Fit Generalized Additive Model (GAM)

# Fit GAM Model (Model 2)
library(mgcv)
m2 <- gam(median_listing_price_per_square_foot~ s(month)+postal_code, data=df)
summary(m2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## median_listing_price_per_square_foot ~ s(month) + postal_code
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       108.950     12.734   8.556  < 2e-16 ***
## postal_code64105   93.978     13.103   7.172 9.35e-13 ***
## postal_code64106  106.802     13.103   8.151 5.34e-16 ***
## postal_code64108  112.390     13.103   8.577  < 2e-16 ***
## postal_code64109   -3.389     13.103  -0.259 0.795918    
## postal_code64110   18.464     13.103   1.409 0.158920    
## postal_code64111   69.581     13.103   5.310 1.18e-07 ***
## postal_code64112  157.390     13.103  12.011  < 2e-16 ***
## postal_code64113   99.905     13.103   7.624 3.31e-14 ***
## postal_code64114   35.802     13.103   2.732 0.006328 ** 
## postal_code64116   22.699     13.103   1.732 0.083328 .  
## postal_code64117   -9.713     13.103  -0.741 0.458605    
## postal_code64118    1.022     13.103   0.078 0.937808    
## postal_code64119    3.096     13.103   0.236 0.813236    
## postal_code64120  -62.725     13.421  -4.674 3.09e-06 ***
## postal_code64123  -38.007     13.103  -2.901 0.003753 ** 
## postal_code64124  -39.860     13.103  -3.042 0.002372 ** 
## postal_code64125  -58.733     13.165  -4.461 8.47e-06 ***
## postal_code64126  -62.209     13.138  -4.735 2.30e-06 ***
## postal_code64127  -55.566     13.103  -4.241 2.30e-05 ***
## postal_code64128  -66.154     13.103  -5.049 4.73e-07 ***
## postal_code64129  -32.198     13.103  -2.457 0.014060 *  
## postal_code64130  -60.051     13.103  -4.583 4.78e-06 ***
## postal_code64131    1.846     13.103   0.141 0.887974    
## postal_code64132  -50.110     13.103  -3.824 0.000134 ***
## postal_code64133  -15.198     13.103  -1.160 0.246201    
## postal_code64134  -27.478     13.103  -2.097 0.036082 *  
## postal_code64136   27.259     13.114   2.079 0.037745 *  
## postal_code64137  -14.698     13.103  -1.122 0.262082    
## postal_code64138  -12.404     13.103  -0.947 0.343909    
## postal_code64139   24.831     13.103   1.895 0.058188 .  
## postal_code64141   55.379     28.475   1.945 0.051893 .  
## postal_code64144   -1.115     28.477  -0.039 0.968766    
## postal_code64145   16.140     13.103   1.232 0.218142    
## postal_code64146    6.022     13.103   0.460 0.645829    
## postal_code64149   73.627     13.424   5.485 4.50e-08 ***
## postal_code64151   15.552     13.103   1.187 0.235380    
## postal_code64152   36.537     13.103   2.788 0.005332 ** 
## postal_code64153   34.317     13.103   2.619 0.008867 ** 
## postal_code64154   47.464     13.103   3.622 0.000297 ***
## postal_code64155   35.684     13.103   2.723 0.006503 ** 
## postal_code64156   40.875     13.103   3.119 0.001830 ** 
## postal_code64157   35.375     13.103   2.700 0.006980 ** 
## postal_code64158   41.861     13.103   3.195 0.001415 ** 
## postal_code64161  -17.401     13.407  -1.298 0.194423    
## postal_code64163   35.672     13.119   2.719 0.006587 ** 
## postal_code64164   95.556     13.643   7.004 3.09e-12 ***
## postal_code64165   56.781     13.214   4.297 1.79e-05 ***
## postal_code64166   27.894     14.865   1.877 0.060685 .  
## postal_code64167   47.618     13.556   3.513 0.000450 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df     F p-value    
## s(month) 5.717  6.871 150.3  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.817   Deviance explained =   82%
## GCV = 660.18  Scale est. = 647.64    n = 2934

9: Plot error residuals for GAM as a histogram

# Print histogram of Model 2 error residuals
e.hat2 <- residuals(m2)
hist(e.hat2,col="grey",breaks=50,main="Histogram of Error Residuals:\n Model 2 (GAM)",xlab=expression(hat(epsilon)))

10: Plot the GAM fitted lines for each postal code together

### PLOTTING GAM ZIPS TOGETHER
# Print all data and fitted line of gam model for each zip code
plot(df$month, df$median_listing_price_per_square_foot,pch=20,xlim=c(0,68), ylim=c(0,300),
     xlab="Month \n(July 2016 - February 2022, represented as integers)",
     ylab="Median Listing Price Per Square Foot",
     main="GAM fitted lines for each postal code")
for (i in zip_codes){
  E.zip.gam.y <- predict(m2,newdata=data.frame(month=1:128, postal_code=i))
  points(1:128,E.zip.gam.y,typ="l",lwd=.5,col="red")
}

11: Evaluate the Predictive Ability of LM vs GAM

### ITERATIVELY EVALUATE LM VS GAN FOR RANGE OF PREDICTION INTERVALS
evaluate_models <- function(eval_range=(12:36)){
  model_count_results_df <- data.frame(matrix(ncol = 5, nrow = 0)) # create a dataframe to store hypothesis test results for export to a table
  colnames(model_count_results_df) <- c('Prediction Size', 'LM count', 'LM error', 'GAM count', 'GAM error')
  for (p_size in eval_range){
    pred_size <- p_size
    train_size <- 68-pred_size
    train_df <- df[df$month <= train_size,]
    test_df <- df[df$month > train_size,]
    train_m1 <- lm(median_listing_price_per_square_foot~month*postal_code, train_df)
    train_m2 <- gam(median_listing_price_per_square_foot~ s(month)+postal_code, data=train_df)
    counter = 0
    m1_count = 0
    m2_count = 0
    sum_error_m1 = 0
    sum_error_m2 = 0
    for (i in zip_codes){
      if (nrow(df[df$postal_code==i,]) > 1){ # filtering zip codes with <= 1 row of data since they can't be used for prediction
        zip.E.m1.y <- predict(m1,newdata=data.frame(month=seq(train_size+1,68), postal_code=i))
        zip.E.m2.y <- predict(m2,newdata=data.frame(month=seq(train_size+1,68), postal_code=i))
        sum_zip_m1 = 0
        sum_zip_m2 = 0
        zip_counter = 0
        for (j in 1:pred_size){
          y <- df[df$postal_code==i & df$month==j+train_size,]$median_listing_price_per_square_foot
          if (length(y) > 0 && is.na(y)==FALSE){
            diff_m1 <- abs(y - zip.E.m1.y[j])
            sum_error_m1 <- sum_error_m1 + diff_m1
            sum_zip_m1 <- sum_zip_m1 + diff_m1
            diff_m2 <- abs(y - zip.E.m2.y[j])
            sum_error_m2 <- sum_error_m2 + diff_m2
            sum_zip_m2 <- sum_zip_m2 + diff_m2
            zip_counter <- zip_counter + 1
            counter <- counter + 1
          }
        }
        sum_zip_m1 <- sum_zip_m1/zip_counter
        sum_zip_m2 <- sum_zip_m2/zip_counter
        if (sum_zip_m1 < sum_zip_m2){
          m1_count <- m1_count + 1
        }
        else{
          m2_count <- m2_count + 1
        }
    }
    }
    m1_avg_error <- sum_error_m1/counter
    m2_avg_error <- sum_error_m2/counter
    model_count_results_df[nrow(model_count_results_df)+1,] <- c(p_size, m1_count, m1_avg_error, m2_count, m2_avg_error)  
  }
  ## Plot the number predicted better for each model
  par(mar=c(5.1,5.1,4.1,2.1)) # update left margin (2nd value - from 4.1 to 5.1)
  plot(model_count_results_df$`Prediction Size`, model_count_results_df$`LM count`, 'l', lwd=2.0, col='red', xaxt='n', 
       xlab = "Prediction Size \n (number of months)", ylab = "Number of Postal\n Codes Predicted Better",
       main = "LM vs GAM: Number of \n Postal Codes Predicted Better", ylim=c(19,32))
  lines(model_count_results_df$`Prediction Size`, model_count_results_df$`GAM count`, lwd=2.0, col='blue')
  axis(1,at=c(12,18,24,30,36))
  legend(27.71, 32.125, legend=c("Model 1 (LM)", "Model 2 (GAM)"), fill = c("red","blue"))
  
  ## Plot the mean abs error for each model
  par(mar=c(5.1,4.1,4.1,2.1)) # set the plot margins back to default
  plot(model_count_results_df$`Prediction Size`, model_count_results_df$`LM error`, 'l', lwd=2.0, col='red', xaxt='n', 
       xlab = "Prediction Size \n (number of months)", ylab = "Mean Absolute Error",
       main = "LM vs GAM: \n Mean Absolute Error", ylim=c(12.5,17))
  lines(model_count_results_df$`Prediction Size`, model_count_results_df$`GAM error`, lwd=2.0, col='blue')
  axis(1,at=c(12,18,24,30,36))
  legend(27.72, 17.05, legend=c("Model 1 (LM)", "Model 2 (GAM)"), fill = c("red","blue"))
}
evaluate_models()

12: Plot the fit and prediction interval for postal code 64163

# Plot M1 with predictions for specific zip code and 95% prediction interval
plot_zip_func <- function(zip, mo_range=(1:68)){
  plot_zip <- zip
  plot_df <- subset(df, subset=postal_code==plot_zip)
  F <- predict(m1,newdata=data.frame(month=mo_range, postal_code=plot_zip),interval="prediction",level=0.95)
  plot(x = plot_df$month, y = plot_df$median_listing_price_per_square_foot, xlab = "Month", 
       ylab = expression("Median Listing Price Per Sq. Ft."),  
       main=paste("Linear model ", max(mo_range)-68, "-month\n prediction for postal code: ",plot_zip, sep=""),col="cornflowerblue",pch="*",cex=1.25,
       ylim = c(0,max(F[,3])+25), xlim=c(0, max(mo_range)))
  points(mo_range,F[,1], col="red",typ="l",lwd=1.5)
  points(mo_range,F[,2], col="orange",typ="l",lwd=0.5)
  points(mo_range,F[,3], col="orange",typ="l",lwd=0.5)
  
  # Plot the predicted values at the max range
  text(max(mo_range)-1, F[,1][max(mo_range)], signif(F[,1][max(mo_range)],4), pos=1, cex=0.75)
  points(max(mo_range), F[,1][max(mo_range)], pch='*')
  text(max(mo_range)-1, F[,2][max(mo_range)], signif(F[,2][max(mo_range)],4), pos=1, cex=0.75)
  points(max(mo_range), F[,2][max(mo_range)], pch='*')
  text(max(mo_range)-1, F[,3][max(mo_range)], signif(F[,3][max(mo_range)],4), pos=1, cex=0.75)
  points(max(mo_range), F[,3][max(mo_range)], pch='*')
}
plot_zip_func('64163', (1:86))