Final Project Tutorial

1: Import data into R

To avoid a large file size, this file has been modified from the original data available from Realtor.com to include only the residential housing data for Kansas City, MO
This data covers the dates from July 2016 through February 2022, with each row entry containing data for one month and postal code

# Load the data as a data frame
url <- "https://www.dropbox.com/s/57b32h025egu1yw/kc_only_data.csv?dl=1"
df <- read.csv(url)

2: Pre-Process Data

Drop rows that don’t contain data for “median listing price per square foot”, since that is the response variable of interest. This reduces the number of postal codes included in the analysis from 55 to 50.
Create a new column called ‘month’ which converts the “month_data_yyyymm” field into a representative integer. This allows the linear model to recognize the change between years (for example, from ‘202112’ (December 2021) to ‘202201’ (January 2022) would represent 89 months in the model otherwise when it should only be 1)
Convert the postal code from an integer into ‘character’ class so that it will be recognized in the linear model as a categorical class

# Drop rows that don't contain data for the median_listing_price_per_square_foot column
# since it's the response variable I'm looking at, to simplify analysis
# This reduces the data set from 55 postal codes to 50
library(tidyr)
df <- df %>% drop_na(median_listing_price_per_square_foot)

# Update the "yyyymm" format to a sequence of integers from 1 to 68 for each unique month
# so that it can be factored into a linear model as a monotonically increasing parameter, increasing by 1 
dates <- sort(c(unique(df$month_date_yyyymm)))
date_dict <- data.frame(row.names=c(dates) , val=seq(1,length(dates)))
update_date <- function(date_orig){
  return (date_dict[paste(date_orig),])
}
df$month <- sapply(df$month_date_yyyymm,update_date)
#sort(c(unique(df$month))) # to double check that month conversion worked, this should be a list of integers from 1 to 68

# Turn postal_code into character format to use as a "class" in model
df$postal_code <- as.character(df$postal_code)

# Get unique zip codes as a sorted list
zip_codes <- sort(c(unique(df$postal_code)))
print(paste('Total number of postal codes included in analysis:', length(zip_codes)))

## [1] "Total number of postal codes included in analysis: 50"

3: Create the linear model

This model utilizes the “interaction effect” between ‘month’ and ‘postal code’ to get an intercept and slope value for each of the 50 postal codes

# Generate linear model using 'lm' function
# Creates a model (without an intercept) for the coefficient of price increase for all zip codes
# per each month. Each zip code (55 total) is included as a binary parameter (0 or 1), and the model
# takes into account the interaction effect between time and zip code
m1 <- lm(median_listing_price_per_square_foot~ month*postal_code-1, data=df)
summary(m1) # print model parameters

## 
## Call:
## lm(formula = median_listing_price_per_square_foot ~ month * postal_code - 
##     1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -100.10   -8.34   -0.98    6.89  789.15 
## 
## Coefficients: (2 not defined because of singularities)
##                          Estimate Std. Error t value Pr(>|t|)    
## month                    0.592122   0.629389   0.941  0.34689    
## postal_code64102        92.538755  29.804755   3.105  0.00192 ** 
## postal_code64105       186.042581   6.123576  30.381  < 2e-16 ***
## postal_code64106       218.227831   6.123576  35.637  < 2e-16 ***
## postal_code64108       200.581212   6.123576  32.756  < 2e-16 ***
## postal_code64109        68.906936   6.123576  11.253  < 2e-16 ***
## postal_code64110       104.801141   6.123576  17.114  < 2e-16 ***
## postal_code64111       150.068920   6.123576  24.507  < 2e-16 ***
## postal_code64112       266.773486   6.123576  43.565  < 2e-16 ***
## postal_code64113       194.194030   6.123576  31.713  < 2e-16 ***
## postal_code64114       116.213784   6.123576  18.978  < 2e-16 ***
## postal_code64116       105.338455   6.123576  17.202  < 2e-16 ***
## postal_code64117        62.666374   6.123576  10.234  < 2e-16 ***
## postal_code64118        92.137840   6.123576  15.046  < 2e-16 ***
## postal_code64119        94.748025   6.123576  15.473  < 2e-16 ***
## postal_code64120        27.935892  11.474862   2.435  0.01497 *  
## postal_code64123        37.855575   6.123576   6.182 7.24e-10 ***
## postal_code64124        26.221686   6.123576   4.282 1.91e-05 ***
## postal_code64125        20.328068   7.817638   2.600  0.00936 ** 
## postal_code64126        22.872088   7.073245   3.234  0.00124 ** 
## postal_code64127        26.091308   6.123576   4.261 2.10e-05 ***
## postal_code64128        13.668569   6.123576   2.232  0.02568 *  
## postal_code64129        47.178665   6.123576   7.704 1.80e-14 ***
## postal_code64130        19.585162   6.123576   3.198  0.00140 ** 
## postal_code64131        75.823968   6.123576  12.382  < 2e-16 ***
## postal_code64132        24.774802   6.123576   4.046 5.35e-05 ***
## postal_code64133        71.113696   6.123576  11.613  < 2e-16 ***
## postal_code64134        54.835382   6.123576   8.955  < 2e-16 ***
## postal_code64136       119.764025   6.383949  18.760  < 2e-16 ***
## postal_code64137        79.374451   6.123576  12.962  < 2e-16 ***
## postal_code64138        78.205004   6.123576  12.771  < 2e-16 ***
## postal_code64139       117.924056   6.123576  19.257  < 2e-16 ***
## postal_code64141       143.157560  27.963588   5.119 3.27e-07 ***
## postal_code64144        85.973316  28.552360   3.011  0.00263 ** 
## postal_code64145       123.069359   6.123576  20.098  < 2e-16 ***
## postal_code64146        87.804653   6.123576  14.339  < 2e-16 ***
## postal_code64149       124.296847   8.447776  14.714  < 2e-16 ***
## postal_code64151        96.497805   6.123576  15.758  < 2e-16 ***
## postal_code64152       115.763828   6.123576  18.905  < 2e-16 ***
## postal_code64153       136.724759   6.123576  22.328  < 2e-16 ***
## postal_code64154       121.485075   6.123576  19.839  < 2e-16 ***
## postal_code64155       119.387182   6.123576  19.496  < 2e-16 ***
## postal_code64156       120.830992   6.123576  19.732  < 2e-16 ***
## postal_code64157       111.826602   6.123576  18.262  < 2e-16 ***
## postal_code64158       105.385426   6.123576  17.210  < 2e-16 ***
## postal_code64161        90.699739  13.746223   6.598 4.95e-11 ***
## postal_code64163        87.012510   6.626581  13.131  < 2e-16 ***
## postal_code64164       150.262443  16.837901   8.924  < 2e-16 ***
## postal_code64165        93.254748   8.008488  11.644  < 2e-16 ***
## postal_code64166        97.376798  22.975231   4.238 2.32e-05 ***
## postal_code64167       143.404987   9.701940  14.781  < 2e-16 ***
## month:postal_code64105  -0.114243   0.648021  -0.176  0.86007    
## month:postal_code64106  -0.675452   0.648021  -1.042  0.29735    
## month:postal_code64108  -0.001978   0.648021  -0.003  0.99756    
## month:postal_code64109   0.458743   0.648021   0.708  0.47906    
## month:postal_code64110   0.051749   0.648021   0.080  0.93636    
## month:postal_code64111   0.221311   0.648021   0.342  0.73274    
## month:postal_code64112  -0.616247   0.648021  -0.951  0.34170    
## month:postal_code64113  -0.178735   0.648021  -0.276  0.78271    
## month:postal_code64114   0.223506   0.648021   0.345  0.73019    
## month:postal_code64116   0.158937   0.648021   0.245  0.80627    
## month:postal_code64117   0.456338   0.648021   0.704  0.48136    
## month:postal_code64118  -0.086740   0.648021  -0.134  0.89353    
## month:postal_code64119  -0.102295   0.648021  -0.158  0.87458    
## month:postal_code64120  -0.034630   0.677456  -0.051  0.95924    
## month:postal_code64123   0.355372   0.648021   0.548  0.58346    
## month:postal_code64124   0.638878   0.648021   0.986  0.32427    
## month:postal_code64125   0.237275   0.655415   0.362  0.71736    
## month:postal_code64126   0.085099   0.652076   0.131  0.89618    
## month:postal_code64127   0.187414   0.648021   0.289  0.77244    
## month:postal_code64128   0.240588   0.648021   0.371  0.71047    
## month:postal_code64129   0.253509   0.648021   0.391  0.69568    
## month:postal_code64130   0.245989   0.648021   0.380  0.70427    
## month:postal_code64131   0.409997   0.648021   0.633  0.52699    
## month:postal_code64132   0.383715   0.648021   0.592  0.55381    
## month:postal_code64133   0.052494   0.648021   0.081  0.93544    
## month:postal_code64134   0.168404   0.648021   0.260  0.79498    
## month:postal_code64136  -0.123434   0.649055  -0.190  0.84919    
## month:postal_code64137  -0.172456   0.648021  -0.266  0.79016    
## month:postal_code64138  -0.072062   0.648021  -0.111  0.91146    
## month:postal_code64139  -0.144055   0.648021  -0.222  0.82410    
## month:postal_code64141         NA         NA      NA       NA    
## month:postal_code64144         NA         NA      NA       NA    
## month:postal_code64145  -0.545113   0.648021  -0.841  0.40031    
## month:postal_code64146   0.183787   0.648021   0.284  0.77673    
## month:postal_code64149   1.178843   0.672885   1.752  0.07990 .  
## month:postal_code64151   0.208027   0.648021   0.321  0.74822    
## month:postal_code64152   0.257861   0.648021   0.398  0.69072    
## month:postal_code64153  -0.414067   0.648021  -0.639  0.52289    
## month:postal_code64154   0.408737   0.648021   0.631  0.52826    
## month:postal_code64155   0.128113   0.648021   0.198  0.84329    
## month:postal_code64156   0.236732   0.648021   0.365  0.71490    
## month:postal_code64157   0.338309   0.648021   0.522  0.60167    
## month:postal_code64158   0.712989   0.648021   1.100  0.27131    
## month:postal_code64161  -0.502090   0.704456  -0.713  0.47607    
## month:postal_code64163   1.028456   0.650226   1.582  0.11383    
## month:postal_code64164   0.729031   0.707537   1.030  0.30292    
## month:postal_code64165   1.335663   0.655779   2.037  0.04177 *  
## month:postal_code64166   0.462496   0.784427   0.590  0.55551    
## month:postal_code64167  -0.248645   0.679423  -0.366  0.71442    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.97 on 2836 degrees of freedom
## Multiple R-squared:  0.9685, Adjusted R-squared:  0.9674 
## F-statistic: 890.3 on 98 and 2836 DF,  p-value: < 2.2e-16

4: Plot the linear model error residuals as a histogram

If the model is a good fit, the errors should be approximately normally distributed (with the histogram appearing bell-shaped)

# Get error residuals from M1 and plot them
e.hat <- residuals(m1)
hist(e.hat,col="grey",breaks=50,main="Histogram of Error Residuals:\n Model 1 (LM)",xlab=expression(hat(epsilon)))

5: Extract the slope coefficients from the linear model and plot, to compare postal codes

1: Extract coeficients from model as a matrix
2: Convert to a data frame
3: Filter out the intercept values
4: Process the slope coefficients (add each individual value to the model to the model’s base slope value)
5: Update the names column to be the postal codes only (for easier comparison)
6: Sort the data frame by slope value in descending order
7: Plot the top 10 estimated coefficients on a bar plot

## Get just the slope estimates for each zip code 
## (so they can be sorted in descending order to find the largest)
matrix_coef <- summary(m1)$coefficients  # Extract coefficients from model in a matrix
m1_estimates <- matrix_coef[,1] # Get just the first column (coefficient estimate) 
m1_estimates <- as.data.frame(m1_estimates) # Convert the coefficients into a dataframe for easier manipulation
m1_estimates$name <- rownames(m1_estimates) # Add a column "name" of the label for each coefficient
names(m1_estimates)[1] <- "coef"
rownames(m1_estimates) <- seq(1,length(m1_estimates[,1]))   # Reset row names to an integer index
m1_estimates <- m1_estimates[startsWith(m1_estimates$name, "month"),] # Get just the slope coefficients (labeled starting with "month")
base_coef <- m1_estimates[m1_estimates$name == "month",][,1] # get the base slope coefficient from the linear model 
m1_estimates[m1_estimates$name == "month",][,2] <- "64102" # update the label for the "base coefficient" row to the actual zip code for comparison

# function to add the base slope estimate to all others for comparison of real values
update_slopes_with_base <- function(df_row){
  return(df_row[1] + base_coef)
}

# function to update the dataframe label column (to get just the zip code listed)
update_zip_name <- function(df_row){
  return(sub("month:postal_code","",df_row[1]))
}

# Update the coefficients (add the base coefficient value (except for the base value itself))
m1_estimates[startsWith(m1_estimates$name, "month"),]$coef <- lapply(m1_estimates[startsWith(m1_estimates$name, "month"),]$coef,update_slopes_with_base)
# Update the dataframe row labels to be just the zip code
m1_estimates[startsWith(m1_estimates$name, "month"),]$name <- lapply(m1_estimates[startsWith(m1_estimates$name, "month:postal_code"),]$name,update_zip_name)
m1_estimates <- as.data.frame(lapply(m1_estimates, unlist)) # convert the df columns formatted as alist

# Sort the coefficient estimates
m1_estimates <- m1_estimates[order(m1_estimates$coef,decreasing=TRUE), ]

# Plot the top 10 estimated coefficients
library(dplyr)
library(ggplot2)
top_10 <- m1_estimates %>% slice(1:10)
ggplot(top_10, aes(reorder(name, -coef, sum), coef)) +    # ggplot2 plot with modified x-axis labels
  geom_bar(stat = "identity") +
  labs(title="Postal Codes with Highest Home Sales Price Growth",
       x="Postal Code", y= "Linear Model Slope Coefficient") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, vjust = 0.5)) +
  geom_text(aes(label = signif(coef, digits = 4)), nudge_y = 0.1, size=3.5)

6: Plot the data and fitted lines and 95% confidence interval for the top 4 postal codes

Plot postal codes 64165, 64149, 64163, and 64164
This gets an idea of how well the model is fit for each postal code
Also can see if there are non-linear patterns in the data specific to each postal code

# Plot M1 with predictions for specific zip code and 95% confidence interval
plot_zip_func <- function(zip, mo_range=(1:68)){
  plot_zip <- zip
  plot_df <- subset(df, subset=postal_code==plot_zip)
  plot(x = plot_df$month, y = plot_df$median_listing_price_per_square_foot, xlab = "Month", 
       ylab = expression("Median Listing Price Per Sq. Ft."),  
       main=paste("Linear model fitted line and confidence\n interval for postal code:",plot_zip, sep=" "),col="cornflowerblue",pch="*",cex=1.25,
       ylim = c(0,350),xlim=c(0, max(mo_range)))
  F <- predict(m1,newdata=data.frame(month=mo_range, postal_code=plot_zip),interval="confidence",level=0.95)
  points(mo_range,F[,1], col="red",typ="l",lwd=1.5)
  points(mo_range,F[,2], col="orange",typ="l",lwd=0.5)
  points(mo_range,F[,3], col="orange",typ="l",lwd=0.5)
}
plot_zip_func('64165')

plot_zip_func('64149')

plot_zip_func('64163')

plot_zip_func('64164')

7: Hypothesis testing of top 4 postal codes

For each of the top postal codes, perform a hypothesis test, with the null hypothesis:

\({H_0 : \beta_2 = 0}\)

and alternative hypothesis:

\({H_a : \beta_2 > 0}\)

where \({\beta_2}\) is the slope parameter for each postal code
Plot the testing results (for both 95% and 99% confidence) as a formatted table

## Hypothesis test a list of postal codes
## and output a formatted table of hypothesis test results
## at 95% and 99% confidence 
hypothesis_test_multiple <- function(post_code_list){
  hypothesis_results_df <- data.frame(matrix(ncol = 3, nrow = 0)) # create a dataframe to store hypothesis test results for export to a table
  colnames(hypothesis_results_df) <- c('Postal Code', 'β > 0 at 95% Confidence', 'β > 0 at 99% Confidence')
  for (post_code in post_code_list){
    test_df <- subset(df,subset=postal_code==post_code)
    test_model <- lm(median_listing_price_per_square_foot~ month, data=test_df) # removed interactive effect between month and postal code because this is just a subset of a single month
    beta1.null <- 0
    beta1.hat <- coef(test_model)[2]
    sigma2.beta1.hat <- vcov(test_model)[2,2]
    t <- (beta1.hat - beta1.null)/sqrt(sigma2.beta1.hat)
    n <- nrow(test_df)
    p <- 2
    f_stat <- pt(t,n-p,lower.tail=FALSE)  # Calculate integral of t-distribution
    print(paste(post_code,'test statistic:', f_stat))
    flag_5 <- FALSE
    flag_1 <- FALSE
    if (f_stat < 0.05){
      flag_5 <- TRUE  
    }
    if (f_stat < 0.01){
      flag_1 <- TRUE
    }
    hypothesis_results_df[nrow(hypothesis_results_df)+1,] <- c(post_code, flag_5, flag_1)
  }
  library(formattable)
  formattable(hypothesis_results_df,
              align=c("c","c","c"),
              list(
    'β > 0 at 95% Confidence' = formatter("span",
                        style = x ~ style(color = ifelse(x, "green", "red")),
                        x ~ icontext(ifelse(x, "ok", "remove"), ifelse(x, "Yes", "No"))),
    'β > 0 at 99% Confidence' = formatter("span",
                              style = x ~ style(color = ifelse(x, "green", "red")),
                              x ~ icontext(ifelse(x, "ok", "remove"), ifelse(x, "Yes", "No")))
  ))
}
hypothesis_test_multiple(c('64165', '64149', '64163', '64164'))

## [1] "64165 test statistic: 0.0152174309475825"
## [1] "64149 test statistic: 0.0144677504425707"
## [1] "64163 test statistic: 9.79917211497219e-25"
## [1] "64164 test statistic: 0.132380308664403"

Postal Code	ß > 0 at 95% Confidence	ß > 0 at 99% Confidence
64165	Yes	No
64149	Yes	No
64163	Yes	Yes
64164	No	No

8: Fit Generalized Additive Model (GAM)

# Fit GAM Model (Model 2)
library(mgcv)
m2 <- gam(median_listing_price_per_square_foot~ s(month)+postal_code, data=df)
summary(m2)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## median_listing_price_per_square_foot ~ s(month) + postal_code
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       108.950     12.734   8.556  < 2e-16 ***
## postal_code64105   93.978     13.103   7.172 9.35e-13 ***
## postal_code64106  106.802     13.103   8.151 5.34e-16 ***
## postal_code64108  112.390     13.103   8.577  < 2e-16 ***
## postal_code64109   -3.389     13.103  -0.259 0.795918    
## postal_code64110   18.464     13.103   1.409 0.158920    
## postal_code64111   69.581     13.103   5.310 1.18e-07 ***
## postal_code64112  157.390     13.103  12.011  < 2e-16 ***
## postal_code64113   99.905     13.103   7.624 3.31e-14 ***
## postal_code64114   35.802     13.103   2.732 0.006328 ** 
## postal_code64116   22.699     13.103   1.732 0.083328 .  
## postal_code64117   -9.713     13.103  -0.741 0.458605    
## postal_code64118    1.022     13.103   0.078 0.937808    
## postal_code64119    3.096     13.103   0.236 0.813236    
## postal_code64120  -62.725     13.421  -4.674 3.09e-06 ***
## postal_code64123  -38.007     13.103  -2.901 0.003753 ** 
## postal_code64124  -39.860     13.103  -3.042 0.002372 ** 
## postal_code64125  -58.733     13.165  -4.461 8.47e-06 ***
## postal_code64126  -62.209     13.138  -4.735 2.30e-06 ***
## postal_code64127  -55.566     13.103  -4.241 2.30e-05 ***
## postal_code64128  -66.154     13.103  -5.049 4.73e-07 ***
## postal_code64129  -32.198     13.103  -2.457 0.014060 *  
## postal_code64130  -60.051     13.103  -4.583 4.78e-06 ***
## postal_code64131    1.846     13.103   0.141 0.887974    
## postal_code64132  -50.110     13.103  -3.824 0.000134 ***
## postal_code64133  -15.198     13.103  -1.160 0.246201    
## postal_code64134  -27.478     13.103  -2.097 0.036082 *  
## postal_code64136   27.259     13.114   2.079 0.037745 *  
## postal_code64137  -14.698     13.103  -1.122 0.262082    
## postal_code64138  -12.404     13.103  -0.947 0.343909    
## postal_code64139   24.831     13.103   1.895 0.058188 .  
## postal_code64141   55.379     28.475   1.945 0.051893 .  
## postal_code64144   -1.115     28.477  -0.039 0.968766    
## postal_code64145   16.140     13.103   1.232 0.218142    
## postal_code64146    6.022     13.103   0.460 0.645829    
## postal_code64149   73.627     13.424   5.485 4.50e-08 ***
## postal_code64151   15.552     13.103   1.187 0.235380    
## postal_code64152   36.537     13.103   2.788 0.005332 ** 
## postal_code64153   34.317     13.103   2.619 0.008867 ** 
## postal_code64154   47.464     13.103   3.622 0.000297 ***
## postal_code64155   35.684     13.103   2.723 0.006503 ** 
## postal_code64156   40.875     13.103   3.119 0.001830 ** 
## postal_code64157   35.375     13.103   2.700 0.006980 ** 
## postal_code64158   41.861     13.103   3.195 0.001415 ** 
## postal_code64161  -17.401     13.407  -1.298 0.194423    
## postal_code64163   35.672     13.119   2.719 0.006587 ** 
## postal_code64164   95.556     13.643   7.004 3.09e-12 ***
## postal_code64165   56.781     13.214   4.297 1.79e-05 ***
## postal_code64166   27.894     14.865   1.877 0.060685 .  
## postal_code64167   47.618     13.556   3.513 0.000450 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df     F p-value    
## s(month) 5.717  6.871 150.3  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.817   Deviance explained =   82%
## GCV = 660.18  Scale est. = 647.64    n = 2934

9: Plot error residuals for GAM as a histogram

Like the linear model, if the model is a good fit, the errors should be approximately normally distributed (with the histogram appearing bell-shaped)

# Print histogram of Model 2 error residuals
e.hat2 <- residuals(m2)
hist(e.hat2,col="grey",breaks=50,main="Histogram of Error Residuals:\n Model 2 (GAM)",xlab=expression(hat(epsilon)))

10: Plot the GAM fitted lines for each postal code together

This allows seeing how the GAM slope parameters are all the same, and that only the intercepts vary by postal code

### PLOTTING GAM ZIPS TOGETHER
# Print all data and fitted line of gam model for each zip code
plot(df$month, df$median_listing_price_per_square_foot,pch=20,xlim=c(0,68), ylim=c(0,300),
     xlab="Month \n(July 2016 - February 2022, represented as integers)",
     ylab="Median Listing Price Per Square Foot",
     main="GAM fitted lines for each postal code")
for (i in zip_codes){
  E.zip.gam.y <- predict(m2,newdata=data.frame(month=1:128, postal_code=i))
  points(1:128,E.zip.gam.y,typ="l",lwd=.5,col="red")
}

11: Evaluate the Predictive Ability of LM vs GAM

This iteratively checks how well each model performs at prediction ranges from 12-months to 36-months (and each integer value in between)
For each prediction range, the data set is split into a train and test set which vary in length corresponding to the prediction size
- Each postal code is individually used to predict values (for the length of the prediction range), for which each value is then compared to the actual value to calculate absolute error.
- Mean average error is calculated for each postal code by dividing by the number of actual observations in the prediction range
- For each postal code, an evaluation of which model was better able to predict (based on mean absolute error) is done
  - This is used to add to a count (‘m1_count’ and ‘m2_count’ variables below) to store how many postal codes were predicted better by each model
- The model counts and mean absolute error values for each are stored in a data frame for each prediction range being evaluated (from 12 through 36 months).
The data frame of evaluation results is used to plot how the evaluation results change according to prediction size (with plots for ‘LM vs GAM: Number of Postal Codes Predicted Better’ and ‘LM vs GAM: Mean Absolute Error’)

### ITERATIVELY EVALUATE LM VS GAN FOR RANGE OF PREDICTION INTERVALS
evaluate_models <- function(eval_range=(12:36)){
  model_count_results_df <- data.frame(matrix(ncol = 5, nrow = 0)) # create a dataframe to store hypothesis test results for export to a table
  colnames(model_count_results_df) <- c('Prediction Size', 'LM count', 'LM error', 'GAM count', 'GAM error')
  for (p_size in eval_range){
    pred_size <- p_size
    train_size <- 68-pred_size
    train_df <- df[df$month <= train_size,]
    test_df <- df[df$month > train_size,]
    train_m1 <- lm(median_listing_price_per_square_foot~month*postal_code, train_df)
    train_m2 <- gam(median_listing_price_per_square_foot~ s(month)+postal_code, data=train_df)
    counter = 0
    m1_count = 0
    m2_count = 0
    sum_error_m1 = 0
    sum_error_m2 = 0
    for (i in zip_codes){
      if (nrow(df[df$postal_code==i,]) > 1){ # filtering zip codes with <= 1 row of data since they can't be used for prediction
        zip.E.m1.y <- predict(m1,newdata=data.frame(month=seq(train_size+1,68), postal_code=i))
        zip.E.m2.y <- predict(m2,newdata=data.frame(month=seq(train_size+1,68), postal_code=i))
        sum_zip_m1 = 0
        sum_zip_m2 = 0
        zip_counter = 0
        for (j in 1:pred_size){
          y <- df[df$postal_code==i & df$month==j+train_size,]$median_listing_price_per_square_foot
          if (length(y) > 0 && is.na(y)==FALSE){
            diff_m1 <- abs(y - zip.E.m1.y[j])
            sum_error_m1 <- sum_error_m1 + diff_m1
            sum_zip_m1 <- sum_zip_m1 + diff_m1
            diff_m2 <- abs(y - zip.E.m2.y[j])
            sum_error_m2 <- sum_error_m2 + diff_m2
            sum_zip_m2 <- sum_zip_m2 + diff_m2
            zip_counter <- zip_counter + 1
            counter <- counter + 1
          }
        }
        sum_zip_m1 <- sum_zip_m1/zip_counter
        sum_zip_m2 <- sum_zip_m2/zip_counter
        if (sum_zip_m1 < sum_zip_m2){
          m1_count <- m1_count + 1
        }
        else{
          m2_count <- m2_count + 1
        }
    }
    }
    m1_avg_error <- sum_error_m1/counter
    m2_avg_error <- sum_error_m2/counter
    model_count_results_df[nrow(model_count_results_df)+1,] <- c(p_size, m1_count, m1_avg_error, m2_count, m2_avg_error)  
  }
  ## Plot the number predicted better for each model
  par(mar=c(5.1,5.1,4.1,2.1)) # update left margin (2nd value - from 4.1 to 5.1)
  plot(model_count_results_df$`Prediction Size`, model_count_results_df$`LM count`, 'l', lwd=2.0, col='red', xaxt='n', 
       xlab = "Prediction Size \n (number of months)", ylab = "Number of Postal\n Codes Predicted Better",
       main = "LM vs GAM: Number of \n Postal Codes Predicted Better", ylim=c(19,32))
  lines(model_count_results_df$`Prediction Size`, model_count_results_df$`GAM count`, lwd=2.0, col='blue')
  axis(1,at=c(12,18,24,30,36))
  legend(27.71, 32.125, legend=c("Model 1 (LM)", "Model 2 (GAM)"), fill = c("red","blue"))
  
  ## Plot the mean abs error for each model
  par(mar=c(5.1,4.1,4.1,2.1)) # set the plot margins back to default
  plot(model_count_results_df$`Prediction Size`, model_count_results_df$`LM error`, 'l', lwd=2.0, col='red', xaxt='n', 
       xlab = "Prediction Size \n (number of months)", ylab = "Mean Absolute Error",
       main = "LM vs GAM: \n Mean Absolute Error", ylim=c(12.5,17))
  lines(model_count_results_df$`Prediction Size`, model_count_results_df$`GAM error`, lwd=2.0, col='blue')
  axis(1,at=c(12,18,24,30,36))
  legend(27.72, 17.05, legend=c("Model 1 (LM)", "Model 2 (GAM)"), fill = c("red","blue"))
}
evaluate_models()

12: Plot the fit and prediction interval for postal code 64163

Prediction range of 18-months starting from March 2022 and ending August 2023
- Meaning plot range is 1:86 (adding 18 to the actual data length of 68 months)
Label the plot with the high, expected, and low values for the end of the prediction range

# Plot M1 with predictions for specific zip code and 95% prediction interval
plot_zip_func <- function(zip, mo_range=(1:68)){
  plot_zip <- zip
  plot_df <- subset(df, subset=postal_code==plot_zip)
  F <- predict(m1,newdata=data.frame(month=mo_range, postal_code=plot_zip),interval="prediction",level=0.95)
  plot(x = plot_df$month, y = plot_df$median_listing_price_per_square_foot, xlab = "Month", 
       ylab = expression("Median Listing Price Per Sq. Ft."),  
       main=paste("Linear model ", max(mo_range)-68, "-month\n prediction for postal code: ",plot_zip, sep=""),col="cornflowerblue",pch="*",cex=1.25,
       ylim = c(0,max(F[,3])+25), xlim=c(0, max(mo_range)))
  points(mo_range,F[,1], col="red",typ="l",lwd=1.5)
  points(mo_range,F[,2], col="orange",typ="l",lwd=0.5)
  points(mo_range,F[,3], col="orange",typ="l",lwd=0.5)
  
  # Plot the predicted values at the max range
  text(max(mo_range)-1, F[,1][max(mo_range)], signif(F[,1][max(mo_range)],4), pos=1, cex=0.75)
  points(max(mo_range), F[,1][max(mo_range)], pch='*')
  text(max(mo_range)-1, F[,2][max(mo_range)], signif(F[,2][max(mo_range)],4), pos=1, cex=0.75)
  points(max(mo_range), F[,2][max(mo_range)], pch='*')
  text(max(mo_range)-1, F[,3][max(mo_range)], signif(F[,3][max(mo_range)],4), pos=1, cex=0.75)
  points(max(mo_range), F[,3][max(mo_range)], pch='*')
}
plot_zip_func('64163', (1:86))

Final Project Tutorial

Zack Strathe

5/13/2022

1: Import data into R

2: Pre-Process Data

3: Create the linear model

4: Plot the linear model error residuals as a histogram

5: Extract the slope coefficients from the linear model and plot, to compare postal codes

6: Plot the data and fitted lines and 95% confidence interval for the top 4 postal codes

7: Hypothesis testing of top 4 postal codes

8: Fit Generalized Additive Model (GAM)

9: Plot error residuals for GAM as a histogram

10: Plot the GAM fitted lines for each postal code together

11: Evaluate the Predictive Ability of LM vs GAM

12: Plot the fit and prediction interval for postal code 64163