install.packages("gbm")
install.packages("tidyverse")
install.packages("xtable")
install.packages("estimatr")
install.packages("gbm")
install.packages("texreg")
install.packages("caret")
install.packages("doParallel")
install.packages("foreach")
library(gbm)
library(foreach)
#load packages#
library(xtable)
library(estimatr)
library(texreg)
library(tidyverse)
library(readxl)
library(gbm)
#load packages#
library(doParallel)
library(estimatr)
library(texreg)
library(tidyverse)
library(readxl)
library(caret)
library(foreach)
#load packages#
library(xtable)
library(estimatr)
library(texreg)
library(tidyverse)
library(readxl)
plm.localpoly.pred=function(y,X,W,X.new,W.new,h,p=1,kernel.fun="Epanechnikov",cl=1) {
library(foreach)
n=length(y)
if(kernel.fun=="Epanechnikov") {
K=function(u) {3/4*(1-u^2)*(abs(u)<=1)}
}
localpoly.mean=function(w,G) {
D=W-matrix(rep(w,each=n),nrow=n)
omega=apply(Matrix::Matrix(K(t(D)/h),sparse=T),MARGIN=2,FUN=prod)
local.ind=which(omega!=0)
Z=rep(1,length(local.ind))
if(p>0) {
Z=cbind(Z,poly(D[local.ind,,drop=F],degree=p,raw=T,simple=T))
}
ZW=t(Z*omega[local.ind])
Proj=(solve(ZW%*%Z)%*%ZW)[1,]
m=colSums(G[local.ind,,drop=F]*Proj)
return(m)
}
if(cl>1) {
doParallel::registerDoParallel(cl)
M=foreach(i=1:n,.combine=rbind) %dopar% {localpoly.mean(w=W[i,],G=cbind(y,X))}
doParallel::stopImplicitCluster()
} else {
M=foreach(i=1:n,.combine=rbind) %do% {localpoly.mean(w=W[i,],G=cbind(y,X))}
}
y_tilde=y-M[,1];X_tilde=X-M[,-1]
beta_hat=as.vector(solve(t(X_tilde)%*%X_tilde)%*%t(X_tilde)%*%y_tilde)
gamma_in=y-X%*%beta_hat
if(cl>1) {
doParallel::registerDoParallel(cl)
gamma_out=foreach(i=1:nrow(W.new),.combine=c) %dopar% {localpoly.mean(w=W.new[i,],G=gamma_in)}
doParallel::stopImplicitCluster()
} else {
gamma_out=foreach(i=1:nrow(W.new),.combine=c) %do% {localpoly.mean(w=W.new[i,],G=gamma_in)}
}
y_out=as.vector(X.new%*%beta_hat)+gamma_out
return(y_out)
}
plm.localpoly.tune.cv=function(y,X,W,h.grid,p.grid,fold=4,cl=1) {
n=length(y)
tuneGrid=data.frame(p=rep(p.grid,each=nrow(h.grid)),h=h.grid)
group=caret::createFolds(1:n,k=fold)
plm.localpoly.cv=function(h,p) {
y_hat=rep(NA,n)
for (k in 1:fold) {
y_hat[group[[k]]]=plm.localpoly.pred(y=y[-group[[k]]],X=X[-group[[k]],],W=W[-group[[k]],],
X.new=X[group[[k]],],W.new=W[group[[k]],],h=h,p=p,cl=cl)
}
e2=(y-y_hat)^2
R2=1-sum(e2)/sum((y-mean(y))^2)
RMSE=sqrt(mean(e2))
return(c(R2=R2,RMSE=RMSE))
}
result=NULL
for (i in 1:nrow(tuneGrid)) {
cat(paste("Group",i,"starting tuning at",Sys.time()),"\n")
result=rbind(result,plm.localpoly.cv(h=unlist(tuneGrid[i,-1]),p=tuneGrid$p[i]))
}
result=cbind(tuneGrid,result)
bestTune=which.max(result$R2)
h.best=unlist(tuneGrid[bestTune,-1]);names(h.best)=NULL
p.best=tuneGrid$p[bestTune]
return(list(h.best=h.best,p.best=p.best,Performance=result))
}
plm.localpoly.tune.oos=function(y,X,W,h.grid,p.grid,valid.ind,cl=1) {
tuneGrid=data.frame(p=rep(p.grid,each=nrow(h.grid)),h=h.grid)
plm.localpoly.oos=function(h,p) {
y_hat=plm.localpoly.pred(y=y[-valid.ind],X=X[-valid.ind,],W=W[-valid.ind,],
X.new=X[valid.ind,],W.new=W[valid.ind,],h=h,p=p,cl=cl)
e2=(y[valid.ind]-y_hat)^2
R2=1-sum(e2)/sum((y[valid.ind]-mean(y[valid.ind]))^2)
RMSE=sqrt(mean(e2))
return(c(R2=R2,RMSE=RMSE))
}
result=NULL
for (i in 1:nrow(tuneGrid)) {
cat(paste("Group",i,"starting tuning at",Sys.time()),"\n")
result=rbind(result,plm.localpoly.oos(h=unlist(tuneGrid[i,-1]),p=tuneGrid$p[i]))
}
result=cbind(tuneGrid,result)
bestTune=which.max(result$R2)
h.best=unlist(tuneGrid[bestTune,-1]);names(h.best)=NULL
p.best=tuneGrid$p[bestTune]
return(list(h.best=h.best,p.best=p.best,Performance=result))
}
dat <- read_csv("C:/Users/jizha/OneDrive/Desktop/london_house_price1108data-2011-202240%-18m.csv")
dat <- read_csv("C:/Users/jizha/Desktop/london_house_price1108data-2011-202240%-18m.csv")
View(dat)
df<-dat
df <- df %>%
rename(
average_price =price,
price = price1
)
lianjia <-df
cl=24
library(caret)
# Training sample
set.seed(2021)
train_ind=createDataPartition(1:nrow(lianjia),p=0.7)$Resample1
lianjia.SNP=list()
lianjia.SNP$X=lianjia[,c("square","numberRoom")]
lianjia.SNP$X=cbind(lianjia.SNP$X,predict(dummyVars(~buildingType,data=lianjia),
newdata=lianjia)[,-1])
lianjia.SNP$X=cbind(lianjia.SNP$X,
predict(dummyVars(~current_energy_rating,data=lianjia),newdata=lianjia)[,-1])
lianjia.SNP$X=cbind(lianjia.SNP$X,lianjia[,c("age","energy_consumption_current","heating_cost_potential","co2_emissions_current")])
lianjia.SNP$X=cbind(lianjia.SNP$X,predict(dummyVars(~post_code,data=lianjia),newdata=lianjia)[,-1])
lianjia.SNP$X=cbind(lianjia.SNP$X,communityAverage=lianjia$communityAverage)
lianjia.SNP$X=cbind(lianjia.SNP$X,employmentAverage=lianjia$employment)
lianjia.SNP$X=cbind(lianjia.SNP$X,communityAverage=lianjia$communityAverage)
lianjia.SNP$X=cbind(lianjia.SNP$X,median_incomeavg=lianjia$median_income)
lianjia.SNP$X=cbind(lianjia.SNP$X,dist=lianjia$dist.center)
lianjia.SNP$X=as.matrix(lianjia.SNP$X)
lianjia.SNP$W=as.matrix(lianjia[,c("Lng","Lat")])
cl=24
library(caret)
# Training sample
set.seed(123)
train_ind=createDataPartition(1:nrow(lianjia),p=0.7)$Resample1
# OLS
lianjia <-df
cl=24
formula.tree.noc=price~
square+numberRoom+
+buildingType+
age+energy_consumption_current+heating_cost_potential +co2_emissions_current+
post_code+current_energy_rating+tradeYear+
communityAverage+employment+median_income
# Tree formula with coordinates
formula.tree=price~
square+livingRoom+drawingRoom+kitchen+bathRoom+
floor_type+floor_total+elevator+ladderRatio+
renovationCondition+buildingType+buildingStructure+
age+DOM+followers+fiveYearsProperty+
subway+district+Lng+Lat+tradeYear+
communityAverage
#############
formula.tree=price~
square+numberRoom+
+buildingType+
age+energy_consumption_current+heating_cost_potential +co2_emissions_current+
post_code+current_energy_rating+tradeYear+
communityAverage+employment+median_income+Lat+Lng+dist.center
formula.tree.noc=price~
square+numberRoom+
+buildingType+
age+energy_consumption_current+heating_cost_potential +co2_emissions_current+
post_code+current_energy_rating+tradeYear+
communityAverage+employment+median_income+dist.center
cat(paste("Starting GBM with coordinates at",Sys.time()),"\n")
doParallel::registerDoParallel(cl)
model.GBM=train(formula.tree,data=lianjia[train_ind,],
method="gbm",distribution="gaussian",
tuneGrid=data.frame(n.trees=10000,
interaction.depth=20,
shrinkage=0.005,
n.minobsinnode=20),
verbose=F)
pred.GBM=predict(model.GBM,newdata=lianjia[-train_ind,])
doParallel::stopImplicitCluster()
