#----------------------------------SUPPLEMENTARY DATA------------------------------------# # Name of Authors: CHINEDU GEORGE AGOKEI; BOMONYO JESSICA AFA | | #----------------------------------------------------------------------------------------# # Loading libraries library(dplyr) library(combinat) # Importing dataset car1 <- read.csv("car_price.csv", header = T) car2 <- filter(car1, year==2010, brand=="honda", model=="accord") car <- sample_n(car2, size = 6) #---------------------------------------------------------------------------------------- #Visualization and Regression model of Original data car_price = c(car1\$price) mileage = c(car1\$odometres) year = c(car1\$year) plot(x=mileage, y=car_price) # Original plot of relationship between price and mileage plot(x=year, y=car_price) # Original plot of relationship between price and year hist(car1\$price) lin_model <- lm(car_price ~ mileage + year, data = car1) summary(lin_model) #--------------------------------------------------------------------------------------- # Visualization of sampled data price = c(car\$price) km = c(car\$odometres) year = c(car\$year) plot(x=km, y=price) #plot to see the relation between km and price, non-linear #---------------------------------------------------------------------------------------- # Statistical Analysis #--------Using Pearson method-----------# sttrue1= cor(price,km,method = "pearson") ##pearson = default n=length(km) nr=fact(n) #number of rearrangements to be examined st=numeric(nr) cnt=0 d=permn(km) for (i in 1:nr){ st[i]<-cor(d[[i]],price,method = "pearson") if (st[i] > sttrue1)cnt=cnt+1 } p1=cnt/nr #pvalue p1 #-------Using Spearman method----------# sttrue2=cor(price,km,method = "spearman") n=length(km) nr=fact(n) #number of rearrangements to be examined st=numeric(nr) cnt=0 d=permn(km) for (i in 1:nr){ st[i]<-cor(d[[i]],price,method = "spearman") #error1 #ok if (st[i] > sttrue2)cnt=cnt+1 } #should be cnt not price p2=cnt/nr #pvalue p2 hist(st) abline(v=sttrue1, col="blue") abline(v=sttrue2, col="red") legend(0.62, 17, legend=c("Pearson", "Spearman"), col=c("blue", "red"), lty=1, cex=0.8, box.lty=0) #p1 equals to p2 #------------------------------------------------------------------------------------------ # Using Linear Regression - reationship with price and km (mileage) # For sampled data model <- lm(price ~ km, data = car) summary(model) #------------------------------------------------------------------------------------------ # Permutation Test # Exact Test--Mileage (Kilometer-KM) sttrue3 =cor(price,km) n = length(km) nr0 = fact(n) #number of arrangment st = numeric(nr0) cnt= 0 d = permn(km) for (i in 1:nr0){ st[i] <- cor(d[[i]],price) if(st[i] > sttrue3)cnt=cnt+1} pkm<-cnt/nr0 #pvalue KM pkm # Error 3 - changes the value of price to 1 #ok #------------------------------------------------------------------------------------------ # Calculating Confidence Interval for Mean and Standard Deviation ################mean############# hist(car\$price) n = length(car\$price) #set number of bootstrap samples nsim=1000 stat = numeric(nsim) #create a vector in which to store the results #Set up a loop to generate a series of bootstrap samples for (i in 1:nsim){ PriceB= sample (car\$price, n, replace=T) stat[i] = mean(PriceB)} #--------percentile method-----------# quantile(stat,c(0.025,0.975)) #not symmetric, percentile method is not good in this case #---------simple method--------------# 2*mean(car\$price)-quantile(stat,0.975) 2*mean(car\$price)-quantile(stat,0.025) ################sd############# hist(car\$price) n = length(car\$price) #set number of bootstrap samples nsim=1000 stat = numeric(nsim) #create a vector in which to store the results #Set up a loop to generate a series of bootstrap samples for (i in 1:nsim){ PriceB= sample (car\$price, n, replace=T) stat[i] = sd(PriceB)} #--------percentile method---------# quantile(stat,c(0.025,0.975)) #not symmetric, percentile method is not good in this case #--------simple method-------------# 2*sd(car\$price)-quantile(stat,0.975) 2*sd(car\$price)-quantile(stat,0.025) #------------------------------------------------------------------------------------------ # Price Prediction based on given mileage (in Km) using CI # Methods used - Percentile and Simple methods e = c(car\$odometres) f = c(car\$price) plot(e, f) #plot to see the relation between km and price, non-linear plot(x=1/e, y=f) #consider use funtion: price=1/km #fit non-linear regression model, starting value b1=1 ajust<- lm(f~e) summary(ajust) #--------percentile method------------# n<-length(e); data<-cbind(e,f) theta<-summary(ajust)\$coefficients[1] sdtheta<-summary(ajust)\$coefficients[2] nb<-1000; z<-seq(1,n);tb<-numeric(nb);predb<-numeric(nb) thetab<-numeric(nb) # sdthetab<-numeric(nb) #error2 no sdthetab found for(i in 1:nb){ zb<-sample(z,n,replace=T) ajustb<- nls(data[zb,2]~ b1*(1/data[zb,1]),start =list(b1=1)) predb[i]<- (summary(ajustb)\$coefficients[1]/20000) tb[i]<-(thetab[i]-theta)/sdthetab} quantile(predb,c(0.025,0.975)) #--------simple method----------------# abs(2*sd(price)-quantile(predb,0.975)) #price cannot be negative, take the absolute value abs(2*sd(price)-quantile(predb,0.025)) #----------------------------------------------------------------------------------------