#----------------------------------SUPPLEMENTARY DATA------------------------------------#
# Name of Authors: CHINEDU GEORGE AGOKEI; BOMONYO JESSICA AFA                                                          |                        |
#----------------------------------------------------------------------------------------#

# Loading libraries

library(dplyr)
library(combinat)

# Importing dataset 

car1 <- read.csv("car_price.csv", header = T)
car2 <- filter(car1, year==2010, brand=="honda", model=="accord")
car <- sample_n(car2, size = 6)

#----------------------------------------------------------------------------------------
#Visualization and Regression model of Original data

car_price = c(car1$price)
mileage = c(car1$odometres)
year = c(car1$year)
plot(x=mileage, y=car_price) # Original plot of relationship between price and mileage
plot(x=year, y=car_price) # Original plot of relationship between price and year

hist(car1$price)

lin_model <- lm(car_price ~ mileage + year, data = car1)
summary(lin_model)

#---------------------------------------------------------------------------------------
# Visualization of sampled data

price = c(car$price)
km = c(car$odometres)
year = c(car$year)
plot(x=km, y=price) #plot to see the relation between km and price, non-linear

#----------------------------------------------------------------------------------------
# Statistical Analysis

#--------Using Pearson method-----------# 

sttrue1= cor(price,km,method = "pearson") ##pearson = default
n=length(km)
nr=fact(n) #number of rearrangements to be examined
st=numeric(nr)
cnt=0
d=permn(km)
for (i in 1:nr){
  st[i]<-cor(d[[i]],price,method = "pearson")
  if (st[i] > sttrue1)cnt=cnt+1 }
p1=cnt/nr #pvalue
p1

#-------Using Spearman method----------#

sttrue2=cor(price,km,method = "spearman")
n=length(km)
nr=fact(n) #number of rearrangements to be examined
st=numeric(nr)
cnt=0
d=permn(km)
for (i in 1:nr){
  st[i]<-cor(d[[i]],price,method = "spearman") #error1 #ok
  if (st[i] > sttrue2)cnt=cnt+1 }   #should be cnt not price
p2=cnt/nr #pvalue
p2

hist(st)
abline(v=sttrue1, col="blue")
abline(v=sttrue2, col="red")
legend(0.62, 17, legend=c("Pearson", "Spearman"),
       col=c("blue", "red"), lty=1, cex=0.8, box.lty=0)
#p1 equals to p2


#------------------------------------------------------------------------------------------
# Using Linear Regression - reationship with price and km (mileage)
# For sampled data

model <- lm(price ~ km, data = car)
summary(model)

#------------------------------------------------------------------------------------------
# Permutation Test
# Exact Test--Mileage (Kilometer-KM)

sttrue3 =cor(price,km)
n = length(km)
nr0 = fact(n) #number of arrangment
st = numeric(nr0)
cnt= 0
d = permn(km)
for (i in 1:nr0){
  st[i] <- cor(d[[i]],price)
  if(st[i] > sttrue3)cnt=cnt+1}
pkm<-cnt/nr0 #pvalue KM
pkm # Error 3 - changes the value of price to 1 #ok

#------------------------------------------------------------------------------------------
# Calculating Confidence Interval for Mean and Standard Deviation

################mean#############

hist(car$price)
n = length(car$price)
#set number of bootstrap samples
nsim=1000
stat = numeric(nsim) #create a vector in which to store the results
#Set up a loop to generate a series of bootstrap samples
for (i in 1:nsim){
  PriceB= sample (car$price, n, replace=T)
  stat[i] = mean(PriceB)}

#--------percentile method-----------#

quantile(stat,c(0.025,0.975))
#not symmetric, percentile method is not good in this case

#---------simple method--------------#

2*mean(car$price)-quantile(stat,0.975)
2*mean(car$price)-quantile(stat,0.025)

################sd#############

hist(car$price)
n = length(car$price)
#set number of bootstrap samples
nsim=1000
stat = numeric(nsim) #create a vector in which to store the results
#Set up a loop to generate a series of bootstrap samples
for (i in 1:nsim){
  PriceB= sample (car$price, n, replace=T)
  stat[i] = sd(PriceB)}

#--------percentile method---------#

quantile(stat,c(0.025,0.975))
#not symmetric, percentile method is not good in this case

#--------simple method-------------#

2*sd(car$price)-quantile(stat,0.975)
2*sd(car$price)-quantile(stat,0.025)

#------------------------------------------------------------------------------------------
# Price Prediction based on given mileage (in Km) using CI
# Methods used - Percentile and Simple methods

e = c(car$odometres)
f = c(car$price)
plot(e, f) #plot to see the relation between km and price, non-linear
plot(x=1/e, y=f) #consider use funtion: price=1/km
#fit non-linear regression model, starting value b1=1
ajust<- lm(f~e) 
summary(ajust)

#--------percentile method------------#

n<-length(e); data<-cbind(e,f)
theta<-summary(ajust)$coefficients[1]
sdtheta<-summary(ajust)$coefficients[2]
nb<-1000; z<-seq(1,n);tb<-numeric(nb);predb<-numeric(nb)
thetab<-numeric(nb)
# sdthetab<-numeric(nb) #error2 no sdthetab found
for(i in 1:nb){
  zb<-sample(z,n,replace=T)
  ajustb<- nls(data[zb,2]~ b1*(1/data[zb,1]),start =list(b1=1))
  predb[i]<- (summary(ajustb)$coefficients[1]/20000)
  tb[i]<-(thetab[i]-theta)/sdthetab}
quantile(predb,c(0.025,0.975))


#--------simple method----------------#

abs(2*sd(price)-quantile(predb,0.975)) #price cannot be negative, take the absolute value
abs(2*sd(price)-quantile(predb,0.025))

#----------------------------------------------------------------------------------------