# This file includes some R codes that are used when conducting the SOA research
# project "Extreme Events for Insurers - Correlation, Models, and Mitigation."
# The code is for eductional purpose only. It is provided 'as is' without warranty of any kind, 
# either express or implied, warranties of fitness for a purpose, or the warranty of non-infringement. 
# Although the authors try their best to test the tool, they make no warranty that
# (1) it will meet your requirements;
# (2) it will be secure or error-free;
# (3) the results that may be obtained from the use of the code will be effective, accurate or reliable;
# (4) any errors in the tool will be corrected.

# The SOA, the project oversight group and the authors assume no responsibility for errors or omissions 
# in the code or related documentation. In no event shall the sponsor and the authors be liable to you 
# or any third parties for any damages of any kind arising out of or in connection with the use of the code. 

# Please contact Kailan Shang at klshang81@gmail.com for any questions about the code.

# R is a open-source statistical software and can be downloaded at www.r-project.org

############################################################################
# This is the Q-Q plot example in Section 3.1
############################################################################
# Loading S&P 500 daily index returns
df<-read.csv("C:/Data/spdaily.csv")
experience<-df$DailyReturn

# number of data
n <- length(experience)
# average daily return
avg <- mean(experience)
# volatility of daily return
std <- sd(experience)
# simulated normal distribution
NormalDistribution<-rnorm(n,avg,std)
# Q-Q plot
qqplot(experience,NormalDistribution,main="Q-Q plot",xlab = "Historical Experience Quantiles", ylab = "Normal Distribution Quantiles",xlim=range(-0.25,0.1),ylim=range(-0.25,0.1))
abline(0,1)


############################################################################
# This is the degree of tail fatness linear regression example in Section 3.1
############################################################################

# S&P 500 monthly index returns are ordered and the empirical distribution is constructed
# Index returns that are lower than mu - 5 * sigma are used for the regression
# mu is the historical average return and sigma is the historical volatility
# x is the negation of a sequence of index returns that are lower than mu - 8 * sigma
# y is the empirical probablity that the index return is lower than mu - 8 * sigma
# The logrithm of x and y are used in the linear regression
# In the report, daily, weekly, and monthly data are used for this analysis
# Both 5-sigma events and 8-sigma events are used for the analysis as well

# import lny and lnx
df<-read.csv("C:/Data/sp1m.csv")
xr = df$x
yr = df$y

# linear regression (ordinary least square)
lm1<-lm(y~x,data=df)

# predicted y
ypredict = predict(lm1)
ypredict = as.vector(ypredict)

# Take the exponential of x, y and ypredict
xtrans = exp(xr)
ytrans = exp(yr)
yptrans = exp(ypredict)
lm1output = cbind(xtrans, ytrans, yptrans)

# Export x, y and ypredict to a csv file
write.csv(lm1output, file = "C:/Data/splm1m.csv")

# linear regression (weighted least square)
lm2 = lm(y~x,data=df,weights=(df$y)^(-2))

# predicted y
xn = data.frame(xr)
ypredict = predict(lm2)
ypredict = as.vector(ypredict)

# Take the exponential of x, y and ypredict
xtrans = exp(xr)
ytrans = exp(yr)
yptrans = exp(ypredict)
lm2output = cbind(xtrans, ytrans, yptrans)

# Export x, y and ypredict to a csv file
write.csv(lm2output, file = "C:/Data/splm2m.csv")


############################################################################
# This is the extrem value theory example in Section 3.2.3
# Annual block minima and point over threshold
############################################################################

# R package "fExtremes" needs to be installed first
# loading R package "fExtremes"
library(fExtremes)

# loading daily S&P index return based on close price
SPDaily <- read.csv("C:/Data/SPDaily.csv")

# Negating the daily returns
SPDaily <- -SPDaily

# Loading the negated minimum daily return for each year
YearM <- read.csv("C:/Data/YearM.csv")

# MLE of xi (shape parameter)
fitmle = gevFit(YearM, type = "mle")
fitmle

# Hill estimator of xi (Using daily returns that are greater
# than the maximum of the annual block minima of the experience data
t <- min(YearM$YearMax)
x0 <- SPDaily[SPDaily$DailyReturn > t, ]
hP <- hillPlot(x0, plottype = "xi", start = 5)
# the xi is hP$y[1]
hP

# Set the seed of random number generation
set.seed(123)
n <- 10000
# average daily return
avg <- mean(SPDaily$DailyReturn)
# volatility of daily return
std <- sd(SPDaily$DailyReturn)
# Simulating negated daily equity returns
r <- rnorm(n, mean = avg, sd = std)

# Function to find the maximum for each column
colMax <- function(data) sapply(data, max, na.rm = TRUE)

# Split the simulated daily returns to have 250 columns 
# (250 business days for each year)
rs <- split(r, ceiling(seq_along(r)/250))
rmax <- colMax(rs)
rmax <- as.vector(rmax)

# MLE of xi (shape parameter)
fitRmle = gevFit(rmax, type = "mle")
fitRmle

# Hill estimator of xi (Using daily returns that are greater
# than the maximum of the annual block maxima of the simulated negated returns
r0 <- r[r>min(rmax)]
hPR <- hillPlot(r0, plottype = "xi", start = 5)
# the xi is hPR$y[1]
hPR

# Plot Figure 3.5
x0 <- YearM$YearMax
# Empirical data
plot(sort(x0), (1:length(x0)/length(x0)),
xlim = c(0, 0.3), ylim = c(0, 1.1), pch = 19,
cex = 0.5, ylab = "Cumulative Probability", xlab = "- Return", main = "Distribution of Annual Minima")
grid()
# 0.0126 is the negated maximum of the annual block minima
q = seq(0.0126, 0.3, by = 0.0001)

# GEV with MLE parameter
lines(q, pgev(q, xi = 0.51, mu = 0.022927611, beta = 0.009585151), col = "steelblue")

# GEV with Hill estimator
lines(q, pgev(q, xi = 0.40, mu = 0.022927611, beta = 0.009585151), col = "green")

# Normal distribution assuming daily returns are i.i.d.
lines(q, (pnorm(q, mean = -0.0029394, sd = 0.009726209)^250-pnorm(0.0126, mean = -0.0029394, sd = 0.009726209)^250)/(1-pnorm(0.0126, mean = -0.074, sd = 0.154)^250), col = "red")
legend("bottomright", c("MLE","Hill","Normal"), lty=c(1,1,1), col=c("steelblue","green","red")) # gives the legend lines the correct color and width

###############
# POT example
###############

# MLE esitmation for the shape parameter for experience data
fitYgpd = gpdFit(SPDaily$DailyReturn, u = quantile(SPDaily$DailyReturn, 0.95), type = "mle")
fitYgpd
fitYgpd = gpdFit(SPDaily$DailyReturn, u = quantile(SPDaily$DailyReturn, 0.99), type = "mle")
fitYgpd
fitYgpd = gpdFit(SPDaily$DailyReturn, u = quantile(SPDaily$DailyReturn, 0.995), type = "mle")
fitYgpd
fitYgpd = gpdFit(SPDaily$DailyReturn, u = quantile(SPDaily$DailyReturn, 0.999), type = "mle")
fitYgpd

# Hill estimator for the shape parameter for experience data
# The percentile can be changed for different threshold
u = quantile(SPDaily$DailyReturn, 0.95)
u
x0 <- SPDaily[SPDaily$DailyReturn > u, ]
hP <- hillPlot(x0, plottype = "xi", start = 5)
# the xi is hP$y[1]
hP


# MLE esitmation of the shape parameter for simulated data based on the normal distribution
fitYgpd = gpdFit(r, u = quantile(r, 0.95), type = "mle")
fitYgpd
fitYgpd = gpdFit(r, u = quantile(r, 0.99), type = "mle")
fitYgpd
fitYgpd = gpdFit(r, u = quantile(r, 0.995), type = "mle")
fitYgpd
fitYgpd = gpdFit(r, u = quantile(r, 0.999), type = "mle")
fitYgpd

# Hill estimator for the shape parameter for simulated data
# The percentile can be changed for different threshold
u = quantile(r, 0.999)
u
x0 <- r[r > u]
hP <- hillPlot(x0, plottype = "xi", start = 5)
# the xi is hP$y[1]
hP

# plot Figure 3.6
fitYgpd = gpdFit(SPDaily$DailyReturn, u = quantile(SPDaily$SPDaily, 0.999), type = "mle")
fitYgpd
u = quantile(SPDaily$DailyReturn, 0.999)
u
x0 <- SPDaily$DailyReturn - u
x0 <- x0[x0>0]
plot(sort(x0), (1:length(x0)/length(x0)),
xlim = c(0, 0.3), ylim = c(0, 1.1), pch = 19,
cex = 0.5, ylab = "Cumulative Probability", xlab = "-Exceedance", main = "Distribution of Exceedance")
grid()
q = seq(0, 0.3, by = 0.0001)
lines(q, pgpd(q, xi = 0.37093266, beta = 0.01444581), col = "steelblue")
lines(q, pgpd(q, xi = 0.2643599, beta = 0.01444581), col = "green")
lines(q, pgpd(q, xi = -0.04272785, beta = 0.00197075), col = "red")
legend("bottomright", c("MLE","Hill","Normal"), lty=c(1,1,1), col=c("steelblue","green","red"))

############################################################################
# This is the ACF example in Section 3.3
############################################################################

# Importing the daily returns
dfd<-read.csv("C:/Data/SPDaily.csv")

# Building a time series
dfd<-ts(dfd)

# Caculate the autocorrelation function
myacf = acf(dfd, xlim=c(1,20))
myacf

# Building a time series using the absolute value of the returns
dfd<-ts(abs(dfd))

# Caculate the autocorrelation function
myacf = acf(dfd, xlim=c(1,20))
myacf

# Importing the daily returns of the extreme period
dfd<-read.csv("C:/March 2014/Research/Contingent Liquidity/Tail Events/S&P Regression/dr.csv")

# Building a time series
dfd<-ts(dfd)

# Caculate the autocorrelation function
myacf = acf(dfd, xlim=c(1,20))
myacf

# Building a time series using the absolute value of the returns
dfd<-ts(abs(dfd))
# Caculate the autocorrelation function
myacf = acf(dfd, xlim=c(1,20))
myacf

############################################################################
# This is the GARCH example in Section 3.3
############################################################################

# R package "fGarch" needs to be installed
# Loading R package "fGarch"
library(fGarch)

# Loading S&P 500 daily index returns
dfd<-read.csv("C:/Data/SPDaily.csv")

# Fitting the data to MA(1) and GARCH(1,1) models
gf <- garchFit(~ arma(0,1) + garch(1,1), data = dfd$DailyReturn, trace = FALSE)

# Output the fitted daily volatility sequence.
write.csv(gf@sigma.t, file = "C:/Data/drsigma.csv")

############################################################################
# This is the copula example in Section 4.3
############################################################################

# The R package "copula" needs to be installed first
# Loading R package "copula"
library(copula)

# Import data. The data has two fields: Quarterly change in 10 year Treasury bond yield and 
# the negated Quarterly change in Moody's Seasoned Baa corporate bond credit spread
# The credit spread is calculated as the corporate bond yield - 10 year TB yield
datar<-read.csv("C:/Data/copuladata.csv",header=TRUE)

# Experience 
n<-length(datar[,1]) 
set.seed(123) 

# Experience distribution
x<- sapply(datar, rank, ties.method = "random") / (n + 1) 

# Draw the experience data
plot(x, xlab="Change in TB Yield", ylab=" - Change in Credit Spread", main = "Empirical Correlation", cex = 0.5)
abline(h = 0.2, v = 0.2, col = "gray60")

# Fitting to a normal copula. The fitted parameter is 0.68
normal.cop <- normalCopula(c(0.6),dim=2,dispstr="un")
fit.normal <-fitCopula(normal.cop,x,method="ml")
fit.normal

#Set up copula object for copula distribution and goodness-of-fit test later 
clayton.cop <- claytonCopula(3, dim=2) 

# Fitting to a clayton copula using maximum likelihood method and using the Kendall's tau.
# The parameter used in the example is the average of the two estimates.
fit.clayton<-fitCopula(clayton.cop,x,method="ml") 
fit.clayton 
fit.clayton<-fitCopula(clayton.cop,x,method="itau") 
fit.clayton 

# Draw the simulated normal copula
n <- 500
normal.cop <- normalCopula(c(0.68),dim=2,dispstr="un")
xn <- rCopula(n, normal.cop)
plot(xn, main="Gaussian Copula (rho = 0.68)", xlab="Change in TB Yield", ylab=" - Change in Credit Spread", cex = 0.5)
abline(h = 0.2, v = 0.2, col = "gray60")

# Draw the simulated Clayton copula
n <- 500
clayton.cop <- claytonCopula(1.73, dim=2) 
cn <- rCopula(n, clayton.cop)
plot(cn, main="Clayton Copula (Theta = 1.73)", xlab="Change in TB Yield", ylab=" - Change in Credit Spread", cex = 0.5)
abline(h = 0.2, v = 0.2, col = "gray60")

############################################################################
# This is the HMM example in Section 6.2
###########################################################################

# R package "HMM" needs to be installed
# Loading R package "HMM"
library(HMM)

# Set the initial HMM. Two states: "H" and "L". Seven outcomes (No. of earthquakes in each year)
# "a"->0, "b"->1, "c"->2, "d"->3, "e"->4, "f"->5, "g"->6
# Initial state probabilities: P(H)=20%, P(L)=80%
# Trasition Probabilities: P(H|H)=70%, P(H|L)=10%, P(L|H)=30%, P(L|L)=90%
# Emission Probabilities (The probability distribution of the outcome based on the hidden state)
# State "H": P(a)=0%, P(b)=0%, P(c)=0%, P(d)=25%, P(e)=25%, P(f)=25%, P(g)=25%
# State "L": P(a)=30%, P(b)=30%, P(c)=20%, P(d)=15%, P(e)=5%, P(f)=0%, P(g)=0%
  
hmm1<-initHMM(c("H","L"),c("a","b","c","d","e","f","g"),startProbs=c(0.2,0.8),transProbs=matrix(c(0.7,0.1,0.3,0.9),2),
emissionProbs=matrix(c(0,.3,.0,.3,.0,.2,.25,.15,.25,.05,.25,0,.25,0),2))

# Read the historical earthquake frequence from 1900 to 2012
# The data has been translated from numerical values (0 ~ 6) to alphabetic values (a ~ g)
Obv<-read.csv("C:/Data/US6Ch.csv",skip=0,header=TRUE)
Obv<-Obv$USE

# Calibrate the HMM using Baum-Welch algorithm
bw=baumWelch(hmm1, Obv, maxIterations=100)
print(bw$hmm)

# Find out the most possible path of state based on the historical data using Viterbi algorithm
viterbi = viterbi(bw$hmm,Obv)
print(viterbi)

# Simulation for one period

# Update the HMM using the calibration result. 
# Two states: "H" and "L". Seven outcomes (No. of earthquakes in each year)
# "a"->0, "b"->1, "c"->2, "d"->3, "e"->4, "f"->5, "g"->6
# Initial state probabilities: P(H)=0%, P(L)=100%
# Trasition Probabilities: P(H|H)=20%, P(H|L)=7.8%, P(L|H)=80%, P(L|L)=92.2%
# Emission Probabilities (The probability distribution of the outcome based on the hidden state)
# State "H": P(a)=0%, P(b)=0%, P(c)=0%, P(d)=69.9%, P(e)=0%, P(f)=20.1%, P(g)=10%
# State "L": P(a)=46.6%, P(b)=30.1%, P(c)=18.4%, P(d)=0%, P(e)=4.9%, P(f)=0%, P(g)=0%

# Simulation assuming the intial state as "L"
hmm2<-initHMM(c("H","L"),c("a","b","c","d","e","f","g"),startProbs=c(0,1),transProbs=matrix(c(0.2,0.078,0.8,0.922),2),
emissionProbs=matrix(c(0,.466,.0,.301,.0,.184,.699,.0,.0,.049,.201,0,.1,0),2))

# No. of simulation
n<-1000

# Set the random number generation seed
set.seed(123)

# Initialize the vectors storing the simulated outcomes (sr) and states (ss)
sr<-rep(NA,n)
ss<-rep(NA,n)

# Simulating the next period earthquake frequency. 
# Two-period simulation is required as the first one is the current time period. 
for(i in 1:n){
a<-simHMM(hmm2,2)
sr[i]<-a$observation[2]
ss[i]<-a$states[2]}

# Output the simulation result
write.csv(sr,"C:/Data/sr.csv")
write.csv(ss,"C:/Data/ss.csv")

# Simulation assuming the intial state as "H"
hmm3<-initHMM(c("H","L"),c("a","b","c","d","e","f","g"),startProbs=c(1,0),transProbs=matrix(c(0.2,0.078,0.8,0.922),2),
emissionProbs=matrix(c(0,.466,.0,.301,.0,.184,.699,.0,.0,.049,.201,0,.1,0),2))

# No. of simulation
n<-1000

#Set the random number generation seed
set.seed(123)

# Initialize the vectors storing the simulated outcomes (sr) and states (ss)
sr1<-rep(NA,n)
ss1<-rep(NA,n)

# Simulating the next period earthquake frequency. 
# Two-period simulation is required as the first one is the current time period. 
for(i in 1:n){
a<-simHMM(hmm3,2)
sr1[i]<-a$observation[2]
ss1[i]<-a$states[2]}

# Output the simulation result
write.csv(sr1,"C:/Data/sr1.csv")
write.csv(ss1,"C:/Data/ss1.csv")