Forecasting Mortality Rates with a Stochastic PCA–GEE Model: An Application to Solvency Capital Requirements

Authors

Reza Dastranj

Martin Kolář

Published

October 26, 2025

Load packages

First, load the packages to be used

library(dplyr)          # Data manipulation and transformation
library(ggplot2)        # Data visualization
library(tidyverse)      # Collection of tidyverse packages
library(forecast)       # Time series forecasting
library(geepack)        # Generalized Estimating Equation Package

Dataset

Importing the dataset containing log(q_{cgxt}) values for males and females from AUT (Austria) and CZE (Czech Republic)

List of Dataset Filenames

# log(q_{cgxt}) values for the years 1991 to 2021
load("dat.RData")
# log(q_{cgxt}) values for the years 1991 to 2010
load("dattrn.RData")
M0 <- dat
MB0 <- do.call(cbind, M0)


t <- 20

M <- dattrn
MB <- do.call(cbind, M)

Construction of ASDRs DataFrame for Training Set

# Initialize vectors for training set

t <- 20


# Initialize an empty list 'kcList' to store the individual elements of kc
kcList <- list()

# Iterate through specific pairs of matrices in the list 'M'
# This loop runs for i = 1 and i = 3, thus iterating through the 1st-2nd and 3rd-4th matrices in M
for (i in c(1, 3)) {
  
  # Extract the matrices corresponding to the current pair
  matrix1 <- M[[i]]       # Get the i-th matrix from M
  matrix2 <- M[[i + 1]]   # Get the (i+1)-th matrix from M
  
  # Perform principal component analysis (PCA) on the first 15 columns of the combined matrices
  # Then repeat the first principal component (PC1) 15 times
  result1 <- rep(
    prcomp(cbind(matrix1[, 1:15], matrix2[, 1:15]), center = FALSE, scale. = FALSE)$x[, 1], 
    times = 15
  )
  
  # Perform PCA on the columns 16 to 41 of the combined matrices
  # Then repeat the first principal component (PC1) 26 times
  result2 <- rep(
    prcomp(cbind(matrix1[, 16:41], matrix2[, 16:41]), center = FALSE, scale. = FALSE)$x[, 1], 
    times = 26
  )
  
  # Perform PCA on the columns 42 to 91 of the combined matrices
  # Then repeat the first principal component (PC1) 40 times
  result3 <- rep(
    prcomp(cbind(matrix1[, 42:91], matrix2[, 42:91]), center = FALSE, scale. = FALSE)$x[, 1], 
    times = 50
  )
  
  # Combine the repeated principal component results into a single vector
  combined_results <- c(result1, result2, result3)
  
  # Repeat the combined results 2 times and append them to 'kcList'
  kcList <- c(kcList, rep(combined_results, 2))
}

# Flatten 'kcList' into a single vector 'kc1'
kc1 <- unlist(kcList)

# Create a new vector 'kc2' by squaring each element in 'kc1'
kc2 <- kc1^2



# Define age groups for the 'yngold0' vector
yngold0 <- c("Group[0,14]", "Group[15,40]", "Group[41,90]")

# Create a repeated sequence for 'yngold' with specified age groups
# The repetition pattern is based on the number of observations (t) in each age group
yngold <- rep(
  c(
    rep("Group[0,14]", t * 15),   # Repeat "Group[0,14]" t * 15 times
    rep("Group[15,40]", t * 26),  # Repeat "Group[15,40]" t * 26 times
    rep("Group[41,90]", t * 50)   # Repeat "Group[41,90]" t * 40 times
  ), 
  4  # Repeat the entire pattern 4 times
)

# Create a vector for 'gender' with alternating repetitions of "Female" and "Male"
gender <- rep(c("Female", "Male"), each = t * 91, times = 2)

# Create a vector for 'Country' with repetitions of "AUT" and "CZE"
Country <- rep(c("AUT", "CZE"), each = t * 91 * 2)

# Initialize vectors for the training set
# 'year' is repeated for each observation in the dataset
year <- rep(1991:2010, times = 4 * 91)

# Define levels for 'age' factor from 0 to 90
age_levels <- factor(0:90)

# Create 'age' vector by repeating age values across all observations
age <- rep(0:90, each = t, times = 4)

# Calculate 'cohort' by subtracting age from year
cohort <- year - age

# Combine all vectors into a data frame called 'ASDRs' for the training set
ASDRs <- data.frame(
  kc1, kc2,          # Variables 'kc1' and 'kc2' from previous calculations
  cohort,            # Cohort calculated above
  y = as.vector(MB), # Response variable 'y', converted to a vector
  age,               # Age vector
  gender,            # Gender vector
  Country,           # Country vector
  year,              # Year vector
  yngold,            # Age group vector
  stringsAsFactors = FALSE  # Do not automatically convert strings to factors
)

# Convert 'age' to a factor with specified levels
ASDRs$age <- factor(ASDRs$age, levels = age_levels)

# Convert 'age' to numeric for a separate variable 'agenum'
ASDRs$agenum <- as.numeric(ASDRs$age)

# Convert 'gender' and 'Country' to factors with specified levels
ASDRs$gender <- factor(ASDRs$gender, levels = c("Female", "Male"))
ASDRs$Country <- factor(ASDRs$Country, levels = c("AUT", "CZE"))

# Create a 'subject' variable by interacting 'Country', 'gender', and 'age'
ASDRs$subject <- interaction(ASDRs$Country, ASDRs$gender, ASDRs$age)
# Display the structure of the resulting data frame to check the data types and structure
str(ASDRs)
'data.frame':   7280 obs. of  11 variables:
 $ kc1    : num  46 46.3 45.3 46 46.7 ...
 $ kc2    : num  2118 2145 2051 2117 2184 ...
 $ cohort : int  1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 ...
 $ y      : num  -5.01 -5.03 -5.18 -5.2 -5.33 ...
 $ age    : Factor w/ 91 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
 $ Country: Factor w/ 2 levels "AUT","CZE": 1 1 1 1 1 1 1 1 1 1 ...
 $ year   : int  1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 ...
 $ yngold : chr  "Group[0,14]" "Group[0,14]" "Group[0,14]" "Group[0,14]" ...
 $ agenum : num  1 1 1 1 1 1 1 1 1 1 ...
 $ subject: Factor w/ 364 levels "AUT.Female.0",..: 1 1 1 1 1 1 1 1 1 1 ...
# Display the first few rows of the resulting data frame to inspect the data
head(ASDRs)
       kc1      kc2 cohort         y age gender Country year      yngold agenum
1 46.02698 2118.483   1991 -5.007169   0 Female     AUT 1991 Group[0,14]      1
2 46.31294 2144.888   1992 -5.033727   0 Female     AUT 1992 Group[0,14]      1
3 45.28995 2051.179   1993 -5.184065   0 Female     AUT 1993 Group[0,14]      1
4 46.00825 2116.759   1994 -5.196172   0 Female     AUT 1994 Group[0,14]      1
5 46.73129 2183.814   1995 -5.332980   0 Female     AUT 1995 Group[0,14]      1
6 46.91257 2200.789   1996 -5.332367   0 Female     AUT 1996 Group[0,14]      1
       subject
1 AUT.Female.0
2 AUT.Female.0
3 AUT.Female.0
4 AUT.Female.0
5 AUT.Female.0
6 AUT.Female.0

Plot of \(k_{ct}\)

# Create a new data frame 'ASDRsnw' by converting the 'year' column to numeric
ASDRsnw <- ASDRs %>%
  mutate(year = as.numeric(as.character(year)))


# Rename the 9th column to "PC1"
colnames(ASDRsnw)[9] <- "PC1"

# Generate a ggplot with points and lines
ggplot(ASDRsnw) +
  # Add points to the plot with year on the x-axis, kc1 on the y-axis, and color based on 'PC1'
  geom_point(aes(x = year, y = kc1, color = PC1), size = 1.6) +
  
  # Add lines to the plot connecting the points with the same x and y aesthetics
  geom_line(aes(x = year, y = kc1, color = PC1), linewidth = .8) +
  
  # Manually set the colors for the 'PC1' groups
  scale_color_manual(values = c("Group[0,14]" = "red", "Group[15,40]" = "green", "Group[41,90]" = "blue")) +
  
  # Set the labels for the x and y axes
  xlab("Year") +  
  ylab("Value") + 
  
  # Create a facet grid for the plot by 'Country'
  facet_grid(~Country) +
  
  # Apply a clean, white background theme to the plot
  theme_bw() +
  
  # Customize the legend to have larger points
  guides(color = guide_legend(override.aes = list(size = 3))) +
  
  # Rotate the x-axis text labels to be vertical and adjust their alignment
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  
  # Set the font size of the x-axis text labels
  theme(axis.text.x = element_text(size = 20)) +
  
  # Set the font size of the y-axis text labels
  theme(axis.text.y = element_text(size = 25)) +
  
  # Set the font size of the legend text
  theme(legend.text = element_text(size = 35)) +
  
  # Set the font size of the axis titles
  theme(axis.title = element_text(size = 25)) +
  
  # Set the font size of the strip text (facet labels)
  theme(strip.text = element_text(size = 35)) +
  
  # Set the font size of the legend title
  theme(legend.title = element_text(size = 20)) +
  
  # Position the legend at the bottom of the plot
  theme(legend.position = "bottom")

Fitting Four GEE Models using the geeglm Function

# Fit GEE model with exchangeable correlation structure

geeEx <- geeglm(
  y ~ -1 + gender + age + 
    gender:age:kc1 + 
    gender:age:kc2 + 
    cohort,
  data = ASDRs,               # Data frame containing the variables
  id = subject,               # Grouping factor for repeated measures
  waves = year,               # Time variable for repeated measures
  corstr = "exchangeable",    # Exchangeable correlation structure
  weights = agenum / mean(agenum)  # Weights based on 'agenum' normalized by its mean
)

# Perform an ANOVA (Analysis of Variance) on the fitted GEE model
anova(geeEx)
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: y
Terms added sequentially (first to last)

                Df    X2 P(>|Chi|)    
gender           2  1599 < 2.2e-16 ***
age             90 44422 < 2.2e-16 ***
cohort           1  4134 < 2.2e-16 ***
gender:age:kc1 182 41956 < 2.2e-16 ***
gender:age:kc2 182 19573 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
geeExl <- geeglm(
  y ~ -1 + gender + age + 
    gender:age:kc1 +
    cohort,
  data = ASDRs,               # Data frame containing the variables
  id = subject,               # Grouping factor for repeated measures
  waves = year,               # Time variable for repeated measures
  corstr = "exchangeable",    # Exchangeable correlation structure
  weights = agenum / mean(agenum)  # Weights based on 'agenum' normalized by its mean
)



# Perform an ANOVA (Analysis of Variance) on the fitted GEE model
anova(geeExl)
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: y
Terms added sequentially (first to last)

                Df    X2 P(>|Chi|)    
gender           2  1599 < 2.2e-16 ***
age             90 44422 < 2.2e-16 ***
cohort           1  4134 < 2.2e-16 ***
gender:age:kc1 182 41956 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Fit GEE model with AR(1) correlation structure

geeAr1 <- geeglm(
  y ~ -1 + gender + age + 
    gender:age:kc1 + 
    gender:age:kc2 + 
    cohort,
  data = ASDRs,               # Data frame containing the variables
  id = subject,               # Grouping factor for repeated measures
  waves = year,               # Time variable for repeated measures
  corstr = "ar1",             # AR1 (autoregressive) correlation structure
  weights = agenum / mean(agenum)  # Weights based on 'agenum' normalized by its mean
)

# Perform an ANOVA (Analysis of Variance) on the fitted GEE model
anova(geeAr1) 
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: y
Terms added sequentially (first to last)

                Df    X2 P(>|Chi|)    
gender           2  1594 < 2.2e-16 ***
age             90 44332 < 2.2e-16 ***
cohort           1  2955 < 2.2e-16 ***
gender:age:kc1 182 49660 < 2.2e-16 ***
gender:age:kc2 182 90733 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
geeAr1l <- geeglm(
  y ~ -1+gender + age + 
    gender:age:kc1 + 
    cohort,
  data = ASDRs,               # Data frame containing the variables
  id = subject,               # Grouping factor for repeated measures
  waves = year,               # Time variable for repeated measures
  corstr = "ar1",             # AR1 (autoregressive) correlation structure
  weights = agenum / mean(agenum)  # Weights based on 'agenum' normalized by its mean
)


# geeAr1
# coef(geeAr1)
# vcov(geeAr1)
# summary(geeAr1)
# coef(summary(geeAr1))

# Perform an ANOVA (Analysis of Variance) on the fitted GEE model
anova(geeAr1l)
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: y
Terms added sequentially (first to last)

                Df    X2 P(>|Chi|)    
gender           2  1594 < 2.2e-16 ***
age             90 44332 < 2.2e-16 ***
cohort           1  2955 < 2.2e-16 ***
gender:age:kc1 182 49660 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

QIC

# Calculate and display the QIC value for the model with an exchangeable correlation structure
QIC_Ex <- QIC(geeEx)
QIC_Ex
       QIC       QICu  Quasi Lik        CIC     params       QICC 
  803.2096  1169.9697  -127.9849   273.6199   457.0000 -3622.5167 
QIC_Exl <- QIC(geeExl)
QIC_Exl
      QIC      QICu Quasi Lik       CIC    params      QICC 
1650.4986  816.4186 -133.2093  692.0400  275.0000 3408.0158 
# Calculate and display the QIC value for the model with an AR1 correlation structure
QIC_Ar1 <- QIC(geeAr1)
QIC_Ar1
       QIC       QICu  Quasi Lik        CIC     params       QICC 
  791.1382  1169.2705  -127.6353   267.9338   457.0000 -3634.5881 
QIC_Ar1l <- QIC(geeAr1l)
QIC_Ar1l
      QIC      QICu Quasi Lik       CIC    params      QICC 
1491.1470  815.1953 -132.5977  612.9758  275.0000 3248.6642 
ASDRs$fit <- predict(geeAr1,ASDRs)

Creation of New Dataset for the GEE Model Prediction

# Initialize vectors for test set

t <- 11

# Initialize an empty list 'kcList' to store the individual elements of kc
kcList <- list()

# Iterate through specific pairs of matrices in the list 'M'
# This loop runs for i = 1 and i = 3, thus iterating through the 1st-2nd and 3rd-4th matrices in M
for (i in c(1, 3)) {
  matrix1 <- M[[i]]       # Get the i-th matrix from M
  matrix2 <- M[[i + 1]]   # Get the (i+1)-th matrix from M to pair with matrix1
  
  # Perform principal component analysis (PCA) on the first 15 columns of the combined matrices
  # Append the first principal component (PC1) to 'kcList1'
  kcList1 <- c(kcList, prcomp(cbind(matrix1[, 1:15], matrix2[, 1:15]), center = FALSE, scale. = FALSE)$x[,1])
  
  # Perform PCA on columns 16 to 41 of the combined matrices
  # Append the first principal component (PC1) to 'kcList2'
  kcList2 <- c(kcList1, prcomp(cbind(matrix1[, 16:41], matrix2[, 16:41]), center = FALSE, scale. = FALSE)$x[,1])
  
  # Perform PCA on columns 42 to 91 of the combined matrices
  # Append the first principal component (PC1) to 'kcList'
  kcList <- c(kcList2, prcomp(cbind(matrix1[, 42:91], matrix2[, 42:91]), center = FALSE, scale. = FALSE)$x[,1])
}

# Combine all elements in kcList into a single vector 'kc0'
kc0 <- unlist(kcList)
# length(kc0)  # Display the length of 'kc0' to verify the size of the combined vector

# Initialize an empty vector 'ar' for storing forecasted values
ar <- c()

# Iterate through 6 subsets of kc0 to forecast future values using random walk with drift
for (i in 1:6) {
  # Extract a subset of kc0 for the current iteration
  subset_kc0 <- kc0[((i - 1) * 20 + 1):(i * 20)]
  
  # Forecast the next 9 values using random walk with drift
  kc_forecast <- rwf(subset_kc0, h = 11, drift = TRUE, level = c(80, 95))
  
  # Extract the mean values from the forecast and append to 'ar'
  ar <- append(ar, kc_forecast$mean[1:11])
}

# Reinitialize 'kcList' to store new combinations of 'ar'
kcList <- list()

indices <- seq(1, length(ar), by = 33)

# Iterate through the indices, combining and replicating blocks of 'ar'
for (i in indices) {
  # Extract three consecutive blocks of 'ar', each of length 9
  ar1 <- ar[i:(i + 10)]
  ar2 <- ar[(i + 11):(i + 21)]
  ar3 <- ar[(i + 22):(i + 32)]
  
  # Replicate and combine the blocks according to the specified pattern
  result1 <- rep(c(rep(ar1, 15), rep(ar2, 26), rep(ar3, 50)), 2)
  
  # Append the result to 'kcList'
  kcList <- c(kcList, result1)
}
# Combine all elements in kcList into a single vector 'kc1'
kc1 <- unlist(kcList)

# Create a new vector 'kc2' by squaring each element in 'kc1'
kc2 <- kc1^2
# Set the value of 't' to 9, representing the number of years from 2011 to 2021
t<- 11

# Create a 'gender' vector with alternating repetitions of "Female" and "Male"
# Each gender is repeated 91 times per year, across 't' years, repeated 2 times (once for each country)
gender <- rep(c("Female", "Male"), each = 91 * t, times = 2)

# Create a 'Country' vector with repetitions of "AUT" and "CZE"
# Each country is repeated for 2 genders * 91 ages * 't' years
Country <- rep(c("AUT", "CZE"), each = 2 * 91 * t)

# Create a 'year' vector representing the years 2011 to 2021
# This sequence is repeated for 4 (2 countries * 2 genders) * 91 ages
year <- rep(2011:2021, times = 4 * 91)

# Define levels for the 'age' factor from 0 to 90
age_levels <- factor(0:90)

# Create an 'age' vector by repeating ages 0 to 90 for each year, repeated 4 times (2 countries * 2 genders)
age <- rep(0:90, each = 11, times = 4)

# Calculate 'cohort' by subtracting 'age' from 'year'
cohort <- year - age

# Combine all variables into a data frame called 'newASDRs' for the test set
newASDRs <- data.frame(
  kc1, kc2,               # Variables 'kc1' and 'kc2' from previous calculations
  cohort,                 # Cohort calculated above
  y = as.vector(MB0[21:31,]),  # Response variable 'y', taken from rows 21 to 29 of 'MB0'
  age,                    # Age vector
  gender,                 # Gender vector
  Country,                # Country vector
  year,                   # Year vector
  stringsAsFactors = FALSE  # Do not automatically convert strings to factors
)

# Convert 'age' to a factor with specified levels (0 to 90)
newASDRs$age <- factor(newASDRs$age, levels = age_levels)

# Convert 'gender' to a factor with specified levels ("Female", "Male")
newASDRs$gender <- factor(newASDRs$gender, levels = c("Female", "Male"))

# Convert 'Country' to a factor with specified levels ("AUT", "CZE")
newASDRs$Country <- factor(newASDRs$Country, levels = c("AUT", "CZE"))

# Convert 'year' to a factor with specified levels (2011 to 2021)
newASDRs$year <- factor(newASDRs$year, levels = 2011:2021)

# Create a 'subject' variable by interacting 'Country', 'gender', and 'age'
newASDRs$subject <- interaction(newASDRs$Country, newASDRs$gender, newASDRs$age)
head(newASDRs)
       kc1      kc2 cohort         y age gender Country year      subject
1 50.11586 2511.600   2011 -5.891180   0 Female     AUT 2011 AUT.Female.0
2 50.32031 2532.133   2012 -5.719324   0 Female     AUT 2012 AUT.Female.0
3 50.52475 2552.750   2013 -5.922384   0 Female     AUT 2013 AUT.Female.0
4 50.72919 2573.451   2014 -5.778350   0 Female     AUT 2014 AUT.Female.0
5 50.93364 2594.235   2015 -5.843578   0 Female     AUT 2015 AUT.Female.0
6 51.13808 2615.103   2016 -5.860367   0 Female     AUT 2016 AUT.Female.0
# Define constants for the observed and forecasted periods
observed_period <- 20
forecast_period <- 11
total_period <- 31

# Create labels for observed and forecasted data
observed_label <- rep("Observed", observed_period)
forecasted_label <- rep("Forecasted", forecast_period)

# Combine observed and forecasted labels for all groups
observation_type <- rep(factor(c(observed_label, forecasted_label)), 6)

# Generate a sequence for years (1991-2021) repeated for each group
year_sequence <- rep(1991:2021, 6)

# Define countries and repeat for each group and time period
countries <- rep(c("AUT", "CZE"), each = 3 * total_period)

# Combine observed data (kc0) and forecasted data (ar) into a single vector
trend_data <- c(
  kc0[1:20], ar[1:11],
  kc0[21:40], ar[12:22],
  kc0[41:60], ar[23:33],
  kc0[61:80], ar[34:44],
  kc0[81:100], ar[45:55],
  kc0[101:120], ar[56:66]
)

# Define age groups and repeat for the appropriate time periods
age_groups <- c("Group[0,14]", "Group[15,40]", "Group[41,90]")
age_group_labels <- rep(rep(age_groups, each = total_period), 2)

# Combine all data into a data frame
trend_df <- data.frame(
  Trend = trend_data,
  Year = year_sequence,
  Type = observation_type,
  Country = countries,
  AgeGroup = age_group_labels
)

# Rename the age group column for clarity
colnames(trend_df)[5] <- "PC1"

# Create the plot using ggplot2
ggplot(trend_df) +
  geom_point(aes(x = Year, y = Trend, color = PC1), size = 1.6) +
  geom_line(aes(x = Year, y = Trend, color = PC1), linewidth = 0.8) +
  scale_color_manual(values = c("Group[0,14]" = "red", "Group[15,40]" = "green", "Group[41,90]" = "blue")) +
  xlab("Year") +
  ylab("Value") +
  facet_grid(~Country) +
theme_bw() +
guides(color = guide_legend(override.aes = list(size = 1))) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  
theme(axis.text.x = element_text(size = 10)) +
  
theme(axis.text.y = element_text(size = 15)) +
  
theme(legend.text = element_text(size = 10)) +
  
theme(axis.title = element_text(size = 15)) +
  
theme(strip.text = element_text(size = 10)) +
  
theme(legend.title = element_text(size = 10)) +
  
theme(legend.position = "bottom")

# Add predictions using the GEE model

newASDRs$pred <- predict(geeAr1, newdata = newASDRs)


library(dplyr)
library(akima)
library(ggplot2)
max(exp(newASDRs$pred - newASDRs$y))
[1] 7.281146
# 1) Prep & interpolate
df <- newASDRs %>%
  filter(Country=="AUT", gender=="Female") %>%
  mutate(
    year  = as.numeric(as.character(year)),
    age   = as.numeric(as.character(age)),
    ratio = exp(pred - y)
  )

xi <- seq(min(df$year), max(df$year), length.out = 400)
yi <- seq(min(df$age),  max(df$age),  length.out = 400)
interp_res <- with(df, interp(year, age, ratio,
                              xo = xi, yo = yi,
                              duplicate = "mean"))
Warning in interp(year, age, ratio, xo = xi, yo = yi, duplicate = "mean"):
collinear points, trying to add some jitter to avoid colinearities!
Warning in interp(year, age, ratio, xo = xi, yo = yi, duplicate = "mean"):
success: collinearities reduced through jitter
grid_df <- expand.grid(year = interp_res$x,
                       age  = interp_res$y)
grid_df$ratio <- as.vector(interp_res$z)

# 2) Define jet palette & breaks
brks     <- seq(min(exp(newASDRs$pred - newASDRs$y)), max(exp(newASDRs$pred - newASDRs$y)), by = 0.05)
jet.cols <- colorRampPalette(c("blue","cyan","green","yellow","red"))(length(brks)+1)

# 3) Plot with integer years on x‐axis
ggplot(grid_df, aes(x = year, y = age)) +
  geom_tile(aes(fill = ratio)) +
  scale_fill_gradientn(
    name    = "Pred / Actual",
    colours = jet.cols,
    limits  = c(min(exp(newASDRs$pred - newASDRs$y)), max(exp(newASDRs$pred - newASDRs$y))),
    oob     = scales::squish
  ) +
  geom_contour(aes(z = ratio),
               breaks    = brks,
               colour    = "black",
               linewidth = 0.3) +
  scale_x_continuous(
    breaks = 2011:2021,
    expand = c(0, 0)
  ) +
  labs(
    x     = "Forecast Year",
    y     = "Age",
    title = "Forecast / Actual mortality, AUT ♀, 2011–2021"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    panel.grid        = element_blank(),
    legend.key.height = unit(1.2, "cm")
  )
Warning: Removed 436 rows containing non-finite outside the scale range
(`stat_contour()`).

ggsave("Projectedactual.pdf", width = 10, height = 6)
Warning: Removed 436 rows containing non-finite outside the scale range
(`stat_contour()`).
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
conversion failure on 'Forecast / Actual mortality, AUT ♀, 2011–2021' in
'mbcsToSbcs': for ♀ (U+2640)
dev.off()
null device 
          1 

Calculate 95% prediction intervals using the geeAr1 model on newASDRs

# Generate the model matrix for the GEE model terms on the newASDRs data
model_matrix_gee <- model.matrix(terms(geeAr1), newASDRs)

# Calculate the predicted mortality rates based on the GEE model coefficients
newASDRs$predicted_rate_gee <- model_matrix_gee %*% coef(geeAr1)

# Calculate the standard errors of the predicted rates
predicted_rate_variance_gee <- diag(model_matrix_gee %*% tcrossprod(geeAr1$geese$vbeta, model_matrix_gee))

# Create a data frame to store the results, including lower and upper bounds of the prediction interval
newASDRs <- data.frame(
  newASDRs,
  lower_bound_gee = newASDRs$predicted_rate_gee - 2 * sqrt(predicted_rate_variance_gee),
  upper_bound_gee = newASDRs$predicted_rate_gee + 2 * sqrt(predicted_rate_variance_gee)
)

# Display the head of the new data frame with predicted rates and prediction intervals
head(newASDRs,20)
        kc1      kc2 cohort         y age gender Country year      subject
1  50.11586 2511.600   2011 -5.891180   0 Female     AUT 2011 AUT.Female.0
2  50.32031 2532.133   2012 -5.719324   0 Female     AUT 2012 AUT.Female.0
3  50.52475 2552.750   2013 -5.922384   0 Female     AUT 2013 AUT.Female.0
4  50.72919 2573.451   2014 -5.778350   0 Female     AUT 2014 AUT.Female.0
5  50.93364 2594.235   2015 -5.843578   0 Female     AUT 2015 AUT.Female.0
6  51.13808 2615.103   2016 -5.860367   0 Female     AUT 2016 AUT.Female.0
7  51.34253 2636.055   2017 -5.863674   0 Female     AUT 2017 AUT.Female.0
8  51.54697 2657.090   2018 -6.040600   0 Female     AUT 2018 AUT.Female.0
9  51.75141 2678.209   2019 -5.960229   0 Female     AUT 2019 AUT.Female.0
10 51.95586 2699.411   2020 -5.783741   0 Female     AUT 2020 AUT.Female.0
11 52.16030 2720.697   2021 -5.852679   0 Female     AUT 2021 AUT.Female.0
12 50.11586 2511.600   2010 -8.609330   1 Female     AUT 2011 AUT.Female.1
13 50.32031 2532.133   2011 -8.623564   1 Female     AUT 2012 AUT.Female.1
14 50.52475 2552.750   2012 -8.768652   1 Female     AUT 2013 AUT.Female.1
15 50.72919 2573.451   2013 -8.779867   1 Female     AUT 2014 AUT.Female.1
16 50.93364 2594.235   2014 -8.653196   1 Female     AUT 2015 AUT.Female.1
17 51.13808 2615.103   2015 -8.229947   1 Female     AUT 2016 AUT.Female.1
18 51.34253 2636.055   2016 -8.705034   1 Female     AUT 2017 AUT.Female.1
19 51.54697 2657.090   2017 -8.478877   1 Female     AUT 2018 AUT.Female.1
20 51.75141 2678.209   2018 -8.263856   1 Female     AUT 2019 AUT.Female.1
        pred predicted_rate_gee lower_bound_gee upper_bound_gee
1  -5.794480          -5.794480       -5.987103       -5.601857
2  -5.794831          -5.794831       -5.983903       -5.605760
3  -5.793071          -5.793071       -5.979230       -5.606911
4  -5.789198          -5.789198       -5.973368       -5.605028
5  -5.783213          -5.783213       -5.966620       -5.599805
6  -5.775116          -5.775116       -5.959293       -5.590939
7  -5.764907          -5.764907       -5.951670       -5.578144
8  -5.752586          -5.752586       -5.943991       -5.561180
9  -5.738153          -5.738153       -5.936433       -5.539872
10 -5.721607          -5.721607       -5.929094       -5.514120
11 -5.702950          -5.702950       -5.922001       -5.483899
12 -8.039216          -8.039216       -8.262040       -7.816393
13 -8.034442          -8.034442       -8.268451       -7.800434
14 -8.027900          -8.027900       -8.274181       -7.781619
15 -8.019590          -8.019590       -8.279268       -7.759911
16 -8.009511          -8.009511       -8.283737       -7.735285
17 -7.997664          -7.997664       -8.287606       -7.707723
18 -7.984050          -7.984050       -8.290885       -7.677214
19 -7.968667          -7.968667       -8.293579       -7.643754
20 -7.951515          -7.951515       -8.295688       -7.607343

Evaluating Forecast Accuracy: Mean Squared Error for GEE Model Predictions

# Initialize an empty vector to store the Mean Squared Error (MSE) values
MSE_test_gee_pca <- c()

# Iterate through the 4 datasets of predictions
for (n in 1:4) {
  
  # Extract predicted values for the specific set (9 years, 91 observations each)
  start_index <- (n - 1) * (11 * 91) + 1
  end_index <- n * (11 * 91)
  gee_predictions <- exp(matrix(newASDRs$pred[start_index:end_index], 11, 91, byrow = FALSE))
  
  # Extract actual values for the corresponding set from the M0 list
  actual_values <- exp(M0[[n]][21:31,])
  
  # Calculate the residuals (errors between actual and predicted values)
  gee_errors <- actual_values - gee_predictions
  
  # Compute the Mean Squared Error (MSE) for the current set
  MSE_test_gee_n <- sum(gee_errors^2) / (91 * 11)
  
  # Append the computed MSE to the MSE vector
  MSE_test_gee_pca <- append(MSE_test_gee_pca, MSE_test_gee_n)
}

# Display the vector of MSE values for all datasets
MSE_test_gee_pca
[1] 5.289035e-06 8.993636e-06 1.179988e-05 3.396036e-05