Analysis template for assessing sustainability competencies training in academic curricula

Before start: install and activate packages

library(fmsb)
library(RColorBrewer)
library(ggplot2)
library(ggdendro)
library(ggfortify)
library(gridExtra)
library(dendextend)
library(magrittr)
library(tidyverse)

Step 1: Import data file

# Insert here the correct path to your "Scores" data file
Scores <- read.csv("C:/Users/meppinga/Downloads/Scores.csv", sep=";")

# Check if the data file is correctly inserted
View(Scores)

# Define the structure of the dataset to be analyzed
nr_courses <- nrow(Scores)
nr_years <- 6 ## In our case Ba1, Ba2, Bio_env, Inf_dat, Tech_eng, Ba3
nr_levels <- 5 ## Overall + 4 areas: Valuing sustainability, Embracing complexity, Envisioning sustainable futures, Acting for sustainability
nr_KSA_course <- 1 ## How many competences per course did you evaluate for KSA statements?

Step 2: Median GreenComp competences scores Ba1, Ba2, Ba3

# Calculate median scores per competence and per Ba years
col_median_Ba1 <- apply(Scores[Scores$Year == "Ba1",5:16], 2, median, na.rm=TRUE)
col_median_Ba2 <- apply(Scores[Scores$Year == "Ba2",5:16], 2, median, na.rm=TRUE)
col_median_Ba3 <- apply(Scores[Scores$Year == "Ba3",5:16], 2, median, na.rm=TRUE)

# Set up spider graph
Max_spider <- c(4,4,4,4,4,4,4,4,4,4,4,4)
Min_spider <- c(0,0,0,0,0,0,0,0,0,0,0,0)
Spider_median_year_comp <- rbind(Max_spider, Min_spider, col_median_Ba1, col_median_Ba2, col_median_Ba3)
rownames(Spider_median_year_comp) <- c("Max", "Min","Ba1", "Ba2", "Ba3")
Spider_median_year_comp <- na.omit(Spider_median_year_comp)
Spider_median_year_comp <- data.frame(Spider_median_year_comp)

# Create spider graph
colors_border=c(rgb(0.2,0.5,0.5,0.9), rgb(0.8,0.2,0.5,0.9) , rgb(0.7,0.5,0.1,0.9), rgb(0.5,0.5,0.1,0.9),
                rgb(0.8,0.5,0.5,0.9), rgb(0.5,0.2,0.5,0.9) , rgb(0.3,0.5,0.1,0.7), rgb(0.5,0.1,0.1,0.9))
colors_in=c( rgb(0.2,0.5,0.5,0.4), rgb(0.8,0.2,0.5,0.4) , rgb(0.7,0.5,0.1,0.4) )
radarchart(Spider_median_year_comp, axistype=1 , 
            #custom polygon
            pcol=colors_border , pfcol=NULL , plwd=4 , plty=1,
            #custom the grid
            cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,4,1), cglwd=0.8,
            #custom labels
            vlcex=0.8)

# Add a legend to the spider graph
legend(x=1.3, y=1.2, legend = rownames(Spider_median_year_comp[-c(1,2),]), bty = "n", pch=20 , col=colors_border , text.col = "black", cex=0.8, pt.cex=1)

Step 3: Average GreenComp competences general and area scores

# Calculate the summary statistics to be displayed
Barplot_average_area_competences <- data.frame(matrix(data = NA, ncol = 3, nrow = (nr_years*nr_levels)))
colnames(Barplot_average_area_competences) <- c("Year", "Area", "Mean")
Barplot_average_area_competences$Year <- c(rep("Ba1", 5), rep("Ba2", 5), rep("Bio_env", 5), rep("Inf_dat", 5), rep("Tech_eng", 5), rep("Ba3", 5))
Barplot_average_area_competences$Area <- c(rep(c("All", "Sust_val", "Emb_compl", "Sust_fut", "Act"), 6))
Barplot_average_area_competences$Mean <- c(mean(as.matrix(Scores[Scores$Year == "Ba1",5:16]), na.rm = TRUE),
                                          mean(as.matrix(Scores[Scores$Year == "Ba1",5:7]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba1",8:10]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba1",10:13]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba1",14:16]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Ba2",5:16]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Ba2",5:7]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Ba2",8:10]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Ba2",11:13]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Ba2",14:16]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Bio_env" ,5:16]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Bio_env" ,5:7]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Bio_env" ,8:10]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Bio_env" ,11:13]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Bio_env" ,14:16]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Inf_dat" ,5:16]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Inf_dat" ,5:7]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Inf_dat" ,8:10]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Inf_dat" ,11:13]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Inf_dat" ,14:16]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Tech_eng" ,5:16]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Tech_eng" ,5:7]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Tech_eng" ,8:10]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Tech_eng" ,11:13]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba2" & Scores$Specialization == "Tech_eng" ,14:16]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba3",5:16]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba3",5:7]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba3",8:10]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba3",11:13]), na.rm = TRUE),
                                           mean(as.matrix(Scores[Scores$Year == "Ba3",14:16]), na.rm = TRUE))
                                           
Barplot_average_area_competences$Area = factor(Barplot_average_area_competences$Area, levels = c("All", "Sust_val", "Emb_compl", "Sust_fut", "Act"), ordered = TRUE)
Barplot_average_area_competences$Year = factor(Barplot_average_area_competences$Year, levels = c("Ba1", "Ba2", "Bio_env", "Inf_dat", "Tech_eng", "Ba3"), ordered = TRUE)

# Create the bar chart
ggplot(Barplot_average_area_competences, aes(fill = Area, y = Mean, x = Year)) + geom_bar(position = "dodge", stat="identity") + 
  theme_bw()

Step 3: Identify course clusters

# Apply hierarchical clustering (here using Ward's method)
labs2   <- Scores[-c(10,25,28,29),17]   # Remove courses with NAs
levels <- Scores[-c(10,25,28,29),2]     # Remove courses with NAs
df  <-  Scores[-c(10,25,28,29),5:16]    # Remove courses with NAs
rownames(df)  <- labs2  # Set row names
dfdata        <-matrix(unlist(df),ncol = 12, nrow = 25)
df2           <- sweep(dfdata,2,colSums(dfdata),`/`)
rownames(df2) <- labs2  # Set row names
d    <- dist(df2, method="manhattan")
hc   <- hclust(d, method="ward.D2") 
dend <- df2 %>% dist(method="manhattan") %>% hclust(method="ward.D2") %>%  as.dendrogram %>%
     set("branches_k_color", k = 4, c("steelblue","aquamarine3","darkgoldenrod1","firebrick4")) %>% set("branches_lwd", 0.7) %>%
     set("labels_cex", 0.6) %>% set("labels_colors",k=4, c("steelblue","aquamarine3","darkgoldenrod1","firebrick4")) %>%
     set("leaves_pch", 19) %>% set("leaves_cex", 0.5) 
ggd1 <- as.ggdend(dend)
cp1<- ggplot(ggd1, horiz = TRUE)

# Calculate competence means for each cluster
s = cutree(hc,k=4)
Cluster <- as.factor(s)
cluster1 <- data.frame(matrix(data = NA, ncol = 2, nrow = 12))
colnames(cluster1) <- c("Mean", "Competences")
cluster1$Mean  <- colMeans(df[s==1,])
cluster1$Competences <- as.factor(1:12) 
cluster2 <- data.frame(matrix(data = NA, ncol = 2, nrow = 12))
colnames(cluster2) <- c("Mean", "Competences")
cluster2$Mean  <- colMeans(df[s==2,])
cluster2$Competences <- as.factor(1:12) 
cluster3 <- data.frame(matrix(data = NA, ncol = 2, nrow = 12))
colnames(cluster3) <- c("Mean", "Competences")
cluster3$Mean  <- colMeans(df[s==3,])
cluster3$Competences <- as.factor(1:12) 
cluster4 <- data.frame(matrix(data = NA, ncol = 2, nrow = 12))
colnames(cluster4) <- c("Mean", "Competences")
cluster4$Mean  <- colMeans(df[s==4,])
cluster4$Competences <- as.factor(1:12) 
bp1 <- ggplot(cluster1, aes(fill = NULL, y = Mean, x = Competences)) + geom_bar(position = "dodge", stat="identity", fill="firebrick4") + 
  coord_cartesian(ylim = c(0, 4)) +
  theme_bw()
bp2 <- ggplot(cluster2, aes(fill = NULL, y = Mean, x = Competences)) + geom_bar(position = "dodge", stat="identity", fill= "steelblue") + 
  coord_cartesian(ylim = c(0, 4)) +
  theme_bw()
bp3 <- ggplot(cluster3, aes(fill = NULL, y = Mean, x = Competences)) + geom_bar(position = "dodge", stat="identity", fill = "aquamarine3") + 
  coord_cartesian(ylim = c(0, 4)) +
  theme_bw()
bp4 <- ggplot(cluster4, aes(fill = NULL, y = Mean, x = Competences)) + geom_bar(position = "dodge", stat="identity", fill = "goldenrod1") + 
  coord_cartesian(ylim = c(0, 4)) +
  theme_bw()
# 
CP <- list(cp1,bp1,bp4,bp3,bp2)
# Create Multi-panel plot
grid.arrange(
    grobs = CP,
    widths = c(1, 1, 1),
    layout_matrix = matrix(c(1,1,2,1,1,3,1,1,4,1,1,5), 4, 3, byrow = T)
)

Step 4: Perform Prinicipal Component Analysis

df  <-  Scores[-c(10,25,28,29),5:16]  # Remove courses with NAs
dfdata          <-matrix(unlist(df),ncol = 12, nrow = 25)
df2             <- sweep(dfdata,2,colSums(dfdata),`/`)
data_dims <- df2
colnames(data_dims) = c(colnames(Scores[5:16]))
data <- data.frame(data_dims, Cluster = as.factor(s))
colnames(data) = c(colnames(Scores[5:16]),"Cluster")
ncomp <-2

# Perform PCA
pca.obj <- prcomp(data_dims, retx = T, center=F, scale=F)
rawLoadings <- pca.obj$rotation[,1:2] %*% diag(pca.obj$sdev, 2, 2)
# Calculate varimax rotated scores
varimax.obj <- varimax(pca.obj$rotation[,1:2], normalize = FALSE)
rotatedLoadings <- rawLoadings %*% varimax.obj$rotmat
# Substitute rotated components and loadings
pca.obj$x[,1] <- -pca.obj$x[,1]
pca.obj$x[,1:2] <- pca.obj$x[,1:2] %*% varimax(rawLoadings)$rotmat
pca.obj$rotation[,1] <- rotatedLoadings[,1]
pca.obj$rotation[,2] <- -rotatedLoadings[,2]

# Calculate average loadings per program year
CourseAvgs <- data.frame(matrix(data = NA, ncol = 2, nrow = 3))
CourseAvgs[1,] <- c( mean(pca.obj$x[levels == "Ba1",1]), mean(pca.obj$x[levels == "Ba1",2]))
CourseAvgs[2,] <- c( mean(pca.obj$x[levels == "Ba2",1]), mean(pca.obj$x[levels == "Ba2",2]))
CourseAvgs[3,] <- c( mean(pca.obj$x[levels == "Ba3",1]), mean(pca.obj$x[levels == "Ba3",2]))
CourseAvgs=as.data.frame(c(CourseAvgs, Cluster=Cluster))
# Create the PCA plot
plot1 <- autoplot(pca.obj, data=data, colour = "Cluster",
                frame = TRUE, loadings.label = TRUE, loadings = TRUE, loadings.label.size = 3.5, loadings.colour = 'black', scale = 0, loadings.label.colour = 'black', loadings.label.hjust=0.35,loadings.label.vjust=-0.05,loadings.label.repel=T, variance_percentage = T)
plot1b <- plot1 + scale_color_manual(values = c("firebrick4","steelblue","aquamarine3","goldenrod1"))+
  scale_fill_manual(values = c("firebrick4","steelblue","aquamarine3","goldenrod1"))+ theme(legend.position = "none")
plot1c <- plot1b +geom_point(data=CourseAvgs, aes(X1,X2), colour = "darkorchid")+ geom_text(data=CourseAvgs,aes(x=X1-0.015,y=X2), label = c("Year 1", "Year 2", "Year3"), colour = "darkorchid")
plot(plot1c)

Step 5: Analysis of KSA statements

# Insert here the correct path to your "Scores" data file
KSA <- read.csv("C:/Users/meppinga/Downloads/KSA.csv", sep=";")
# Check if the data file is correctly inserted
View(KSA)

# Calculate the summary statistics to be displayed
KSA_average <- data.frame(matrix(data = NA, ncol = 4, nrow = (nr_courses * nr_KSA_course * 3)))
colnames(KSA_average) <- c("Course", "Competence", "KSA", "Mean")
j <- 1

for(i in 1:nrow(KSA)){
  KSA_average$Course[j:(j+2)] <- rep(KSA$Course_name[i], 3)
  KSA_average$Competence[j:(j+2)] <- rep(KSA$Competence[i],3)
  KSA_average$KSA[j] <- "Knowledge"
  KSA_average$KSA[j+1] <- "Skills"
  KSA_average$KSA[j+2] <- "Attitudes"
  KSA_average$Mean[j] <- mean(as.matrix(KSA[i,3:8]), na.rm = TRUE)
  KSA_average$Mean[j+1] <- mean(as.matrix(KSA[i,9:14]), na.rm = TRUE)
  KSA_average$Mean[j+2] <- mean(as.matrix(KSA[i,15:19]), na.rm = TRUE)
  
  j <- j + 3
}

KSA_average <- KSA_average %>% drop_na(Course)

# Create the stacked bar chart
ggplot(KSA_average, aes(fill = KSA, y = Mean, x = Course)) + geom_bar(position = "stack", stat="identity") + 
  theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))