Spaces:

trustlogic
/

temporary-trustlogic-batch

Running on CPU Upgrade

App Files Files Community

Wajahat698 commited on Jan 17

Commit

5951a96

verified ·

1 Parent(s): 6816a2f

Update process_data.R

Browse files

Files changed (1) hide show

process_data.R +163 -219

process_data.R CHANGED Viewed

@@ -1,4 +1,6 @@
-# Load required libraries
 library(relaimpo)
 library(readxl)
 library(readr)
@@ -6,286 +8,228 @@ library(lavaan)
 library(leaps)
 library(dplyr)
 library(tidyr)
-# Logging function
 log_message <- function(message, output_text_file) {
   cat(message, "\n")
   write(message, file = output_text_file, append = TRUE)
 }
-# Trust Driver analysis function
 trust_driver_analysis <- function(model_formula, data, output_text_file, csv_file) {
   tryCatch({
-    # Fit linear regression model
-    model <- lm(model_formula, data = data)
-    # Calculate relative importance using the lmg method
-    calc_relaimpo <- calc.relimp(model, type = "lmg", rela = TRUE)
-    # Calculate average importance
-    average_importance <- mean(calc_relaimpo$lmg)
-    # Open the output text file in append mode to add this model's output
-    file_conn <- file(output_text_file, open = "a")
-    # Capture output to include in the text file
-    full_output <- capture.output({
-      print("Trust Driver Analysis:\n")
-      print(calc_relaimpo)
-      cat("\nAverage Importance: ", average_importance, "\n")
-    })
-    # Write output to text file
-    writeLines(full_output, file_conn)
-    close(file_conn)
-    # Create data frame of predictor names and their importance
-    results <- data.frame(Predictor = names(calc_relaimpo$lmg), Importance = calc_relaimpo$lmg)
-    # Save results to CSV file
-    write.csv(results, file = csv_file, row.names = FALSE)
   }, error = function(e) {
-    log_message(paste("Error in trust_driver_analysis:", e$message), output_text_file)
   })
 }
-# Trust Builder Analysis function
-trust_builder_analysis <- function(data, data_headers, output_text_file, csv_file) {
   tryCatch({
-    # Map the questions to column names
-    question_to_column <- setNames(as.list(data_headers[1, ]), as.character(data_headers[2, ]))
-    # Number of important statements to be selected
-    p <- 6
-    # Define the list of column names
-    bucket_columns <- c("Stability", "Development", "Relationship", "Benefit", "Vision", "Competence")
-    # Select columns based on the predefined list
-    bucket <- data %>% select(all_of(bucket_columns))
-    # Select all columns from the consumer dataframe that contain "TB" in their names and assign them to the variable TB
     TB <- data %>% select(contains("TB"))
-    # Dynamically detect the number of TB statements
-    num_tb_statements <- ncol(TB)
-    # Initialize a matrix with number of TB rows (37 for Volkswagen) and 6 columns, filled with NA values
-    coef <- matrix(NA, ncol = 6, nrow = num_tb_statements)
-    # Initialize an empty list to store the predictors for each bucket column
-    bucket_predictors <- list()
-    # Loop over each of the 6 columns
     for (i in 1:6) {
-      # Extract the i-th column from 'bucket' as a matrix and assign it to 'y'
-      y <- as.matrix(pull(bucket[, i]))
-      # Convert 'TB' dataframe to a matrix and assign it to 'x'
       x <- as.matrix(TB)
-      # Perform best subset regression using 'x' as predictors and 'y' as the response variable
-      fit <- regsubsets(x, y, nbest = 1, nvmax = p)
-      # Summarize the regression subsets
-      fit_sum <- summary(fit)
-      # Store the coefficients of the best model in the i-th column of 'coef' matrix
-      coef[, i] <- fit_sum$outmat[p, ]
-      # Print the predictors used in the best model
-      predictors <- names(which(fit_sum$outmat[p, ] == "*"))
-      # Append the predictors to the bucket_predictors list
-      bucket_predictors[[bucket_columns[i]]] <- predictors
-    }
-    # Create the desired output format as model
-    model_str <- sapply(names(bucket_predictors), function(col) {
-      paste(col, "~", paste(bucket_predictors[[col]], collapse = "+"))
-    })
-    # Prepend the Trust x and y to model_str
-    model_str <- c("Trust ~ Stability + Development + Relationship + Benefit + Vision + Competence", model_str)
-    # Fit the model using sem() function
-    fit <- sem(model_str, data = data)
-    fit_summary <- summary(fit, standardized = TRUE, fit.measures = TRUE, rsquare = TRUE)
-    # Make it percentages
-    output <- fit_summary$pe[fit_summary$pe$op == "~", c("lhs", "rhs", "std.all")]
-    # Define the function to convert std.all to percentages
-    convert_to_percentage <- function(df) {
-      df %>%
-        group_by(lhs) %>%
-        mutate(abs_std = abs(std.all),
-              sum_abs_std = sum(abs_std),
-              percent_std = (abs_std / sum_abs_std) * 100) %>%
-        select(-abs_std, -sum_abs_std) %>%
-        ungroup()
-    }
-    # Convert the estimates to percentages
-    percentage_output <- convert_to_percentage(output)
-    # Extract TB column names
-    tb_column_names <- colnames(TB)
-    # Convert std.all to a wide format dataframe
-    percentage_output_wide <- percentage_output %>%
-      pivot_wider(names_from = lhs, values_from = percent_std) %>%
-      rename_with(~ gsub("std.all\\.", "", .), starts_with("std.all"))
-    # Create a new dataframe with TB columns and percentage estimates
-    result_df <- data.frame(TB = tb_column_names)
-    # Merge the result_df with percentage_estimates_wide
-    result_df <- left_join(result_df, percentage_output_wide, by = c("TB" = "rhs"))
-    # Fill NA values with 0 to ensure proper representation
-    result_df[is.na(result_df)] <- 0
-    # Add corresponding messages of TB as a new column
-    result_df$Message <- sapply(result_df$TB, function(tb_col) question_to_column[[tb_col]])
-    # Convert 'TB' column to a factor with the correct order
-    result_df$TB <- factor(result_df$TB, levels = paste0("TB", 1:37))
-    # Exclude 'est' and 'Trust' columns and merge rows by 'TB'
-    result_df <- result_df %>%
-      select(-std.all, -Trust) %>%
-      group_by(TB) %>%
-      summarise(across(everything(), ~ if(is.numeric(.)) sum(., na.rm = TRUE) else first(.))) %>%
-      arrange(TB)
-    # Reorder columns to have Message as the second column
-    result_df <- result_df %>%
-      select(TB, Message, everything())
-    # Open the output text file in append mode to add this model's output
-    file_conn <- file(output_text_file, open = "a")
-    # Capture output to include in the text file
-    full_output <- capture.output({
-      print("Trust Builder Analysis:\n")
-      print("Data header mapping:\n")
-      print(question_to_column)
-      print("Buckets:\n")
-      print(bucket)
-      print("Messages:\n")
-      print(TB)
-      print("Coefficients matrix (coef:\n")
-      print(coef)
-      print("Model:\n")
-      cat(model_str, sep = "\n")
-      print("Fit summary:\n")
-      print(fit_summary)
-      print("Output:\n")
-      print(output)
-      print("Output in percentage (%):\n")
-      print(percentage_output)
-      print("result_df:\n")
-      print(result_df)
-    })
-    # Write output to text file
-    writeLines(full_output, file_conn)
-    close(file_conn)
-    # Create data frame of predictor names and their importance
-    results <- data.frame(result_df)
-    # Save results to CSV file
-    write.csv(results, file = csv_file, row.names = FALSE)
   }, error = function(e) {
-    log_message(paste("Error in trust_builder_analysis:", e$message), output_text_file)
   })
 }
-# Read command-line arguments
 args <- commandArgs(trailingOnly = TRUE)
 input_file <- args[1]
-output_text_file <- args[2]  # Base path for output text and CSV files
-csv_output_path_trust <- args[3]
-csv_output_path_nps <- args[4]
-csv_output_path_loyalty <- args[5]
-csv_output_path_consideration <- args[6]
-csv_output_path_satisfaction <- args[7]
-csv_output_path_trustbuilder <- args[8]
-nps_present <- as.logical(tolower(args[9]))  # Expecting "TRUE" or "FALSE" as the argument
 loyalty_present <- as.logical(tolower(args[10]))
 consideration_present <- as.logical(tolower(args[11]))
 satisfaction_present <- as.logical(tolower(args[12]))
-trustbuilder_present <- as.logical(tolower(args[13]))
-# Log the starting of the script
-log_message("Starting Trust Driver and Builder Analysis Script.", output_text_file)
-########## Trust Driver Analysis ######################
-# Load the trust driver dataset (CSV or Excel)
-data_driver <- NULL
-if (grepl(".xlsx", input_file)) {
-  # Load the Excel file with the fourth row as the header
-  data_driver <- read_excel(input_file, sheet = "Driver", skip = 3)
-}
-# Process the Trust model
 trust_driver_analysis(
-  Trust ~ Stability + Development + Relationship + Benefit + Vision + Competence,
-  data_driver,
-  output_text_file,
-  csv_output_path_trust)
-# Conditionally process the NPS model
 if (nps_present) {
   trust_driver_analysis(
-    NPS ~ Stability + Development + Relationship + Benefit + Vision + Competence,
-    data_driver,
-    output_text_file,
-    csv_output_path_nps)
 }
-# Conditionally process the Loyalty model
 if (loyalty_present) {
   trust_driver_analysis(
-    Loyalty ~ Stability + Development + Relationship + Benefit + Vision + Competence,
-    data_driver,
-    output_text_file,
-    csv_output_path_loyalty)
 }
-# Conditionally process the Consideration model
 if (consideration_present) {
   trust_driver_analysis(
-    Consideration ~ Stability + Development + Relationship + Benefit + Vision + Competence,
-    data_driver,
-    output_text_file,
-    csv_output_path_consideration)
 }
-# Conditionally process the Satisfaction model
 if (satisfaction_present) {
   trust_driver_analysis(
-    Satisfaction ~ Stability + Development + Relationship + Benefit + Vision + Competence,
-    data_driver,
-    output_text_file,
-    csv_output_path_satisfaction)
 }
-########## Trust Builder Analysis ######################
-if (trustbuilder_present) {
-  data_builder <- NULL
-  if (grepl(".xlsx", input_file)) {
-    # Read the 4th and 5th rows as header mapping
-    data_builder_headers <- read_excel(input_file, sheet = "Builder", skip = 3, n_max = 2)
-    # Read the rest of the data, skipping the first 5 rows (to start from row 6)
-    data_builder_rows <- read_excel(input_file, sheet = "Builder", skip = 5)
-  }
-  # Process the Builder model
-  trust_builder_analysis(data_builder_rows, data_builder_headers, output_text_file, csv_output_path_trustbuilder)
 }
-# Log the ending of the script
-log_message("Trust Driver and Builder Analysis Script Completed.", output_text_file)

+##############################
+# LIBRARIES
+##############################
 library(relaimpo)
 library(readxl)
 library(readr)
 library(leaps)
 library(dplyr)
 library(tidyr)
+library(caret)
+##############################
+# LOGGING
+##############################
 log_message <- function(message, output_text_file) {
   cat(message, "\n")
   write(message, file = output_text_file, append = TRUE)
 }
+##############################
+# VALIDATION HELPERS
+##############################
+validate_driver_data <- function(formula, data) {
+  vars <- all.vars(formula)
+  df <- data[, vars, drop = FALSE]
+  df <- na.omit(df)
+  if (nrow(df) <= (length(vars) - 1)) {
+    stop("Insufficient observations: nobs <= predictors")
+  }
+  cor_mat <- cor(df[, -1], use = "pairwise.complete.obs")
+  high_corr <- findCorrelation(cor_mat, cutoff = 0.9)
+  if (length(high_corr) > 0) {
+    df <- df[, -high_corr, drop = FALSE]
+  }
+  return(df)
+}
+##############################
+# TRUST DRIVER ANALYSIS
+##############################
 trust_driver_analysis <- function(model_formula, data, output_text_file, csv_file) {
   tryCatch({
+    df <- validate_driver_data(model_formula, data)
+    model <- lm(model_formula, data = df)
+    if (qr(model.matrix(model))$rank < ncol(model.matrix(model))) {
+      stop("Model matrix is rank deficient")
+    }
+    rel <- calc.relimp(model, type = "lmg", rela = TRUE)
+    results <- data.frame(
+      Predictor = names(rel$lmg),
+      Importance = rel$lmg
+    )
+    write.csv(results, csv_file, row.names = FALSE)
+    log_message(
+      paste("SUCCESS:", deparse(model_formula)),
+      output_text_file
+    )
   }, error = function(e) {
+    log_message(
+      paste("SKIPPED:", deparse(model_formula), "| Reason:", e$message),
+      output_text_file
+    )
+    write.csv(
+      data.frame(Predictor = character(), Importance = numeric()),
+      csv_file,
+      row.names = FALSE
+    )
   })
 }
+##############################
+# TRUST BUILDER ANALYSIS
+##############################
+trust_builder_analysis <- function(data, headers, output_text_file, csv_file) {
   tryCatch({
+    if (nrow(data) <= ncol(data)) {
+      stop("SEM skipped: insufficient sample size")
+    }
+    question_map <- setNames(as.list(headers[1, ]), headers[2, ])
+    bucket_cols <- c(
+      "Stability", "Development", "Relationship",
+      "Benefit", "Vision", "Competence"
+    )
+    bucket <- data %>% select(all_of(bucket_cols))
     TB <- data %>% select(contains("TB"))
+    coef <- matrix(NA, ncol = 6, nrow = ncol(TB))
+    predictors_list <- list()
     for (i in 1:6) {
+      y <- bucket[[i]]
       x <- as.matrix(TB)
+      fit <- regsubsets(x, y, nvmax = 6)
+      fs <- summary(fit)
+      coef[, i] <- fs$outmat[6, ]
+      predictors_list[[bucket_cols[i]]] <- names(which(fs$outmat[6, ] == "*"))
+    }
+    model_str <- c(
+      "Trust ~ Stability + Development + Relationship + Benefit + Vision + Competence",
+      sapply(names(predictors_list), function(x) {
+        paste(x, "~", paste(predictors_list[[x]], collapse = "+"))
+      })
+    )
+    fit <- sem(model_str, data = data)
+    pe <- standardizedSolution(fit)
+    output <- pe %>%
+      filter(op == "~") %>%
+      group_by(lhs) %>%
+      mutate(percent = abs(est.std) / sum(abs(est.std)) * 100) %>%
+      ungroup()
+    result <- output %>%
+      select(rhs, lhs, percent) %>%
+      pivot_wider(names_from = lhs, values_from = percent, values_fill = 0)
+    result$Message <- question_map[result$rhs]
+    write.csv(result, csv_file, row.names = FALSE)
+    log_message("SUCCESS: Trust Builder Model", output_text_file)
   }, error = function(e) {
+    log_message(
+      paste("SKIPPED: Trust Builder | Reason:", e$message),
+      output_text_file
+    )
+    write.csv(
+      data.frame(),
+      csv_file,
+      row.names = FALSE
+    )
   })
 }
+##############################
+# ARGUMENTS
+##############################
 args <- commandArgs(trailingOnly = TRUE)
 input_file <- args[1]
+output_text_file <- args[2]
+csv_trust <- args[3]
+csv_nps <- args[4]
+csv_loyalty <- args[5]
+csv_consideration <- args[6]
+csv_satisfaction <- args[7]
+csv_builder <- args[8]
+nps_present <- as.logical(tolower(args[9]))
 loyalty_present <- as.logical(tolower(args[10]))
 consideration_present <- as.logical(tolower(args[11]))
 satisfaction_present <- as.logical(tolower(args[12]))
+builder_present <- as.logical(tolower(args[13]))
+log_message("Starting Trust Analysis Script", output_text_file)
+##############################
+# LOAD DATA
+##############################
+driver_data <- read_excel(input_file, sheet = "Driver", skip = 3)
+##############################
+# DRIVER MODELS
+##############################
 trust_driver_analysis(
+  Trust ~ Stability + Development + Relationship + Benefit + Vision + Competence,
+  driver_data, output_text_file, csv_trust
+)
 if (nps_present) {
   trust_driver_analysis(
+    NPS ~ Stability + Development + Relationship + Benefit + Vision + Competence,
+    driver_data, output_text_file, csv_nps
+  )
 }
 if (loyalty_present) {
   trust_driver_analysis(
+    Loyalty ~ Stability + Development + Relationship + Benefit + Vision + Competence,
+    driver_data, output_text_file, csv_loyalty
+  )
 }
 if (consideration_present) {
   trust_driver_analysis(
+    Consideration ~ Stability + Development + Relationship + Benefit + Vision + Competence,
+    driver_data, output_text_file, csv_consideration
+  )
 }
 if (satisfaction_present) {
   trust_driver_analysis(
+    Satisfaction ~ Stability + Development + Relationship + Benefit + Vision + Competence,
+    driver_data, output_text_file, csv_satisfaction
+  )
 }
+##############################
+# BUILDER MODEL
+##############################
+if (builder_present) {
+  headers <- read_excel(input_file, sheet = "Builder", skip = 3, n_max = 2)
+  rows <- read_excel(input_file, sheet = "Builder", skip = 5)
+  trust_builder_analysis(rows, headers, output_text_file, csv_builder)
 }
+log_message("Trust Analysis Script Completed", output_text_file)