Wajahat698 commited on
Commit
5951a96
·
verified ·
1 Parent(s): 6816a2f

Update process_data.R

Browse files
Files changed (1) hide show
  1. process_data.R +163 -219
process_data.R CHANGED
@@ -1,4 +1,6 @@
1
- # Load required libraries
 
 
2
  library(relaimpo)
3
  library(readxl)
4
  library(readr)
@@ -6,286 +8,228 @@ library(lavaan)
6
  library(leaps)
7
  library(dplyr)
8
  library(tidyr)
 
9
 
10
- # Logging function
 
 
11
  log_message <- function(message, output_text_file) {
12
  cat(message, "\n")
13
  write(message, file = output_text_file, append = TRUE)
14
  }
15
 
16
- # Trust Driver analysis function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  trust_driver_analysis <- function(model_formula, data, output_text_file, csv_file) {
18
  tryCatch({
19
- # Fit linear regression model
20
- model <- lm(model_formula, data = data)
21
-
22
- # Calculate relative importance using the lmg method
23
- calc_relaimpo <- calc.relimp(model, type = "lmg", rela = TRUE)
24
- # Calculate average importance
25
- average_importance <- mean(calc_relaimpo$lmg)
26
-
27
- # Open the output text file in append mode to add this model's output
28
- file_conn <- file(output_text_file, open = "a")
29
- # Capture output to include in the text file
30
- full_output <- capture.output({
31
- print("Trust Driver Analysis:\n")
32
- print(calc_relaimpo)
33
- cat("\nAverage Importance: ", average_importance, "\n")
34
- })
35
- # Write output to text file
36
- writeLines(full_output, file_conn)
37
- close(file_conn)
38
-
39
- # Create data frame of predictor names and their importance
40
- results <- data.frame(Predictor = names(calc_relaimpo$lmg), Importance = calc_relaimpo$lmg)
41
-
42
- # Save results to CSV file
43
- write.csv(results, file = csv_file, row.names = FALSE)
44
  }, error = function(e) {
45
- log_message(paste("Error in trust_driver_analysis:", e$message), output_text_file)
 
 
 
 
 
 
 
 
 
 
46
  })
47
  }
48
 
49
- # Trust Builder Analysis function
50
- trust_builder_analysis <- function(data, data_headers, output_text_file, csv_file) {
 
 
51
  tryCatch({
52
- # Map the questions to column names
53
- question_to_column <- setNames(as.list(data_headers[1, ]), as.character(data_headers[2, ]))
54
 
55
- # Number of important statements to be selected
56
- p <- 6
 
57
 
58
- # Define the list of column names
59
- bucket_columns <- c("Stability", "Development", "Relationship", "Benefit", "Vision", "Competence")
60
 
61
- # Select columns based on the predefined list
62
- bucket <- data %>% select(all_of(bucket_columns))
 
 
63
 
64
- # Select all columns from the consumer dataframe that contain "TB" in their names and assign them to the variable TB
65
  TB <- data %>% select(contains("TB"))
66
 
67
- # Dynamically detect the number of TB statements
68
- num_tb_statements <- ncol(TB)
69
-
70
- # Initialize a matrix with number of TB rows (37 for Volkswagen) and 6 columns, filled with NA values
71
- coef <- matrix(NA, ncol = 6, nrow = num_tb_statements)
72
-
73
- # Initialize an empty list to store the predictors for each bucket column
74
- bucket_predictors <- list()
75
 
76
- # Loop over each of the 6 columns
77
  for (i in 1:6) {
78
- # Extract the i-th column from 'bucket' as a matrix and assign it to 'y'
79
- y <- as.matrix(pull(bucket[, i]))
80
-
81
- # Convert 'TB' dataframe to a matrix and assign it to 'x'
82
  x <- as.matrix(TB)
83
 
84
- # Perform best subset regression using 'x' as predictors and 'y' as the response variable
85
- fit <- regsubsets(x, y, nbest = 1, nvmax = p)
86
 
87
- # Summarize the regression subsets
88
- fit_sum <- summary(fit)
 
89
 
90
- # Store the coefficients of the best model in the i-th column of 'coef' matrix
91
- coef[, i] <- fit_sum$outmat[p, ]
 
 
 
 
92
 
93
- # Print the predictors used in the best model
94
- predictors <- names(which(fit_sum$outmat[p, ] == "*"))
95
 
96
- # Append the predictors to the bucket_predictors list
97
- bucket_predictors[[bucket_columns[i]]] <- predictors
98
- }
 
 
99
 
100
- # Create the desired output format as model
101
- model_str <- sapply(names(bucket_predictors), function(col) {
102
- paste(col, "~", paste(bucket_predictors[[col]], collapse = "+"))
103
- })
104
 
105
- # Prepend the Trust x and y to model_str
106
- model_str <- c("Trust ~ Stability + Development + Relationship + Benefit + Vision + Competence", model_str)
107
 
108
- # Fit the model using sem() function
109
- fit <- sem(model_str, data = data)
110
- fit_summary <- summary(fit, standardized = TRUE, fit.measures = TRUE, rsquare = TRUE)
111
-
112
- # Make it percentages
113
- output <- fit_summary$pe[fit_summary$pe$op == "~", c("lhs", "rhs", "std.all")]
114
-
115
- # Define the function to convert std.all to percentages
116
- convert_to_percentage <- function(df) {
117
- df %>%
118
- group_by(lhs) %>%
119
- mutate(abs_std = abs(std.all),
120
- sum_abs_std = sum(abs_std),
121
- percent_std = (abs_std / sum_abs_std) * 100) %>%
122
- select(-abs_std, -sum_abs_std) %>%
123
- ungroup()
124
- }
125
 
126
- # Convert the estimates to percentages
127
- percentage_output <- convert_to_percentage(output)
128
-
129
- # Extract TB column names
130
- tb_column_names <- colnames(TB)
131
-
132
- # Convert std.all to a wide format dataframe
133
- percentage_output_wide <- percentage_output %>%
134
- pivot_wider(names_from = lhs, values_from = percent_std) %>%
135
- rename_with(~ gsub("std.all\\.", "", .), starts_with("std.all"))
136
-
137
- # Create a new dataframe with TB columns and percentage estimates
138
- result_df <- data.frame(TB = tb_column_names)
139
-
140
- # Merge the result_df with percentage_estimates_wide
141
- result_df <- left_join(result_df, percentage_output_wide, by = c("TB" = "rhs"))
142
-
143
- # Fill NA values with 0 to ensure proper representation
144
- result_df[is.na(result_df)] <- 0
145
-
146
- # Add corresponding messages of TB as a new column
147
- result_df$Message <- sapply(result_df$TB, function(tb_col) question_to_column[[tb_col]])
148
-
149
- # Convert 'TB' column to a factor with the correct order
150
- result_df$TB <- factor(result_df$TB, levels = paste0("TB", 1:37))
151
-
152
- # Exclude 'est' and 'Trust' columns and merge rows by 'TB'
153
- result_df <- result_df %>%
154
- select(-std.all, -Trust) %>%
155
- group_by(TB) %>%
156
- summarise(across(everything(), ~ if(is.numeric(.)) sum(., na.rm = TRUE) else first(.))) %>%
157
- arrange(TB)
158
-
159
- # Reorder columns to have Message as the second column
160
- result_df <- result_df %>%
161
- select(TB, Message, everything())
162
-
163
- # Open the output text file in append mode to add this model's output
164
- file_conn <- file(output_text_file, open = "a")
165
-
166
- # Capture output to include in the text file
167
- full_output <- capture.output({
168
- print("Trust Builder Analysis:\n")
169
- print("Data header mapping:\n")
170
- print(question_to_column)
171
- print("Buckets:\n")
172
- print(bucket)
173
- print("Messages:\n")
174
- print(TB)
175
- print("Coefficients matrix (coef:\n")
176
- print(coef)
177
- print("Model:\n")
178
- cat(model_str, sep = "\n")
179
- print("Fit summary:\n")
180
- print(fit_summary)
181
- print("Output:\n")
182
- print(output)
183
- print("Output in percentage (%):\n")
184
- print(percentage_output)
185
- print("result_df:\n")
186
- print(result_df)
187
- })
188
- # Write output to text file
189
- writeLines(full_output, file_conn)
190
- close(file_conn)
191
-
192
- # Create data frame of predictor names and their importance
193
- results <- data.frame(result_df)
194
-
195
- # Save results to CSV file
196
- write.csv(results, file = csv_file, row.names = FALSE)
197
  }, error = function(e) {
198
- log_message(paste("Error in trust_builder_analysis:", e$message), output_text_file)
 
 
 
 
 
 
 
 
 
 
199
  })
200
  }
201
 
202
- # Read command-line arguments
 
 
203
  args <- commandArgs(trailingOnly = TRUE)
 
204
  input_file <- args[1]
205
- output_text_file <- args[2] # Base path for output text and CSV files
206
- csv_output_path_trust <- args[3]
207
- csv_output_path_nps <- args[4]
208
- csv_output_path_loyalty <- args[5]
209
- csv_output_path_consideration <- args[6]
210
- csv_output_path_satisfaction <- args[7]
211
- csv_output_path_trustbuilder <- args[8]
212
- nps_present <- as.logical(tolower(args[9])) # Expecting "TRUE" or "FALSE" as the argument
 
213
  loyalty_present <- as.logical(tolower(args[10]))
214
  consideration_present <- as.logical(tolower(args[11]))
215
  satisfaction_present <- as.logical(tolower(args[12]))
216
- trustbuilder_present <- as.logical(tolower(args[13]))
217
 
218
- # Log the starting of the script
219
- log_message("Starting Trust Driver and Builder Analysis Script.", output_text_file)
220
 
221
- ########## Trust Driver Analysis ######################
 
 
 
222
 
223
- # Load the trust driver dataset (CSV or Excel)
224
- data_driver <- NULL
225
- if (grepl(".xlsx", input_file)) {
226
- # Load the Excel file with the fourth row as the header
227
- data_driver <- read_excel(input_file, sheet = "Driver", skip = 3)
228
- }
229
-
230
- # Process the Trust model
231
  trust_driver_analysis(
232
- Trust ~ Stability + Development + Relationship + Benefit + Vision + Competence,
233
- data_driver,
234
- output_text_file,
235
- csv_output_path_trust)
236
 
237
- # Conditionally process the NPS model
238
  if (nps_present) {
239
  trust_driver_analysis(
240
- NPS ~ Stability + Development + Relationship + Benefit + Vision + Competence,
241
- data_driver,
242
- output_text_file,
243
- csv_output_path_nps)
244
  }
245
 
246
- # Conditionally process the Loyalty model
247
  if (loyalty_present) {
248
  trust_driver_analysis(
249
- Loyalty ~ Stability + Development + Relationship + Benefit + Vision + Competence,
250
- data_driver,
251
- output_text_file,
252
- csv_output_path_loyalty)
253
  }
254
 
255
- # Conditionally process the Consideration model
256
  if (consideration_present) {
257
  trust_driver_analysis(
258
- Consideration ~ Stability + Development + Relationship + Benefit + Vision + Competence,
259
- data_driver,
260
- output_text_file,
261
- csv_output_path_consideration)
262
  }
263
 
264
- # Conditionally process the Satisfaction model
265
  if (satisfaction_present) {
266
  trust_driver_analysis(
267
- Satisfaction ~ Stability + Development + Relationship + Benefit + Vision + Competence,
268
- data_driver,
269
- output_text_file,
270
- csv_output_path_satisfaction)
271
  }
272
 
273
- ########## Trust Builder Analysis ######################
274
-
275
- if (trustbuilder_present) {
276
- data_builder <- NULL
277
-
278
- if (grepl(".xlsx", input_file)) {
279
- # Read the 4th and 5th rows as header mapping
280
- data_builder_headers <- read_excel(input_file, sheet = "Builder", skip = 3, n_max = 2)
281
- # Read the rest of the data, skipping the first 5 rows (to start from row 6)
282
- data_builder_rows <- read_excel(input_file, sheet = "Builder", skip = 5)
283
- }
284
 
285
- # Process the Builder model
286
- trust_builder_analysis(data_builder_rows, data_builder_headers, output_text_file, csv_output_path_trustbuilder)
287
 
 
288
  }
289
 
290
- # Log the ending of the script
291
- log_message("Trust Driver and Builder Analysis Script Completed.", output_text_file)
 
1
+ ##############################
2
+ # LIBRARIES
3
+ ##############################
4
  library(relaimpo)
5
  library(readxl)
6
  library(readr)
 
8
  library(leaps)
9
  library(dplyr)
10
  library(tidyr)
11
+ library(caret)
12
 
13
+ ##############################
14
+ # LOGGING
15
+ ##############################
16
  log_message <- function(message, output_text_file) {
17
  cat(message, "\n")
18
  write(message, file = output_text_file, append = TRUE)
19
  }
20
 
21
+ ##############################
22
+ # VALIDATION HELPERS
23
+ ##############################
24
+ validate_driver_data <- function(formula, data) {
25
+ vars <- all.vars(formula)
26
+ df <- data[, vars, drop = FALSE]
27
+ df <- na.omit(df)
28
+
29
+ if (nrow(df) <= (length(vars) - 1)) {
30
+ stop("Insufficient observations: nobs <= predictors")
31
+ }
32
+
33
+ cor_mat <- cor(df[, -1], use = "pairwise.complete.obs")
34
+ high_corr <- findCorrelation(cor_mat, cutoff = 0.9)
35
+
36
+ if (length(high_corr) > 0) {
37
+ df <- df[, -high_corr, drop = FALSE]
38
+ }
39
+
40
+ return(df)
41
+ }
42
+
43
+ ##############################
44
+ # TRUST DRIVER ANALYSIS
45
+ ##############################
46
  trust_driver_analysis <- function(model_formula, data, output_text_file, csv_file) {
47
  tryCatch({
48
+
49
+ df <- validate_driver_data(model_formula, data)
50
+
51
+ model <- lm(model_formula, data = df)
52
+
53
+ if (qr(model.matrix(model))$rank < ncol(model.matrix(model))) {
54
+ stop("Model matrix is rank deficient")
55
+ }
56
+
57
+ rel <- calc.relimp(model, type = "lmg", rela = TRUE)
58
+
59
+ results <- data.frame(
60
+ Predictor = names(rel$lmg),
61
+ Importance = rel$lmg
62
+ )
63
+
64
+ write.csv(results, csv_file, row.names = FALSE)
65
+
66
+ log_message(
67
+ paste("SUCCESS:", deparse(model_formula)),
68
+ output_text_file
69
+ )
70
+
 
 
71
  }, error = function(e) {
72
+
73
+ log_message(
74
+ paste("SKIPPED:", deparse(model_formula), "| Reason:", e$message),
75
+ output_text_file
76
+ )
77
+
78
+ write.csv(
79
+ data.frame(Predictor = character(), Importance = numeric()),
80
+ csv_file,
81
+ row.names = FALSE
82
+ )
83
  })
84
  }
85
 
86
+ ##############################
87
+ # TRUST BUILDER ANALYSIS
88
+ ##############################
89
+ trust_builder_analysis <- function(data, headers, output_text_file, csv_file) {
90
  tryCatch({
 
 
91
 
92
+ if (nrow(data) <= ncol(data)) {
93
+ stop("SEM skipped: insufficient sample size")
94
+ }
95
 
96
+ question_map <- setNames(as.list(headers[1, ]), headers[2, ])
 
97
 
98
+ bucket_cols <- c(
99
+ "Stability", "Development", "Relationship",
100
+ "Benefit", "Vision", "Competence"
101
+ )
102
 
103
+ bucket <- data %>% select(all_of(bucket_cols))
104
  TB <- data %>% select(contains("TB"))
105
 
106
+ coef <- matrix(NA, ncol = 6, nrow = ncol(TB))
107
+ predictors_list <- list()
 
 
 
 
 
 
108
 
 
109
  for (i in 1:6) {
110
+ y <- bucket[[i]]
 
 
 
111
  x <- as.matrix(TB)
112
 
113
+ fit <- regsubsets(x, y, nvmax = 6)
114
+ fs <- summary(fit)
115
 
116
+ coef[, i] <- fs$outmat[6, ]
117
+ predictors_list[[bucket_cols[i]]] <- names(which(fs$outmat[6, ] == "*"))
118
+ }
119
 
120
+ model_str <- c(
121
+ "Trust ~ Stability + Development + Relationship + Benefit + Vision + Competence",
122
+ sapply(names(predictors_list), function(x) {
123
+ paste(x, "~", paste(predictors_list[[x]], collapse = "+"))
124
+ })
125
+ )
126
 
127
+ fit <- sem(model_str, data = data)
128
+ pe <- standardizedSolution(fit)
129
 
130
+ output <- pe %>%
131
+ filter(op == "~") %>%
132
+ group_by(lhs) %>%
133
+ mutate(percent = abs(est.std) / sum(abs(est.std)) * 100) %>%
134
+ ungroup()
135
 
136
+ result <- output %>%
137
+ select(rhs, lhs, percent) %>%
138
+ pivot_wider(names_from = lhs, values_from = percent, values_fill = 0)
 
139
 
140
+ result$Message <- question_map[result$rhs]
 
141
 
142
+ write.csv(result, csv_file, row.names = FALSE)
143
+
144
+ log_message("SUCCESS: Trust Builder Model", output_text_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  }, error = function(e) {
147
+
148
+ log_message(
149
+ paste("SKIPPED: Trust Builder | Reason:", e$message),
150
+ output_text_file
151
+ )
152
+
153
+ write.csv(
154
+ data.frame(),
155
+ csv_file,
156
+ row.names = FALSE
157
+ )
158
  })
159
  }
160
 
161
+ ##############################
162
+ # ARGUMENTS
163
+ ##############################
164
  args <- commandArgs(trailingOnly = TRUE)
165
+
166
  input_file <- args[1]
167
+ output_text_file <- args[2]
168
+ csv_trust <- args[3]
169
+ csv_nps <- args[4]
170
+ csv_loyalty <- args[5]
171
+ csv_consideration <- args[6]
172
+ csv_satisfaction <- args[7]
173
+ csv_builder <- args[8]
174
+
175
+ nps_present <- as.logical(tolower(args[9]))
176
  loyalty_present <- as.logical(tolower(args[10]))
177
  consideration_present <- as.logical(tolower(args[11]))
178
  satisfaction_present <- as.logical(tolower(args[12]))
179
+ builder_present <- as.logical(tolower(args[13]))
180
 
181
+ log_message("Starting Trust Analysis Script", output_text_file)
 
182
 
183
+ ##############################
184
+ # LOAD DATA
185
+ ##############################
186
+ driver_data <- read_excel(input_file, sheet = "Driver", skip = 3)
187
 
188
+ ##############################
189
+ # DRIVER MODELS
190
+ ##############################
 
 
 
 
 
191
  trust_driver_analysis(
192
+ Trust ~ Stability + Development + Relationship + Benefit + Vision + Competence,
193
+ driver_data, output_text_file, csv_trust
194
+ )
 
195
 
 
196
  if (nps_present) {
197
  trust_driver_analysis(
198
+ NPS ~ Stability + Development + Relationship + Benefit + Vision + Competence,
199
+ driver_data, output_text_file, csv_nps
200
+ )
 
201
  }
202
 
 
203
  if (loyalty_present) {
204
  trust_driver_analysis(
205
+ Loyalty ~ Stability + Development + Relationship + Benefit + Vision + Competence,
206
+ driver_data, output_text_file, csv_loyalty
207
+ )
 
208
  }
209
 
 
210
  if (consideration_present) {
211
  trust_driver_analysis(
212
+ Consideration ~ Stability + Development + Relationship + Benefit + Vision + Competence,
213
+ driver_data, output_text_file, csv_consideration
214
+ )
 
215
  }
216
 
 
217
  if (satisfaction_present) {
218
  trust_driver_analysis(
219
+ Satisfaction ~ Stability + Development + Relationship + Benefit + Vision + Competence,
220
+ driver_data, output_text_file, csv_satisfaction
221
+ )
 
222
  }
223
 
224
+ ##############################
225
+ # BUILDER MODEL
226
+ ##############################
227
+ if (builder_present) {
 
 
 
 
 
 
 
228
 
229
+ headers <- read_excel(input_file, sheet = "Builder", skip = 3, n_max = 2)
230
+ rows <- read_excel(input_file, sheet = "Builder", skip = 5)
231
 
232
+ trust_builder_analysis(rows, headers, output_text_file, csv_builder)
233
  }
234
 
235
+ log_message("Trust Analysis Script Completed", output_text_file)