Wajahat698 commited on
Commit
80f7025
·
verified ·
1 Parent(s): 02f5246

Update process_data.R

Browse files
Files changed (1) hide show
  1. process_data.R +267 -49
process_data.R CHANGED
@@ -1,73 +1,291 @@
1
  # Load required libraries
 
2
  library(readxl)
3
  library(readr)
 
 
 
 
4
 
5
- # Define function to process each model
6
- process_model <- function(data, output_text_file, csv_file) {
 
 
 
7
 
8
- # Remove rows containing any non-finite values (NaN or Inf)
9
- data <- data[complete.cases(data), ]
10
- # Print the dimensions of the filtered data frame
11
- cat("Filtered data shape: ", nrow(data), " rows, ", ncol(data), " columns\n")
 
12
 
13
- # Perform factor analysis to get factor scores
14
- factor_scores <- factanal(data, 1, scores = "regression")
 
 
15
 
16
- # Sort the need statements in descending order (highest to lowest correlation according to factor scores)
17
- need_var <- rownames(factor_scores$loadings) # Get row names
18
- need_var <- need_var[order(factor_scores$loadings, decreasing = TRUE)] # Sort order
 
 
 
 
 
 
 
 
19
 
20
- # Add factor scores to the main dataset
21
- data$score <- factor_scores$score
22
 
23
- # Derive the needs impact from the top 7 needs statements based on their correlation scores
24
- # Construct the formula dynamically using the ranked variables from need_var
25
- formula <- paste("score ~", paste(need_var, collapse = " + ")) ## all 27 needs instead of top 7
26
- # Fit linear regression model
27
- fit <- lm(formula, data = data)
28
- # Get summary of the trained linear regression model
29
 
30
- # Normalizes the coefficients of the predictors
31
- # by dividing each coefficient by the sum of all coefficients (excluding the intercept)
32
- # Get the normalized coefficients using cbind()
33
- normalized_coef <- cbind(fit$coef[-1] / sum(fit$coef[-1]))
 
34
 
35
- # Create a data frame combining the variable names and their corresponding normalized coefficients
36
- result <- data.frame(Needs = names(fit$coef)[-1], Coefficient = normalized_coef)
37
 
38
- # Calculate the average of the Coefficient values
39
- average_coefficient <- mean(result$Coefficient)
40
 
41
- # Open the output text file in append mode to add output
42
- file_conn <- file(output_text_file, open = "a")
43
- # Capture output to include in the text file
44
- full_output <- capture.output({
45
- cat("Filtered data shape:", nrow(data), "rows,", ncol(data), "columns\n")
46
- cat("\nAverage Coefficient: ", average_coefficient, "\n")
47
- })
48
- # Write output to text file
49
- writeLines(full_output, file_conn)
50
- close(file_conn)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- # Save results to CSV file
53
- write.csv(result, file = csv_file, row.names = FALSE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  }
55
 
56
  # Read command-line arguments
57
  args <- commandArgs(trailingOnly = TRUE)
58
  input_file <- args[1]
59
  output_text_file <- args[2] # Base path for output text and CSV files
60
- csv_output_path_needs <- args[3]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- # Load the dataset (CSV or Excel)
63
- data <- NULL
64
  if (grepl(".xlsx", input_file)) {
65
- data <- read_excel(input_file, sheet = "Needs and Hygiene")
66
- } else if (grepl(".csv", input_file)) {
67
- data <- read_csv(input_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  }
69
 
70
- # Implement the factor analysis and linear regression
71
- process_model(data,
72
- output_text_file,
73
- csv_output_path_needs)
 
1
  # Load required libraries
2
+ library(relaimpo)
3
  library(readxl)
4
  library(readr)
5
+ library(lavaan)
6
+ library(leaps)
7
+ library(dplyr)
8
+ library(tidyr)
9
 
10
+ # Logging function
11
+ log_message <- function(message, output_text_file) {
12
+ cat(message, "\n")
13
+ write(message, file = output_text_file, append = TRUE)
14
+ }
15
 
16
+ # Trust Driver analysis function
17
+ trust_driver_analysis <- function(model_formula, data, output_text_file, csv_file) {
18
+ tryCatch({
19
+ # Fit linear regression model
20
+ model <- lm(model_formula, data = data)
21
 
22
+ # Calculate relative importance using the lmg method
23
+ calc_relaimpo <- calc.relimp(model, type = "lmg", rela = TRUE)
24
+ # Calculate average importance
25
+ average_importance <- mean(calc_relaimpo$lmg)
26
 
27
+ # Open the output text file in append mode to add this model's output
28
+ file_conn <- file(output_text_file, open = "a")
29
+ # Capture output to include in the text file
30
+ full_output <- capture.output({
31
+ print("Trust Driver Analysis:\n")
32
+ print(calc_relaimpo)
33
+ cat("\nAverage Importance: ", average_importance, "\n")
34
+ })
35
+ # Write output to text file
36
+ writeLines(full_output, file_conn)
37
+ close(file_conn)
38
 
39
+ # Create data frame of predictor names and their importance
40
+ results <- data.frame(Predictor = names(calc_relaimpo$lmg), Importance = calc_relaimpo$lmg)
41
 
42
+ # Save results to CSV file
43
+ write.csv(results, file = csv_file, row.names = FALSE)
44
+ }, error = function(e) {
45
+ log_message(paste("Error in trust_driver_analysis:", e$message), output_text_file)
46
+ })
47
+ }
48
 
49
+ # Trust Builder Analysis function
50
+ trust_builder_analysis <- function(data, data_headers, output_text_file, csv_file) {
51
+ tryCatch({
52
+ # Map the questions to column names
53
+ question_to_column <- setNames(as.list(data_headers[1, ]), as.character(data_headers[2, ]))
54
 
55
+ # Number of important statements to be selected
56
+ p <- 6
57
 
58
+ # Define the list of column names
59
+ bucket_columns <- c("Stability", "Development", "Relationship", "Benefit", "Vision", "Competence")
60
 
61
+ # Select columns based on the predefined list
62
+ bucket <- data %>% select(all_of(bucket_columns))
63
+
64
+ # Select all columns from the consumer dataframe that contain "TB" in their names and assign them to the variable TB
65
+ TB <- data %>% select(contains("TB"))
66
+
67
+ # Dynamically detect the number of TB statements
68
+ num_tb_statements <- ncol(TB)
69
+
70
+ # Initialize a matrix with number of TB rows (37 for Volkswagen) and 6 columns, filled with NA values
71
+ coef <- matrix(NA, ncol = 6, nrow = num_tb_statements)
72
+
73
+ # Initialize an empty list to store the predictors for each bucket column
74
+ bucket_predictors <- list()
75
+
76
+ # Loop over each of the 6 columns
77
+ for (i in 1:6) {
78
+ # Extract the i-th column from 'bucket' as a matrix and assign it to 'y'
79
+ y <- as.matrix(pull(bucket[, i]))
80
+
81
+ # Convert 'TB' dataframe to a matrix and assign it to 'x'
82
+ x <- as.matrix(TB)
83
+
84
+ # Perform best subset regression using 'x' as predictors and 'y' as the response variable
85
+ fit <- regsubsets(x, y, nbest = 1, nvmax = p)
86
+
87
+ # Summarize the regression subsets
88
+ fit_sum <- summary(fit)
89
+
90
+ # Store the coefficients of the best model in the i-th column of 'coef' matrix
91
+ coef[, i] <- fit_sum$outmat[p, ]
92
+
93
+ # Print the predictors used in the best model
94
+ predictors <- names(which(fit_sum$outmat[p, ] == "*"))
95
+
96
+ # Append the predictors to the bucket_predictors list
97
+ bucket_predictors[[bucket_columns[i]]] <- predictors
98
+ }
99
+
100
+ # Create the desired output format as model
101
+ model_str <- sapply(names(bucket_predictors), function(col) {
102
+ paste(col, "~", paste(bucket_predictors[[col]], collapse = "+"))
103
+ })
104
+
105
+ # Prepend the Trust x and y to model_str
106
+ model_str <- c("Trust ~ Stability + Development + Relationship + Benefit + Vision + Competence", model_str)
107
+
108
+ # Fit the model using sem() function
109
+ fit <- sem(model_str, data = data)
110
+ fit_summary <- summary(fit, standardized = TRUE, fit.measures = TRUE, rsquare = TRUE)
111
+
112
+ # Make it percentages
113
+ output <- fit_summary$pe[fit_summary$pe$op == "~", c("lhs", "rhs", "std.all")]
114
+
115
+ # Define the function to convert std.all to percentages
116
+ convert_to_percentage <- function(df) {
117
+ df %>%
118
+ group_by(lhs) %>%
119
+ mutate(abs_std = abs(std.all),
120
+ sum_abs_std = sum(abs_std),
121
+ percent_std = (abs_std / sum_abs_std) * 100) %>%
122
+ select(-abs_std, -sum_abs_std) %>%
123
+ ungroup()
124
+ }
125
+
126
+ # Convert the estimates to percentages
127
+ percentage_output <- convert_to_percentage(output)
128
+
129
+ # Extract TB column names
130
+ tb_column_names <- colnames(TB)
131
+
132
+ # Convert std.all to a wide format dataframe
133
+ percentage_output_wide <- percentage_output %>%
134
+ pivot_wider(names_from = lhs, values_from = percent_std) %>%
135
+ rename_with(~ gsub("std.all\\.", "", .), starts_with("std.all"))
136
+
137
+ # Create a new dataframe with TB columns and percentage estimates
138
+ result_df <- data.frame(TB = tb_column_names)
139
+
140
+ # Merge the result_df with percentage_estimates_wide
141
+ result_df <- left_join(result_df, percentage_output_wide, by = c("TB" = "rhs"))
142
+
143
+ # Fill NA values with 0 to ensure proper representation
144
+ result_df[is.na(result_df)] <- 0
145
 
146
+ # Add corresponding messages of TB as a new column
147
+ result_df$Message <- sapply(result_df$TB, function(tb_col) question_to_column[[tb_col]])
148
+
149
+ # Convert 'TB' column to a factor with the correct order
150
+ result_df$TB <- factor(result_df$TB, levels = paste0("TB", 1:37))
151
+
152
+ # Exclude 'est' and 'Trust' columns and merge rows by 'TB'
153
+ result_df <- result_df %>%
154
+ select(-std.all, -Trust) %>%
155
+ group_by(TB) %>%
156
+ summarise(across(everything(), ~ if(is.numeric(.)) sum(., na.rm = TRUE) else first(.))) %>%
157
+ arrange(TB)
158
+
159
+ # Reorder columns to have Message as the second column
160
+ result_df <- result_df %>%
161
+ select(TB, Message, everything())
162
+
163
+ # Open the output text file in append mode to add this model's output
164
+ file_conn <- file(output_text_file, open = "a")
165
+
166
+ # Capture output to include in the text file
167
+ full_output <- capture.output({
168
+ print("Trust Builder Analysis:\n")
169
+ print("Data header mapping:\n")
170
+ print(question_to_column)
171
+ print("Buckets:\n")
172
+ print(bucket)
173
+ print("Messages:\n")
174
+ print(TB)
175
+ print("Coefficients matrix (coef:\n")
176
+ print(coef)
177
+ print("Model:\n")
178
+ cat(model_str, sep = "\n")
179
+ print("Fit summary:\n")
180
+ print(fit_summary)
181
+ print("Output:\n")
182
+ print(output)
183
+ print("Output in percentage (%):\n")
184
+ print(percentage_output)
185
+ print("result_df:\n")
186
+ print(result_df)
187
+ })
188
+ # Write output to text file
189
+ writeLines(full_output, file_conn)
190
+ close(file_conn)
191
+
192
+ # Create data frame of predictor names and their importance
193
+ results <- data.frame(result_df)
194
+
195
+ # Save results to CSV file
196
+ write.csv(results, file = csv_file, row.names = FALSE)
197
+ }, error = function(e) {
198
+ log_message(paste("Error in trust_builder_analysis:", e$message), output_text_file)
199
+ })
200
  }
201
 
202
  # Read command-line arguments
203
  args <- commandArgs(trailingOnly = TRUE)
204
  input_file <- args[1]
205
  output_text_file <- args[2] # Base path for output text and CSV files
206
+ csv_output_path_trust <- args[3]
207
+ csv_output_path_nps <- args[4]
208
+ csv_output_path_loyalty <- args[5]
209
+ csv_output_path_consideration <- args[6]
210
+ csv_output_path_satisfaction <- args[7]
211
+ csv_output_path_trustbuilder <- args[8]
212
+ nps_present <- as.logical(tolower(args[9])) # Expecting "TRUE" or "FALSE" as the argument
213
+ loyalty_present <- as.logical(tolower(args[10]))
214
+ consideration_present <- as.logical(tolower(args[11]))
215
+ satisfaction_present <- as.logical(tolower(args[12]))
216
+ trustbuilder_present <- as.logical(tolower(args[13]))
217
+
218
+ # Log the starting of the script
219
+ log_message("Starting Trust Driver and Builder Analysis Script.", output_text_file)
220
+
221
+ ########## Trust Driver Analysis ######################
222
 
223
+ # Load the trust driver dataset (CSV or Excel)
224
+ data_driver <- NULL
225
  if (grepl(".xlsx", input_file)) {
226
+ # Load the Excel file with the fourth row as the header
227
+ data_driver <- read_excel(input_file, sheet = "Driver", skip = 3)
228
+ }
229
+
230
+ # Process the Trust model
231
+ trust_driver_analysis(
232
+ Trust ~ Stability + Development + Relationship + Benefit + Vision + Competence,
233
+ data_driver,
234
+ output_text_file,
235
+ csv_output_path_trust)
236
+
237
+ # Conditionally process the NPS model
238
+ if (nps_present) {
239
+ trust_driver_analysis(
240
+ NPS ~ Stability + Development + Relationship + Benefit + Vision + Competence,
241
+ data_driver,
242
+ output_text_file,
243
+ csv_output_path_nps)
244
+ }
245
+
246
+ # Conditionally process the Loyalty model
247
+ if (loyalty_present) {
248
+ trust_driver_analysis(
249
+ Loyalty ~ Stability + Development + Relationship + Benefit + Vision + Competence,
250
+ data_driver,
251
+ output_text_file,
252
+ csv_output_path_loyalty)
253
+ }
254
+
255
+ # Conditionally process the Consideration model
256
+ if (consideration_present) {
257
+ trust_driver_analysis(
258
+ Consideration ~ Stability + Development + Relationship + Benefit + Vision + Competence,
259
+ data_driver,
260
+ output_text_file,
261
+ csv_output_path_consideration)
262
+ }
263
+
264
+ # Conditionally process the Satisfaction model
265
+ if (satisfaction_present) {
266
+ trust_driver_analysis(
267
+ Satisfaction ~ Stability + Development + Relationship + Benefit + Vision + Competence,
268
+ data_driver,
269
+ output_text_file,
270
+ csv_output_path_satisfaction)
271
+ }
272
+
273
+ ########## Trust Builder Analysis ######################
274
+
275
+ if (trustbuilder_present) {
276
+ data_builder <- NULL
277
+
278
+ if (grepl(".xlsx", input_file)) {
279
+ # Read the 4th and 5th rows as header mapping
280
+ data_builder_headers <- read_excel(input_file, sheet = "Builder", skip = 3, n_max = 2)
281
+ # Read the rest of the data, skipping the first 5 rows (to start from row 6)
282
+ data_builder_rows <- read_excel(input_file, sheet = "Builder", skip = 5)
283
+ }
284
+
285
+ # Process the Builder model
286
+ trust_builder_analysis(data_builder_rows, data_builder_headers, output_text_file, csv_output_path_trustbuilder)
287
+
288
  }
289
 
290
+ # Log the ending of the script
291
+ log_message("Trust Driver and Builder Analysis Script Completed.", output_text_file)