Wajahat698 commited on
Commit
ffedfe9
·
verified ·
1 Parent(s): a3a9e8f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -144
app.py CHANGED
@@ -272,80 +272,37 @@ def create_avg_target_display(avg_target):
272
  </div>
273
  """
274
 
275
- def create_mock_shapley_results():
276
- """
277
- Create mock Shapley analysis results when R script fails.
 
 
 
278
  """
279
- np.random.seed(42) # For consistent results
280
- factors = ['Factor 1', 'Factor 2', 'Factor 3', 'Factor 4', 'Factor 5', 'Factor 6']
281
-
282
- # Generate realistic importance values that sum to 1
283
- raw_importance = np.random.dirichlet([1, 1, 1, 1, 1, 1])
284
-
285
- mock_data = pd.DataFrame({
286
- 'Predictor': factors,
287
- 'Importance': raw_importance
288
- })
289
-
290
- return mock_data
291
 
292
- def find_target_column(df):
293
  """
294
- Find the target column in the dataset - look for various possible names.
295
  """
296
- possible_targets = [
297
- 'Consideration', 'Purchase Consideration', 'purchase_consideration',
298
- 'Intent', 'Purchase Intent', 'purchase_intent',
299
- 'Likelihood', 'Purchase Likelihood', 'purchase_likelihood',
300
- 'Probability', 'Purchase Probability', 'purchase_probability'
301
  ]
302
-
303
- # Check exact matches first
304
- for target in possible_targets:
305
- if target in df.columns:
306
- return target, target
307
-
308
- # Check case-insensitive matches
309
- df_columns_lower = [col.lower() for col in df.columns]
310
- for target in possible_targets:
311
- target_lower = target.lower()
312
- if target_lower in df_columns_lower:
313
- actual_col = df.columns[df_columns_lower.index(target_lower)]
314
- return actual_col, target
315
-
316
- # Check partial matches
317
- for col in df.columns:
318
- col_lower = col.lower()
319
- if any(keyword in col_lower for keyword in ['consider', 'intent', 'likelihood', 'probability', 'purchase']):
320
- return col, col
321
-
322
- return None, None
323
 
324
- def get_file_info(file_path):
325
- """
326
- Get information about the Excel file structure.
327
- """
328
  try:
329
- # Get sheet names
330
- xl_file = pd.ExcelFile(file_path)
331
- sheet_names = xl_file.sheet_names
332
-
333
- info = f"Excel file contains {len(sheet_names)} sheet(s): {', '.join(sheet_names)}\n\n"
334
-
335
- # Try to read each sheet and show column info
336
- for sheet in sheet_names:
337
- try:
338
- df = pd.read_excel(file_path, sheet_name=sheet, header=3, nrows=5)
339
- info += f"Sheet '{sheet}' columns: {', '.join(df.columns[:10])}"
340
- if len(df.columns) > 10:
341
- info += f" ... and {len(df.columns) - 10} more"
342
- info += "\n"
343
- except:
344
- info += f"Sheet '{sheet}': Could not read with header=3\n"
345
-
346
- return info
347
  except Exception as e:
348
- return f"Error reading file: {str(e)}"
 
349
 
350
  def analyze_prospects_data(file_path):
351
  """
@@ -357,36 +314,22 @@ def analyze_prospects_data(file_path):
357
  logger.info("Analyzing prospects file: %s", file_path)
358
 
359
  try:
360
- # First, get file info for debugging
361
- file_info = get_file_info(file_path)
362
- logger.info("File info: %s", file_info)
363
 
364
- # Try different sheet names and header positions
365
- sheet_options = ["Driver", "Data", "Sheet1", 0] # Try by name, then by index
366
- header_options = [3, 0, 1, 2] # Try different header positions
367
 
368
- df = None
369
- sheet_used = None
370
- header_used = None
371
 
372
- for sheet in sheet_options:
373
- for header in header_options:
374
- try:
375
- df = pd.read_excel(file_path, sheet_name=sheet, header=header)
376
- if len(df.columns) > 5 and len(df) > 10: # Basic validation
377
- sheet_used = sheet
378
- header_used = header
379
- logger.info(f"Successfully loaded data from sheet '{sheet}' with header={header}")
380
- break
381
- except:
382
- continue
383
- if df is not None:
384
- break
385
-
386
- if df is None:
387
- return create_error_message("Could not read Excel file. Please check the file format."), None, None, None
388
-
389
- logger.info(f"Loaded dataframe with shape {df.shape} and columns: {list(df.columns)}")
390
 
391
  # Map column names from trust buckets to factors
392
  column_mapping = {
@@ -400,60 +343,90 @@ def analyze_prospects_data(file_path):
400
 
401
  # Create a copy with renamed columns for analysis
402
  df_analysis = df.copy()
403
- factors_found = []
404
  for old_name, new_name in column_mapping.items():
405
  if old_name in df_analysis.columns:
406
  df_analysis.rename(columns={old_name: new_name}, inplace=True)
407
- factors_found.append(new_name)
408
-
409
- if len(factors_found) < 3:
410
- return create_error_message(f"Could not find enough factor columns. Found: {factors_found}. Looking for: {list(column_mapping.keys())}"), None, None, None
411
-
412
- # Find target column
413
- target_col, target_name = find_target_column(df)
414
-
415
- if target_col is None:
416
- available_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
417
- return create_error_message(f"Could not find target column (Consideration, Intent, etc.). Available numeric columns: {available_cols}"), None, None, None
418
-
419
- logger.info(f"Using target column: {target_col} (interpreted as {target_name})")
420
 
421
- # Calculate R² for target model
422
- factors = factors_found
423
  X = df_analysis[factors].dropna()
424
- if len(X) == 0:
425
- return create_error_message("No valid data found in factor columns"), None, None, None
426
-
427
- y = df.loc[X.index, target_col]
428
 
429
- # Remove rows where target is missing
430
- valid_idx = ~y.isna()
431
- X = X[valid_idx]
432
- y = y[valid_idx]
433
 
434
  if len(X) < 10:
435
- return create_error_message(f"Not enough valid data points ({len(X)}). Need at least 10."), None, None, None
 
436
 
437
  model = LinearRegression()
438
  model.fit(X, y)
439
  r2 = r2_score(y, model.predict(X))
440
  r2_percent = r2 * 100
441
 
442
- # Calculate average target
443
  avg_target = y.mean()
444
 
 
 
445
  # Create visualizations
446
  r2_html = calculate_r2_image(r2_percent)
447
  avg_target_html = create_avg_target_display(avg_target)
448
 
449
- # Factor performance plot - only use available factors
450
- factor_performance_img = plot_factor_performance(df_analysis[factors], "Factor Performance (Agreement Scores)")
 
 
 
 
451
 
452
- # Create Shapley results (mock data since R script is failing)
453
- results_df = create_mock_shapley_results()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
 
455
- # Only include factors that were actually found
456
- results_df = results_df[results_df['Predictor'].isin(factors)]
 
 
457
 
458
  results_df["Importance_percent"] = results_df["Importance"] * 100
459
  average_value = results_df["Importance_percent"].mean()
@@ -462,23 +435,22 @@ def analyze_prospects_data(file_path):
462
  driver_analysis_img = plot_driver_analysis(
463
  results_df,
464
  average_value,
465
- f"Shapley Driver Analysis - {target_name}"
466
  )
467
 
 
 
 
 
 
 
 
468
  return r2_html, avg_target_html, factor_performance_img, driver_analysis_img
469
 
470
  except Exception as e:
471
  logger.error(f"Error analyzing data: {e}")
472
  return create_error_message(f"Analysis failed: {str(e)}"), None, None, None
473
 
474
- def create_error_message(message):
475
- """Create an HTML error message."""
476
- return f"""
477
- <div style='background-color: #ffebee; border: 1px solid #f44336; border-radius: 4px; padding: 16px; color: #c62828;'>
478
- <strong>Error:</strong> {message}
479
- </div>
480
- """
481
-
482
  def load_default_file():
483
  """Load default file on startup"""
484
  default_file = "example_files/Volkswagen Non Customers.xlsx"
@@ -516,18 +488,12 @@ function refresh() {
516
  with gr.Blocks(css=css, js=js, theme=gr.themes.Soft()) as demo:
517
  gr.Markdown("""
518
  <h2 style="text-align: center; font-size: 2.25rem; font-weight: 600;">
519
- Driver Analysis Demo - Purchase Consideration
520
  </h2>
521
  """)
522
 
523
- gr.Markdown("### Instructions")
524
- gr.Markdown("""
525
- Upload an Excel file with:
526
- - A sheet containing survey data (preferably named 'Driver')
527
- - Factor columns: Stability, Development, Relationship, Benefit, Vision, Competence
528
- - Target column: Consideration, Purchase Consideration, Intent, or similar
529
- - Data should start around row 4 (headers in row 3)
530
- """)
531
 
532
  # File upload section
533
  with gr.Row():
@@ -572,5 +538,5 @@ with gr.Blocks(css=css, js=js, theme=gr.themes.Soft()) as demo:
572
  outputs=[r2_output, avg_target_output, factor_performance_plot, driver_analysis_plot]
573
  )
574
 
575
- # Launch without the theme parameter
576
  demo.launch(server_name="0.0.0.0", share=False)
 
272
  </div>
273
  """
274
 
275
+ def create_error_message(message):
276
+ """Create an HTML error message."""
277
+ return f"""
278
+ <div style='background-color: #ffebee; border: 1px solid #f44336; border-radius: 4px; padding: 16px; color: #c62828;'>
279
+ <strong>Error:</strong> {message}
280
+ </div>
281
  """
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
+ def call_r_script_simplified(input_file, csv_output_path):
284
  """
285
+ Call R script for Shapley regression analysis on Consideration.
286
  """
287
+ command = [
288
+ "Rscript",
289
+ "process_data.R",
290
+ input_file,
291
+ csv_output_path
292
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
 
 
 
 
294
  try:
295
+ result = subprocess.run(command, check=True, capture_output=True, text=True)
296
+ logger.info("R script executed successfully")
297
+ return True
298
+ except subprocess.CalledProcessError as e:
299
+ logger.error("R script failed with error: %s", e)
300
+ logger.error("R script stderr: %s", e.stderr)
301
+ logger.error("R script stdout: %s", e.stdout)
302
+ return False
 
 
 
 
 
 
 
 
 
 
303
  except Exception as e:
304
+ logger.error("Error calling R script: %s", e)
305
+ return False
306
 
307
  def analyze_prospects_data(file_path):
308
  """
 
314
  logger.info("Analyzing prospects file: %s", file_path)
315
 
316
  try:
317
+ # Load Excel file
318
+ df = pd.read_excel(file_path, sheet_name="Driver", header=3)
 
319
 
320
+ # Check required columns
321
+ required_factor_columns = ["Stability", "Development", "Relationship", "Benefit", "Vision", "Competence"]
322
+ missing_factors = [col for col in required_factor_columns if col not in df.columns]
323
 
324
+ if missing_factors:
325
+ logger.error(f"Missing factor columns: {missing_factors}")
326
+ return create_error_message(f"Missing required columns: {missing_factors}"), None, None, None
327
 
328
+ # Check if Consideration column exists
329
+ if "Consideration" not in df.columns:
330
+ logger.error("Consideration column not found in dataset")
331
+ logger.info(f"Available columns: {list(df.columns)}")
332
+ return create_error_message(f"Consideration column not found. Available columns: {list(df.columns)}"), None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
  # Map column names from trust buckets to factors
335
  column_mapping = {
 
343
 
344
  # Create a copy with renamed columns for analysis
345
  df_analysis = df.copy()
 
346
  for old_name, new_name in column_mapping.items():
347
  if old_name in df_analysis.columns:
348
  df_analysis.rename(columns={old_name: new_name}, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
+ # Calculate R² for Consideration model
351
+ factors = ["Factor 1", "Factor 2", "Factor 3", "Factor 4", "Factor 5", "Factor 6"]
352
  X = df_analysis[factors].dropna()
353
+ y = df.loc[X.index, "Consideration"] # Use Consideration as target
 
 
 
354
 
355
+ # Remove any remaining NaN values
356
+ valid_mask = ~y.isna()
357
+ X = X[valid_mask]
358
+ y = y[valid_mask]
359
 
360
  if len(X) < 10:
361
+ logger.error(f"Not enough valid data points: {len(X)}")
362
+ return create_error_message(f"Not enough valid data points: {len(X)}. Need at least 10."), None, None, None
363
 
364
  model = LinearRegression()
365
  model.fit(X, y)
366
  r2 = r2_score(y, model.predict(X))
367
  r2_percent = r2 * 100
368
 
369
+ # Calculate average target (Consideration)
370
  avg_target = y.mean()
371
 
372
+ logger.info(f"R² Score: {r2_percent:.1f}%, Average Consideration: {avg_target:.1f}")
373
+
374
  # Create visualizations
375
  r2_html = calculate_r2_image(r2_percent)
376
  avg_target_html = create_avg_target_display(avg_target)
377
 
378
+ # Factor performance plot
379
+ factor_performance_img = plot_factor_performance(df_analysis, "Factor Performance (Agreement Scores)")
380
+
381
+ # Run Shapley analysis on Consideration
382
+ temp_dir = tempfile.mkdtemp()
383
+ csv_output_path = os.path.join(temp_dir, "consideration_results.csv")
384
 
385
+ # Call R script
386
+ r_success = call_r_script_simplified(file_path, csv_output_path)
387
+
388
+ if not r_success:
389
+ # Clean up and return error
390
+ try:
391
+ os.rmdir(temp_dir)
392
+ except:
393
+ pass
394
+ return create_error_message("R script failed to execute. Shapley analysis not available."), None, None, None
395
+
396
+ # Check if R script produced output file
397
+ if not os.path.exists(csv_output_path):
398
+ try:
399
+ os.rmdir(temp_dir)
400
+ except:
401
+ pass
402
+ return create_error_message("R script did not produce expected output file."), None, None, None
403
+
404
+ # Load results with renamed predictors
405
+ try:
406
+ results_df = pd.read_csv(csv_output_path)
407
+ except Exception as e:
408
+ logger.error(f"Error reading R script output: {e}")
409
+ try:
410
+ os.remove(csv_output_path)
411
+ os.rmdir(temp_dir)
412
+ except:
413
+ pass
414
+ return create_error_message(f"Error reading R script output: {e}"), None, None, None
415
+
416
+ # Validate R script output
417
+ if "Predictor" not in results_df.columns or "Importance" not in results_df.columns:
418
+ logger.error("R script output missing required columns")
419
+ try:
420
+ os.remove(csv_output_path)
421
+ os.rmdir(temp_dir)
422
+ except:
423
+ pass
424
+ return create_error_message("R script output is invalid - missing required columns."), None, None, None
425
 
426
+ # Map predictor names if they come from R script with original names
427
+ results_df["Predictor"] = results_df["Predictor"].map(
428
+ lambda x: column_mapping.get(x, x)
429
+ )
430
 
431
  results_df["Importance_percent"] = results_df["Importance"] * 100
432
  average_value = results_df["Importance_percent"].mean()
 
435
  driver_analysis_img = plot_driver_analysis(
436
  results_df,
437
  average_value,
438
+ "Shapley Driver Analysis - Purchase Consideration"
439
  )
440
 
441
+ # Clean up
442
+ try:
443
+ os.remove(csv_output_path)
444
+ os.rmdir(temp_dir)
445
+ except Exception as e:
446
+ logger.error(f"Error cleaning up temp files: {e}")
447
+
448
  return r2_html, avg_target_html, factor_performance_img, driver_analysis_img
449
 
450
  except Exception as e:
451
  logger.error(f"Error analyzing data: {e}")
452
  return create_error_message(f"Analysis failed: {str(e)}"), None, None, None
453
 
 
 
 
 
 
 
 
 
454
  def load_default_file():
455
  """Load default file on startup"""
456
  default_file = "example_files/Volkswagen Non Customers.xlsx"
 
488
  with gr.Blocks(css=css, js=js, theme=gr.themes.Soft()) as demo:
489
  gr.Markdown("""
490
  <h2 style="text-align: center; font-size: 2.25rem; font-weight: 600;">
491
+ Driver Analysis - Purchase Consideration
492
  </h2>
493
  """)
494
 
495
+ gr.Markdown("### Purchase Consideration Analysis")
496
+ gr.Markdown("Analysis showing what drives Purchase Consideration among prospects using Factors 1-6")
 
 
 
 
 
 
497
 
498
  # File upload section
499
  with gr.Row():
 
538
  outputs=[r2_output, avg_target_output, factor_performance_plot, driver_analysis_plot]
539
  )
540
 
541
+ # Launch the demo
542
  demo.launch(server_name="0.0.0.0", share=False)