Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -272,80 +272,37 @@ def create_avg_target_display(avg_target):
|
|
| 272 |
</div>
|
| 273 |
"""
|
| 274 |
|
| 275 |
-
def
|
| 276 |
-
"""
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
| 278 |
"""
|
| 279 |
-
np.random.seed(42) # For consistent results
|
| 280 |
-
factors = ['Factor 1', 'Factor 2', 'Factor 3', 'Factor 4', 'Factor 5', 'Factor 6']
|
| 281 |
-
|
| 282 |
-
# Generate realistic importance values that sum to 1
|
| 283 |
-
raw_importance = np.random.dirichlet([1, 1, 1, 1, 1, 1])
|
| 284 |
-
|
| 285 |
-
mock_data = pd.DataFrame({
|
| 286 |
-
'Predictor': factors,
|
| 287 |
-
'Importance': raw_importance
|
| 288 |
-
})
|
| 289 |
-
|
| 290 |
-
return mock_data
|
| 291 |
|
| 292 |
-
def
|
| 293 |
"""
|
| 294 |
-
|
| 295 |
"""
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
]
|
| 302 |
-
|
| 303 |
-
# Check exact matches first
|
| 304 |
-
for target in possible_targets:
|
| 305 |
-
if target in df.columns:
|
| 306 |
-
return target, target
|
| 307 |
-
|
| 308 |
-
# Check case-insensitive matches
|
| 309 |
-
df_columns_lower = [col.lower() for col in df.columns]
|
| 310 |
-
for target in possible_targets:
|
| 311 |
-
target_lower = target.lower()
|
| 312 |
-
if target_lower in df_columns_lower:
|
| 313 |
-
actual_col = df.columns[df_columns_lower.index(target_lower)]
|
| 314 |
-
return actual_col, target
|
| 315 |
-
|
| 316 |
-
# Check partial matches
|
| 317 |
-
for col in df.columns:
|
| 318 |
-
col_lower = col.lower()
|
| 319 |
-
if any(keyword in col_lower for keyword in ['consider', 'intent', 'likelihood', 'probability', 'purchase']):
|
| 320 |
-
return col, col
|
| 321 |
-
|
| 322 |
-
return None, None
|
| 323 |
|
| 324 |
-
def get_file_info(file_path):
|
| 325 |
-
"""
|
| 326 |
-
Get information about the Excel file structure.
|
| 327 |
-
"""
|
| 328 |
try:
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
try:
|
| 338 |
-
df = pd.read_excel(file_path, sheet_name=sheet, header=3, nrows=5)
|
| 339 |
-
info += f"Sheet '{sheet}' columns: {', '.join(df.columns[:10])}"
|
| 340 |
-
if len(df.columns) > 10:
|
| 341 |
-
info += f" ... and {len(df.columns) - 10} more"
|
| 342 |
-
info += "\n"
|
| 343 |
-
except:
|
| 344 |
-
info += f"Sheet '{sheet}': Could not read with header=3\n"
|
| 345 |
-
|
| 346 |
-
return info
|
| 347 |
except Exception as e:
|
| 348 |
-
|
|
|
|
| 349 |
|
| 350 |
def analyze_prospects_data(file_path):
|
| 351 |
"""
|
|
@@ -357,36 +314,22 @@ def analyze_prospects_data(file_path):
|
|
| 357 |
logger.info("Analyzing prospects file: %s", file_path)
|
| 358 |
|
| 359 |
try:
|
| 360 |
-
#
|
| 361 |
-
|
| 362 |
-
logger.info("File info: %s", file_info)
|
| 363 |
|
| 364 |
-
#
|
| 365 |
-
|
| 366 |
-
|
| 367 |
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
sheet_used = sheet
|
| 378 |
-
header_used = header
|
| 379 |
-
logger.info(f"Successfully loaded data from sheet '{sheet}' with header={header}")
|
| 380 |
-
break
|
| 381 |
-
except:
|
| 382 |
-
continue
|
| 383 |
-
if df is not None:
|
| 384 |
-
break
|
| 385 |
-
|
| 386 |
-
if df is None:
|
| 387 |
-
return create_error_message("Could not read Excel file. Please check the file format."), None, None, None
|
| 388 |
-
|
| 389 |
-
logger.info(f"Loaded dataframe with shape {df.shape} and columns: {list(df.columns)}")
|
| 390 |
|
| 391 |
# Map column names from trust buckets to factors
|
| 392 |
column_mapping = {
|
|
@@ -400,60 +343,90 @@ def analyze_prospects_data(file_path):
|
|
| 400 |
|
| 401 |
# Create a copy with renamed columns for analysis
|
| 402 |
df_analysis = df.copy()
|
| 403 |
-
factors_found = []
|
| 404 |
for old_name, new_name in column_mapping.items():
|
| 405 |
if old_name in df_analysis.columns:
|
| 406 |
df_analysis.rename(columns={old_name: new_name}, inplace=True)
|
| 407 |
-
factors_found.append(new_name)
|
| 408 |
-
|
| 409 |
-
if len(factors_found) < 3:
|
| 410 |
-
return create_error_message(f"Could not find enough factor columns. Found: {factors_found}. Looking for: {list(column_mapping.keys())}"), None, None, None
|
| 411 |
-
|
| 412 |
-
# Find target column
|
| 413 |
-
target_col, target_name = find_target_column(df)
|
| 414 |
-
|
| 415 |
-
if target_col is None:
|
| 416 |
-
available_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
|
| 417 |
-
return create_error_message(f"Could not find target column (Consideration, Intent, etc.). Available numeric columns: {available_cols}"), None, None, None
|
| 418 |
-
|
| 419 |
-
logger.info(f"Using target column: {target_col} (interpreted as {target_name})")
|
| 420 |
|
| 421 |
-
# Calculate R² for
|
| 422 |
-
factors =
|
| 423 |
X = df_analysis[factors].dropna()
|
| 424 |
-
|
| 425 |
-
return create_error_message("No valid data found in factor columns"), None, None, None
|
| 426 |
-
|
| 427 |
-
y = df.loc[X.index, target_col]
|
| 428 |
|
| 429 |
-
# Remove
|
| 430 |
-
|
| 431 |
-
X = X[
|
| 432 |
-
y = y[
|
| 433 |
|
| 434 |
if len(X) < 10:
|
| 435 |
-
|
|
|
|
| 436 |
|
| 437 |
model = LinearRegression()
|
| 438 |
model.fit(X, y)
|
| 439 |
r2 = r2_score(y, model.predict(X))
|
| 440 |
r2_percent = r2 * 100
|
| 441 |
|
| 442 |
-
# Calculate average target
|
| 443 |
avg_target = y.mean()
|
| 444 |
|
|
|
|
|
|
|
| 445 |
# Create visualizations
|
| 446 |
r2_html = calculate_r2_image(r2_percent)
|
| 447 |
avg_target_html = create_avg_target_display(avg_target)
|
| 448 |
|
| 449 |
-
# Factor performance plot
|
| 450 |
-
factor_performance_img = plot_factor_performance(df_analysis
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
-
#
|
| 453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
|
| 455 |
-
#
|
| 456 |
-
results_df = results_df[
|
|
|
|
|
|
|
| 457 |
|
| 458 |
results_df["Importance_percent"] = results_df["Importance"] * 100
|
| 459 |
average_value = results_df["Importance_percent"].mean()
|
|
@@ -462,23 +435,22 @@ def analyze_prospects_data(file_path):
|
|
| 462 |
driver_analysis_img = plot_driver_analysis(
|
| 463 |
results_df,
|
| 464 |
average_value,
|
| 465 |
-
|
| 466 |
)
|
| 467 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
return r2_html, avg_target_html, factor_performance_img, driver_analysis_img
|
| 469 |
|
| 470 |
except Exception as e:
|
| 471 |
logger.error(f"Error analyzing data: {e}")
|
| 472 |
return create_error_message(f"Analysis failed: {str(e)}"), None, None, None
|
| 473 |
|
| 474 |
-
def create_error_message(message):
|
| 475 |
-
"""Create an HTML error message."""
|
| 476 |
-
return f"""
|
| 477 |
-
<div style='background-color: #ffebee; border: 1px solid #f44336; border-radius: 4px; padding: 16px; color: #c62828;'>
|
| 478 |
-
<strong>Error:</strong> {message}
|
| 479 |
-
</div>
|
| 480 |
-
"""
|
| 481 |
-
|
| 482 |
def load_default_file():
|
| 483 |
"""Load default file on startup"""
|
| 484 |
default_file = "example_files/Volkswagen Non Customers.xlsx"
|
|
@@ -516,18 +488,12 @@ function refresh() {
|
|
| 516 |
with gr.Blocks(css=css, js=js, theme=gr.themes.Soft()) as demo:
|
| 517 |
gr.Markdown("""
|
| 518 |
<h2 style="text-align: center; font-size: 2.25rem; font-weight: 600;">
|
| 519 |
-
Driver Analysis
|
| 520 |
</h2>
|
| 521 |
""")
|
| 522 |
|
| 523 |
-
gr.Markdown("###
|
| 524 |
-
gr.Markdown(""
|
| 525 |
-
Upload an Excel file with:
|
| 526 |
-
- A sheet containing survey data (preferably named 'Driver')
|
| 527 |
-
- Factor columns: Stability, Development, Relationship, Benefit, Vision, Competence
|
| 528 |
-
- Target column: Consideration, Purchase Consideration, Intent, or similar
|
| 529 |
-
- Data should start around row 4 (headers in row 3)
|
| 530 |
-
""")
|
| 531 |
|
| 532 |
# File upload section
|
| 533 |
with gr.Row():
|
|
@@ -572,5 +538,5 @@ with gr.Blocks(css=css, js=js, theme=gr.themes.Soft()) as demo:
|
|
| 572 |
outputs=[r2_output, avg_target_output, factor_performance_plot, driver_analysis_plot]
|
| 573 |
)
|
| 574 |
|
| 575 |
-
# Launch
|
| 576 |
demo.launch(server_name="0.0.0.0", share=False)
|
|
|
|
| 272 |
</div>
|
| 273 |
"""
|
| 274 |
|
| 275 |
+
def create_error_message(message):
|
| 276 |
+
"""Create an HTML error message."""
|
| 277 |
+
return f"""
|
| 278 |
+
<div style='background-color: #ffebee; border: 1px solid #f44336; border-radius: 4px; padding: 16px; color: #c62828;'>
|
| 279 |
+
<strong>Error:</strong> {message}
|
| 280 |
+
</div>
|
| 281 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
+
def call_r_script_simplified(input_file, csv_output_path):
|
| 284 |
"""
|
| 285 |
+
Call R script for Shapley regression analysis on Consideration.
|
| 286 |
"""
|
| 287 |
+
command = [
|
| 288 |
+
"Rscript",
|
| 289 |
+
"process_data.R",
|
| 290 |
+
input_file,
|
| 291 |
+
csv_output_path
|
| 292 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
try:
|
| 295 |
+
result = subprocess.run(command, check=True, capture_output=True, text=True)
|
| 296 |
+
logger.info("R script executed successfully")
|
| 297 |
+
return True
|
| 298 |
+
except subprocess.CalledProcessError as e:
|
| 299 |
+
logger.error("R script failed with error: %s", e)
|
| 300 |
+
logger.error("R script stderr: %s", e.stderr)
|
| 301 |
+
logger.error("R script stdout: %s", e.stdout)
|
| 302 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
except Exception as e:
|
| 304 |
+
logger.error("Error calling R script: %s", e)
|
| 305 |
+
return False
|
| 306 |
|
| 307 |
def analyze_prospects_data(file_path):
|
| 308 |
"""
|
|
|
|
| 314 |
logger.info("Analyzing prospects file: %s", file_path)
|
| 315 |
|
| 316 |
try:
|
| 317 |
+
# Load Excel file
|
| 318 |
+
df = pd.read_excel(file_path, sheet_name="Driver", header=3)
|
|
|
|
| 319 |
|
| 320 |
+
# Check required columns
|
| 321 |
+
required_factor_columns = ["Stability", "Development", "Relationship", "Benefit", "Vision", "Competence"]
|
| 322 |
+
missing_factors = [col for col in required_factor_columns if col not in df.columns]
|
| 323 |
|
| 324 |
+
if missing_factors:
|
| 325 |
+
logger.error(f"Missing factor columns: {missing_factors}")
|
| 326 |
+
return create_error_message(f"Missing required columns: {missing_factors}"), None, None, None
|
| 327 |
|
| 328 |
+
# Check if Consideration column exists
|
| 329 |
+
if "Consideration" not in df.columns:
|
| 330 |
+
logger.error("Consideration column not found in dataset")
|
| 331 |
+
logger.info(f"Available columns: {list(df.columns)}")
|
| 332 |
+
return create_error_message(f"Consideration column not found. Available columns: {list(df.columns)}"), None, None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
|
| 334 |
# Map column names from trust buckets to factors
|
| 335 |
column_mapping = {
|
|
|
|
| 343 |
|
| 344 |
# Create a copy with renamed columns for analysis
|
| 345 |
df_analysis = df.copy()
|
|
|
|
| 346 |
for old_name, new_name in column_mapping.items():
|
| 347 |
if old_name in df_analysis.columns:
|
| 348 |
df_analysis.rename(columns={old_name: new_name}, inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
+
# Calculate R² for Consideration model
|
| 351 |
+
factors = ["Factor 1", "Factor 2", "Factor 3", "Factor 4", "Factor 5", "Factor 6"]
|
| 352 |
X = df_analysis[factors].dropna()
|
| 353 |
+
y = df.loc[X.index, "Consideration"] # Use Consideration as target
|
|
|
|
|
|
|
|
|
|
| 354 |
|
| 355 |
+
# Remove any remaining NaN values
|
| 356 |
+
valid_mask = ~y.isna()
|
| 357 |
+
X = X[valid_mask]
|
| 358 |
+
y = y[valid_mask]
|
| 359 |
|
| 360 |
if len(X) < 10:
|
| 361 |
+
logger.error(f"Not enough valid data points: {len(X)}")
|
| 362 |
+
return create_error_message(f"Not enough valid data points: {len(X)}. Need at least 10."), None, None, None
|
| 363 |
|
| 364 |
model = LinearRegression()
|
| 365 |
model.fit(X, y)
|
| 366 |
r2 = r2_score(y, model.predict(X))
|
| 367 |
r2_percent = r2 * 100
|
| 368 |
|
| 369 |
+
# Calculate average target (Consideration)
|
| 370 |
avg_target = y.mean()
|
| 371 |
|
| 372 |
+
logger.info(f"R² Score: {r2_percent:.1f}%, Average Consideration: {avg_target:.1f}")
|
| 373 |
+
|
| 374 |
# Create visualizations
|
| 375 |
r2_html = calculate_r2_image(r2_percent)
|
| 376 |
avg_target_html = create_avg_target_display(avg_target)
|
| 377 |
|
| 378 |
+
# Factor performance plot
|
| 379 |
+
factor_performance_img = plot_factor_performance(df_analysis, "Factor Performance (Agreement Scores)")
|
| 380 |
+
|
| 381 |
+
# Run Shapley analysis on Consideration
|
| 382 |
+
temp_dir = tempfile.mkdtemp()
|
| 383 |
+
csv_output_path = os.path.join(temp_dir, "consideration_results.csv")
|
| 384 |
|
| 385 |
+
# Call R script
|
| 386 |
+
r_success = call_r_script_simplified(file_path, csv_output_path)
|
| 387 |
+
|
| 388 |
+
if not r_success:
|
| 389 |
+
# Clean up and return error
|
| 390 |
+
try:
|
| 391 |
+
os.rmdir(temp_dir)
|
| 392 |
+
except:
|
| 393 |
+
pass
|
| 394 |
+
return create_error_message("R script failed to execute. Shapley analysis not available."), None, None, None
|
| 395 |
+
|
| 396 |
+
# Check if R script produced output file
|
| 397 |
+
if not os.path.exists(csv_output_path):
|
| 398 |
+
try:
|
| 399 |
+
os.rmdir(temp_dir)
|
| 400 |
+
except:
|
| 401 |
+
pass
|
| 402 |
+
return create_error_message("R script did not produce expected output file."), None, None, None
|
| 403 |
+
|
| 404 |
+
# Load results with renamed predictors
|
| 405 |
+
try:
|
| 406 |
+
results_df = pd.read_csv(csv_output_path)
|
| 407 |
+
except Exception as e:
|
| 408 |
+
logger.error(f"Error reading R script output: {e}")
|
| 409 |
+
try:
|
| 410 |
+
os.remove(csv_output_path)
|
| 411 |
+
os.rmdir(temp_dir)
|
| 412 |
+
except:
|
| 413 |
+
pass
|
| 414 |
+
return create_error_message(f"Error reading R script output: {e}"), None, None, None
|
| 415 |
+
|
| 416 |
+
# Validate R script output
|
| 417 |
+
if "Predictor" not in results_df.columns or "Importance" not in results_df.columns:
|
| 418 |
+
logger.error("R script output missing required columns")
|
| 419 |
+
try:
|
| 420 |
+
os.remove(csv_output_path)
|
| 421 |
+
os.rmdir(temp_dir)
|
| 422 |
+
except:
|
| 423 |
+
pass
|
| 424 |
+
return create_error_message("R script output is invalid - missing required columns."), None, None, None
|
| 425 |
|
| 426 |
+
# Map predictor names if they come from R script with original names
|
| 427 |
+
results_df["Predictor"] = results_df["Predictor"].map(
|
| 428 |
+
lambda x: column_mapping.get(x, x)
|
| 429 |
+
)
|
| 430 |
|
| 431 |
results_df["Importance_percent"] = results_df["Importance"] * 100
|
| 432 |
average_value = results_df["Importance_percent"].mean()
|
|
|
|
| 435 |
driver_analysis_img = plot_driver_analysis(
|
| 436 |
results_df,
|
| 437 |
average_value,
|
| 438 |
+
"Shapley Driver Analysis - Purchase Consideration"
|
| 439 |
)
|
| 440 |
|
| 441 |
+
# Clean up
|
| 442 |
+
try:
|
| 443 |
+
os.remove(csv_output_path)
|
| 444 |
+
os.rmdir(temp_dir)
|
| 445 |
+
except Exception as e:
|
| 446 |
+
logger.error(f"Error cleaning up temp files: {e}")
|
| 447 |
+
|
| 448 |
return r2_html, avg_target_html, factor_performance_img, driver_analysis_img
|
| 449 |
|
| 450 |
except Exception as e:
|
| 451 |
logger.error(f"Error analyzing data: {e}")
|
| 452 |
return create_error_message(f"Analysis failed: {str(e)}"), None, None, None
|
| 453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
def load_default_file():
|
| 455 |
"""Load default file on startup"""
|
| 456 |
default_file = "example_files/Volkswagen Non Customers.xlsx"
|
|
|
|
| 488 |
with gr.Blocks(css=css, js=js, theme=gr.themes.Soft()) as demo:
|
| 489 |
gr.Markdown("""
|
| 490 |
<h2 style="text-align: center; font-size: 2.25rem; font-weight: 600;">
|
| 491 |
+
Driver Analysis - Purchase Consideration
|
| 492 |
</h2>
|
| 493 |
""")
|
| 494 |
|
| 495 |
+
gr.Markdown("### Purchase Consideration Analysis")
|
| 496 |
+
gr.Markdown("Analysis showing what drives Purchase Consideration among prospects using Factors 1-6")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
|
| 498 |
# File upload section
|
| 499 |
with gr.Row():
|
|
|
|
| 538 |
outputs=[r2_output, avg_target_output, factor_performance_plot, driver_analysis_plot]
|
| 539 |
)
|
| 540 |
|
| 541 |
+
# Launch the demo
|
| 542 |
demo.launch(server_name="0.0.0.0", share=False)
|