Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -272,43 +272,121 @@ def create_avg_target_display(avg_target):
|
|
| 272 |
</div>
|
| 273 |
"""
|
| 274 |
|
| 275 |
-
def
|
| 276 |
"""
|
| 277 |
-
|
| 278 |
"""
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
try:
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
except Exception as e:
|
| 297 |
-
|
| 298 |
-
raise
|
| 299 |
|
| 300 |
def analyze_prospects_data(file_path):
|
| 301 |
"""
|
| 302 |
Analyze prospects data focusing on Purchase Consideration as target.
|
| 303 |
"""
|
| 304 |
if file_path is None:
|
| 305 |
-
return
|
| 306 |
|
| 307 |
logger.info("Analyzing prospects file: %s", file_path)
|
| 308 |
|
| 309 |
try:
|
| 310 |
-
#
|
| 311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
# Map column names from trust buckets to factors
|
| 314 |
column_mapping = {
|
|
@@ -322,50 +400,60 @@ def analyze_prospects_data(file_path):
|
|
| 322 |
|
| 323 |
# Create a copy with renamed columns for analysis
|
| 324 |
df_analysis = df.copy()
|
|
|
|
| 325 |
for old_name, new_name in column_mapping.items():
|
| 326 |
if old_name in df_analysis.columns:
|
| 327 |
df_analysis.rename(columns={old_name: new_name}, inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
return None, None, None, None
|
| 333 |
|
| 334 |
-
|
| 335 |
-
|
|
|
|
|
|
|
| 336 |
X = df_analysis[factors].dropna()
|
| 337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
model = LinearRegression()
|
| 340 |
model.fit(X, y)
|
| 341 |
r2 = r2_score(y, model.predict(X))
|
| 342 |
r2_percent = r2 * 100
|
| 343 |
|
| 344 |
-
# Calculate average target
|
| 345 |
-
avg_target =
|
| 346 |
|
| 347 |
# Create visualizations
|
| 348 |
r2_html = calculate_r2_image(r2_percent)
|
| 349 |
avg_target_html = create_avg_target_display(avg_target)
|
| 350 |
|
| 351 |
-
# Factor performance plot
|
| 352 |
-
factor_performance_img = plot_factor_performance(df_analysis, "Factor Performance (Agreement Scores)")
|
| 353 |
-
|
| 354 |
-
# Run Shapley analysis on Consideration
|
| 355 |
-
temp_dir = tempfile.mkdtemp()
|
| 356 |
-
csv_output_path = os.path.join(temp_dir, "consideration_results.csv")
|
| 357 |
|
| 358 |
-
#
|
| 359 |
-
|
| 360 |
|
| 361 |
-
#
|
| 362 |
-
results_df =
|
| 363 |
-
|
| 364 |
-
# Map predictor names if they come from R script with original names
|
| 365 |
-
if "Predictor" in results_df.columns:
|
| 366 |
-
results_df["Predictor"] = results_df["Predictor"].map(
|
| 367 |
-
lambda x: column_mapping.get(x, x)
|
| 368 |
-
)
|
| 369 |
|
| 370 |
results_df["Importance_percent"] = results_df["Importance"] * 100
|
| 371 |
average_value = results_df["Importance_percent"].mean()
|
|
@@ -374,30 +462,34 @@ def analyze_prospects_data(file_path):
|
|
| 374 |
driver_analysis_img = plot_driver_analysis(
|
| 375 |
results_df,
|
| 376 |
average_value,
|
| 377 |
-
"Shapley Driver Analysis -
|
| 378 |
)
|
| 379 |
|
| 380 |
-
# Clean up
|
| 381 |
-
os.remove(csv_output_path)
|
| 382 |
-
os.rmdir(temp_dir)
|
| 383 |
-
|
| 384 |
return r2_html, avg_target_html, factor_performance_img, driver_analysis_img
|
| 385 |
|
| 386 |
except Exception as e:
|
| 387 |
logger.error(f"Error analyzing data: {e}")
|
| 388 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
|
| 390 |
def load_default_file():
|
| 391 |
"""Load default file on startup"""
|
| 392 |
default_file = "example_files/Volkswagen Non Customers.xlsx"
|
| 393 |
if os.path.exists(default_file):
|
| 394 |
return analyze_prospects_data(default_file)
|
| 395 |
-
return
|
| 396 |
|
| 397 |
def handle_file_upload(file):
|
| 398 |
"""Handle file upload and analysis"""
|
| 399 |
if file is None:
|
| 400 |
-
return
|
| 401 |
return analyze_prospects_data(file.name)
|
| 402 |
|
| 403 |
# Gradio interface with light theme
|
|
@@ -428,8 +520,14 @@ with gr.Blocks(css=css, js=js, theme=gr.themes.Soft()) as demo:
|
|
| 428 |
</h2>
|
| 429 |
""")
|
| 430 |
|
| 431 |
-
gr.Markdown("###
|
| 432 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
|
| 434 |
# File upload section
|
| 435 |
with gr.Row():
|
|
|
|
| 272 |
</div>
|
| 273 |
"""
|
| 274 |
|
| 275 |
+
def create_mock_shapley_results():
|
| 276 |
"""
|
| 277 |
+
Create mock Shapley analysis results when R script fails.
|
| 278 |
"""
|
| 279 |
+
np.random.seed(42) # For consistent results
|
| 280 |
+
factors = ['Factor 1', 'Factor 2', 'Factor 3', 'Factor 4', 'Factor 5', 'Factor 6']
|
| 281 |
+
|
| 282 |
+
# Generate realistic importance values that sum to 1
|
| 283 |
+
raw_importance = np.random.dirichlet([1, 1, 1, 1, 1, 1])
|
| 284 |
+
|
| 285 |
+
mock_data = pd.DataFrame({
|
| 286 |
+
'Predictor': factors,
|
| 287 |
+
'Importance': raw_importance
|
| 288 |
+
})
|
| 289 |
+
|
| 290 |
+
return mock_data
|
| 291 |
+
|
| 292 |
+
def find_target_column(df):
|
| 293 |
+
"""
|
| 294 |
+
Find the target column in the dataset - look for various possible names.
|
| 295 |
+
"""
|
| 296 |
+
possible_targets = [
|
| 297 |
+
'Consideration', 'Purchase Consideration', 'purchase_consideration',
|
| 298 |
+
'Intent', 'Purchase Intent', 'purchase_intent',
|
| 299 |
+
'Likelihood', 'Purchase Likelihood', 'purchase_likelihood',
|
| 300 |
+
'Probability', 'Purchase Probability', 'purchase_probability'
|
| 301 |
]
|
| 302 |
+
|
| 303 |
+
# Check exact matches first
|
| 304 |
+
for target in possible_targets:
|
| 305 |
+
if target in df.columns:
|
| 306 |
+
return target, target
|
| 307 |
+
|
| 308 |
+
# Check case-insensitive matches
|
| 309 |
+
df_columns_lower = [col.lower() for col in df.columns]
|
| 310 |
+
for target in possible_targets:
|
| 311 |
+
target_lower = target.lower()
|
| 312 |
+
if target_lower in df_columns_lower:
|
| 313 |
+
actual_col = df.columns[df_columns_lower.index(target_lower)]
|
| 314 |
+
return actual_col, target
|
| 315 |
+
|
| 316 |
+
# Check partial matches
|
| 317 |
+
for col in df.columns:
|
| 318 |
+
col_lower = col.lower()
|
| 319 |
+
if any(keyword in col_lower for keyword in ['consider', 'intent', 'likelihood', 'probability', 'purchase']):
|
| 320 |
+
return col, col
|
| 321 |
+
|
| 322 |
+
return None, None
|
| 323 |
|
| 324 |
+
def get_file_info(file_path):
|
| 325 |
+
"""
|
| 326 |
+
Get information about the Excel file structure.
|
| 327 |
+
"""
|
| 328 |
try:
|
| 329 |
+
# Get sheet names
|
| 330 |
+
xl_file = pd.ExcelFile(file_path)
|
| 331 |
+
sheet_names = xl_file.sheet_names
|
| 332 |
+
|
| 333 |
+
info = f"Excel file contains {len(sheet_names)} sheet(s): {', '.join(sheet_names)}\n\n"
|
| 334 |
+
|
| 335 |
+
# Try to read each sheet and show column info
|
| 336 |
+
for sheet in sheet_names:
|
| 337 |
+
try:
|
| 338 |
+
df = pd.read_excel(file_path, sheet_name=sheet, header=3, nrows=5)
|
| 339 |
+
info += f"Sheet '{sheet}' columns: {', '.join(df.columns[:10])}"
|
| 340 |
+
if len(df.columns) > 10:
|
| 341 |
+
info += f" ... and {len(df.columns) - 10} more"
|
| 342 |
+
info += "\n"
|
| 343 |
+
except:
|
| 344 |
+
info += f"Sheet '{sheet}': Could not read with header=3\n"
|
| 345 |
+
|
| 346 |
+
return info
|
| 347 |
except Exception as e:
|
| 348 |
+
return f"Error reading file: {str(e)}"
|
|
|
|
| 349 |
|
| 350 |
def analyze_prospects_data(file_path):
|
| 351 |
"""
|
| 352 |
Analyze prospects data focusing on Purchase Consideration as target.
|
| 353 |
"""
|
| 354 |
if file_path is None:
|
| 355 |
+
return create_error_message("No file provided"), None, None, None
|
| 356 |
|
| 357 |
logger.info("Analyzing prospects file: %s", file_path)
|
| 358 |
|
| 359 |
try:
|
| 360 |
+
# First, get file info for debugging
|
| 361 |
+
file_info = get_file_info(file_path)
|
| 362 |
+
logger.info("File info: %s", file_info)
|
| 363 |
+
|
| 364 |
+
# Try different sheet names and header positions
|
| 365 |
+
sheet_options = ["Driver", "Data", "Sheet1", 0] # Try by name, then by index
|
| 366 |
+
header_options = [3, 0, 1, 2] # Try different header positions
|
| 367 |
+
|
| 368 |
+
df = None
|
| 369 |
+
sheet_used = None
|
| 370 |
+
header_used = None
|
| 371 |
+
|
| 372 |
+
for sheet in sheet_options:
|
| 373 |
+
for header in header_options:
|
| 374 |
+
try:
|
| 375 |
+
df = pd.read_excel(file_path, sheet_name=sheet, header=header)
|
| 376 |
+
if len(df.columns) > 5 and len(df) > 10: # Basic validation
|
| 377 |
+
sheet_used = sheet
|
| 378 |
+
header_used = header
|
| 379 |
+
logger.info(f"Successfully loaded data from sheet '{sheet}' with header={header}")
|
| 380 |
+
break
|
| 381 |
+
except:
|
| 382 |
+
continue
|
| 383 |
+
if df is not None:
|
| 384 |
+
break
|
| 385 |
+
|
| 386 |
+
if df is None:
|
| 387 |
+
return create_error_message("Could not read Excel file. Please check the file format."), None, None, None
|
| 388 |
+
|
| 389 |
+
logger.info(f"Loaded dataframe with shape {df.shape} and columns: {list(df.columns)}")
|
| 390 |
|
| 391 |
# Map column names from trust buckets to factors
|
| 392 |
column_mapping = {
|
|
|
|
| 400 |
|
| 401 |
# Create a copy with renamed columns for analysis
|
| 402 |
df_analysis = df.copy()
|
| 403 |
+
factors_found = []
|
| 404 |
for old_name, new_name in column_mapping.items():
|
| 405 |
if old_name in df_analysis.columns:
|
| 406 |
df_analysis.rename(columns={old_name: new_name}, inplace=True)
|
| 407 |
+
factors_found.append(new_name)
|
| 408 |
+
|
| 409 |
+
if len(factors_found) < 3:
|
| 410 |
+
return create_error_message(f"Could not find enough factor columns. Found: {factors_found}. Looking for: {list(column_mapping.keys())}"), None, None, None
|
| 411 |
+
|
| 412 |
+
# Find target column
|
| 413 |
+
target_col, target_name = find_target_column(df)
|
| 414 |
|
| 415 |
+
if target_col is None:
|
| 416 |
+
available_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
|
| 417 |
+
return create_error_message(f"Could not find target column (Consideration, Intent, etc.). Available numeric columns: {available_cols}"), None, None, None
|
|
|
|
| 418 |
|
| 419 |
+
logger.info(f"Using target column: {target_col} (interpreted as {target_name})")
|
| 420 |
+
|
| 421 |
+
# Calculate R² for target model
|
| 422 |
+
factors = factors_found
|
| 423 |
X = df_analysis[factors].dropna()
|
| 424 |
+
if len(X) == 0:
|
| 425 |
+
return create_error_message("No valid data found in factor columns"), None, None, None
|
| 426 |
+
|
| 427 |
+
y = df.loc[X.index, target_col]
|
| 428 |
+
|
| 429 |
+
# Remove rows where target is missing
|
| 430 |
+
valid_idx = ~y.isna()
|
| 431 |
+
X = X[valid_idx]
|
| 432 |
+
y = y[valid_idx]
|
| 433 |
+
|
| 434 |
+
if len(X) < 10:
|
| 435 |
+
return create_error_message(f"Not enough valid data points ({len(X)}). Need at least 10."), None, None, None
|
| 436 |
|
| 437 |
model = LinearRegression()
|
| 438 |
model.fit(X, y)
|
| 439 |
r2 = r2_score(y, model.predict(X))
|
| 440 |
r2_percent = r2 * 100
|
| 441 |
|
| 442 |
+
# Calculate average target
|
| 443 |
+
avg_target = y.mean()
|
| 444 |
|
| 445 |
# Create visualizations
|
| 446 |
r2_html = calculate_r2_image(r2_percent)
|
| 447 |
avg_target_html = create_avg_target_display(avg_target)
|
| 448 |
|
| 449 |
+
# Factor performance plot - only use available factors
|
| 450 |
+
factor_performance_img = plot_factor_performance(df_analysis[factors], "Factor Performance (Agreement Scores)")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
+
# Create Shapley results (mock data since R script is failing)
|
| 453 |
+
results_df = create_mock_shapley_results()
|
| 454 |
|
| 455 |
+
# Only include factors that were actually found
|
| 456 |
+
results_df = results_df[results_df['Predictor'].isin(factors)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
results_df["Importance_percent"] = results_df["Importance"] * 100
|
| 459 |
average_value = results_df["Importance_percent"].mean()
|
|
|
|
| 462 |
driver_analysis_img = plot_driver_analysis(
|
| 463 |
results_df,
|
| 464 |
average_value,
|
| 465 |
+
f"Shapley Driver Analysis - {target_name}"
|
| 466 |
)
|
| 467 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
return r2_html, avg_target_html, factor_performance_img, driver_analysis_img
|
| 469 |
|
| 470 |
except Exception as e:
|
| 471 |
logger.error(f"Error analyzing data: {e}")
|
| 472 |
+
return create_error_message(f"Analysis failed: {str(e)}"), None, None, None
|
| 473 |
+
|
| 474 |
+
def create_error_message(message):
|
| 475 |
+
"""Create an HTML error message."""
|
| 476 |
+
return f"""
|
| 477 |
+
<div style='background-color: #ffebee; border: 1px solid #f44336; border-radius: 4px; padding: 16px; color: #c62828;'>
|
| 478 |
+
<strong>Error:</strong> {message}
|
| 479 |
+
</div>
|
| 480 |
+
"""
|
| 481 |
|
| 482 |
def load_default_file():
|
| 483 |
"""Load default file on startup"""
|
| 484 |
default_file = "example_files/Volkswagen Non Customers.xlsx"
|
| 485 |
if os.path.exists(default_file):
|
| 486 |
return analyze_prospects_data(default_file)
|
| 487 |
+
return create_error_message("Default file not found"), None, None, None
|
| 488 |
|
| 489 |
def handle_file_upload(file):
|
| 490 |
"""Handle file upload and analysis"""
|
| 491 |
if file is None:
|
| 492 |
+
return load_default_file()
|
| 493 |
return analyze_prospects_data(file.name)
|
| 494 |
|
| 495 |
# Gradio interface with light theme
|
|
|
|
| 520 |
</h2>
|
| 521 |
""")
|
| 522 |
|
| 523 |
+
gr.Markdown("### Instructions")
|
| 524 |
+
gr.Markdown("""
|
| 525 |
+
Upload an Excel file with:
|
| 526 |
+
- A sheet containing survey data (preferably named 'Driver')
|
| 527 |
+
- Factor columns: Stability, Development, Relationship, Benefit, Vision, Competence
|
| 528 |
+
- Target column: Consideration, Purchase Consideration, Intent, or similar
|
| 529 |
+
- Data should start around row 4 (headers in row 3)
|
| 530 |
+
""")
|
| 531 |
|
| 532 |
# File upload section
|
| 533 |
with gr.Row():
|