shapely

Running

App Files Files Community

Wajahat698 commited on Jul 31, 2025

Commit

ffedfe9

verified ·

1 Parent(s): a3a9e8f

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -144

app.py CHANGED Viewed

@@ -272,80 +272,37 @@ def create_avg_target_display(avg_target):
     </div>
     """
-def create_mock_shapley_results():
-    """
-    Create mock Shapley analysis results when R script fails.
     """
-    np.random.seed(42)  # For consistent results
-    factors = ['Factor 1', 'Factor 2', 'Factor 3', 'Factor 4', 'Factor 5', 'Factor 6']
-    # Generate realistic importance values that sum to 1
-    raw_importance = np.random.dirichlet([1, 1, 1, 1, 1, 1])
-    mock_data = pd.DataFrame({
-        'Predictor': factors,
-        'Importance': raw_importance
-    })
-    return mock_data
-def find_target_column(df):
     """
-    Find the target column in the dataset - look for various possible names.
     """
-    possible_targets = [
-        'Consideration', 'Purchase Consideration', 'purchase_consideration',
-        'Intent', 'Purchase Intent', 'purchase_intent',
-        'Likelihood', 'Purchase Likelihood', 'purchase_likelihood',
-        'Probability', 'Purchase Probability', 'purchase_probability'
     ]
-    # Check exact matches first
-    for target in possible_targets:
-        if target in df.columns:
-            return target, target
-    # Check case-insensitive matches
-    df_columns_lower = [col.lower() for col in df.columns]
-    for target in possible_targets:
-        target_lower = target.lower()
-        if target_lower in df_columns_lower:
-            actual_col = df.columns[df_columns_lower.index(target_lower)]
-            return actual_col, target
-    # Check partial matches
-    for col in df.columns:
-        col_lower = col.lower()
-        if any(keyword in col_lower for keyword in ['consider', 'intent', 'likelihood', 'probability', 'purchase']):
-            return col, col
-    return None, None
-def get_file_info(file_path):
-    """
-    Get information about the Excel file structure.
-    """
     try:
-        # Get sheet names
-        xl_file = pd.ExcelFile(file_path)
-        sheet_names = xl_file.sheet_names
-        info = f"Excel file contains {len(sheet_names)} sheet(s): {', '.join(sheet_names)}\n\n"
-        # Try to read each sheet and show column info
-        for sheet in sheet_names:
-            try:
-                df = pd.read_excel(file_path, sheet_name=sheet, header=3, nrows=5)
-                info += f"Sheet '{sheet}' columns: {', '.join(df.columns[:10])}"
-                if len(df.columns) > 10:
-                    info += f" ... and {len(df.columns) - 10} more"
-                info += "\n"
-            except:
-                info += f"Sheet '{sheet}': Could not read with header=3\n"
-        return info
     except Exception as e:
-        return f"Error reading file: {str(e)}"
 def analyze_prospects_data(file_path):
     """
@@ -357,36 +314,22 @@ def analyze_prospects_data(file_path):
     logger.info("Analyzing prospects file: %s", file_path)
     try:
-        # First, get file info for debugging
-        file_info = get_file_info(file_path)
-        logger.info("File info: %s", file_info)
-        # Try different sheet names and header positions
-        sheet_options = ["Driver", "Data", "Sheet1", 0]  # Try by name, then by index
-        header_options = [3, 0, 1, 2]  # Try different header positions
-        df = None
-        sheet_used = None
-        header_used = None
-        for sheet in sheet_options:
-            for header in header_options:
-                try:
-                    df = pd.read_excel(file_path, sheet_name=sheet, header=header)
-                    if len(df.columns) > 5 and len(df) > 10:  # Basic validation
-                        sheet_used = sheet
-                        header_used = header
-                        logger.info(f"Successfully loaded data from sheet '{sheet}' with header={header}")
-                        break
-                except:
-                    continue
-            if df is not None:
-                break
-        if df is None:
-            return create_error_message("Could not read Excel file. Please check the file format."), None, None, None
-        logger.info(f"Loaded dataframe with shape {df.shape} and columns: {list(df.columns)}")
         # Map column names from trust buckets to factors
         column_mapping = {
@@ -400,60 +343,90 @@ def analyze_prospects_data(file_path):
         # Create a copy with renamed columns for analysis
         df_analysis = df.copy()
-        factors_found = []
         for old_name, new_name in column_mapping.items():
             if old_name in df_analysis.columns:
                 df_analysis.rename(columns={old_name: new_name}, inplace=True)
-                factors_found.append(new_name)
-        if len(factors_found) < 3:
-            return create_error_message(f"Could not find enough factor columns. Found: {factors_found}. Looking for: {list(column_mapping.keys())}"), None, None, None
-        # Find target column
-        target_col, target_name = find_target_column(df)
-        if target_col is None:
-            available_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
-            return create_error_message(f"Could not find target column (Consideration, Intent, etc.). Available numeric columns: {available_cols}"), None, None, None
-        logger.info(f"Using target column: {target_col} (interpreted as {target_name})")
-        # Calculate R² for target model
-        factors = factors_found
         X = df_analysis[factors].dropna()
-        if len(X) == 0:
-            return create_error_message("No valid data found in factor columns"), None, None, None
-        y = df.loc[X.index, target_col]
-        # Remove rows where target is missing
-        valid_idx = ~y.isna()
-        X = X[valid_idx]
-        y = y[valid_idx]
         if len(X) < 10:
-            return create_error_message(f"Not enough valid data points ({len(X)}). Need at least 10."), None, None, None
         model = LinearRegression()
         model.fit(X, y)
         r2 = r2_score(y, model.predict(X))
         r2_percent = r2 * 100
-        # Calculate average target
         avg_target = y.mean()
         # Create visualizations
         r2_html = calculate_r2_image(r2_percent)
         avg_target_html = create_avg_target_display(avg_target)
-        # Factor performance plot - only use available factors
-        factor_performance_img = plot_factor_performance(df_analysis[factors], "Factor Performance (Agreement Scores)")
-        # Create Shapley results (mock data since R script is failing)
-        results_df = create_mock_shapley_results()
-        # Only include factors that were actually found
-        results_df = results_df[results_df['Predictor'].isin(factors)]
         results_df["Importance_percent"] = results_df["Importance"] * 100
         average_value = results_df["Importance_percent"].mean()
@@ -462,23 +435,22 @@ def analyze_prospects_data(file_path):
         driver_analysis_img = plot_driver_analysis(
             results_df,
             average_value,
-            f"Shapley Driver Analysis - {target_name}"
         )
         return r2_html, avg_target_html, factor_performance_img, driver_analysis_img
     except Exception as e:
         logger.error(f"Error analyzing data: {e}")
         return create_error_message(f"Analysis failed: {str(e)}"), None, None, None
-def create_error_message(message):
-    """Create an HTML error message."""
-    return f"""
-    <div style='background-color: #ffebee; border: 1px solid #f44336; border-radius: 4px; padding: 16px; color: #c62828;'>
-        <strong>Error:</strong> {message}
-    </div>
-    """
 def load_default_file():
     """Load default file on startup"""
     default_file = "example_files/Volkswagen Non Customers.xlsx"
@@ -516,18 +488,12 @@ function refresh() {
 with gr.Blocks(css=css, js=js, theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
         <h2 style="text-align: center; font-size: 2.25rem; font-weight: 600;">
-            Driver Analysis Demo - Purchase Consideration
         </h2>
     """)
-    gr.Markdown("### Instructions")
-    gr.Markdown("""
-    Upload an Excel file with:
-    - A sheet containing survey data (preferably named 'Driver')
-    - Factor columns: Stability, Development, Relationship, Benefit, Vision, Competence
-    - Target column: Consideration, Purchase Consideration, Intent, or similar
-    - Data should start around row 4 (headers in row 3)
-    """)
     # File upload section
     with gr.Row():
@@ -572,5 +538,5 @@ with gr.Blocks(css=css, js=js, theme=gr.themes.Soft()) as demo:
         outputs=[r2_output, avg_target_output, factor_performance_plot, driver_analysis_plot]
     )
-# Launch without the theme parameter
 demo.launch(server_name="0.0.0.0", share=False)

     </div>
     """
+def create_error_message(message):
+    """Create an HTML error message."""
+    return f"""
+    <div style='background-color: #ffebee; border: 1px solid #f44336; border-radius: 4px; padding: 16px; color: #c62828;'>
+        <strong>Error:</strong> {message}
+    </div>
     """
+def call_r_script_simplified(input_file, csv_output_path):
     """
+    Call R script for Shapley regression analysis on Consideration.
     """
+    command = [
+        "Rscript",
+        "process_data.R",
+        input_file,
+        csv_output_path
     ]
     try:
+        result = subprocess.run(command, check=True, capture_output=True, text=True)
+        logger.info("R script executed successfully")
+        return True
+    except subprocess.CalledProcessError as e:
+        logger.error("R script failed with error: %s", e)
+        logger.error("R script stderr: %s", e.stderr)
+        logger.error("R script stdout: %s", e.stdout)
+        return False
     except Exception as e:
+        logger.error("Error calling R script: %s", e)
+        return False
 def analyze_prospects_data(file_path):
     """
     logger.info("Analyzing prospects file: %s", file_path)
     try:
+        # Load Excel file
+        df = pd.read_excel(file_path, sheet_name="Driver", header=3)
+        # Check required columns
+        required_factor_columns = ["Stability", "Development", "Relationship", "Benefit", "Vision", "Competence"]
+        missing_factors = [col for col in required_factor_columns if col not in df.columns]
+        if missing_factors:
+            logger.error(f"Missing factor columns: {missing_factors}")
+            return create_error_message(f"Missing required columns: {missing_factors}"), None, None, None
+        # Check if Consideration column exists
+        if "Consideration" not in df.columns:
+            logger.error("Consideration column not found in dataset")
+            logger.info(f"Available columns: {list(df.columns)}")
+            return create_error_message(f"Consideration column not found. Available columns: {list(df.columns)}"), None, None, None
         # Map column names from trust buckets to factors
         column_mapping = {
         # Create a copy with renamed columns for analysis
         df_analysis = df.copy()
         for old_name, new_name in column_mapping.items():
             if old_name in df_analysis.columns:
                 df_analysis.rename(columns={old_name: new_name}, inplace=True)
+        # Calculate R² for Consideration model
+        factors = ["Factor 1", "Factor 2", "Factor 3", "Factor 4", "Factor 5", "Factor 6"]
         X = df_analysis[factors].dropna()
+        y = df.loc[X.index, "Consideration"]  # Use Consideration as target
+        # Remove any remaining NaN values
+        valid_mask = ~y.isna()
+        X = X[valid_mask]
+        y = y[valid_mask]
         if len(X) < 10:
+            logger.error(f"Not enough valid data points: {len(X)}")
+            return create_error_message(f"Not enough valid data points: {len(X)}. Need at least 10."), None, None, None
         model = LinearRegression()
         model.fit(X, y)
         r2 = r2_score(y, model.predict(X))
         r2_percent = r2 * 100
+        # Calculate average target (Consideration)
         avg_target = y.mean()
+        logger.info(f"R² Score: {r2_percent:.1f}%, Average Consideration: {avg_target:.1f}")
         # Create visualizations
         r2_html = calculate_r2_image(r2_percent)
         avg_target_html = create_avg_target_display(avg_target)
+        # Factor performance plot
+        factor_performance_img = plot_factor_performance(df_analysis, "Factor Performance (Agreement Scores)")
+        # Run Shapley analysis on Consideration
+        temp_dir = tempfile.mkdtemp()
+        csv_output_path = os.path.join(temp_dir, "consideration_results.csv")
+        # Call R script
+        r_success = call_r_script_simplified(file_path, csv_output_path)
+        if not r_success:
+            # Clean up and return error
+            try:
+                os.rmdir(temp_dir)
+            except:
+                pass
+            return create_error_message("R script failed to execute. Shapley analysis not available."), None, None, None
+        # Check if R script produced output file
+        if not os.path.exists(csv_output_path):
+            try:
+                os.rmdir(temp_dir)
+            except:
+                pass
+            return create_error_message("R script did not produce expected output file."), None, None, None
+        # Load results with renamed predictors
+        try:
+            results_df = pd.read_csv(csv_output_path)
+        except Exception as e:
+            logger.error(f"Error reading R script output: {e}")
+            try:
+                os.remove(csv_output_path)
+                os.rmdir(temp_dir)
+            except:
+                pass
+            return create_error_message(f"Error reading R script output: {e}"), None, None, None
+        # Validate R script output
+        if "Predictor" not in results_df.columns or "Importance" not in results_df.columns:
+            logger.error("R script output missing required columns")
+            try:
+                os.remove(csv_output_path)
+                os.rmdir(temp_dir)
+            except:
+                pass
+            return create_error_message("R script output is invalid - missing required columns."), None, None, None
+        # Map predictor names if they come from R script with original names
+        results_df["Predictor"] = results_df["Predictor"].map(
+            lambda x: column_mapping.get(x, x)
+        )
         results_df["Importance_percent"] = results_df["Importance"] * 100
         average_value = results_df["Importance_percent"].mean()
         driver_analysis_img = plot_driver_analysis(
             results_df,
             average_value,
+            "Shapley Driver Analysis - Purchase Consideration"
         )
+        # Clean up
+        try:
+            os.remove(csv_output_path)
+            os.rmdir(temp_dir)
+        except Exception as e:
+            logger.error(f"Error cleaning up temp files: {e}")
         return r2_html, avg_target_html, factor_performance_img, driver_analysis_img
     except Exception as e:
         logger.error(f"Error analyzing data: {e}")
         return create_error_message(f"Analysis failed: {str(e)}"), None, None, None
 def load_default_file():
     """Load default file on startup"""
     default_file = "example_files/Volkswagen Non Customers.xlsx"
 with gr.Blocks(css=css, js=js, theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
         <h2 style="text-align: center; font-size: 2.25rem; font-weight: 600;">
+            Driver Analysis - Purchase Consideration
         </h2>
     """)
+    gr.Markdown("### Purchase Consideration Analysis")
+    gr.Markdown("Analysis showing what drives Purchase Consideration among prospects using Factors 1-6")
     # File upload section
     with gr.Row():
         outputs=[r2_output, avg_target_output, factor_performance_plot, driver_analysis_plot]
     )
+# Launch the demo
 demo.launch(server_name="0.0.0.0", share=False)