Spaces:

CatLLM
/

survey-classifier

Running

App Files Files Community

chrissoria commited on 9 days ago

Commit

c04a288

verified ·

1 Parent(s): 0481916

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +124 -21

app.py CHANGED Viewed

@@ -606,30 +606,126 @@ def run_classify_data(input_type, input_data, description, categories,
         return None, None, None, None, f"Error: {str(e)}"
-def create_distribution_chart(result_df, categories):
-    """Create a bar chart showing category distribution."""
-    fig, ax = plt.subplots(figsize=(10, max(4, len(categories) * 0.8)))
-    dist_data = []
     total_rows = len(result_df)
-    for i, cat in enumerate(categories, 1):
-        col_name = f"category_{i}"
-        if col_name in result_df.columns:
-            count = int(result_df[col_name].sum())
-            pct = (count / total_rows) * 100 if total_rows > 0 else 0
-            dist_data.append({"Category": cat, "Percentage": round(pct, 1)})
-    categories_list = [d["Category"] for d in dist_data][::-1]
-    percentages = [d["Percentage"] for d in dist_data][::-1]
-    bars = ax.barh(categories_list, percentages, color='#2563eb')
-    ax.set_xlim(0, 100)
-    ax.set_xlabel('Percentage (%)', fontsize=11)
-    ax.set_title('Category Distribution (%)', fontsize=14, fontweight='bold')
-    for bar, pct in zip(bars, percentages):
-        ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
-               f'{pct:.1f}%', va='center', fontsize=10)
     plt.tight_layout()
     return fig
@@ -1256,7 +1352,9 @@ with col_input:
                             'pdf_path': pdf_path,
                             'code': code,
                             'status': f"Classified {len(result_df)} items in {processing_time:.1f}s",
-                            'categories': categories_entered
                         }
                         st.success(f"Classified {len(result_df)} items in {processing_time:.1f}s")
                         st.rerun()
@@ -1270,7 +1368,12 @@ with col_output:
         results = st.session_state.results
         # Distribution chart
-        fig = create_distribution_chart(results['df'], results['categories'])
         st.pyplot(fig)
         st.caption("Note: Categories are not mutually exclusive—each item can belong to multiple categories.")

         return None, None, None, None, f"Error: {str(e)}"
+def sanitize_model_name(model: str) -> str:
+    """Convert model name to column-safe suffix (matches catllm logic)."""
+    import re
+    sanitized = re.sub(r'[^a-zA-Z0-9]', '_', model)
+    sanitized = re.sub(r'_+', '_', sanitized)
+    sanitized = sanitized.strip('_').lower()
+    return sanitized[:40]
+def create_distribution_chart(result_df, categories, classify_mode="Single Model", models_list=None):
+    """Create a bar chart showing category distribution.
+    Args:
+        result_df: DataFrame with classification results
+        categories: List of category names
+        classify_mode: "Single Model", "Model Comparison", or "Ensemble"
+        models_list: List of model names (for multi-model modes)
+    """
+    import numpy as np
     total_rows = len(result_df)
+    if total_rows == 0:
+        fig, ax = plt.subplots(figsize=(10, 4))
+        ax.text(0.5, 0.5, 'No data to display', ha='center', va='center', fontsize=14)
+        ax.axis('off')
+        return fig
+    # Define colors for different models
+    model_colors = ['#2563eb', '#dc2626', '#16a34a', '#ca8a04', '#9333ea', '#0891b2', '#be185d', '#65a30d']
+    if classify_mode == "Single Model":
+        # Single model: use category_1, category_2, etc.
+        fig, ax = plt.subplots(figsize=(10, max(4, len(categories) * 0.8)))
+        dist_data = []
+        for i, cat in enumerate(categories, 1):
+            col_name = f"category_{i}"
+            if col_name in result_df.columns:
+                count = int(result_df[col_name].sum())
+                pct = (count / total_rows) * 100
+                dist_data.append({"Category": cat, "Percentage": round(pct, 1)})
+        categories_list = [d["Category"] for d in dist_data][::-1]
+        percentages = [d["Percentage"] for d in dist_data][::-1]
+        bars = ax.barh(categories_list, percentages, color='#2563eb')
+        ax.set_xlim(0, 100)
+        ax.set_xlabel('Percentage (%)', fontsize=11)
+        ax.set_title('Category Distribution (%)', fontsize=14, fontweight='bold')
+        for bar, pct in zip(bars, percentages):
+            ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
+                   f'{pct:.1f}%', va='center', fontsize=10)
+    elif classify_mode == "Ensemble":
+        # Ensemble: use category_1_consensus, category_2_consensus, etc.
+        fig, ax = plt.subplots(figsize=(10, max(4, len(categories) * 0.8)))
+        dist_data = []
+        for i, cat in enumerate(categories, 1):
+            col_name = f"category_{i}_consensus"
+            if col_name in result_df.columns:
+                count = int(result_df[col_name].sum())
+                pct = (count / total_rows) * 100
+                dist_data.append({"Category": cat, "Percentage": round(pct, 1)})
+        categories_list = [d["Category"] for d in dist_data][::-1]
+        percentages = [d["Percentage"] for d in dist_data][::-1]
+        bars = ax.barh(categories_list, percentages, color='#16a34a')
+        ax.set_xlim(0, 100)
+        ax.set_xlabel('Percentage (%)', fontsize=11)
+        ax.set_title('Ensemble Consensus Distribution (%)', fontsize=14, fontweight='bold')
+        for bar, pct in zip(bars, percentages):
+            ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
+                   f'{pct:.1f}%', va='center', fontsize=10)
+    else:  # Model Comparison
+        # Model Comparison: grouped bars for each model
+        if not models_list:
+            models_list = []
+        sanitized_names = [sanitize_model_name(m) for m in models_list]
+        n_models = len(sanitized_names)
+        n_categories = len(categories)
+        fig, ax = plt.subplots(figsize=(12, max(5, n_categories * 1.2)))
+        # Gather data for each model
+        bar_height = 0.8 / n_models
+        y_positions = np.arange(n_categories)
+        for model_idx, (model_name, sanitized) in enumerate(zip(models_list, sanitized_names)):
+            model_pcts = []
+            for i in range(1, n_categories + 1):
+                col_name = f"category_{i}_{sanitized}"
+                if col_name in result_df.columns:
+                    count = int(result_df[col_name].sum())
+                    pct = (count / total_rows) * 100
+                else:
+                    pct = 0
+                model_pcts.append(pct)
+            # Reverse for horizontal bar chart
+            model_pcts = model_pcts[::-1]
+            offset = (model_idx - n_models / 2 + 0.5) * bar_height
+            color = model_colors[model_idx % len(model_colors)]
+            # Use shorter display name
+            display_name = model_name.split('/')[-1].split(':')[0][:20]
+            bars = ax.barh(y_positions + offset, model_pcts, bar_height * 0.9,
+                          label=display_name, color=color, alpha=0.85)
+        ax.set_yticks(y_positions)
+        ax.set_yticklabels(categories[::-1])
+        ax.set_xlim(0, 100)
+        ax.set_xlabel('Percentage (%)', fontsize=11)
+        ax.set_title('Category Distribution by Model (%)', fontsize=14, fontweight='bold')
+        ax.legend(loc='lower right', fontsize=9)
     plt.tight_layout()
     return fig
                             'pdf_path': pdf_path,
                             'code': code,
                             'status': f"Classified {len(result_df)} items in {processing_time:.1f}s",
+                            'categories': categories_entered,
+                            'classify_mode': classify_mode,
+                            'models_list': models_list,
                         }
                         st.success(f"Classified {len(result_df)} items in {processing_time:.1f}s")
                         st.rerun()
         results = st.session_state.results
         # Distribution chart
+        fig = create_distribution_chart(
+            results['df'],
+            results['categories'],
+            classify_mode=results.get('classify_mode', 'Single Model'),
+            models_list=results.get('models_list', [])
+        )
         st.pyplot(fig)
         st.caption("Note: Categories are not mutually exclusive—each item can belong to multiple categories.")