Spaces:

Mohaddz
/

Customer-classify

Runtime error

App Files Files Community

Mohaddz commited on Aug 29, 2025

Commit

40a7521

verified ·

1 Parent(s): 2403d73

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -48

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import plotly.express as px
 import plotly.graph_objects as go
 from collections import defaultdict
 import json
 class MultiClientThemeClassifier:
     def __init__(self):
@@ -20,7 +21,7 @@ class MultiClientThemeClassifier:
         """Load the embedding model"""
         if not self.model_loaded:
             try:
-                # Using a smaller, faster model for demo
                 self.model = SentenceTransformer('Qwen/Qwen3-Embedding-0.6B')
                 self.model_loaded = True
                 return "✅ Model loaded successfully!"
@@ -88,47 +89,63 @@ class MultiClientThemeClassifier:
             return "❌ Model not loaded!", "", ""
         try:
             # Read CSV
             df = pd.read_csv(io.StringIO(csv_content))
             # Validate CSV format
             if 'text' not in df.columns or 'real_tag' not in df.columns:
                 return "❌ CSV must have 'text' and 'real_tag' columns!", "", ""
             # Get unique themes from CSV
             unique_themes = df['real_tag'].unique().tolist()
             # Add themes for this client (using theme names as prototypes for demo)
-            self.add_client_themes(client_id, unique_themes)
-            # Classify all texts
             predictions = []
             confidences = []
-            for _, row in df.iterrows():
-                pred_theme, confidence, _ = self.classify_text(row['text'], client_id)
-                predictions.append(pred_theme)
-                confidences.append(confidence)
             df['predicted_tag'] = predictions
             df['confidence'] = confidences
             # Calculate metrics
-            correct = (df['real_tag'] == df['predicted_tag']).sum()
-            total = len(df)
-            accuracy = correct / total
-            # Create confusion matrix data
-            confusion_data = []
-            for real_tag in unique_themes:
-                for pred_tag in unique_themes + ['UNKNOWN_THEME']:
-                    count = len(df[(df['real_tag'] == real_tag) & (df['predicted_tag'] == pred_tag)])
-                    if count > 0:
-                        confusion_data.append({
-                            'Real': real_tag,
-                            'Predicted': pred_tag,
-                            'Count': count
-                        })
             # Generate results summary
             results_summary = f"""
@@ -138,32 +155,41 @@ class MultiClientThemeClassifier:
 - Total samples: {total}
 - Correct predictions: {correct}
 - **Accuracy: {accuracy:.2%}**
-- Average confidence: {np.mean(confidences):.3f}
 **Per-Theme Breakdown:**
 """
             for theme in unique_themes:
-                theme_df = df[df['real_tag'] == theme]
-                theme_correct = (theme_df['real_tag'] == theme_df['predicted_tag']).sum()
-                theme_total = len(theme_df)
-                theme_acc = theme_correct / theme_total if theme_total > 0 else 0
-                avg_conf = theme_df['confidence'].mean()
-                results_summary += f"- **{theme}**: {theme_acc:.2%} ({theme_correct}/{theme_total}) - Avg conf: {avg_conf:.3f}\n"
-            # Create visualization
-            fig = px.bar(
-                x=unique_themes,
-                y=[len(df[df['real_tag'] == theme]) for theme in unique_themes],
-                title="Theme Distribution in Dataset",
-                labels={'x': 'Themes', 'y': 'Count'}
-            )
-            return results_summary, df.to_csv(index=False), fig.to_html()
         except Exception as e:
-            return f"❌ Error during benchmarking: {str(e)}", "", ""
 # Initialize the classifier
 classifier = MultiClientThemeClassifier()
@@ -204,21 +230,30 @@ def benchmark_interface(csv_file, client_id: str):
         return "Please upload a CSV file!", "", ""
     try:
-        csv_content = csv_file.decode('utf-8')
         return classifier.benchmark_csv(csv_content, client_id)
     except Exception as e:
-        return f"❌ Error reading CSV: {str(e)}", "", ""
 # Create the Gradio interface
-with gr.Blocks(title="Company Custom Themes Classification MVP", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🎯 Company Custom Themes Classification - MVP
     **A scalable, cost-effective solution for multi-client theme classification**
     This demo showcases an embedding-based approach that can:
     - ✅ Handle multiple clients with different themes
-    - ✅ Distinguish between similar themes (e.g., "التمويل العقاري" vs "التمويل الشخصي")
     - ✅ Process ~1M posts/day at low cost (~$500/month vs $30k/month for pure LLM)
     - ✅ Provide confidence scores and similarity breakdowns
     """)
@@ -236,7 +271,7 @@ with gr.Blocks(title="Company Custom Themes Classification MVP", theme=gr.themes
             themes_input = gr.Textbox(
                 label="Themes (one per line)",
                 lines=5,
-                placeholder="e.g.:\nالتمويل العقاري\nالتمويل الشخصي\nالتعليم الأهلي\nالرياضة"
             )
         add_themes_btn = gr.Button("Add Themes", variant="secondary")
@@ -256,7 +291,7 @@ with gr.Blocks(title="Company Custom Themes Classification MVP", theme=gr.themes
                 text_input = gr.Textbox(
                     label="Text to Classify",
                     lines=3,
-                    placeholder="Enter Arabic or English text..."
                 )
                 client_select = gr.Textbox(
                     label="Client ID",
@@ -320,7 +355,7 @@ with gr.Blocks(title="Company Custom Themes Classification MVP", theme=gr.themes
         gr.Markdown("""
         ## 🎯 Solution Overview
-        This MVP demonstrates a **hybrid embedding-based approach** for Company's Custom Themes feature:
         ### ✅ Key Advantages:
         1. **Cost Effective**: ~$500/month vs $30,000/month for pure LLM approach
@@ -330,7 +365,7 @@ with gr.Blocks(title="Company Custom Themes Classification MVP", theme=gr.themes
         5. **Confidence Scoring**: Provides transparency in predictions
         ### 🏗️ Architecture:
-        1. **Embedding Model**: SentenceTransformers for semantic understanding
         2. **Theme Prototypes**: Each client's themes represented as embedding vectors
         3. **Similarity Matching**: Cosine similarity for classification
         4. **Confidence Thresholding**: Flags uncertain predictions
@@ -348,7 +383,7 @@ with gr.Blocks(title="Company Custom Themes Classification MVP", theme=gr.themes
         ---
-        **Built for Company's Case Study | Scalable Theme Classification MVP**
         """)
 # Launch the app

 import plotly.graph_objects as go
 from collections import defaultdict
 import json
+import traceback
 class MultiClientThemeClassifier:
     def __init__(self):
         """Load the embedding model"""
         if not self.model_loaded:
             try:
+                # Using Qwen embedding model
                 self.model = SentenceTransformer('Qwen/Qwen3-Embedding-0.6B')
                 self.model_loaded = True
                 return "✅ Model loaded successfully!"
             return "❌ Model not loaded!", "", ""
         try:
+            print("Starting CSV benchmark...")
             # Read CSV
             df = pd.read_csv(io.StringIO(csv_content))
+            print(f"CSV loaded with shape: {df.shape}")
+            print(f"CSV columns: {df.columns.tolist()}")
             # Validate CSV format
             if 'text' not in df.columns or 'real_tag' not in df.columns:
                 return "❌ CSV must have 'text' and 'real_tag' columns!", "", ""
+            # Clean data
+            df = df.dropna(subset=['text', 'real_tag'])
+            df['text'] = df['text'].astype(str)
+            df['real_tag'] = df['real_tag'].astype(str)
+            print(f"After cleaning: {df.shape}")
             # Get unique themes from CSV
             unique_themes = df['real_tag'].unique().tolist()
+            print(f"Unique themes found: {len(unique_themes)} - {unique_themes}")
             # Add themes for this client (using theme names as prototypes for demo)
+            theme_add_result = self.add_client_themes(client_id, unique_themes)
+            print(f"Theme addition result: {theme_add_result}")
+            # Classify all texts with progress
             predictions = []
             confidences = []
+            print("Starting classification...")
+            for idx, row in df.iterrows():
+                try:
+                    text = str(row['text'])[:500]  # Limit text length
+                    pred_theme, confidence, _ = self.classify_text(text, client_id)
+                    predictions.append(pred_theme)
+                    confidences.append(confidence)
+                    if idx % 10 == 0:  # Progress logging
+                        print(f"Processed {idx + 1}/{len(df)} samples")
+                except Exception as e:
+                    print(f"Error classifying row {idx}: {str(e)}")
+                    predictions.append("ERROR")
+                    confidences.append(0.0)
+            print("Classification complete!")
             df['predicted_tag'] = predictions
             df['confidence'] = confidences
             # Calculate metrics
+            valid_predictions = df[df['predicted_tag'] != 'ERROR']
+            correct = (valid_predictions['real_tag'] == valid_predictions['predicted_tag']).sum()
+            total = len(valid_predictions)
+            accuracy = correct / total if total > 0 else 0
+            print(f"Metrics calculated: {correct}/{total} = {accuracy:.2%}")
             # Generate results summary
             results_summary = f"""
 - Total samples: {total}
 - Correct predictions: {correct}
 - **Accuracy: {accuracy:.2%}**
+- Average confidence: {np.mean([c for c in confidences if c > 0]):.3f}
 **Per-Theme Breakdown:**
 """
             for theme in unique_themes:
+                theme_df = valid_predictions[valid_predictions['real_tag'] == theme]
+                if len(theme_df) > 0:
+                    theme_correct = (theme_df['real_tag'] == theme_df['predicted_tag']).sum()
+                    theme_total = len(theme_df)
+                    theme_acc = theme_correct / theme_total if theme_total > 0 else 0
+                    avg_conf = theme_df['confidence'].mean()
+                    results_summary += f"- **{theme}**: {theme_acc:.2%} ({theme_correct}/{theme_total}) - Avg conf: {avg_conf:.3f}\n"
+            # Create simple visualization
+            try:
+                theme_counts = [len(df[df['real_tag'] == theme]) for theme in unique_themes]
+                fig = px.bar(
+                    x=unique_themes,
+                    y=theme_counts,
+                    title="Theme Distribution in Dataset",
+                    labels={'x': 'Themes', 'y': 'Count'}
+                )
+                visualization_html = fig.to_html()
+            except Exception as viz_error:
+                print(f"Visualization error: {viz_error}")
+                visualization_html = "<p>Visualization error occurred</p>"
+            return results_summary, df.to_csv(index=False), visualization_html
         except Exception as e:
+            error_details = traceback.format_exc()
+            print(f"Full error: {error_details}")
+            return f"❌ Error during benchmarking: {str(e)}\n\nFull traceback:\n{error_details}", "", ""
 # Initialize the classifier
 classifier = MultiClientThemeClassifier()
         return "Please upload a CSV file!", "", ""
     try:
+        # Handle both file objects and file paths
+        if hasattr(csv_file, 'read'):
+            csv_content = csv_file.read().decode('utf-8')
+        elif hasattr(csv_file, 'name'):
+            with open(csv_file.name, 'r', encoding='utf-8') as f:
+                csv_content = f.read()
+        else:
+            csv_content = str(csv_file)
         return classifier.benchmark_csv(csv_content, client_id)
     except Exception as e:
+        error_details = traceback.format_exc()
+        return f"❌ Error reading CSV: {str(e)}\n\nDetails:\n{error_details}", "", ""
 # Create the Gradio interface
+with gr.Blocks(title="Custom Themes Classification MVP", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 🎯 Custom Themes Classification - MVP
     **A scalable, cost-effective solution for multi-client theme classification**
     This demo showcases an embedding-based approach that can:
     - ✅ Handle multiple clients with different themes
+    - ✅ Distinguish between similar themes (e.g., "Real Estate Financing" vs "Personal Financing")
     - ✅ Process ~1M posts/day at low cost (~$500/month vs $30k/month for pure LLM)
     - ✅ Provide confidence scores and similarity breakdowns
     """)
             themes_input = gr.Textbox(
                 label="Themes (one per line)",
                 lines=5,
+                placeholder="e.g.:\nReal Estate Financing\nPersonal Financing\nPrivate Education\nSports"
             )
         add_themes_btn = gr.Button("Add Themes", variant="secondary")
                 text_input = gr.Textbox(
                     label="Text to Classify",
                     lines=3,
+                    placeholder="Enter text to classify..."
                 )
                 client_select = gr.Textbox(
                     label="Client ID",
         gr.Markdown("""
         ## 🎯 Solution Overview
+        This MVP demonstrates a **hybrid embedding-based approach** for Custom Themes classification:
         ### ✅ Key Advantages:
         1. **Cost Effective**: ~$500/month vs $30,000/month for pure LLM approach
         5. **Confidence Scoring**: Provides transparency in predictions
         ### 🏗️ Architecture:
+        1. **Embedding Model**: Qwen/Qwen3-Embedding-0.6B for semantic understanding
         2. **Theme Prototypes**: Each client's themes represented as embedding vectors
         3. **Similarity Matching**: Cosine similarity for classification
         4. **Confidence Thresholding**: Flags uncertain predictions
         ---
+        **Scalable Theme Classification MVP**
         """)
 # Launch the app