Spaces:

Mohaddz
/

Customer-classify

Sleeping

App Files Files Community

Mohaddz commited on Aug 29, 2025

Commit

7b53477

verified ·

1 Parent(s): 92204cf

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -127

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from collections import defaultdict
 import json
 import traceback
 import spaces # Import the spaces library
 class MultiClientThemeClassifier:
     def __init__(self):
@@ -25,7 +26,8 @@ class MultiClientThemeClassifier:
             model_name = self.default_model
         try:
-            if self.model_loaded and self.model.name_or_path == model_name:
                 return f"✅ Model '{model_name}' is already loaded."
             self.model = None
@@ -52,10 +54,8 @@ class MultiClientThemeClassifier:
     def add_client_themes(self, client_id: str, themes: List[str], examples_per_theme: Dict[str, List[str]] = None):
         """Add themes for a specific client"""
-        # Automatically load model if needed
         error_status = self._ensure_model_is_loaded()
-        if error_status:
-            return error_status
         try:
             self.client_themes[client_id] = {}
@@ -68,10 +68,8 @@ class MultiClientThemeClassifier:
     def classify_text(self, text: str, client_id: str, confidence_threshold: float = 0.3) -> Tuple[str, float, Dict[str, float]]:
         """Classify a single text for a specific client"""
-        # Automatically load model if needed
         error_status = self._ensure_model_is_loaded()
-        if error_status:
-            return f"Error: {error_status}", 0.0, {}
         if client_id not in self.client_themes:
             return "Client not found", 0.0, {}
@@ -81,8 +79,7 @@ class MultiClientThemeClassifier:
             similarities = {theme: util.cos_sim(text_embedding, prototype).item()
                             for theme, prototype in self.client_themes[client_id].items()}
-            if not similarities:
-                 return "No themes for client", 0.0, {}
             best_theme = max(similarities, key=similarities.get)
             best_score = similarities[best_theme]
@@ -96,10 +93,8 @@ class MultiClientThemeClassifier:
     def benchmark_csv(self, csv_content: str, client_id: str) -> Tuple[str, Optional[str], Optional[str]]:
         """Benchmark the model on a CSV file"""
-        # Automatically load model if needed
         error_status = self._ensure_model_is_loaded()
-        if error_status:
-            return f"❌ Model could not be loaded: {error_status}", None, None
         try:
             df = pd.read_csv(io.StringIO(csv_content))
@@ -124,10 +119,15 @@ class MultiClientThemeClassifier:
             results_summary = f"📊 **Benchmarking Results**\n\n**Accuracy: {accuracy:.2%}** ({correct}/{total})"
-            with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as temp_file:
-                df.to_csv(temp_file.name, index=False)
-                fig = px.bar(df['real_tag'].value_counts(), title="Theme Distribution")
-                return results_summary, temp_file.name, fig.to_html()
         except Exception as e:
             error_details = traceback.format_exc()
@@ -142,15 +142,13 @@ def load_model_interface(model_name: str):
 @spaces.GPU
 def add_themes_interface(client_id: str, themes_text: str):
-    if not themes_text.strip():
-        return "❌ Please enter themes!"
     themes = [theme.strip() for theme in themes_text.split('\n') if theme.strip()]
     return classifier.add_client_themes(client_id, themes)
 @spaces.GPU
 def classify_interface(text: str, client_id: str, confidence_threshold: float):
-    if not text.strip():
-        return "Please enter text to classify!", ""
     pred_theme, confidence, similarities = classifier.classify_text(text, client_id, confidence_threshold)
@@ -164,150 +162,66 @@ def benchmark_interface(csv_file, client_id: str):
     if csv_file is None:
         return "Please upload a CSV file!", None, None
     try:
-        csv_content = csv_file.read().decode('utf-8')
         return classifier.benchmark_csv(csv_content, client_id)
     except Exception as e:
-        return f"❌ Error reading CSV: {str(e)}", None, None
 # --- Gradio Interface (No Changes Below) ---
-# Create the Gradio interface
 with gr.Blocks(title="Custom Themes Classification MVP", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🎯 Custom Themes Classification - MVP
-    **A scalable, cost-effective solution for multi-client theme classification**
-    This demo showcases an embedding-based approach that can:
-    - ✅ Handle multiple clients with different themes
-    - ✅ Distinguish between similar themes (e.g., "Real Estate Financing" vs "Personal Financing")
-    - ✅ Process ~1M posts/day at low cost (~$500/month vs $30k/month for pure LLM)
-    - ✅ Provide confidence scores and similarity breakdowns
-    """)
     with gr.Tab("🚀 Setup & Model"):
         gr.Markdown("### Step 1: Load the Embedding Model (Optional)")
-        gr.Markdown("If you don't load a model, a default one will be loaded automatically on first use.")
         with gr.Row():
-            model_input = gr.Textbox(
-                label="HuggingFace Model Name",
-                value="Qwen/Qwen3-Embedding-0.6B",
-                placeholder="e.g., sentence-transformers/all-MiniLM-L6-v2",
-                info="Enter any SentenceTransformer-compatible model from HuggingFace"
-            )
             load_btn = gr.Button("Load Model", variant="primary")
         load_status = gr.Textbox(label="Status", interactive=False)
-        gr.Markdown("""
-        **Popular Models:**
-        - `Qwen/Qwen3-Embedding-0.6B` - High quality, multilingual
-        - `sentence-transformers/all-MiniLM-L6-v2` - Fast, lightweight
-        - `sentence-transformers/all-mpnet-base-v2` - High accuracy
-        """)
         load_btn.click(load_model_interface, inputs=[model_input], outputs=load_status)
         gr.Markdown("### Step 2: Add Themes for a Client")
         with gr.Row():
             client_input = gr.Textbox(label="Client ID", placeholder="e.g., client_1")
-            themes_input = gr.Textbox(
-                label="Themes (one per line)",
-                lines=5,
-                placeholder="e.g.:\nReal Estate Financing\nPersonal Financing\nPrivate Education\nSports"
-            )
         add_themes_btn = gr.Button("Add Themes", variant="secondary")
         themes_status = gr.Textbox(label="Status", interactive=False)
-        add_themes_btn.click(
-            add_themes_interface,
-            inputs=[client_input, themes_input],
-            outputs=themes_status
-        )
     with gr.Tab("🔍 Single Text Classification"):
         gr.Markdown("### Classify Individual Posts")
         with gr.Row():
             with gr.Column():
-                text_input = gr.Textbox(
-                    label="Text to Classify",
-                    lines=3,
-                    placeholder="Enter text to classify..."
-                )
-                client_select = gr.Textbox(
-                    label="Client ID",
-                    placeholder="e.g., client_1"
-                )
-                confidence_slider = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.3,
-                    step=0.1,
-                    label="Confidence Threshold"
-                )
                 classify_btn = gr.Button("Classify", variant="primary")
             with gr.Column():
                 classification_result = gr.Markdown(label="Results")
-        classify_btn.click(
-            classify_interface,
-            inputs=[text_input, client_select, confidence_slider],
-            outputs=[classification_result, gr.Textbox(visible=False)]
-        )
     with gr.Tab("📊 CSV Benchmarking"):
-        gr.Markdown("""
-        ### Benchmark on Your Dataset
-        Upload a CSV file with columns:
-        - `text`: The posts/content to classify
-        - `real_tag`: The correct theme labels
-        The system will automatically extract unique themes and evaluate performance.
-        """)
         with gr.Row():
             with gr.Column():
-                csv_upload = gr.File(
-                    label="Upload CSV File",
-                    file_types=[".csv"]
-                )
-                benchmark_client = gr.Textbox(
-                    label="Client ID for Benchmark",
-                    placeholder="e.g., benchmark_client"
-                )
                 benchmark_btn = gr.Button("Run Benchmark", variant="primary")
             with gr.Column():
                 benchmark_results = gr.Markdown(label="Benchmark Results")
         with gr.Row():
             results_csv = gr.File(label="Download Detailed Results", interactive=False)
             visualization = gr.HTML(label="Visualization")
-        benchmark_btn.click(
-            benchmark_interface,
-            inputs=[csv_upload, benchmark_client],
-            outputs=[benchmark_results, results_csv, visualization]
-        )
-    with gr.Tab("📋 About & Usage"):
-        gr.Markdown("""
-        ## 🎯 Solution Overview
-        This MVP demonstrates a **hybrid embedding-based approach** for Custom Themes classification.
-        ### 🏗️ Architecture:
-        1. **Embedding Model**: Customizable SentenceTransformer models from HuggingFace
-        2. **Theme Prototypes**: Each client's themes represented as embedding vectors
-        3. **Similarity Matching**: Cosine similarity for classification
-        4. **Automatic Loading**: The application will automatically load a default model if one is not present, making it resilient to platform hibernation.
-        """)
 # Launch the app
 if __name__ == "__main__":
-    import tempfile
     demo.launch(share=True)

 import json
 import traceback
 import spaces # Import the spaces library
+import tempfile
 class MultiClientThemeClassifier:
     def __init__(self):
             model_name = self.default_model
         try:
+            # Avoid reloading the same model
+            if self.model_loaded and hasattr(self.model, 'tokenizer') and self.model.tokenizer.name_or_path == model_name:
                 return f"✅ Model '{model_name}' is already loaded."
             self.model = None
     def add_client_themes(self, client_id: str, themes: List[str], examples_per_theme: Dict[str, List[str]] = None):
         """Add themes for a specific client"""
         error_status = self._ensure_model_is_loaded()
+        if error_status: return error_status
         try:
             self.client_themes[client_id] = {}
     def classify_text(self, text: str, client_id: str, confidence_threshold: float = 0.3) -> Tuple[str, float, Dict[str, float]]:
         """Classify a single text for a specific client"""
         error_status = self._ensure_model_is_loaded()
+        if error_status: return f"Error: {error_status}", 0.0, {}
         if client_id not in self.client_themes:
             return "Client not found", 0.0, {}
             similarities = {theme: util.cos_sim(text_embedding, prototype).item()
                             for theme, prototype in self.client_themes[client_id].items()}
+            if not similarities: return "No themes for client", 0.0, {}
             best_theme = max(similarities, key=similarities.get)
             best_score = similarities[best_theme]
     def benchmark_csv(self, csv_content: str, client_id: str) -> Tuple[str, Optional[str], Optional[str]]:
         """Benchmark the model on a CSV file"""
         error_status = self._ensure_model_is_loaded()
+        if error_status: return f"❌ Model could not be loaded: {error_status}", None, None
         try:
             df = pd.read_csv(io.StringIO(csv_content))
             results_summary = f"📊 **Benchmarking Results**\n\n**Accuracy: {accuracy:.2%}** ({correct}/{total})"
+            # Create visualization
+            fig = px.bar(df['real_tag'].value_counts(), title="Theme Distribution in Dataset", labels={'index': 'Theme', 'value': 'Count'})
+            visualization_html = fig.to_html()
+            # Save results to a temporary file for download
+            temp_file_path = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8').name
+            df.to_csv(temp_file_path, index=False)
+            return results_summary, temp_file_path, visualization_html
         except Exception as e:
             error_details = traceback.format_exc()
 @spaces.GPU
 def add_themes_interface(client_id: str, themes_text: str):
+    if not themes_text.strip(): return "❌ Please enter themes!"
     themes = [theme.strip() for theme in themes_text.split('\n') if theme.strip()]
     return classifier.add_client_themes(client_id, themes)
 @spaces.GPU
 def classify_interface(text: str, client_id: str, confidence_threshold: float):
+    if not text.strip(): return "Please enter text to classify!", ""
     pred_theme, confidence, similarities = classifier.classify_text(text, client_id, confidence_threshold)
     if csv_file is None:
         return "Please upload a CSV file!", None, None
     try:
+        # CORRECTED: Handle both file-like objects and string/NamedString objects from Gradio
+        if hasattr(csv_file, 'read'):
+            # It's a file-like object, read and decode it
+            csv_content = csv_file.read().decode('utf-8')
+        else:
+            # It's a string or NamedString, use it directly
+            csv_content = csv_file
         return classifier.benchmark_csv(csv_content, client_id)
     except Exception as e:
+        error_details = traceback.format_exc()
+        return f"❌ Error processing CSV file: {str(e)}\n\nDetails:\n{error_details}", None, None
 # --- Gradio Interface (No Changes Below) ---
 with gr.Blocks(title="Custom Themes Classification MVP", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎯 Custom Themes Classification - MVP")
     with gr.Tab("🚀 Setup & Model"):
         gr.Markdown("### Step 1: Load the Embedding Model (Optional)")
+        gr.Markdown("If you don't load a model, a default one (`Qwen/Qwen3-Embedding-0.6B`) will be loaded automatically on first use.")
         with gr.Row():
+            model_input = gr.Textbox(label="HuggingFace Model Name", value="Qwen/Qwen3-Embedding-0.6B")
             load_btn = gr.Button("Load Model", variant="primary")
         load_status = gr.Textbox(label="Status", interactive=False)
         load_btn.click(load_model_interface, inputs=[model_input], outputs=load_status)
         gr.Markdown("### Step 2: Add Themes for a Client")
         with gr.Row():
             client_input = gr.Textbox(label="Client ID", placeholder="e.g., client_1")
+            themes_input = gr.Textbox(label="Themes (one per line)", lines=5)
         add_themes_btn = gr.Button("Add Themes", variant="secondary")
         themes_status = gr.Textbox(label="Status", interactive=False)
+        add_themes_btn.click(add_themes_interface, inputs=[client_input, themes_input], outputs=themes_status)
     with gr.Tab("🔍 Single Text Classification"):
         gr.Markdown("### Classify Individual Posts")
         with gr.Row():
             with gr.Column():
+                text_input = gr.Textbox(label="Text to Classify", lines=3)
+                client_select = gr.Textbox(label="Client ID", placeholder="e.g., client_1")
+                confidence_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.1, label="Confidence Threshold")
                 classify_btn = gr.Button("Classify", variant="primary")
             with gr.Column():
                 classification_result = gr.Markdown(label="Results")
+        classify_btn.click(classify_interface, inputs=[text_input, client_select, confidence_slider], outputs=[classification_result, gr.Textbox(visible=False)])
     with gr.Tab("📊 CSV Benchmarking"):
+        gr.Markdown("### Benchmark on Your Dataset\nUpload a CSV with `text` and `real_tag` columns.")
         with gr.Row():
             with gr.Column():
+                csv_upload = gr.File(label="Upload CSV File", file_types=[".csv"])
+                benchmark_client = gr.Textbox(label="Client ID for Benchmark", placeholder="e.g., benchmark_client")
                 benchmark_btn = gr.Button("Run Benchmark", variant="primary")
             with gr.Column():
                 benchmark_results = gr.Markdown(label="Benchmark Results")
         with gr.Row():
             results_csv = gr.File(label="Download Detailed Results", interactive=False)
             visualization = gr.HTML(label="Visualization")
+        benchmark_btn.click(benchmark_interface, inputs=[csv_upload, benchmark_client], outputs=[benchmark_results, results_csv, visualization])
 # Launch the app
 if __name__ == "__main__":
     demo.launch(share=True)