Spaces:

Mohaddz
/

Customer-classify

Runtime error

App Files Files Community

Mohaddz commited on Aug 29, 2025

Commit

2c7390f

verified ·

1 Parent(s): 5a8e848

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -13

app.py CHANGED Viewed

@@ -26,7 +26,6 @@ class MultiClientThemeClassifier:
             model_name = self.default_model
         try:
-            # Avoid reloading the same model
             if self.model_loaded and hasattr(self.model, 'tokenizer') and self.model.tokenizer.name_or_path == model_name:
                 return f"✅ Model '{model_name}' is already loaded."
@@ -92,17 +91,18 @@ class MultiClientThemeClassifier:
             return f"Error: {str(e)}", 0.0, {}
     def benchmark_csv(self, csv_content: str, client_id: str) -> Tuple[str, Optional[str], Optional[str]]:
-        """Benchmark the model on a CSV file"""
         error_status = self._ensure_model_is_loaded()
         if error_status: return f"❌ Model could not be loaded: {error_status}", None, None
         try:
-            # CORRECTED: Use encoding 'utf-8-sig' to handle the invisible BOM character
-            df = pd.read_csv(io.StringIO(csv_content), encoding='utf-8-sig')
             if 'text' not in df.columns or 'real_tag' not in df.columns:
-                return "❌ CSV must have 'text' and 'real_tag' columns!", None, None
             df.dropna(subset=['text', 'real_tag'], inplace=True)
             df['text'] = df['text'].astype(str)
             df['real_tag'] = df['real_tag'].astype(str)
@@ -121,11 +121,9 @@ class MultiClientThemeClassifier:
             results_summary = f"📊 **Benchmarking Results**\n\n**Accuracy: {accuracy:.2%}** ({correct}/{total})"
-            # Create visualization
-            fig = px.bar(df['real_tag'].value_counts(), title="Theme Distribution in Dataset", labels={'index': 'Theme', 'value': 'Count'})
             visualization_html = fig.to_html()
-            # Save results to a temporary file for download
             temp_file_path = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8-sig').name
             df.to_csv(temp_file_path, index=False)
@@ -164,23 +162,28 @@ def benchmark_interface(csv_file, client_id: str):
     if csv_file is None:
         return "Please upload a CSV file!", None, None
     try:
         if hasattr(csv_file, 'read'):
-            csv_content = csv_file.read().decode('utf-8')
         else:
-            csv_content = csv_file
         return classifier.benchmark_csv(csv_content, client_id)
     except Exception as e:
         error_details = traceback.format_exc()
         return f"❌ Error processing CSV file: {str(e)}\n\nDetails:\n{error_details}", None, None
-# --- Gradio Interface (No Changes Below) ---
 with gr.Blocks(title="Custom Themes Classification MVP", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎯 Custom Themes Classification - MVP")
     with gr.Tab("🚀 Setup & Model"):
         gr.Markdown("### Step 1: Load the Embedding Model (Optional)")
-        gr.Markdown("If you don't load a model, a default one (`Qwen/Qwen3-Embedding-0.6B`) will be loaded automatically on first use.")
         with gr.Row():
             model_input = gr.Textbox(label="HuggingFace Model Name", value="Qwen/Qwen3-Embedding-0.6B")
             load_btn = gr.Button("Load Model", variant="primary")

             model_name = self.default_model
         try:
             if self.model_loaded and hasattr(self.model, 'tokenizer') and self.model.tokenizer.name_or_path == model_name:
                 return f"✅ Model '{model_name}' is already loaded."
             return f"Error: {str(e)}", 0.0, {}
     def benchmark_csv(self, csv_content: str, client_id: str) -> Tuple[str, Optional[str], Optional[str]]:
+        """Benchmark the model on a CSV file. Assumes csv_content is a clean string."""
         error_status = self._ensure_model_is_loaded()
         if error_status: return f"❌ Model could not be loaded: {error_status}", None, None
         try:
+            # The string is now clean, so no special encoding is needed here.
+            df = pd.read_csv(io.StringIO(csv_content))
+            # Check for columns after reading
             if 'text' not in df.columns or 'real_tag' not in df.columns:
+                return f"❌ CSV must have 'text' and 'real_tag' columns! Found: {df.columns.to_list()}", None, None
             df.dropna(subset=['text', 'real_tag'], inplace=True)
             df['text'] = df['text'].astype(str)
             df['real_tag'] = df['real_tag'].astype(str)
             results_summary = f"📊 **Benchmarking Results**\n\n**Accuracy: {accuracy:.2%}** ({correct}/{total})"
+            fig = px.bar(df['real_tag'].value_counts(), title="Theme Distribution", labels={'index': 'Theme', 'value': 'Count'})
             visualization_html = fig.to_html()
             temp_file_path = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8-sig').name
             df.to_csv(temp_file_path, index=False)
     if csv_file is None:
         return "Please upload a CSV file!", None, None
     try:
+        # CORRECTED AND FINAL FIX: Handle the BOM at the point of file reading.
         if hasattr(csv_file, 'read'):
+            # It's a file-like object (TemporaryFile), read its bytes and decode with utf-8-sig
+            csv_content = csv_file.read().decode('utf-8-sig')
         else:
+            # It's a string (NamedString), which was likely decoded with 'utf-8'.
+            # Manually remove the BOM if it exists.
+            csv_content = str(csv_file).lstrip('\ufeff')
+        # Now, pass the clean string to the benchmark function
         return classifier.benchmark_csv(csv_content, client_id)
     except Exception as e:
         error_details = traceback.format_exc()
         return f"❌ Error processing CSV file: {str(e)}\n\nDetails:\n{error_details}", None, None
+# --- Gradio Interface ---
 with gr.Blocks(title="Custom Themes Classification MVP", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎯 Custom Themes Classification - MVP")
     with gr.Tab("🚀 Setup & Model"):
         gr.Markdown("### Step 1: Load the Embedding Model (Optional)")
+        gr.Markdown("A default model (`Qwen/Qwen3-Embedding-0.6B`) will load automatically on first use.")
         with gr.Row():
             model_input = gr.Textbox(label="HuggingFace Model Name", value="Qwen/Qwen3-Embedding-0.6B")
             load_btn = gr.Button("Load Model", variant="primary")