Spaces:

CatLLM
/

survey-classifier

Running

chrissoria Claude commited on 19 days ago

Commit

7165ab2

1 Parent(s): 1c1f244

Add 5 UX improvements: progress indicator, example dataset, better placeholders, dark mode, large file warning

- Convert classify_data() to generator for real-time progress updates during classification
- Add "Try Example Dataset" button with sample survey responses
- Update category placeholder text with diverse, helpful examples
- Add dark mode support via gr.themes.Soft()
- Show warning for datasets > 1000 rows with time estimate

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

app.py +83 -21
example_data.csv +5 -0

app.py CHANGED Viewed

@@ -399,6 +399,21 @@ def get_model_source(model):
     return "huggingface"
 def load_columns(file):
     if file is None:
         return gr.update(choices=[], value=None), "Please upload a file first"
@@ -411,9 +426,18 @@ def load_columns(file):
             df = pd.read_excel(file_path)
         columns = df.columns.tolist()
         return (
             gr.update(choices=columns, value=columns[0] if columns else None),
-            f"Loaded {len(df)} rows. Select column and click Classify."
         )
     except Exception as e:
         return gr.update(choices=[], value=None), f"**Error:** {str(e)}"
@@ -422,15 +446,17 @@ def load_columns(file):
 def classify_data(spreadsheet_file, spreadsheet_column,
                   cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                   model_tier, model, model_source_input, api_key_input):
-    """Main classification function. Returns distribution, samples, full results, files, and status."""
     if not CATLLM_AVAILABLE:
-        return None, None, None, None, "**Error:** catllm package not available"
     all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
     categories = [c.strip() for c in all_cats if c and c.strip()]
     if not categories:
-        return None, None, None, None, "**Error:** Please enter at least one category"
     actual_model = model
@@ -440,31 +466,38 @@ def classify_data(spreadsheet_file, spreadsheet_column,
         if model in HF_ROUTED_MODELS:
             actual_api_key = os.environ.get("HF_API_KEY", "")
             if not actual_api_key:
-                return None, None, None, None, "**Error:** HuggingFace API key not configured in Space secrets"
         elif "gpt" in model.lower():
             actual_api_key = os.environ.get("OPENAI_API_KEY", "")
             if not actual_api_key:
-                return None, None, None, None, "**Error:** OpenAI API key not configured in Space secrets"
         elif "gemini" in model.lower():
             actual_api_key = os.environ.get("GOOGLE_API_KEY", "")
             if not actual_api_key:
-                return None, None, None, None, "**Error:** Google API key not configured in Space secrets"
         elif "mistral" in model.lower():
             actual_api_key = os.environ.get("MISTRAL_API_KEY", "")
             if not actual_api_key:
-                return None, None, None, None, "**Error:** Mistral API key not configured in Space secrets"
         elif "claude" in model.lower():
             actual_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
             if not actual_api_key:
-                return None, None, None, None, "**Error:** Anthropic API key not configured in Space secrets"
         elif "sonar" in model.lower():
             actual_api_key = os.environ.get("PERPLEXITY_API_KEY", "")
             if not actual_api_key:
-                return None, None, None, None, "**Error:** Perplexity API key not configured in Space secrets"
         elif "grok" in model.lower():
             actual_api_key = os.environ.get("XAI_API_KEY", "")
             if not actual_api_key:
-                return None, None, None, None, "**Error:** xAI API key not configured in Space secrets"
         else:
             actual_api_key = os.environ.get("HF_API_KEY", "")
     else:
@@ -472,7 +505,8 @@ def classify_data(spreadsheet_file, spreadsheet_column,
         if api_key_input and api_key_input.strip():
             actual_api_key = api_key_input.strip()
         else:
-            return None, None, None, None, f"**Error:** Please provide your API key for {model}"
     # Use user-selected model_source, or auto-detect if "auto"
     if model_source_input == "auto":
@@ -482,9 +516,11 @@ def classify_data(spreadsheet_file, spreadsheet_column,
     try:
         if not spreadsheet_file:
-            return None, None, None, None, "**Error:** Please upload a file"
         if not spreadsheet_column:
-            return None, None, None, None, "**Error:** Please select a column to classify"
         file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
         if file_path.endswith('.csv'):
@@ -493,10 +529,14 @@ def classify_data(spreadsheet_file, spreadsheet_column,
             df = pd.read_excel(file_path)
         if spreadsheet_column not in df.columns:
-            return None, None, None, None, f"**Error:** Column '{spreadsheet_column}' not found"
         input_data = df[spreadsheet_column].tolist()
         # Calculate data quality metrics before classification
         text_series = df[spreadsheet_column].dropna().astype(str)
         data_quality = {
@@ -507,6 +547,9 @@ def classify_data(spreadsheet_file, spreadsheet_column,
             'error_count': 0  # Will be updated after classification
         }
         # Capture timing
         start_time = time.time()
@@ -557,6 +600,9 @@ Provide your work in JSON format where the number belonging to each category is
             catllm_version = "unknown"
         python_version = sys.version.split()[0]
         # Generate PDF methodology report with all new data
         pdf_path = generate_methodology_report_pdf(
             categories=categories,
@@ -624,17 +670,17 @@ Provide your work in JSON format where the number belonging to each category is
             })
         sample_df = pd.DataFrame(sample_data)
-        # Return: distribution plot (visible), samples (visible), full results (visible), files, status
-        return (
             gr.update(value=distribution_fig, visible=True),
             gr.update(value=sample_df, visible=True),
             gr.update(value=result, visible=True),
             [csv_path, pdf_path],
-            f"**Success!** Classified {len(input_data)} responses in {processing_time:.1f}s"
         )
     except Exception as e:
-        return None, None, None, None, f"**Error:** {str(e)}"
 def add_category_field(current_count):
@@ -728,7 +774,7 @@ result.to_csv("classified_results.csv", index=False)
     return gr.update(value=code, visible=True)
-with gr.Blocks(title="CatLLM - Survey Response Classifier") as demo:
     gr.Image("logo.png", show_label=False, show_download_button=False, height=100, container=False)
     gr.Markdown("# CatLLM - Survey Response Classifier")
     gr.Markdown("Classify survey responses into custom categories using LLMs.")
@@ -772,6 +818,7 @@ https://github.com/chrissoria/cat-llm
                 label="Upload Survey Data (CSV or Excel)",
                 file_types=[".csv", ".xlsx", ".xls"]
             )
             spreadsheet_column = gr.Dropdown(
                 label="Column to Classify",
@@ -781,11 +828,20 @@ https://github.com/chrissoria/cat-llm
             gr.Markdown("### Categories")
             category_inputs = []
             for i in range(MAX_CATEGORIES):
                 visible = i < INITIAL_CATEGORIES
                 cat_input = gr.Textbox(
                     label=f"Category {i+1}",
-                    placeholder=f"e.g., {'Positive' if i==0 else 'Negative' if i==1 else 'Neutral'}",
                     visible=visible
                 )
                 category_inputs.append(cat_input)
@@ -872,6 +928,12 @@ https://github.com/chrissoria/cat-llm
         outputs=[spreadsheet_column, status]
     )
     add_category_btn.click(
         fn=add_category_field,
         inputs=[category_count],

     return "huggingface"
+def load_example_dataset():
+    """Load the example dataset for users to try the app."""
+    example_path = "example_data.csv"
+    try:
+        df = pd.read_csv(example_path)
+        columns = df.columns.tolist()
+        return (
+            example_path,  # file path
+            gr.update(choices=columns, value=columns[0] if columns else None),  # column dropdown
+            f"Loaded example dataset ({len(df)} rows). Select column and click Classify."  # status
+        )
+    except Exception as e:
+        return None, gr.update(choices=[], value=None), f"**Error loading example:** {str(e)}"
 def load_columns(file):
     if file is None:
         return gr.update(choices=[], value=None), "Please upload a file first"
             df = pd.read_excel(file_path)
         columns = df.columns.tolist()
+        num_rows = len(df)
+        # Warning for large datasets
+        if num_rows > 1000:
+            est_minutes = round(num_rows * 1.5 / 60)  # ~1.5 seconds per row estimate
+            status_msg = f"⚠️ **Large dataset** ({num_rows:,} rows). Classification may take ~{est_minutes} minutes. Select column and click Classify."
+        else:
+            status_msg = f"Loaded {num_rows:,} rows. Select column and click Classify."
         return (
             gr.update(choices=columns, value=columns[0] if columns else None),
+            status_msg
         )
     except Exception as e:
         return gr.update(choices=[], value=None), f"**Error:** {str(e)}"
 def classify_data(spreadsheet_file, spreadsheet_column,
                   cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                   model_tier, model, model_source_input, api_key_input):
+    """Main classification function with progress updates. Yields status updates then final results."""
     if not CATLLM_AVAILABLE:
+        yield None, None, None, None, "**Error:** catllm package not available"
+        return
     all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
     categories = [c.strip() for c in all_cats if c and c.strip()]
     if not categories:
+        yield None, None, None, None, "**Error:** Please enter at least one category"
+        return
     actual_model = model
         if model in HF_ROUTED_MODELS:
             actual_api_key = os.environ.get("HF_API_KEY", "")
             if not actual_api_key:
+                yield None, None, None, None, "**Error:** HuggingFace API key not configured in Space secrets"
+                return
         elif "gpt" in model.lower():
             actual_api_key = os.environ.get("OPENAI_API_KEY", "")
             if not actual_api_key:
+                yield None, None, None, None, "**Error:** OpenAI API key not configured in Space secrets"
+                return
         elif "gemini" in model.lower():
             actual_api_key = os.environ.get("GOOGLE_API_KEY", "")
             if not actual_api_key:
+                yield None, None, None, None, "**Error:** Google API key not configured in Space secrets"
+                return
         elif "mistral" in model.lower():
             actual_api_key = os.environ.get("MISTRAL_API_KEY", "")
             if not actual_api_key:
+                yield None, None, None, None, "**Error:** Mistral API key not configured in Space secrets"
+                return
         elif "claude" in model.lower():
             actual_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
             if not actual_api_key:
+                yield None, None, None, None, "**Error:** Anthropic API key not configured in Space secrets"
+                return
         elif "sonar" in model.lower():
             actual_api_key = os.environ.get("PERPLEXITY_API_KEY", "")
             if not actual_api_key:
+                yield None, None, None, None, "**Error:** Perplexity API key not configured in Space secrets"
+                return
         elif "grok" in model.lower():
             actual_api_key = os.environ.get("XAI_API_KEY", "")
             if not actual_api_key:
+                yield None, None, None, None, "**Error:** xAI API key not configured in Space secrets"
+                return
         else:
             actual_api_key = os.environ.get("HF_API_KEY", "")
     else:
         if api_key_input and api_key_input.strip():
             actual_api_key = api_key_input.strip()
         else:
+            yield None, None, None, None, f"**Error:** Please provide your API key for {model}"
+            return
     # Use user-selected model_source, or auto-detect if "auto"
     if model_source_input == "auto":
     try:
         if not spreadsheet_file:
+            yield None, None, None, None, "**Error:** Please upload a file"
+            return
         if not spreadsheet_column:
+            yield None, None, None, None, "**Error:** Please select a column to classify"
+            return
         file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
         if file_path.endswith('.csv'):
             df = pd.read_excel(file_path)
         if spreadsheet_column not in df.columns:
+            yield None, None, None, None, f"**Error:** Column '{spreadsheet_column}' not found"
+            return
         input_data = df[spreadsheet_column].tolist()
+        # Progress update: data loaded
+        yield None, None, None, None, f"⏳ **Loading data...** Found {len(input_data)} responses to classify."
         # Calculate data quality metrics before classification
         text_series = df[spreadsheet_column].dropna().astype(str)
         data_quality = {
             'error_count': 0  # Will be updated after classification
         }
+        # Progress update: starting classification
+        yield None, None, None, None, f"🔄 **Classifying {len(input_data)} responses...** This may take a moment."
         # Capture timing
         start_time = time.time()
             catllm_version = "unknown"
         python_version = sys.version.split()[0]
+        # Progress update: generating report
+        yield None, None, None, None, f"📄 **Generating methodology report...** Classification complete in {processing_time:.1f}s."
         # Generate PDF methodology report with all new data
         pdf_path = generate_methodology_report_pdf(
             categories=categories,
             })
         sample_df = pd.DataFrame(sample_data)
+        # Final yield: distribution plot (visible), samples (visible), full results (visible), files, status
+        yield (
             gr.update(value=distribution_fig, visible=True),
             gr.update(value=sample_df, visible=True),
             gr.update(value=result, visible=True),
             [csv_path, pdf_path],
+            f"✅ **Success!** Classified {len(input_data)} responses in {processing_time:.1f}s"
         )
     except Exception as e:
+        yield None, None, None, None, f"**Error:** {str(e)}"
 def add_category_field(current_count):
     return gr.update(value=code, visible=True)
+with gr.Blocks(title="CatLLM - Survey Response Classifier", theme=gr.themes.Soft()) as demo:
     gr.Image("logo.png", show_label=False, show_download_button=False, height=100, container=False)
     gr.Markdown("# CatLLM - Survey Response Classifier")
     gr.Markdown("Classify survey responses into custom categories using LLMs.")
                 label="Upload Survey Data (CSV or Excel)",
                 file_types=[".csv", ".xlsx", ".xls"]
             )
+            example_btn = gr.Button("📋 Try Example Dataset", variant="secondary", size="sm")
             spreadsheet_column = gr.Dropdown(
                 label="Column to Classify",
             gr.Markdown("### Categories")
             category_inputs = []
+            placeholder_examples = [
+                "e.g., Positive sentiment",
+                "e.g., Negative sentiment",
+                "e.g., Product feedback",
+                "e.g., Service complaint",
+                "e.g., Feature request",
+                "e.g., Custom category"
+            ]
             for i in range(MAX_CATEGORIES):
                 visible = i < INITIAL_CATEGORIES
+                placeholder = placeholder_examples[i] if i < len(placeholder_examples) else "e.g., Custom category"
                 cat_input = gr.Textbox(
                     label=f"Category {i+1}",
+                    placeholder=placeholder,
                     visible=visible
                 )
                 category_inputs.append(cat_input)
         outputs=[spreadsheet_column, status]
     )
+    example_btn.click(
+        fn=load_example_dataset,
+        inputs=[],
+        outputs=[spreadsheet_file, spreadsheet_column, status]
+    )
     add_category_btn.click(
         fn=add_category_field,
         inputs=[category_count],

example_data.csv ADDED Viewed

	@@ -0,0 +1,5 @@

+Response,
+I wanted to live in San Diego,
+I really hated my apartment,
+My grandparents needed me to live nearby ,
+"Tony, my husband, got a new job at UC Berkeley",