Spaces:
Running
Running
Commit
Β·
7165ab2
1
Parent(s):
1c1f244
Add 5 UX improvements: progress indicator, example dataset, better placeholders, dark mode, large file warning
Browse files- Convert classify_data() to generator for real-time progress updates during classification
- Add "Try Example Dataset" button with sample survey responses
- Update category placeholder text with diverse, helpful examples
- Add dark mode support via gr.themes.Soft()
- Show warning for datasets > 1000 rows with time estimate
π€ Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
- app.py +83 -21
- example_data.csv +5 -0
app.py
CHANGED
|
@@ -399,6 +399,21 @@ def get_model_source(model):
|
|
| 399 |
return "huggingface"
|
| 400 |
|
| 401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
def load_columns(file):
|
| 403 |
if file is None:
|
| 404 |
return gr.update(choices=[], value=None), "Please upload a file first"
|
|
@@ -411,9 +426,18 @@ def load_columns(file):
|
|
| 411 |
df = pd.read_excel(file_path)
|
| 412 |
|
| 413 |
columns = df.columns.tolist()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
return (
|
| 415 |
gr.update(choices=columns, value=columns[0] if columns else None),
|
| 416 |
-
|
| 417 |
)
|
| 418 |
except Exception as e:
|
| 419 |
return gr.update(choices=[], value=None), f"**Error:** {str(e)}"
|
|
@@ -422,15 +446,17 @@ def load_columns(file):
|
|
| 422 |
def classify_data(spreadsheet_file, spreadsheet_column,
|
| 423 |
cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
|
| 424 |
model_tier, model, model_source_input, api_key_input):
|
| 425 |
-
"""Main classification function
|
| 426 |
if not CATLLM_AVAILABLE:
|
| 427 |
-
|
|
|
|
| 428 |
|
| 429 |
all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
|
| 430 |
categories = [c.strip() for c in all_cats if c and c.strip()]
|
| 431 |
|
| 432 |
if not categories:
|
| 433 |
-
|
|
|
|
| 434 |
|
| 435 |
actual_model = model
|
| 436 |
|
|
@@ -440,31 +466,38 @@ def classify_data(spreadsheet_file, spreadsheet_column,
|
|
| 440 |
if model in HF_ROUTED_MODELS:
|
| 441 |
actual_api_key = os.environ.get("HF_API_KEY", "")
|
| 442 |
if not actual_api_key:
|
| 443 |
-
|
|
|
|
| 444 |
elif "gpt" in model.lower():
|
| 445 |
actual_api_key = os.environ.get("OPENAI_API_KEY", "")
|
| 446 |
if not actual_api_key:
|
| 447 |
-
|
|
|
|
| 448 |
elif "gemini" in model.lower():
|
| 449 |
actual_api_key = os.environ.get("GOOGLE_API_KEY", "")
|
| 450 |
if not actual_api_key:
|
| 451 |
-
|
|
|
|
| 452 |
elif "mistral" in model.lower():
|
| 453 |
actual_api_key = os.environ.get("MISTRAL_API_KEY", "")
|
| 454 |
if not actual_api_key:
|
| 455 |
-
|
|
|
|
| 456 |
elif "claude" in model.lower():
|
| 457 |
actual_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
| 458 |
if not actual_api_key:
|
| 459 |
-
|
|
|
|
| 460 |
elif "sonar" in model.lower():
|
| 461 |
actual_api_key = os.environ.get("PERPLEXITY_API_KEY", "")
|
| 462 |
if not actual_api_key:
|
| 463 |
-
|
|
|
|
| 464 |
elif "grok" in model.lower():
|
| 465 |
actual_api_key = os.environ.get("XAI_API_KEY", "")
|
| 466 |
if not actual_api_key:
|
| 467 |
-
|
|
|
|
| 468 |
else:
|
| 469 |
actual_api_key = os.environ.get("HF_API_KEY", "")
|
| 470 |
else:
|
|
@@ -472,7 +505,8 @@ def classify_data(spreadsheet_file, spreadsheet_column,
|
|
| 472 |
if api_key_input and api_key_input.strip():
|
| 473 |
actual_api_key = api_key_input.strip()
|
| 474 |
else:
|
| 475 |
-
|
|
|
|
| 476 |
|
| 477 |
# Use user-selected model_source, or auto-detect if "auto"
|
| 478 |
if model_source_input == "auto":
|
|
@@ -482,9 +516,11 @@ def classify_data(spreadsheet_file, spreadsheet_column,
|
|
| 482 |
|
| 483 |
try:
|
| 484 |
if not spreadsheet_file:
|
| 485 |
-
|
|
|
|
| 486 |
if not spreadsheet_column:
|
| 487 |
-
|
|
|
|
| 488 |
|
| 489 |
file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
|
| 490 |
if file_path.endswith('.csv'):
|
|
@@ -493,10 +529,14 @@ def classify_data(spreadsheet_file, spreadsheet_column,
|
|
| 493 |
df = pd.read_excel(file_path)
|
| 494 |
|
| 495 |
if spreadsheet_column not in df.columns:
|
| 496 |
-
|
|
|
|
| 497 |
|
| 498 |
input_data = df[spreadsheet_column].tolist()
|
| 499 |
|
|
|
|
|
|
|
|
|
|
| 500 |
# Calculate data quality metrics before classification
|
| 501 |
text_series = df[spreadsheet_column].dropna().astype(str)
|
| 502 |
data_quality = {
|
|
@@ -507,6 +547,9 @@ def classify_data(spreadsheet_file, spreadsheet_column,
|
|
| 507 |
'error_count': 0 # Will be updated after classification
|
| 508 |
}
|
| 509 |
|
|
|
|
|
|
|
|
|
|
| 510 |
# Capture timing
|
| 511 |
start_time = time.time()
|
| 512 |
|
|
@@ -557,6 +600,9 @@ Provide your work in JSON format where the number belonging to each category is
|
|
| 557 |
catllm_version = "unknown"
|
| 558 |
python_version = sys.version.split()[0]
|
| 559 |
|
|
|
|
|
|
|
|
|
|
| 560 |
# Generate PDF methodology report with all new data
|
| 561 |
pdf_path = generate_methodology_report_pdf(
|
| 562 |
categories=categories,
|
|
@@ -624,17 +670,17 @@ Provide your work in JSON format where the number belonging to each category is
|
|
| 624 |
})
|
| 625 |
sample_df = pd.DataFrame(sample_data)
|
| 626 |
|
| 627 |
-
#
|
| 628 |
-
|
| 629 |
gr.update(value=distribution_fig, visible=True),
|
| 630 |
gr.update(value=sample_df, visible=True),
|
| 631 |
gr.update(value=result, visible=True),
|
| 632 |
[csv_path, pdf_path],
|
| 633 |
-
f"**Success!** Classified {len(input_data)} responses in {processing_time:.1f}s"
|
| 634 |
)
|
| 635 |
|
| 636 |
except Exception as e:
|
| 637 |
-
|
| 638 |
|
| 639 |
|
| 640 |
def add_category_field(current_count):
|
|
@@ -728,7 +774,7 @@ result.to_csv("classified_results.csv", index=False)
|
|
| 728 |
return gr.update(value=code, visible=True)
|
| 729 |
|
| 730 |
|
| 731 |
-
with gr.Blocks(title="CatLLM - Survey Response Classifier") as demo:
|
| 732 |
gr.Image("logo.png", show_label=False, show_download_button=False, height=100, container=False)
|
| 733 |
gr.Markdown("# CatLLM - Survey Response Classifier")
|
| 734 |
gr.Markdown("Classify survey responses into custom categories using LLMs.")
|
|
@@ -772,6 +818,7 @@ https://github.com/chrissoria/cat-llm
|
|
| 772 |
label="Upload Survey Data (CSV or Excel)",
|
| 773 |
file_types=[".csv", ".xlsx", ".xls"]
|
| 774 |
)
|
|
|
|
| 775 |
|
| 776 |
spreadsheet_column = gr.Dropdown(
|
| 777 |
label="Column to Classify",
|
|
@@ -781,11 +828,20 @@ https://github.com/chrissoria/cat-llm
|
|
| 781 |
|
| 782 |
gr.Markdown("### Categories")
|
| 783 |
category_inputs = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 784 |
for i in range(MAX_CATEGORIES):
|
| 785 |
visible = i < INITIAL_CATEGORIES
|
|
|
|
| 786 |
cat_input = gr.Textbox(
|
| 787 |
label=f"Category {i+1}",
|
| 788 |
-
placeholder=
|
| 789 |
visible=visible
|
| 790 |
)
|
| 791 |
category_inputs.append(cat_input)
|
|
@@ -872,6 +928,12 @@ https://github.com/chrissoria/cat-llm
|
|
| 872 |
outputs=[spreadsheet_column, status]
|
| 873 |
)
|
| 874 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
add_category_btn.click(
|
| 876 |
fn=add_category_field,
|
| 877 |
inputs=[category_count],
|
|
|
|
| 399 |
return "huggingface"
|
| 400 |
|
| 401 |
|
| 402 |
+
def load_example_dataset():
|
| 403 |
+
"""Load the example dataset for users to try the app."""
|
| 404 |
+
example_path = "example_data.csv"
|
| 405 |
+
try:
|
| 406 |
+
df = pd.read_csv(example_path)
|
| 407 |
+
columns = df.columns.tolist()
|
| 408 |
+
return (
|
| 409 |
+
example_path, # file path
|
| 410 |
+
gr.update(choices=columns, value=columns[0] if columns else None), # column dropdown
|
| 411 |
+
f"Loaded example dataset ({len(df)} rows). Select column and click Classify." # status
|
| 412 |
+
)
|
| 413 |
+
except Exception as e:
|
| 414 |
+
return None, gr.update(choices=[], value=None), f"**Error loading example:** {str(e)}"
|
| 415 |
+
|
| 416 |
+
|
| 417 |
def load_columns(file):
|
| 418 |
if file is None:
|
| 419 |
return gr.update(choices=[], value=None), "Please upload a file first"
|
|
|
|
| 426 |
df = pd.read_excel(file_path)
|
| 427 |
|
| 428 |
columns = df.columns.tolist()
|
| 429 |
+
num_rows = len(df)
|
| 430 |
+
|
| 431 |
+
# Warning for large datasets
|
| 432 |
+
if num_rows > 1000:
|
| 433 |
+
est_minutes = round(num_rows * 1.5 / 60) # ~1.5 seconds per row estimate
|
| 434 |
+
status_msg = f"β οΈ **Large dataset** ({num_rows:,} rows). Classification may take ~{est_minutes} minutes. Select column and click Classify."
|
| 435 |
+
else:
|
| 436 |
+
status_msg = f"Loaded {num_rows:,} rows. Select column and click Classify."
|
| 437 |
+
|
| 438 |
return (
|
| 439 |
gr.update(choices=columns, value=columns[0] if columns else None),
|
| 440 |
+
status_msg
|
| 441 |
)
|
| 442 |
except Exception as e:
|
| 443 |
return gr.update(choices=[], value=None), f"**Error:** {str(e)}"
|
|
|
|
| 446 |
def classify_data(spreadsheet_file, spreadsheet_column,
|
| 447 |
cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
|
| 448 |
model_tier, model, model_source_input, api_key_input):
|
| 449 |
+
"""Main classification function with progress updates. Yields status updates then final results."""
|
| 450 |
if not CATLLM_AVAILABLE:
|
| 451 |
+
yield None, None, None, None, "**Error:** catllm package not available"
|
| 452 |
+
return
|
| 453 |
|
| 454 |
all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
|
| 455 |
categories = [c.strip() for c in all_cats if c and c.strip()]
|
| 456 |
|
| 457 |
if not categories:
|
| 458 |
+
yield None, None, None, None, "**Error:** Please enter at least one category"
|
| 459 |
+
return
|
| 460 |
|
| 461 |
actual_model = model
|
| 462 |
|
|
|
|
| 466 |
if model in HF_ROUTED_MODELS:
|
| 467 |
actual_api_key = os.environ.get("HF_API_KEY", "")
|
| 468 |
if not actual_api_key:
|
| 469 |
+
yield None, None, None, None, "**Error:** HuggingFace API key not configured in Space secrets"
|
| 470 |
+
return
|
| 471 |
elif "gpt" in model.lower():
|
| 472 |
actual_api_key = os.environ.get("OPENAI_API_KEY", "")
|
| 473 |
if not actual_api_key:
|
| 474 |
+
yield None, None, None, None, "**Error:** OpenAI API key not configured in Space secrets"
|
| 475 |
+
return
|
| 476 |
elif "gemini" in model.lower():
|
| 477 |
actual_api_key = os.environ.get("GOOGLE_API_KEY", "")
|
| 478 |
if not actual_api_key:
|
| 479 |
+
yield None, None, None, None, "**Error:** Google API key not configured in Space secrets"
|
| 480 |
+
return
|
| 481 |
elif "mistral" in model.lower():
|
| 482 |
actual_api_key = os.environ.get("MISTRAL_API_KEY", "")
|
| 483 |
if not actual_api_key:
|
| 484 |
+
yield None, None, None, None, "**Error:** Mistral API key not configured in Space secrets"
|
| 485 |
+
return
|
| 486 |
elif "claude" in model.lower():
|
| 487 |
actual_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
| 488 |
if not actual_api_key:
|
| 489 |
+
yield None, None, None, None, "**Error:** Anthropic API key not configured in Space secrets"
|
| 490 |
+
return
|
| 491 |
elif "sonar" in model.lower():
|
| 492 |
actual_api_key = os.environ.get("PERPLEXITY_API_KEY", "")
|
| 493 |
if not actual_api_key:
|
| 494 |
+
yield None, None, None, None, "**Error:** Perplexity API key not configured in Space secrets"
|
| 495 |
+
return
|
| 496 |
elif "grok" in model.lower():
|
| 497 |
actual_api_key = os.environ.get("XAI_API_KEY", "")
|
| 498 |
if not actual_api_key:
|
| 499 |
+
yield None, None, None, None, "**Error:** xAI API key not configured in Space secrets"
|
| 500 |
+
return
|
| 501 |
else:
|
| 502 |
actual_api_key = os.environ.get("HF_API_KEY", "")
|
| 503 |
else:
|
|
|
|
| 505 |
if api_key_input and api_key_input.strip():
|
| 506 |
actual_api_key = api_key_input.strip()
|
| 507 |
else:
|
| 508 |
+
yield None, None, None, None, f"**Error:** Please provide your API key for {model}"
|
| 509 |
+
return
|
| 510 |
|
| 511 |
# Use user-selected model_source, or auto-detect if "auto"
|
| 512 |
if model_source_input == "auto":
|
|
|
|
| 516 |
|
| 517 |
try:
|
| 518 |
if not spreadsheet_file:
|
| 519 |
+
yield None, None, None, None, "**Error:** Please upload a file"
|
| 520 |
+
return
|
| 521 |
if not spreadsheet_column:
|
| 522 |
+
yield None, None, None, None, "**Error:** Please select a column to classify"
|
| 523 |
+
return
|
| 524 |
|
| 525 |
file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
|
| 526 |
if file_path.endswith('.csv'):
|
|
|
|
| 529 |
df = pd.read_excel(file_path)
|
| 530 |
|
| 531 |
if spreadsheet_column not in df.columns:
|
| 532 |
+
yield None, None, None, None, f"**Error:** Column '{spreadsheet_column}' not found"
|
| 533 |
+
return
|
| 534 |
|
| 535 |
input_data = df[spreadsheet_column].tolist()
|
| 536 |
|
| 537 |
+
# Progress update: data loaded
|
| 538 |
+
yield None, None, None, None, f"β³ **Loading data...** Found {len(input_data)} responses to classify."
|
| 539 |
+
|
| 540 |
# Calculate data quality metrics before classification
|
| 541 |
text_series = df[spreadsheet_column].dropna().astype(str)
|
| 542 |
data_quality = {
|
|
|
|
| 547 |
'error_count': 0 # Will be updated after classification
|
| 548 |
}
|
| 549 |
|
| 550 |
+
# Progress update: starting classification
|
| 551 |
+
yield None, None, None, None, f"π **Classifying {len(input_data)} responses...** This may take a moment."
|
| 552 |
+
|
| 553 |
# Capture timing
|
| 554 |
start_time = time.time()
|
| 555 |
|
|
|
|
| 600 |
catllm_version = "unknown"
|
| 601 |
python_version = sys.version.split()[0]
|
| 602 |
|
| 603 |
+
# Progress update: generating report
|
| 604 |
+
yield None, None, None, None, f"π **Generating methodology report...** Classification complete in {processing_time:.1f}s."
|
| 605 |
+
|
| 606 |
# Generate PDF methodology report with all new data
|
| 607 |
pdf_path = generate_methodology_report_pdf(
|
| 608 |
categories=categories,
|
|
|
|
| 670 |
})
|
| 671 |
sample_df = pd.DataFrame(sample_data)
|
| 672 |
|
| 673 |
+
# Final yield: distribution plot (visible), samples (visible), full results (visible), files, status
|
| 674 |
+
yield (
|
| 675 |
gr.update(value=distribution_fig, visible=True),
|
| 676 |
gr.update(value=sample_df, visible=True),
|
| 677 |
gr.update(value=result, visible=True),
|
| 678 |
[csv_path, pdf_path],
|
| 679 |
+
f"β
**Success!** Classified {len(input_data)} responses in {processing_time:.1f}s"
|
| 680 |
)
|
| 681 |
|
| 682 |
except Exception as e:
|
| 683 |
+
yield None, None, None, None, f"**Error:** {str(e)}"
|
| 684 |
|
| 685 |
|
| 686 |
def add_category_field(current_count):
|
|
|
|
| 774 |
return gr.update(value=code, visible=True)
|
| 775 |
|
| 776 |
|
| 777 |
+
with gr.Blocks(title="CatLLM - Survey Response Classifier", theme=gr.themes.Soft()) as demo:
|
| 778 |
gr.Image("logo.png", show_label=False, show_download_button=False, height=100, container=False)
|
| 779 |
gr.Markdown("# CatLLM - Survey Response Classifier")
|
| 780 |
gr.Markdown("Classify survey responses into custom categories using LLMs.")
|
|
|
|
| 818 |
label="Upload Survey Data (CSV or Excel)",
|
| 819 |
file_types=[".csv", ".xlsx", ".xls"]
|
| 820 |
)
|
| 821 |
+
example_btn = gr.Button("π Try Example Dataset", variant="secondary", size="sm")
|
| 822 |
|
| 823 |
spreadsheet_column = gr.Dropdown(
|
| 824 |
label="Column to Classify",
|
|
|
|
| 828 |
|
| 829 |
gr.Markdown("### Categories")
|
| 830 |
category_inputs = []
|
| 831 |
+
placeholder_examples = [
|
| 832 |
+
"e.g., Positive sentiment",
|
| 833 |
+
"e.g., Negative sentiment",
|
| 834 |
+
"e.g., Product feedback",
|
| 835 |
+
"e.g., Service complaint",
|
| 836 |
+
"e.g., Feature request",
|
| 837 |
+
"e.g., Custom category"
|
| 838 |
+
]
|
| 839 |
for i in range(MAX_CATEGORIES):
|
| 840 |
visible = i < INITIAL_CATEGORIES
|
| 841 |
+
placeholder = placeholder_examples[i] if i < len(placeholder_examples) else "e.g., Custom category"
|
| 842 |
cat_input = gr.Textbox(
|
| 843 |
label=f"Category {i+1}",
|
| 844 |
+
placeholder=placeholder,
|
| 845 |
visible=visible
|
| 846 |
)
|
| 847 |
category_inputs.append(cat_input)
|
|
|
|
| 928 |
outputs=[spreadsheet_column, status]
|
| 929 |
)
|
| 930 |
|
| 931 |
+
example_btn.click(
|
| 932 |
+
fn=load_example_dataset,
|
| 933 |
+
inputs=[],
|
| 934 |
+
outputs=[spreadsheet_file, spreadsheet_column, status]
|
| 935 |
+
)
|
| 936 |
+
|
| 937 |
add_category_btn.click(
|
| 938 |
fn=add_category_field,
|
| 939 |
inputs=[category_count],
|
example_data.csv
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Response,
|
| 2 |
+
I wanted to live in San Diego,
|
| 3 |
+
I really hated my apartment,
|
| 4 |
+
My grandparents needed me to live nearby ,
|
| 5 |
+
"Tony, my husband, got a new job at UC Berkeley",
|