Spaces:

CatLLM
/

survey-classifier

Running

chrissoria Claude commited on 21 days ago

Commit

66b262b

1 Parent(s): 47603f7

Enhance PDF codebook with comprehensive researcher documentation

- Add Sample Results table (first 5 rows) to PDF Page 1
- Add Category Distribution page (Page 2) with counts/percentages
- Expand Classification Summary (Page 3) with processing time, data quality notes, version info
- Add Prompt Template page (Page 4) showing exact prompts sent to LLM
- Update app UI to show Category Distribution as main output, with sample results below
- Add timing capture and display processing speed in status message
- Include CatLLM version and Python version in codebook

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

__pycache__/app.cpython-311.pyc +0 -0
app.py +299 -42

__pycache__/app.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import gradio as gr
 import pandas as pd
 import tempfile
 import os
 from datetime import datetime
 # Import catllm
@@ -58,12 +60,14 @@ def is_free_model(model, model_tier):
     return model_tier == "Free Models"
-def generate_codebook_pdf(categories, model, column_name, num_rows, model_source, filename, success_rate):
-    """Generate a PDF codebook explaining the output columns."""
     from reportlab.lib.pagesizes import letter
     from reportlab.lib import colors
     from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
-    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Preformatted
     # Create temp file for PDF
     pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_codebook.pdf', delete=False)
@@ -74,18 +78,19 @@ def generate_codebook_pdf(categories, model, column_name, num_rows, model_source
     title_style = ParagraphStyle('Title', parent=styles['Heading1'], fontSize=18, spaceAfter=20)
     heading_style = ParagraphStyle('Heading', parent=styles['Heading2'], fontSize=14, spaceAfter=10, spaceBefore=15)
     normal_style = styles['Normal']
     story = []
-    # === PAGE 1: Title and Category Mapping ===
     story.append(Paragraph("CatLLM Classification Codebook", title_style))
     story.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", normal_style))
-    story.append(Spacer(1, 20))
     # Category mapping
     story.append(Paragraph("Category Mapping", heading_style))
     story.append(Paragraph("Each category column contains binary values: 1 = present, 0 = not present", normal_style))
-    story.append(Spacer(1, 10))
     category_data = [["Column Name", "Category Description"]]
     for i, cat in enumerate(categories, 1):
@@ -96,11 +101,46 @@ def generate_codebook_pdf(categories, model, column_name, num_rows, model_source
         ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
         ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
         ('GRID', (0, 0), (-1, -1), 1, colors.black),
-        ('PADDING', (0, 0), (-1, -1), 8),
         ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
     ]))
     story.append(cat_table)
-    story.append(Spacer(1, 20))
     # Other columns
     story.append(Paragraph("Other Output Columns", heading_style))
@@ -109,35 +149,72 @@ def generate_codebook_pdf(categories, model, column_name, num_rows, model_source
         ["survey_input", "The original text that was classified"],
         ["model_response", "Raw response from the LLM"],
         ["json", "Extracted JSON with category assignments"],
-        ["processing_status", "'success' if classification worked, 'error' if it failed"],
-        ["categories_id", "Comma-separated list of category numbers that were assigned"],
     ]
     other_table = Table(other_cols, colWidths=[120, 330])
     other_table.setStyle(TableStyle([
         ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
         ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
         ('GRID', (0, 0), (-1, -1), 1, colors.black),
-        ('PADDING', (0, 0), (-1, -1), 8),
         ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
     ]))
     story.append(other_table)
-    story.append(Spacer(1, 20))
-    # Citation
     story.append(Paragraph("Citation", heading_style))
     story.append(Paragraph("If you use CatLLM in your research, please cite:", normal_style))
     story.append(Spacer(1, 5))
     story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. https://github.com/chrissoria/cat-llm", normal_style))
-    # === PAGE 2: Classification Summary ===
     story.append(PageBreak())
     story.append(Paragraph("Classification Summary", title_style))
-    story.append(Spacer(1, 20))
     summary_data = [
         ["Source Column", column_name],
         ["Model Used", model],
         ["Model Source", model_source],
         ["Rows Classified", str(num_rows)],
         ["Number of Categories", str(len(categories))],
         ["Success Rate", f"{success_rate:.2f}%"],
@@ -146,11 +223,107 @@ def generate_codebook_pdf(categories, model, column_name, num_rows, model_source
     summary_table.setStyle(TableStyle([
         ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
         ('GRID', (0, 0), (-1, -1), 1, colors.black),
-        ('PADDING', (0, 0), (-1, -1), 8),
     ]))
     story.append(summary_table)
-    # === PAGE 3: Reproducibility Code ===
     story.append(PageBreak())
     story.append(Paragraph("Reproducibility Code", title_style))
     story.append(Paragraph("Use the following Python code to reproduce this classification:", normal_style))
@@ -183,9 +356,6 @@ print(result)
 # Save to CSV
 result.to_csv("classified_results.csv", index=False)'''
-    # Use a monospace style for code
-    code_style = ParagraphStyle('Code', parent=styles['Normal'], fontName='Courier', fontSize=9, leftIndent=20, spaceAfter=10)
     # Split code into lines and add each as a paragraph
     for line in code_text.split('\n'):
         if line.strip() == '':
@@ -239,15 +409,15 @@ def load_columns(file):
 def classify_data(spreadsheet_file, spreadsheet_column,
                   cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                   model_tier, model, model_source_input, api_key_input):
-    """Main classification function."""
     if not CATLLM_AVAILABLE:
-        return None, None, "**Error:** catllm package not available"
     all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
     categories = [c.strip() for c in all_cats if c and c.strip()]
     if not categories:
-        return None, None, "**Error:** Please enter at least one category"
     actual_model = model
@@ -257,31 +427,31 @@ def classify_data(spreadsheet_file, spreadsheet_column,
         if model in HF_ROUTED_MODELS:
             actual_api_key = os.environ.get("HF_API_KEY", "")
             if not actual_api_key:
-                return None, None, "**Error:** HuggingFace API key not configured in Space secrets"
         elif "gpt" in model.lower():
             actual_api_key = os.environ.get("OPENAI_API_KEY", "")
             if not actual_api_key:
-                return None, None, "**Error:** OpenAI API key not configured in Space secrets"
         elif "gemini" in model.lower():
             actual_api_key = os.environ.get("GOOGLE_API_KEY", "")
             if not actual_api_key:
-                return None, None, "**Error:** Google API key not configured in Space secrets"
         elif "mistral" in model.lower():
             actual_api_key = os.environ.get("MISTRAL_API_KEY", "")
             if not actual_api_key:
-                return None, None, "**Error:** Mistral API key not configured in Space secrets"
         elif "claude" in model.lower():
             actual_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
             if not actual_api_key:
-                return None, None, "**Error:** Anthropic API key not configured in Space secrets"
         elif "sonar" in model.lower():
             actual_api_key = os.environ.get("PERPLEXITY_API_KEY", "")
             if not actual_api_key:
-                return None, None, "**Error:** Perplexity API key not configured in Space secrets"
         elif "grok" in model.lower():
             actual_api_key = os.environ.get("XAI_API_KEY", "")
             if not actual_api_key:
-                return None, None, "**Error:** xAI API key not configured in Space secrets"
         else:
             actual_api_key = os.environ.get("HF_API_KEY", "")
     else:
@@ -289,7 +459,7 @@ def classify_data(spreadsheet_file, spreadsheet_column,
         if api_key_input and api_key_input.strip():
             actual_api_key = api_key_input.strip()
         else:
-            return None, None, f"**Error:** Please provide your API key for {model}"
     # Use user-selected model_source, or auto-detect if "auto"
     if model_source_input == "auto":
@@ -299,9 +469,9 @@ def classify_data(spreadsheet_file, spreadsheet_column,
     try:
         if not spreadsheet_file:
-            return None, None, "**Error:** Please upload a file"
         if not spreadsheet_column:
-            return None, None, "**Error:** Please select a column to classify"
         file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
         if file_path.endswith('.csv'):
@@ -310,10 +480,23 @@ def classify_data(spreadsheet_file, spreadsheet_column,
             df = pd.read_excel(file_path)
         if spreadsheet_column not in df.columns:
-            return None, None, f"**Error:** Column '{spreadsheet_column}' not found"
         input_data = df[spreadsheet_column].tolist()
         result = catllm.multi_class(
             survey_input=input_data,
             categories=categories,
@@ -322,6 +505,12 @@ def classify_data(spreadsheet_file, spreadsheet_column,
             model_source=model_source
         )
         # Save CSV for download
         with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
             result.to_csv(f.name, index=False)
@@ -337,13 +526,77 @@ def classify_data(spreadsheet_file, spreadsheet_column,
         else:
             success_rate = 100.0
-        # Generate PDF codebook
-        pdf_path = generate_codebook_pdf(categories, actual_model, spreadsheet_column, len(input_data), model_source, original_filename, success_rate)
-        return result, [csv_path, pdf_path], f"**Success!** Classified {len(input_data)} responses"
     except Exception as e:
-        return None, None, f"**Error:** {str(e)}"
 def add_category_field(current_count):
@@ -374,7 +627,9 @@ def reset_all():
         "",  # api_key
         "**Free tier** - no API key required! We cover the cost while CatLLM is in review.",  # api_key_status
         "Ready to classify",  # status
-        None,  # results
         None,  # download_file
         gr.update(value="", visible=False),  # code_output
     ])
@@ -538,7 +793,9 @@ https://github.com/chrissoria/cat-llm
         with gr.Column():
             status = gr.Markdown("Ready to classify")
-            results = gr.DataFrame(label="Classification Results")
             download_file = gr.File(label="Download Results (CSV + Codebook PDF)", file_count="multiple")
             code_output = gr.Code(
                 label="Python Code",
@@ -583,7 +840,7 @@ https://github.com/chrissoria/cat-llm
     classify_btn.click(
         fn=classify_data,
         inputs=[spreadsheet_file, spreadsheet_column] + category_inputs + [model_tier, model, model_source, api_key],
-        outputs=[results, download_file, status]
     )
     see_code_btn.click(
@@ -595,7 +852,7 @@ https://github.com/chrissoria/cat-llm
     reset_btn.click(
         fn=reset_all,
         inputs=[],
-        outputs=[spreadsheet_file, spreadsheet_column] + category_inputs + [add_category_btn, category_count, model_tier, model, model_source, api_key, api_key_status, status, results, download_file, code_output]
     )

 import pandas as pd
 import tempfile
 import os
+import time
+import sys
 from datetime import datetime
 # Import catllm
     return model_tier == "Free Models"
+def generate_codebook_pdf(categories, model, column_name, num_rows, model_source, filename, success_rate,
+                          result_df=None, processing_time=None, prompt_template=None,
+                          data_quality=None, catllm_version=None, python_version=None):
+    """Generate a PDF codebook explaining the output columns with comprehensive documentation."""
     from reportlab.lib.pagesizes import letter
     from reportlab.lib import colors
     from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
     # Create temp file for PDF
     pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_codebook.pdf', delete=False)
     title_style = ParagraphStyle('Title', parent=styles['Heading1'], fontSize=18, spaceAfter=20)
     heading_style = ParagraphStyle('Heading', parent=styles['Heading2'], fontSize=14, spaceAfter=10, spaceBefore=15)
     normal_style = styles['Normal']
+    code_style = ParagraphStyle('Code', parent=styles['Normal'], fontName='Courier', fontSize=9, leftIndent=20, spaceAfter=3)
     story = []
+    # === PAGE 1: Title, Category Mapping, Sample Results ===
     story.append(Paragraph("CatLLM Classification Codebook", title_style))
     story.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", normal_style))
+    story.append(Spacer(1, 15))
     # Category mapping
     story.append(Paragraph("Category Mapping", heading_style))
     story.append(Paragraph("Each category column contains binary values: 1 = present, 0 = not present", normal_style))
+    story.append(Spacer(1, 8))
     category_data = [["Column Name", "Category Description"]]
     for i, cat in enumerate(categories, 1):
         ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
         ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
         ('GRID', (0, 0), (-1, -1), 1, colors.black),
+        ('PADDING', (0, 0), (-1, -1), 6),
         ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
+        ('FONTSIZE', (0, 0), (-1, -1), 9),
     ]))
     story.append(cat_table)
+    story.append(Spacer(1, 15))
+    # Sample Results (first 5 rows)
+    if result_df is not None and len(result_df) > 0:
+        story.append(Paragraph("Sample Results (First 5 Rows)", heading_style))
+        story.append(Paragraph("Example classifications showing original text and assigned categories:", normal_style))
+        story.append(Spacer(1, 8))
+        sample_data = [["Original Text (truncated)", "Assigned Categories"]]
+        sample_df = result_df.head(5)
+        for _, row in sample_df.iterrows():
+            # Get original text, truncate to 80 chars
+            original_text = str(row.get('survey_input', ''))[:80]
+            if len(str(row.get('survey_input', ''))) > 80:
+                original_text += "..."
+            # Get assigned categories
+            assigned = row.get('categories_id', '')
+            if pd.isna(assigned) or assigned == '':
+                assigned = "None"
+            sample_data.append([original_text, str(assigned)])
+        sample_table = Table(sample_data, colWidths=[320, 130])
+        sample_table.setStyle(TableStyle([
+            ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+            ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+            ('GRID', (0, 0), (-1, -1), 1, colors.black),
+            ('PADDING', (0, 0), (-1, -1), 6),
+            ('FONTSIZE', (0, 0), (-1, -1), 8),
+            ('VALIGN', (0, 0), (-1, -1), 'TOP'),
+        ]))
+        story.append(sample_table)
+        story.append(Spacer(1, 15))
     # Other columns
     story.append(Paragraph("Other Output Columns", heading_style))
         ["survey_input", "The original text that was classified"],
         ["model_response", "Raw response from the LLM"],
         ["json", "Extracted JSON with category assignments"],
+        ["processing_status", "'success' if classification worked, 'error' if failed"],
+        ["categories_id", "Comma-separated list of assigned category numbers"],
     ]
     other_table = Table(other_cols, colWidths=[120, 330])
     other_table.setStyle(TableStyle([
         ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
         ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
         ('GRID', (0, 0), (-1, -1), 1, colors.black),
+        ('PADDING', (0, 0), (-1, -1), 6),
         ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
+        ('FONTSIZE', (0, 0), (-1, -1), 9),
     ]))
     story.append(other_table)
+    # === PAGE 2: Category Distribution ===
+    story.append(PageBreak())
+    story.append(Paragraph("Category Distribution", title_style))
+    story.append(Paragraph("Count and percentage of responses assigned to each category:", normal_style))
+    story.append(Spacer(1, 15))
+    if result_df is not None:
+        dist_data = [["Category", "Description", "Count", "Percentage"]]
+        total_rows = len(result_df)
+        for i, cat in enumerate(categories, 1):
+            col_name = f"category_{i}"
+            if col_name in result_df.columns:
+                count = int(result_df[col_name].sum())
+                pct = (count / total_rows) * 100 if total_rows > 0 else 0
+                dist_data.append([col_name, cat[:40], str(count), f"{pct:.1f}%"])
+            else:
+                dist_data.append([col_name, cat[:40], "N/A", "N/A"])
+        dist_table = Table(dist_data, colWidths=[80, 200, 60, 80])
+        dist_table.setStyle(TableStyle([
+            ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+            ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+            ('GRID', (0, 0), (-1, -1), 1, colors.black),
+            ('PADDING', (0, 0), (-1, -1), 6),
+            ('FONTSIZE', (0, 0), (-1, -1), 9),
+            ('ALIGN', (2, 1), (-1, -1), 'CENTER'),
+        ]))
+        story.append(dist_table)
+        story.append(Spacer(1, 15))
+        story.append(Paragraph(f"<i>Note: Percentages may sum to more than 100% as responses can be assigned to multiple categories.</i>", normal_style))
+    # Citation on page 2
+    story.append(Spacer(1, 30))
     story.append(Paragraph("Citation", heading_style))
     story.append(Paragraph("If you use CatLLM in your research, please cite:", normal_style))
     story.append(Spacer(1, 5))
     story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. https://github.com/chrissoria/cat-llm", normal_style))
+    # === PAGE 3: Classification Summary (Expanded) ===
     story.append(PageBreak())
     story.append(Paragraph("Classification Summary", title_style))
+    story.append(Spacer(1, 15))
+    # Basic summary
+    story.append(Paragraph("Classification Details", heading_style))
     summary_data = [
+        ["Source File", filename],
         ["Source Column", column_name],
         ["Model Used", model],
         ["Model Source", model_source],
+        ["Temperature", "default"],
         ["Rows Classified", str(num_rows)],
         ["Number of Categories", str(len(categories))],
         ["Success Rate", f"{success_rate:.2f}%"],
     summary_table.setStyle(TableStyle([
         ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
         ('GRID', (0, 0), (-1, -1), 1, colors.black),
+        ('PADDING', (0, 0), (-1, -1), 6),
+        ('FONTSIZE', (0, 0), (-1, -1), 9),
     ]))
     story.append(summary_table)
+    story.append(Spacer(1, 15))
+    # Processing Time
+    if processing_time is not None:
+        story.append(Paragraph("Processing Time", heading_style))
+        rows_per_min = (num_rows / processing_time) * 60 if processing_time > 0 else 0
+        avg_time = processing_time / num_rows if num_rows > 0 else 0
+        time_data = [
+            ["Total Processing Time", f"{processing_time:.1f} seconds"],
+            ["Average Time per Response", f"{avg_time:.2f} seconds"],
+            ["Processing Rate", f"{rows_per_min:.1f} rows/minute"],
+        ]
+        time_table = Table(time_data, colWidths=[180, 270])
+        time_table.setStyle(TableStyle([
+            ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
+            ('GRID', (0, 0), (-1, -1), 1, colors.black),
+            ('PADDING', (0, 0), (-1, -1), 6),
+            ('FONTSIZE', (0, 0), (-1, -1), 9),
+        ]))
+        story.append(time_table)
+        story.append(Spacer(1, 15))
+    # Data Quality Notes
+    if data_quality is not None:
+        story.append(Paragraph("Data Quality Notes", heading_style))
+        quality_data = [
+            ["Empty/Null Inputs Skipped", str(data_quality.get('null_count', 0))],
+            ["Average Text Length", f"{data_quality.get('avg_length', 0)} characters"],
+            ["Min Text Length", f"{data_quality.get('min_length', 0)} characters"],
+            ["Max Text Length", f"{data_quality.get('max_length', 0)} characters"],
+            ["Responses with Errors", str(data_quality.get('error_count', 0))],
+        ]
+        quality_table = Table(quality_data, colWidths=[180, 270])
+        quality_table.setStyle(TableStyle([
+            ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
+            ('GRID', (0, 0), (-1, -1), 1, colors.black),
+            ('PADDING', (0, 0), (-1, -1), 6),
+            ('FONTSIZE', (0, 0), (-1, -1), 9),
+        ]))
+        story.append(quality_table)
+        story.append(Spacer(1, 15))
+    # Version Information
+    story.append(Paragraph("Version Information", heading_style))
+    version_data = [
+        ["CatLLM Version", catllm_version or "unknown"],
+        ["Python Version", python_version or "unknown"],
+        ["Timestamp", datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
+    ]
+    version_table = Table(version_data, colWidths=[180, 270])
+    version_table.setStyle(TableStyle([
+        ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
+        ('GRID', (0, 0), (-1, -1), 1, colors.black),
+        ('PADDING', (0, 0), (-1, -1), 6),
+        ('FONTSIZE', (0, 0), (-1, -1), 9),
+    ]))
+    story.append(version_table)
+    # === PAGE 4: Prompt Template ===
+    story.append(PageBreak())
+    story.append(Paragraph("Prompt Template Used", title_style))
+    story.append(Paragraph("The following prompt template was sent to the LLM for each classification:", normal_style))
+    story.append(Spacer(1, 15))
+    if prompt_template:
+        # Show the template with placeholders
+        story.append(Paragraph("Template with Placeholders:", heading_style))
+        story.append(Spacer(1, 8))
+        for line in prompt_template.split('\n'):
+            escaped_line = line.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+            if escaped_line.strip():
+                story.append(Paragraph(escaped_line, code_style))
+            else:
+                story.append(Spacer(1, 5))
+        story.append(Spacer(1, 20))
+        # Show example with actual categories
+        story.append(Paragraph("Example with Your Categories:", heading_style))
+        story.append(Spacer(1, 8))
+        categories_list = "\n".join([f"  {i}. {cat}" for i, cat in enumerate(categories, 1)])
+        example_prompt = f'''Categorize this survey response "[YOUR TEXT HERE]" into the following categories:
+{categories_list}
+Provide your work in JSON format where the number belonging to each category
+is the key and a 1 if the category is present and a 0 if not.'''
+        for line in example_prompt.split('\n'):
+            escaped_line = line.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+            if escaped_line.strip():
+                story.append(Paragraph(escaped_line, code_style))
+            else:
+                story.append(Spacer(1, 5))
+    # === PAGE 5: Reproducibility Code ===
     story.append(PageBreak())
     story.append(Paragraph("Reproducibility Code", title_style))
     story.append(Paragraph("Use the following Python code to reproduce this classification:", normal_style))
 # Save to CSV
 result.to_csv("classified_results.csv", index=False)'''
     # Split code into lines and add each as a paragraph
     for line in code_text.split('\n'):
         if line.strip() == '':
 def classify_data(spreadsheet_file, spreadsheet_column,
                   cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                   model_tier, model, model_source_input, api_key_input):
+    """Main classification function. Returns distribution, samples, full results, files, and status."""
     if not CATLLM_AVAILABLE:
+        return None, None, None, None, "**Error:** catllm package not available"
     all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
     categories = [c.strip() for c in all_cats if c and c.strip()]
     if not categories:
+        return None, None, None, None, "**Error:** Please enter at least one category"
     actual_model = model
         if model in HF_ROUTED_MODELS:
             actual_api_key = os.environ.get("HF_API_KEY", "")
             if not actual_api_key:
+                return None, None, None, None, "**Error:** HuggingFace API key not configured in Space secrets"
         elif "gpt" in model.lower():
             actual_api_key = os.environ.get("OPENAI_API_KEY", "")
             if not actual_api_key:
+                return None, None, None, None, "**Error:** OpenAI API key not configured in Space secrets"
         elif "gemini" in model.lower():
             actual_api_key = os.environ.get("GOOGLE_API_KEY", "")
             if not actual_api_key:
+                return None, None, None, None, "**Error:** Google API key not configured in Space secrets"
         elif "mistral" in model.lower():
             actual_api_key = os.environ.get("MISTRAL_API_KEY", "")
             if not actual_api_key:
+                return None, None, None, None, "**Error:** Mistral API key not configured in Space secrets"
         elif "claude" in model.lower():
             actual_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
             if not actual_api_key:
+                return None, None, None, None, "**Error:** Anthropic API key not configured in Space secrets"
         elif "sonar" in model.lower():
             actual_api_key = os.environ.get("PERPLEXITY_API_KEY", "")
             if not actual_api_key:
+                return None, None, None, None, "**Error:** Perplexity API key not configured in Space secrets"
         elif "grok" in model.lower():
             actual_api_key = os.environ.get("XAI_API_KEY", "")
             if not actual_api_key:
+                return None, None, None, None, "**Error:** xAI API key not configured in Space secrets"
         else:
             actual_api_key = os.environ.get("HF_API_KEY", "")
     else:
         if api_key_input and api_key_input.strip():
             actual_api_key = api_key_input.strip()
         else:
+            return None, None, None, None, f"**Error:** Please provide your API key for {model}"
     # Use user-selected model_source, or auto-detect if "auto"
     if model_source_input == "auto":
     try:
         if not spreadsheet_file:
+            return None, None, None, None, "**Error:** Please upload a file"
         if not spreadsheet_column:
+            return None, None, None, None, "**Error:** Please select a column to classify"
         file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
         if file_path.endswith('.csv'):
             df = pd.read_excel(file_path)
         if spreadsheet_column not in df.columns:
+            return None, None, None, None, f"**Error:** Column '{spreadsheet_column}' not found"
         input_data = df[spreadsheet_column].tolist()
+        # Calculate data quality metrics before classification
+        text_series = df[spreadsheet_column].dropna().astype(str)
+        data_quality = {
+            'null_count': int(df[spreadsheet_column].isna().sum()),
+            'avg_length': round(text_series.str.len().mean(), 1) if len(text_series) > 0 else 0,
+            'min_length': int(text_series.str.len().min()) if len(text_series) > 0 else 0,
+            'max_length': int(text_series.str.len().max()) if len(text_series) > 0 else 0,
+            'error_count': 0  # Will be updated after classification
+        }
+        # Capture timing
+        start_time = time.time()
         result = catllm.multi_class(
             survey_input=input_data,
             categories=categories,
             model_source=model_source
         )
+        processing_time = time.time() - start_time
+        # Update error count from results
+        if 'processing_status' in result.columns:
+            data_quality['error_count'] = int((result['processing_status'] == 'error').sum())
         # Save CSV for download
         with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
             result.to_csv(f.name, index=False)
         else:
             success_rate = 100.0
+        # Build prompt template for documentation
+        prompt_template = '''Categorize this survey response "{response}" into the following categories:
+{categories}
+Provide your work in JSON format where the number belonging to each category
+is the key and a 1 if the category is present and a 0 if not.'''
+        # Get version info
+        try:
+            catllm_version = catllm.__version__
+        except AttributeError:
+            catllm_version = "unknown"
+        python_version = sys.version.split()[0]
+        # Generate PDF codebook with all new data
+        pdf_path = generate_codebook_pdf(
+            categories=categories,
+            model=actual_model,
+            column_name=spreadsheet_column,
+            num_rows=len(input_data),
+            model_source=model_source,
+            filename=original_filename,
+            success_rate=success_rate,
+            result_df=result,
+            processing_time=processing_time,
+            prompt_template=prompt_template,
+            data_quality=data_quality,
+            catllm_version=catllm_version,
+            python_version=python_version
+        )
+        # Build distribution summary DataFrame for display
+        dist_data = []
+        total_rows = len(result)
+        for i, cat in enumerate(categories, 1):
+            col_name = f"category_{i}"
+            if col_name in result.columns:
+                count = int(result[col_name].sum())
+                pct = (count / total_rows) * 100 if total_rows > 0 else 0
+                dist_data.append({
+                    "Category": cat,
+                    "Count": count,
+                    "Percentage": f"{pct:.1f}%"
+                })
+        distribution_df = pd.DataFrame(dist_data)
+        # Build sample results DataFrame (first 5 rows)
+        sample_data = []
+        for _, row in result.head(5).iterrows():
+            original_text = str(row.get('survey_input', ''))[:100]
+            if len(str(row.get('survey_input', ''))) > 100:
+                original_text += "..."
+            assigned = row.get('categories_id', '')
+            if pd.isna(assigned) or assigned == '':
+                assigned = "None"
+            sample_data.append({
+                "Original Text": original_text,
+                "Assigned Categories": str(assigned)
+            })
+        sample_df = pd.DataFrame(sample_data)
+        # Return: distribution (visible), samples (visible), full results (visible), files, status
+        return (
+            gr.update(value=distribution_df, visible=True),
+            gr.update(value=sample_df, visible=True),
+            gr.update(value=result, visible=True),
+            [csv_path, pdf_path],
+            f"**Success!** Classified {len(input_data)} responses in {processing_time:.1f}s"
+        )
     except Exception as e:
+        return None, None, None, None, f"**Error:** {str(e)}"
 def add_category_field(current_count):
         "",  # api_key
         "**Free tier** - no API key required! We cover the cost while CatLLM is in review.",  # api_key_status
         "Ready to classify",  # status
+        gr.update(value=None, visible=False),  # distribution_df
+        gr.update(value=None, visible=False),  # sample_results
+        gr.update(value=None, visible=False),  # results
         None,  # download_file
         gr.update(value="", visible=False),  # code_output
     ])
         with gr.Column():
             status = gr.Markdown("Ready to classify")
+            distribution_df = gr.DataFrame(label="Category Distribution Summary", visible=False)
+            sample_results = gr.DataFrame(label="Sample Results (First 5 Rows)", visible=False)
+            results = gr.DataFrame(label="Full Classification Results", visible=False)
             download_file = gr.File(label="Download Results (CSV + Codebook PDF)", file_count="multiple")
             code_output = gr.Code(
                 label="Python Code",
     classify_btn.click(
         fn=classify_data,
         inputs=[spreadsheet_file, spreadsheet_column] + category_inputs + [model_tier, model, model_source, api_key],
+        outputs=[distribution_df, sample_results, results, download_file, status]
     )
     see_code_btn.click(
     reset_btn.click(
         fn=reset_all,
         inputs=[],
+        outputs=[spreadsheet_file, spreadsheet_column] + category_inputs + [add_category_btn, category_count, model_tier, model, model_source, api_key, api_key_status, status, distribution_df, sample_results, results, download_file, code_output]
     )