Spaces:

CatLLM
/

CatVader

Sleeping

App Files Files Community

chrissoria commited on Mar 4

Commit

a5c1ab5

verified ·

1 Parent(s): dfbf34e

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +2368 -0

app.py ADDED Viewed

	@@ -0,0 +1,2368 @@

+"""
+Streamlit app - CatVader Social Media Classifier
+Migrated from Gradio for better mobile support
+"""
+import streamlit as st
+import pandas as pd
+import tempfile
+import os
+import time
+import sys
+from datetime import datetime
+import matplotlib.pyplot as plt
+# Import catvader
+try:
+    import catvader
+    CATVADER_AVAILABLE = True
+except ImportError as e:
+    print(f"Warning: Could not import catvader: {e}")
+    CATVADER_AVAILABLE = False
+MAX_CATEGORIES = 10
+INITIAL_CATEGORIES = 3
+MAX_FILE_SIZE_MB = 100
+def count_pdf_pages(pdf_path):
+    """Count the number of pages in a PDF file."""
+    try:
+        import fitz  # PyMuPDF
+        doc = fitz.open(pdf_path)
+        page_count = len(doc)
+        doc.close()
+        return page_count
+    except Exception:
+        return 1  # Default to 1 if can't read
+def extract_text_from_pdfs(pdf_paths):
+    """Extract text from all pages of all PDFs, returning list of page texts."""
+    import fitz  # PyMuPDF
+    all_texts = []
+    for pdf_path in pdf_paths:
+        try:
+            doc = fitz.open(pdf_path)
+            for page in doc:
+                text = page.get_text().strip()
+                if text:  # Only add non-empty pages
+                    all_texts.append(text)
+            doc.close()
+        except Exception as e:
+            print(f"Error extracting text from {pdf_path}: {e}")
+    return all_texts
+def extract_pdf_pages(pdf_paths, pdf_name_map, mode="image"):
+    """
+    Extract individual pages from PDFs.
+    Returns list of (page_data, page_label) tuples.
+    For image mode: page_data is path to temp image file
+    For text mode: page_data is extracted text
+    """
+    import fitz  # PyMuPDF
+    pages = []
+    for pdf_path in pdf_paths:
+        orig_name = pdf_name_map.get(pdf_path, os.path.basename(pdf_path).replace('.pdf', ''))
+        try:
+            doc = fitz.open(pdf_path)
+            for page_num, page in enumerate(doc, 1):
+                page_label = f"{orig_name}_p{page_num}"
+                if mode == "text":
+                    # Extract text
+                    text = page.get_text().strip()
+                    if text:
+                        pages.append((text, page_label, "text"))
+                else:
+                    # Render as image (for image or both mode)
+                    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x zoom for better quality
+                    img_path = tempfile.NamedTemporaryFile(delete=False, suffix='.png').name
+                    pix.save(img_path)
+                    if mode == "both":
+                        text = page.get_text().strip()
+                        pages.append((img_path, page_label, "image", text))
+                    else:
+                        pages.append((img_path, page_label, "image"))
+            doc.close()
+        except Exception as e:
+            print(f"Error extracting pages from {pdf_path}: {e}")
+    return pages
+# Free models - display name -> actual API model name
+FREE_MODELS_MAP = {
+    "GPT-4o Mini": "gpt-4o-mini",
+    "Gemini 2.5 Flash": "gemini-2.5-flash",
+    "Claude 3 Haiku": "claude-3-haiku-20240307",
+    "Llama 3.3 70B": "meta-llama/Llama-3.3-70B-Instruct:groq",
+    "Qwen 2.5": "Qwen/Qwen2.5-72B-Instruct",
+    "DeepSeek R1": "deepseek-ai/DeepSeek-R1:novita",
+    "Mistral Medium": "mistral-medium-2505",
+    "Grok 4 Fast": "grok-4-fast-non-reasoning",
+}
+FREE_MODEL_DISPLAY_NAMES = list(FREE_MODELS_MAP.keys())
+FREE_MODEL_CHOICES = list(FREE_MODELS_MAP.values())  # Keep for backward compat
+# Paid models (user provides their own API key)
+PAID_MODEL_CHOICES = [
+    "gemini-2.5-flash",
+    "gemini-2.5-pro",
+    "gpt-4.1",
+    "gpt-4o",
+    "gpt-4o-mini",
+    "claude-sonnet-4-5-20250929",
+    "claude-opus-4-20250514",
+    "claude-3-5-haiku-20241022",
+    "mistral-large-latest",
+]
+# Models routed through HuggingFace
+HF_ROUTED_MODELS = [
+    "meta-llama/Llama-3.3-70B-Instruct:groq",
+    "deepseek-ai/DeepSeek-R1:novita",
+]
+def is_free_model(model, model_tier):
+    """Check if using free tier (Space pays for API)."""
+    return model_tier == "Free Models"
+def get_model_source(model):
+    """Auto-detect model source."""
+    model_lower = model.lower()
+    if "gpt" in model_lower:
+        return "openai"
+    elif "claude" in model_lower:
+        return "anthropic"
+    elif "gemini" in model_lower:
+        return "google"
+    elif "mistral" in model_lower and ":novita" not in model_lower:
+        return "mistral"
+    elif any(x in model_lower for x in [":novita", ":groq", "qwen", "llama", "deepseek"]):
+        return "huggingface"
+    elif "sonar" in model_lower:
+        return "perplexity"
+    elif "grok" in model_lower:
+        return "xai"
+    return "huggingface"
+def get_api_key(model, model_tier, api_key_input):
+    """Get the appropriate API key based on model and tier."""
+    if is_free_model(model, model_tier):
+        if model in HF_ROUTED_MODELS:
+            return os.environ.get("HF_API_KEY", ""), "HuggingFace"
+        elif "gpt" in model.lower():
+            return os.environ.get("OPENAI_API_KEY", ""), "OpenAI"
+        elif "gemini" in model.lower():
+            return os.environ.get("GOOGLE_API_KEY", ""), "Google"
+        elif "mistral" in model.lower():
+            return os.environ.get("MISTRAL_API_KEY", ""), "Mistral"
+        elif "claude" in model.lower():
+            return os.environ.get("ANTHROPIC_API_KEY", ""), "Anthropic"
+        elif "sonar" in model.lower():
+            return os.environ.get("PERPLEXITY_API_KEY", ""), "Perplexity"
+        elif "grok" in model.lower():
+            return os.environ.get("XAI_API_KEY", ""), "xAI"
+        else:
+            return os.environ.get("HF_API_KEY", ""), "HuggingFace"
+    else:
+        if api_key_input and api_key_input.strip():
+            return api_key_input.strip(), "User"
+        return "", "User"
+def calculate_total_file_size(files):
+    """Calculate total size of uploaded files in MB."""
+    if files is None:
+        return 0
+    if not isinstance(files, list):
+        files = [files]
+    total_bytes = 0
+    for f in files:
+        try:
+            if hasattr(f, 'size'):
+                total_bytes += f.size
+            elif hasattr(f, 'name'):
+                total_bytes += os.path.getsize(f.name)
+        except (OSError, AttributeError):
+            pass
+    return total_bytes / (1024 * 1024)
+def generate_extract_code(input_type, description, model, model_source, max_categories, mode=None):
+    """Generate Python code for category extraction."""
+    if input_type == "text":
+        return f'''import catvader
+import pandas as pd
+# Load your data
+df = pd.read_csv("your_data.csv")
+# Extract categories from the text column
+result = catvader.extract(
+    input_data=df["{description}"].tolist(),
+    api_key="YOUR_API_KEY",
+    input_type="text",
+    description="{description}",
+    user_model="{model}",
+    model_source="{model_source}",
+    max_categories={max_categories}
+)
+# View extracted categories
+print(result["top_categories"])
+print(result["counts_df"])
+'''
+    elif input_type == "pdf":
+        mode_line = f',\n    mode="{mode}"' if mode else ''
+        return f'''import catvader
+# Extract categories from PDF documents
+result = catvader.extract(
+    input_data="path/to/your/pdfs/",
+    api_key="YOUR_API_KEY",
+    input_type="pdf",
+    description="{description}"{mode_line},
+    user_model="{model}",
+    model_source="{model_source}",
+    max_categories={max_categories}
+)
+# View extracted categories
+print(result["top_categories"])
+print(result["counts_df"])
+'''
+    else:  # image
+        return f'''import catvader
+# Extract categories from images
+result = catvader.extract(
+    input_data="path/to/your/images/",
+    api_key="YOUR_API_KEY",
+    input_type="image",
+    description="{description}",
+    user_model="{model}",
+    model_source="{model_source}",
+    max_categories={max_categories}
+)
+# View extracted categories
+print(result["top_categories"])
+print(result["counts_df"])
+'''
+def generate_full_code(extraction_params, classify_params):
+    """Generate combined extract + classify code when categories were auto-extracted."""
+    ext = extraction_params
+    cls = classify_params
+    # Determine input data placeholder
+    if ext['input_type'] == "text":
+        input_placeholder = 'df["your_column"].tolist()'
+        load_data = '''import pandas as pd
+# Load your data
+df = pd.read_csv("your_data.csv")
+'''
+    elif ext['input_type'] == "pdf":
+        input_placeholder = '"path/to/your/pdfs/"'
+        load_data = ''
+    else:
+        input_placeholder = '"path/to/your/images/"'
+        load_data = ''
+    mode_param = f',\n    mode="{ext["mode"]}"' if ext.get('mode') else ''
+    # Build extract code
+    extract_code = f'''# Step 1: Extract categories from your data
+extract_result = catvader.extract(
+    input_data={input_placeholder},
+    api_key="YOUR_API_KEY",
+    description="{ext['description']}",
+    user_model="{ext['model']}",
+    max_categories={ext['max_categories']}{mode_param}
+)
+categories = extract_result["top_categories"]
+print(f"Extracted {{len(categories)}} categories: {{categories}}")
+'''
+    # Build classify code based on mode
+    if cls['classify_mode'] == "Single Model":
+        classify_mode_param = f',\n    mode="{cls["mode"]}"' if cls.get('mode') and ext['input_type'] == "pdf" else ''
+        classify_code = f'''
+# Step 2: Classify data using extracted categories
+result = catvader.classify(
+    input_data={input_placeholder},
+    categories=categories,
+    api_key="YOUR_API_KEY",
+    description="{cls['description']}",
+    user_model="{cls['model']}"{classify_mode_param}
+)'''
+    else:
+        # Multi-model mode — include per-model temperatures when set
+        ens_runs = cls.get('ensemble_runs')
+        model_lines = []
+        if ens_runs:
+            for m, temp in ens_runs:
+                model_lines.append(f'("{m}", "auto", "YOUR_API_KEY", {{"creativity": {temp}}})')
+        else:
+            model_temps = cls.get('model_temperatures', {})
+            for m in cls['models_list']:
+                temp = model_temps.get(m) if model_temps else None
+                if temp is not None:
+                    model_lines.append(f'("{m}", "auto", "YOUR_API_KEY", {{"creativity": {temp}}})')
+                else:
+                    model_lines.append(f'("{m}", "auto", "YOUR_API_KEY")')
+        models_str = ",\n        ".join(model_lines)
+        classify_mode_param = f',\n    mode="{cls["mode"]}"' if cls.get('mode') and ext['input_type'] == "pdf" else ''
+        threshold_str = "majority" if cls['consensus_threshold'] == 0.5 else "two-thirds" if cls['consensus_threshold'] == 0.67 else "unanimous"
+        consensus_param = f',\n    consensus_threshold="{threshold_str}"' if cls['classify_mode'] == "Ensemble" else ''
+        classify_code = f'''
+# Step 2: Classify data using extracted categories with {"ensemble voting" if cls['classify_mode'] == "Ensemble" else "model comparison"}
+models = [
+        {models_str}
+]
+result = catvader.classify(
+    input_data={input_placeholder},
+    categories=categories,
+    models=models,
+    description="{cls['description']}"{classify_mode_param}{consensus_param}
+)'''
+    return f'''import catvader
+{load_data}
+{extract_code}
+{classify_code}
+# View results
+print(result)
+result.to_csv("classified_results.csv", index=False)
+'''
+def generate_classify_code(input_type, description, categories, model, model_source, mode=None,
+                           classify_mode="Single Model", models_list=None, consensus_threshold=0.5,
+                           model_temperatures=None, ensemble_runs=None):
+    """Generate Python code for classification."""
+    categories_str = ",\n    ".join([f'"{cat}"' for cat in categories])
+    # Determine input data placeholder based on type
+    if input_type == "text":
+        input_placeholder = 'df["your_column"].tolist()'
+        load_data = '''import pandas as pd
+# Load your data
+df = pd.read_csv("your_data.csv")
+'''
+    elif input_type == "pdf":
+        input_placeholder = '"path/to/your/pdfs/"'
+        load_data = ''
+    else:  # image
+        input_placeholder = '"path/to/your/images/"'
+        load_data = ''
+    # Generate code based on classification mode
+    if classify_mode == "Single Model":
+        # Single model mode
+        mode_param = f',\n    mode="{mode}"' if mode and input_type == "pdf" else ''
+        return f'''import catvader
+{load_data}
+# Define categories
+categories = [
+    {categories_str}
+]
+# Classify data (input type is auto-detected)
+result = catvader.classify(
+    input_data={input_placeholder},
+    categories=categories,
+    api_key="YOUR_API_KEY",
+    description="{description}",
+    user_model="{model}"{mode_param}
+)
+# View results
+print(result)
+result.to_csv("classified_results.csv", index=False)
+'''
+    else:
+        # Multi-model mode (Comparison or Ensemble)
+        # Build model tuples with per-model temperature when set
+        if ensemble_runs:
+            # Ensemble with explicit (model, temp) pairs (supports duplicate models)
+            model_lines = []
+            for m, temp in ensemble_runs:
+                model_lines.append(f'("{m}", "auto", "YOUR_API_KEY", {{"creativity": {temp}}})')
+            models_str = ",\n        ".join(model_lines)
+        elif models_list:
+            model_lines = []
+            for m in models_list:
+                temp = model_temperatures.get(m) if model_temperatures else None
+                if temp is not None:
+                    model_lines.append(f'("{m}", "auto", "YOUR_API_KEY", {{"creativity": {temp}}})')
+                else:
+                    model_lines.append(f'("{m}", "auto", "YOUR_API_KEY")')
+            models_str = ",\n        ".join(model_lines)
+        else:
+            models_str = '("gpt-4o", "auto", "YOUR_API_KEY"),\n        ("claude-sonnet-4-5-20250929", "auto", "YOUR_API_KEY")'
+        mode_param = f',\n    mode="{mode}"' if mode and input_type == "pdf" else ''
+        # Map numeric threshold back to string for cleaner code
+        threshold_str = "majority" if consensus_threshold == 0.5 else "two-thirds" if consensus_threshold == 0.67 else "unanimous"
+        consensus_param = f',\n    consensus_threshold="{threshold_str}"' if classify_mode == "Ensemble" else ''
+        return f'''import catvader
+{load_data}
+# Define categories
+categories = [
+    {categories_str}
+]
+# Define models for {"ensemble voting" if classify_mode == "Ensemble" else "comparison"}
+models = [
+        {models_str}
+]
+# Classify with multiple models
+result = catvader.classify(
+    input_data={input_placeholder},
+    categories=categories,
+    models=models,
+    description="{description}"{mode_param}{consensus_param}
+)
+# View results
+print(result)
+result.to_csv("classified_results.csv", index=False)
+'''
+def generate_methodology_report_pdf(categories, model, column_name, num_rows, model_source, filename, success_rate,
+                          result_df=None, processing_time=None, prompt_template=None,
+                          data_quality=None, catvader_version=None, python_version=None,
+                          task_type="assign", extracted_categories_df=None, max_categories=None,
+                          input_type="text", description=None, classify_mode="Single Model",
+                          models_list=None, code=None, consensus_threshold=None):
+    """Generate a PDF methodology report."""
+    from reportlab.lib.pagesizes import letter
+    from reportlab.lib import colors
+    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
+    pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_methodology_report.pdf', delete=False)
+    doc = SimpleDocTemplate(pdf_file.name, pagesize=letter)
+    styles = getSampleStyleSheet()
+    title_style = ParagraphStyle('Title', parent=styles['Heading1'], fontSize=18, spaceAfter=20)
+    heading_style = ParagraphStyle('Heading', parent=styles['Heading2'], fontSize=14, spaceAfter=10, spaceBefore=15)
+    normal_style = styles['Normal']
+    code_style = ParagraphStyle('Code', parent=styles['Normal'], fontName='Courier', fontSize=9, leftIndent=20, spaceAfter=3)
+    story = []
+    if task_type == "extract_and_assign":
+        report_title = "CatVader Extraction &amp; Classification Report"
+    else:
+        report_title = "CatVader Classification Report"
+    story.append(Paragraph(report_title, title_style))
+    story.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", normal_style))
+    story.append(Spacer(1, 15))
+    story.append(Paragraph("About This Report", heading_style))
+    if task_type == "extract_and_assign":
+        about_text = """This methodology report documents the automated category extraction and classification process. \
+CatVader first discovers categories from your data using LLMs, then classifies each item into those categories."""
+    else:
+        about_text = """This methodology report documents the classification process for reproducibility and transparency. \
+CatVader restricts the prompt to a standard template that is impartial to the researcher's inclinations, ensuring \
+consistent and reproducible results."""
+    story.append(Paragraph(about_text, normal_style))
+    story.append(Spacer(1, 15))
+    if categories:
+        story.append(Paragraph("Category Mapping", heading_style))
+        if classify_mode in ("Ensemble", "Model Comparison") and result_df is not None:
+            # Multi-model: show per-model columns and consensus columns
+            story.append(Paragraph("Each model produces its own binary columns. "
+                                   "Consensus columns show the majority vote result.", normal_style))
+            story.append(Spacer(1, 8))
+            # Detect ALL distinct model suffixes directly from the DataFrame
+            # (handles same-model-different-temperature cases correctly)
+            all_suffixes = _find_all_model_suffixes(result_df)
+            category_data = [["Column Name", "Category Description"]]
+            for i, cat in enumerate(categories, 1):
+                # Per-model columns (each suffix is a unique model/temperature)
+                for suffix in all_suffixes:
+                    category_data.append([f"category_{i}_{suffix}", f"{cat} ({suffix})"])
+                # Consensus + agreement columns
+                category_data.append([f"category_{i}_consensus", f"{cat} (consensus)"])
+                category_data.append([f"category_{i}_agreement", f"{cat} (agreement score)"])
+            cat_table = Table(category_data, colWidths=[200, 250])
+        else:
+            # Single model: simple mapping
+            story.append(Paragraph("Each category column contains binary values: 1 = present, 0 = not present", normal_style))
+            story.append(Spacer(1, 8))
+            category_data = [["Column Name", "Category Description"]]
+            for i, cat in enumerate(categories, 1):
+                category_data.append([f"category_{i}", cat])
+            cat_table = Table(category_data, colWidths=[120, 330])
+        cat_table.setStyle(TableStyle([
+            ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+            ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+            ('GRID', (0, 0), (-1, -1), 1, colors.black),
+            ('PADDING', (0, 0), (-1, -1), 6),
+            ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
+            ('FONTSIZE', (0, 0), (-1, -1), 9),
+        ]))
+        story.append(cat_table)
+        story.append(Spacer(1, 15))
+    story.append(Spacer(1, 30))
+    story.append(Paragraph("Citation", heading_style))
+    story.append(Paragraph("If you use CatVader in your research, please cite:", normal_style))
+    story.append(Spacer(1, 5))
+    story.append(Paragraph("Soria, C. (2025). CatVader: A Python package for LLM-based social media classification. DOI: 10.5281/zenodo.15532316", normal_style))
+    # Summary section
+    story.append(PageBreak())
+    story.append(Paragraph("Classification Summary", title_style))
+    story.append(Spacer(1, 15))
+    summary_data = [
+        ["Source File", filename],
+        ["Source Column", column_name],
+        ["Classification Mode", classify_mode],
+        ["Model(s) Used", model],
+        ["Model Source", model_source],
+        ["Rows Classified", str(num_rows)],
+        ["Number of Categories", str(len(categories)) if categories else "0"],
+        ["Success Rate", f"{success_rate:.2f}%"],
+    ]
+    # Add consensus threshold for ensemble mode
+    if classify_mode == "Ensemble" and consensus_threshold is not None:
+        threshold_labels = {0.5: "Majority (50%+)", 0.67: "Two-Thirds (67%+)", 1.0: "Unanimous (100%)"}
+        threshold_label = threshold_labels.get(consensus_threshold, f"Custom ({consensus_threshold:.0%})")
+        summary_data.append(["Consensus Threshold", threshold_label])
+    summary_table = Table(summary_data, colWidths=[150, 300])
+    summary_table.setStyle(TableStyle([
+        ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
+        ('GRID', (0, 0), (-1, -1), 1, colors.black),
+        ('PADDING', (0, 0), (-1, -1), 6),
+        ('FONTSIZE', (0, 0), (-1, -1), 9),
+    ]))
+    story.append(summary_table)
+    story.append(Spacer(1, 15))
+    # Agreement scores table for ensemble mode
+    if classify_mode == "Ensemble" and result_df is not None and categories:
+        agreement_cols = [f"category_{i}_agreement" for i in range(1, len(categories) + 1)]
+        has_agreement = all(col in result_df.columns for col in agreement_cols)
+        if has_agreement:
+            story.append(Paragraph("Ensemble Agreement Scores", heading_style))
+            story.append(Paragraph(
+                "Agreement shows what proportion of models agreed on each category. "
+                "Higher scores indicate stronger consensus.", normal_style))
+            story.append(Spacer(1, 8))
+            agree_data = [["Category", "Mean Agreement", "Min Agreement"]]
+            for i, cat in enumerate(categories, 1):
+                col = f"category_{i}_agreement"
+                mean_val = result_df[col].mean()
+                min_val = result_df[col].min()
+                agree_data.append([cat, f"{mean_val:.1%}", f"{min_val:.1%}"])
+            agree_table = Table(agree_data, colWidths=[200, 125, 125])
+            agree_table.setStyle(TableStyle([
+                ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+                ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+                ('GRID', (0, 0), (-1, -1), 1, colors.black),
+                ('PADDING', (0, 0), (-1, -1), 6),
+                ('FONTSIZE', (0, 0), (-1, -1), 9),
+            ]))
+            story.append(agree_table)
+            story.append(Spacer(1, 15))
+    if processing_time is not None:
+        story.append(Paragraph("Processing Time", heading_style))
+        rows_per_min = (num_rows / processing_time) * 60 if processing_time > 0 else 0
+        avg_time = processing_time / num_rows if num_rows > 0 else 0
+        time_data = [
+            ["Total Processing Time", f"{processing_time:.1f} seconds"],
+            ["Average Time per Response", f"{avg_time:.2f} seconds"],
+            ["Processing Rate", f"{rows_per_min:.1f} rows/minute"],
+        ]
+        time_table = Table(time_data, colWidths=[180, 270])
+        time_table.setStyle(TableStyle([
+            ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
+            ('GRID', (0, 0), (-1, -1), 1, colors.black),
+            ('PADDING', (0, 0), (-1, -1), 6),
+            ('FONTSIZE', (0, 0), (-1, -1), 9),
+        ]))
+        story.append(time_table)
+    story.append(Spacer(1, 15))
+    story.append(Paragraph("Version Information", heading_style))
+    version_data = [
+        ["CatVader Version", catvader_version or "unknown"],
+        ["Python Version", python_version or "unknown"],
+        ["Timestamp", datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
+    ]
+    version_table = Table(version_data, colWidths=[180, 270])
+    version_table.setStyle(TableStyle([
+        ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
+        ('GRID', (0, 0), (-1, -1), 1, colors.black),
+        ('PADDING', (0, 0), (-1, -1), 6),
+        ('FONTSIZE', (0, 0), (-1, -1), 9),
+    ]))
+    story.append(version_table)
+    # Reproducibility Code section
+    if code:
+        story.append(PageBreak())
+        story.append(Paragraph("Reproducibility Code", title_style))
+        story.append(Paragraph("Use this Python code to reproduce the classification with the CatVader package:", normal_style))
+        story.append(Spacer(1, 10))
+        # Split code into lines and add as code-formatted paragraphs
+        for line in code.strip().split('\n'):
+            # Escape special characters for reportlab
+            escaped_line = line.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+            if escaped_line.strip():
+                story.append(Paragraph(escaped_line, code_style))
+            else:
+                story.append(Spacer(1, 6))
+    # Visualizations section
+    if result_df is not None and categories:
+        from reportlab.platypus import Image
+        import io
+        # Distribution chart (new page)
+        story.append(PageBreak())
+        story.append(Paragraph("Category Distribution", title_style))
+        try:
+            fig1 = create_distribution_chart(result_df, categories, classify_mode, models_list)
+            img_buffer1 = io.BytesIO()
+            fig1.savefig(img_buffer1, format='png', dpi=150, bbox_inches='tight')
+            img_buffer1.seek(0)
+            plt.close(fig1)
+            # Save to temp file for reportlab
+            img_temp1 = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
+            img_temp1.write(img_buffer1.read())
+            img_temp1.close()
+            img1 = Image(img_temp1.name, width=450, height=250)
+            story.append(img1)
+            story.append(Spacer(1, 10))
+            story.append(Paragraph("Note: Categories are not mutually exclusive—each item can belong to multiple categories.", normal_style))
+        except Exception as e:
+            story.append(Paragraph(f"Could not generate distribution chart: {str(e)}", normal_style))
+        # Classification matrix (new page)
+        story.append(PageBreak())
+        story.append(Paragraph("Classification Matrix", title_style))
+        try:
+            fig2 = create_classification_heatmap(result_df, categories, classify_mode, models_list)
+            img_buffer2 = io.BytesIO()
+            fig2.savefig(img_buffer2, format='png', dpi=150, bbox_inches='tight')
+            img_buffer2.seek(0)
+            plt.close(fig2)
+            # Save to temp file for reportlab
+            img_temp2 = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
+            img_temp2.write(img_buffer2.read())
+            img_temp2.close()
+            img2 = Image(img_temp2.name, width=450, height=300)
+            story.append(img2)
+            story.append(Spacer(1, 10))
+            story.append(Paragraph("Orange = category present, Black = not present. Each row represents one response.", normal_style))
+        except Exception as e:
+            story.append(Paragraph(f"Could not generate classification matrix: {str(e)}", normal_style))
+    doc.build(story)
+    return pdf_file.name
+def run_auto_extract(input_type, input_data, description, max_categories_val,
+                     model_tier, model, api_key_input, mode=None, progress_callback=None):
+    """Extract categories from data."""
+    if not CATVADER_AVAILABLE:
+        return None, "catvader package not available"
+    actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
+    if not actual_api_key:
+        return None, f"{provider} API key not configured"
+    model_source = get_model_source(model)
+    try:
+        if isinstance(input_data, list):
+            num_items = len(input_data)
+        else:
+            num_items = 1
+        if input_type == "image":
+            divisions = min(3, max(1, num_items // 5))
+            categories_per_chunk = 12
+        else:
+            divisions = max(1, num_items // 15)
+            divisions = min(divisions, 5)
+            chunk_size = num_items // max(1, divisions)
+            categories_per_chunk = min(10, chunk_size - 1)
+        extract_kwargs = {
+            'input_data': input_data,
+            'api_key': actual_api_key,
+            'input_type': input_type,
+            'description': description,
+            'user_model': model,
+            'model_source': model_source,
+            'divisions': divisions,
+            'categories_per_chunk': categories_per_chunk,
+            'max_categories': int(max_categories_val)
+        }
+        if mode:
+            extract_kwargs['mode'] = mode
+        extract_result = catvader.extract(**extract_kwargs)
+        categories = extract_result.get('top_categories', [])
+        if not categories:
+            return None, "No categories were extracted"
+        return categories, f"Extracted {len(categories)} categories successfully!"
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+def run_classify_data(input_type, input_data, description, categories,
+                      model_tier, model, api_key_input, mode=None,
+                      original_filename="data", column_name="text",
+                      progress_callback=None):
+    """Classify data with user-provided categories."""
+    if not CATVADER_AVAILABLE:
+        return None, None, None, None, "catvader package not available"
+    if not categories:
+        return None, None, None, None, "Please enter at least one category"
+    actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
+    if not actual_api_key:
+        return None, None, None, None, f"{provider} API key not configured"
+    model_source = get_model_source(model)
+    try:
+        start_time = time.time()
+        classify_kwargs = {
+            'input_data': input_data,
+            'categories': categories,
+            'models': [(model, model_source, actual_api_key)],
+            'description': description,
+        }
+        if mode:
+            classify_kwargs['mode'] = mode
+        result = catvader.classify(**classify_kwargs)
+        processing_time = time.time() - start_time
+        num_items = len(result)
+        # Save CSV
+        with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
+            result.to_csv(f.name, index=False)
+            csv_path = f.name
+        # Calculate success rate
+        if 'processing_status' in result.columns:
+            success_count = (result['processing_status'] == 'success').sum()
+            success_rate = (success_count / len(result)) * 100
+        else:
+            success_rate = 100.0
+        # Get version info
+        try:
+            catvader_version = catvader.__version__
+        except AttributeError:
+            catvader_version = "unknown"
+        python_version = sys.version.split()[0]
+        # Generate methodology report
+        report_pdf_path = generate_methodology_report_pdf(
+            categories=categories,
+            model=model,
+            column_name=column_name,
+            num_rows=num_items,
+            model_source=model_source,
+            filename=original_filename,
+            success_rate=success_rate,
+            result_df=result,
+            processing_time=processing_time,
+            catvader_version=catvader_version,
+            python_version=python_version,
+            task_type="assign",
+            input_type=input_type,
+            description=description
+        )
+        # Generate reproducibility code
+        code = generate_classify_code(input_type, description, categories, model, model_source, mode)
+        return result, csv_path, report_pdf_path, code, f"Classified {num_items} items in {processing_time:.1f}s"
+    except Exception as e:
+        return None, None, None, None, f"Error: {str(e)}"
+def sanitize_model_name(model: str) -> str:
+    """Convert model name to column-safe suffix (matches catvader logic)."""
+    import re
+    sanitized = re.sub(r'[^a-zA-Z0-9]', '_', model)
+    sanitized = re.sub(r'_+', '_', sanitized)
+    sanitized = sanitized.strip('_').lower()
+    return sanitized[:40]
+def _find_model_column_suffix(result_df, model_name):
+    """Find the actual column suffix used for a model in the DataFrame.
+    catvader appends a creativity suffix (e.g. _tauto, _t50) to ensemble column
+    names, so we can't just use sanitize_model_name().  This function looks at
+    the real DataFrame columns to discover the full suffix.
+    """
+    sanitized = sanitize_model_name(model_name)
+    prefix = f"category_1_{sanitized}"
+    for col in result_df.columns:
+        if col.startswith(prefix):
+            # Return everything after "category_1_"
+            return col[len("category_1_"):]
+    # Fallback: return just the sanitized name
+    return sanitized
+def _find_all_model_suffixes(result_df):
+    """Discover all distinct per-model column suffixes from the DataFrame.
+    Looks at category_1_* columns (excluding _consensus and _agreement)
+    to find every unique model suffix.  Works even when the same model
+    appears multiple times with different temperature suffixes.
+    Returns:
+        List of suffix strings, e.g.
+        ['claude_haiku_4_5_20251001_t0', 'claude_haiku_4_5_20251001_t25', ...]
+    """
+    import re
+    suffixes = []
+    for col in result_df.columns:
+        m = re.match(r'^category_1_(.+)$', col)
+        if m:
+            suffix = m.group(1)
+            if suffix not in ('consensus', 'agreement'):
+                suffixes.append(suffix)
+    return suffixes
+def create_classification_heatmap(result_df, categories, classify_mode="Single Model", models_list=None):
+    """Create a binary heatmap showing classification for each row.
+    Args:
+        result_df: DataFrame with classification results
+        categories: List of category names
+        classify_mode: "Single Model", "Model Comparison", or "Ensemble"
+        models_list: List of model names (for multi-model modes)
+    """
+    import numpy as np
+    total_rows = len(result_df)
+    if total_rows == 0:
+        fig, ax = plt.subplots(figsize=(10, 4))
+        ax.text(0.5, 0.5, 'No data to display', ha='center', va='center', fontsize=14)
+        ax.axis('off')
+        return fig
+    # Build the binary matrix based on classify_mode
+    if classify_mode == "Ensemble":
+        # Use consensus columns
+        col_names = [f"category_{i}_consensus" for i in range(1, len(categories) + 1)]
+    elif classify_mode == "Model Comparison" and models_list:
+        # Use first model's columns (detect actual suffix from DataFrame)
+        suffix = _find_model_column_suffix(result_df, models_list[0])
+        col_names = [f"category_{i}_{suffix}" for i in range(1, len(categories) + 1)]
+    else:
+        # Single model
+        col_names = [f"category_{i}" for i in range(1, len(categories) + 1)]
+    # Extract the binary matrix
+    matrix_data = []
+    for col in col_names:
+        if col in result_df.columns:
+            matrix_data.append(result_df[col].astype(int).values)
+        else:
+            matrix_data.append(np.zeros(total_rows, dtype=int))
+    matrix = np.array(matrix_data).T  # Rows = responses, Cols = categories
+    # Create figure with appropriate sizing
+    fig_height = max(4, min(20, total_rows * 0.15))
+    fig_width = max(8, len(categories) * 0.8)
+    fig, ax = plt.subplots(figsize=(fig_width, fig_height))
+    # Create custom colormap: black (0) and orange (1) - CatVader theme
+    from matplotlib.colors import ListedColormap
+    cmap = ListedColormap(['#1a1a1a', '#E8A33C'])
+    # Plot heatmap
+    im = ax.imshow(matrix, aspect='auto', cmap=cmap, vmin=0, vmax=1)
+    # Set labels - remove y-axis numbers for cleaner look
+    ax.set_xticks(range(len(categories)))
+    ax.set_xticklabels(categories, rotation=45, ha='right', fontsize=9)
+    ax.set_xlabel('Categories', fontsize=11)
+    ax.set_ylabel(f'Responses (n={total_rows})', fontsize=11)
+    ax.set_yticks([])  # Remove y-axis tick marks
+    title = 'Classification Matrix'
+    if classify_mode == "Ensemble":
+        title += ' (Ensemble Consensus)'
+    elif classify_mode == "Model Comparison":
+        title += f' ({models_list[0].split("/")[-1].split(":")[0][:20]})'
+    ax.set_title(title, fontsize=14, fontweight='bold')
+    # Add legend
+    from matplotlib.patches import Patch
+    legend_elements = [
+        Patch(facecolor='#1a1a1a', edgecolor='white', label='Not Present'),
+        Patch(facecolor='#E8A33C', edgecolor='white', label='Present')
+    ]
+    ax.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1.15, 1))
+    plt.tight_layout()
+    return fig
+def create_distribution_chart(result_df, categories, classify_mode="Single Model", models_list=None):
+    """Create a bar chart showing category distribution.
+    Args:
+        result_df: DataFrame with classification results
+        categories: List of category names
+        classify_mode: "Single Model", "Model Comparison", or "Ensemble"
+        models_list: List of model names (for multi-model modes)
+    """
+    import numpy as np
+    total_rows = len(result_df)
+    if total_rows == 0:
+        fig, ax = plt.subplots(figsize=(10, 4))
+        ax.text(0.5, 0.5, 'No data to display', ha='center', va='center', fontsize=14)
+        ax.axis('off')
+        return fig
+    # Define colors for different models
+    model_colors = ['#2563eb', '#dc2626', '#16a34a', '#ca8a04', '#9333ea', '#0891b2', '#be185d', '#65a30d']
+    if classify_mode == "Single Model":
+        # Single model: use category_1, category_2, etc.
+        fig, ax = plt.subplots(figsize=(10, max(4, len(categories) * 0.8)))
+        dist_data = []
+        for i, cat in enumerate(categories, 1):
+            col_name = f"category_{i}"
+            if col_name in result_df.columns:
+                count = int(result_df[col_name].sum())
+                pct = (count / total_rows) * 100
+                dist_data.append({"Category": cat, "Percentage": round(pct, 1)})
+        categories_list = [d["Category"] for d in dist_data][::-1]
+        percentages = [d["Percentage"] for d in dist_data][::-1]
+        bars = ax.barh(categories_list, percentages, color='#2563eb')
+        ax.set_xlim(0, 100)
+        ax.set_xlabel('Percentage (%)', fontsize=11)
+        ax.set_title('Category Distribution (%)', fontsize=14, fontweight='bold')
+        for bar, pct in zip(bars, percentages):
+            ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
+                   f'{pct:.1f}%', va='center', fontsize=10)
+    elif classify_mode == "Ensemble":
+        # Ensemble: use category_1_consensus, category_2_consensus, etc.
+        fig, ax = plt.subplots(figsize=(10, max(4, len(categories) * 0.8)))
+        dist_data = []
+        for i, cat in enumerate(categories, 1):
+            col_name = f"category_{i}_consensus"
+            if col_name in result_df.columns:
+                count = int(result_df[col_name].sum())
+                pct = (count / total_rows) * 100
+                dist_data.append({"Category": cat, "Percentage": round(pct, 1)})
+        categories_list = [d["Category"] for d in dist_data][::-1]
+        percentages = [d["Percentage"] for d in dist_data][::-1]
+        bars = ax.barh(categories_list, percentages, color='#16a34a')
+        ax.set_xlim(0, 100)
+        ax.set_xlabel('Percentage (%)', fontsize=11)
+        ax.set_title('Ensemble Consensus Distribution (%)', fontsize=14, fontweight='bold')
+        for bar, pct in zip(bars, percentages):
+            ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
+                   f'{pct:.1f}%', va='center', fontsize=10)
+    else:  # Model Comparison
+        # Model Comparison: grouped bars for each model
+        if not models_list:
+            models_list = []
+        # Detect actual column suffixes from the DataFrame
+        model_suffixes = [_find_model_column_suffix(result_df, m) for m in models_list]
+        n_models = len(model_suffixes)
+        n_categories = len(categories)
+        fig, ax = plt.subplots(figsize=(12, max(5, n_categories * 1.2)))
+        # Gather data for each model
+        bar_height = 0.8 / n_models
+        y_positions = np.arange(n_categories)
+        for model_idx, (model_name, suffix) in enumerate(zip(models_list, model_suffixes)):
+            model_pcts = []
+            for i in range(1, n_categories + 1):
+                col_name = f"category_{i}_{suffix}"
+                if col_name in result_df.columns:
+                    count = int(result_df[col_name].sum())
+                    pct = (count / total_rows) * 100
+                else:
+                    pct = 0
+                model_pcts.append(pct)
+            # Reverse for horizontal bar chart
+            model_pcts = model_pcts[::-1]
+            offset = (model_idx - n_models / 2 + 0.5) * bar_height
+            color = model_colors[model_idx % len(model_colors)]
+            # Use shorter display name
+            display_name = model_name.split('/')[-1].split(':')[0][:20]
+            bars = ax.barh(y_positions + offset, model_pcts, bar_height * 0.9,
+                          label=display_name, color=color, alpha=0.85)
+        ax.set_yticks(y_positions)
+        ax.set_yticklabels(categories[::-1])
+        ax.set_xlim(0, 100)
+        ax.set_xlabel('Percentage (%)', fontsize=11)
+        ax.set_title('Category Distribution by Model (%)', fontsize=14, fontweight='bold')
+        ax.legend(loc='lower right', fontsize=9)
+    plt.tight_layout()
+    return fig
+# Page config
+st.set_page_config(
+    page_title="CatVader - Social Media Classifier",
+    page_icon="🐱",
+    layout="wide"
+)
+# Custom CSS for enhanced styling
+st.markdown("""
+<style>
+/* Import Garamond font and apply globally */
+@import url('https://fonts.googleapis.com/css2?family=EB+Garamond:wght@400;500;600;700&display=swap');
+*:not([class*="icon"]):not([data-testid="stIconMaterial"]):not(svg):not(path) {
+    font-family: 'EB Garamond', Garamond, Georgia, serif !important;
+    font-size: 17px !important;
+}
+/* Preserve Streamlit icon fonts */
+[data-testid="stIconMaterial"], .material-icons, .material-symbols-rounded {
+    font-family: 'Material Symbols Rounded', 'Material Icons' !important;
+    font-size: 24px !important;
+}
+/* Main container styling */
+.main .block-container {
+    padding-top: 2rem;
+    padding-bottom: 2rem;
+}
+/* Headers with gradient accent */
+h1 {
+    background: linear-gradient(90deg, #E8A33C 0%, #D4872C 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+    font-weight: 700;
+}
+/* Card-like sections */
+.stExpander {
+    border: 1px solid #E8D5B5;
+    border-radius: 12px;
+    box-shadow: 0 2px 8px rgba(232, 163, 60, 0.08);
+}
+/* File uploader styling */
+.stFileUploader {
+    border-radius: 12px;
+}
+.stFileUploader > div > div {
+    border: 2px dashed #E8A33C;
+    border-radius: 12px;
+    background: linear-gradient(135deg, #FEFCF9 0%, #F5EFE6 100%);
+}
+/* Button styling */
+.stButton > button {
+    border-radius: 8px;
+    font-weight: 600;
+    transition: all 0.2s ease;
+    border: 2px solid #E8A33C;
+    background: #FEFCF9;
+    color: #D4872C;
+}
+/* Tall button for example dataset (matches file uploader height) */
+.tall-button .stButton > button {
+    min-height: 107px;
+    border-radius: 12px;
+}
+.stButton > button:hover {
+    transform: translateY(-1px);
+    box-shadow: 0 4px 12px rgba(232, 163, 60, 0.3);
+    background: #F5EFE6;
+}
+/* Primary button */
+.stButton > button[kind="primary"] {
+    background: linear-gradient(135deg, #E8A33C 0%, #D4872C 100%);
+    border: none;
+    color: white;
+}
+/* Success/info messages */
+.stSuccess {
+    background-color: #E8F5E9;
+    border-left: 4px solid #4CAF50;
+    border-radius: 0 8px 8px 0;
+}
+.stInfo {
+    background-color: #FFF8E8;
+    border-left: 4px solid #E8A33C;
+    border-radius: 0 8px 8px 0;
+}
+/* Radio buttons */
+.stRadio > div {
+    gap: 0.5rem;
+    display: flex;
+    width: 100%;
+}
+.stRadio > div > label {
+    background: #F5EFE6;
+    padding: 0.5rem 1rem;
+    border-radius: 20px;
+    border: 1px solid transparent;
+    transition: all 0.2s ease;
+    flex: 1;
+    text-align: center;
+    justify-content: center;
+}
+.stRadio > div > label:hover {
+    border-color: #E8A33C;
+}
+/* Text inputs */
+.stTextInput > div > div > input {
+    border-radius: 8px;
+    border: 1px solid #E8D5B5;
+}
+.stTextInput > div > div > input:focus {
+    border-color: #E8A33C;
+    box-shadow: 0 0 0 2px rgba(232, 163, 60, 0.2);
+}
+/* Select boxes */
+.stSelectbox > div > div {
+    border-radius: 8px;
+}
+/* Dataframe styling */
+.stDataFrame {
+    border-radius: 12px;
+    overflow: hidden;
+    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
+}
+/* Progress bar */
+.stProgress > div > div {
+    background: linear-gradient(90deg, #E8A33C 0%, #D4872C 100%);
+    border-radius: 10px;
+}
+/* Slider */
+.stSlider > div > div > div {
+    background: #E8A33C;
+}
+/* Divider */
+hr {
+    border: none;
+    height: 1px;
+    background: linear-gradient(90deg, transparent, #E8D5B5, transparent);
+    margin: 1.5rem 0;
+}
+/* Code blocks */
+.stCodeBlock {
+    border-radius: 12px;
+    border: 1px solid #E8D5B5;
+}
+/* Metric cards */
+.stMetric {
+    background: linear-gradient(135deg, #FEFCF9 0%, #F5EFE6 100%);
+    padding: 1rem;
+    border-radius: 12px;
+    border: 1px solid #E8D5B5;
+}
+/* Download buttons */
+.stDownloadButton > button {
+    background: #F5EFE6;
+    border: 1px solid #E8A33C;
+    color: #D4872C;
+}
+.stDownloadButton > button:hover {
+    background: #E8A33C;
+    color: white;
+}
+/* Multiselect */
+.stMultiSelect > div > div {
+    border-radius: 8px;
+}
+/* Status indicator */
+.stStatus {
+    border-radius: 12px;
+}
+/* Column gaps */
+[data-testid="column"] {
+    padding: 0 0.5rem;
+}
+/* Logo and title alignment */
+[data-testid="column"]:first-child img {
+    border-radius: 8px;
+}
+</style>
+""", unsafe_allow_html=True)
+# Initialize session state
+if 'categories' not in st.session_state:
+    st.session_state.categories = [''] * MAX_CATEGORIES
+if 'category_count' not in st.session_state:
+    st.session_state.category_count = INITIAL_CATEGORIES
+if 'task_mode' not in st.session_state:
+    st.session_state.task_mode = None
+if 'extracted_categories' not in st.session_state:
+    st.session_state.extracted_categories = None
+if 'results' not in st.session_state:
+    st.session_state.results = None
+if 'active_tab' not in st.session_state:
+    st.session_state.active_tab = "survey"
+if 'survey_data' not in st.session_state:
+    st.session_state.survey_data = None
+if 'pdf_data' not in st.session_state:
+    st.session_state.pdf_data = None
+if 'image_data' not in st.session_state:
+    st.session_state.image_data = None
+if 'extraction_params' not in st.session_state:
+    st.session_state.extraction_params = None  # Stores params when categories are auto-extracted
+if 'bluesky_df' not in st.session_state:
+    st.session_state.bluesky_df = None
+# Logo and title - use HTML for better alignment
+st.markdown("""
+<div style="display: flex; align-items: center; gap: 20px; margin-bottom: 10px;">
+    <img src="https://huggingface.co/spaces/CatVader/social-media-classifier/resolve/main/logo.png" width="100" style="border-radius: 8px;">
+    <div>
+        <div style="font-size: 2.2rem; font-weight: 700; color: #333; font-family: 'EB Garamond', Garamond, Georgia, serif; line-height: 1.1;">CatVader</div>
+        <div style="font-size: 1.1rem; font-weight: 500; color: #E8A33C; font-family: 'EB Garamond', Garamond, Georgia, serif; margin-bottom: 4px;">NLP for Survey Research</div>
+        <div style="font-size: 1rem; font-weight: 400; color: #666; font-family: 'EB Garamond', Garamond, Georgia, serif;">Research-grade classification of social media posts, PDFs, and images using AI models.</div>
+        <div style="font-size: 0.85rem; font-weight: 400; color: #888; font-family: 'EB Garamond', Garamond, Georgia, serif; margin-top: 4px;">Developed at UC Berkeley</div>
+    </div>
+</div>
+""", unsafe_allow_html=True)
+# About section
+with st.expander("About This App"):
+    st.markdown("""
+**Privacy Notice:** Your data is sent to third-party LLM APIs for classification. Do not upload sensitive, confidential, or personally identifiable information (PII).
+---
+**CatVader** is an open-source Python package for classifying and exploring social media data using Large Language Models.
+### What It Does
+- **Extract Categories**: Discover themes and categories in your data automatically
+- **Assign Categories**: Classify data into your predefined categories
+- **Extract & Assign**: Let CatVader discover categories, then classify all your data
+### Supported Providers
+OpenAI (GPT-4o, GPT-4o Mini), Anthropic (Claude), Google (Gemini), Mistral, HuggingFace, xAI (Grok), and Perplexity. Use the free tier or bring your own API key.
+### Beta Test - We Want Your Feedback!
+This app is currently in **beta** and **free to use** while CatVader is under active development, made possible by **Bashir Ahmed's generous fellowship support**.
+- Found a bug? Have a feature request? Please open an issue on [GitHub](https://github.com/chrissoria/cat-vader)
+- Reach out directly: [chrissoria@berkeley.edu](mailto:chrissoria@berkeley.edu)
+### Acknowledgments
+- **Bashir Ahmed** for his generous fellowship support that makes this free beta possible
+- **Claude Fischer** for his thoughtful feedback and collaboration on research that helped inspire this project
+- **Kevin Collins** from Survey360 for his input
+- **Fendi Tsim** for sharing it widely
+### Links
+- **Website**: [christophersoria.com](https://christophersoria.com)
+- **PyPI**: [pip install cat-vader](https://pypi.org/project/cat-vader/)
+- **GitHub**: [github.com/chrissoria/cat-vader](https://github.com/chrissoria/cat-vader)
+### Citation
+If you use CatVader in your research, please cite:
+```
+Soria, C. (2025). CatVader: A Python package for LLM-based social media classification. DOI: 10.5281/zenodo.15532316
+```
+""")
+# Main layout
+col_input, col_output = st.columns([1, 1])
+with col_input:
+    # Input type selector
+    input_type_choice = st.radio(
+        "Input Type",
+        options=["Social Media Posts", "PDF Documents", "Images"],
+        horizontal=True,
+        key="input_type_radio"
+    )
+    # Initialize variables
+    input_data = None
+    input_type_selected = "text"
+    description = ""
+    original_filename = "data"
+    pdf_mode = "Image (visual documents)"
+    if input_type_choice == "Social Media Posts":
+        input_type_selected = "text"
+        data_source = st.radio(
+            "Data Source",
+            options=["Upload CSV/Excel", "Fetch from Bluesky"],
+            horizontal=True,
+            key="data_source_radio"
+        )
+        if data_source == "Upload CSV/Excel":
+            st.session_state.bluesky_df = None  # Clear any fetched data when switching sources
+            upload_col, example_col = st.columns([3, 1])
+            with upload_col:
+                uploaded_file = st.file_uploader(
+                    "Upload Data (CSV or Excel)",
+                    type=['csv', 'xlsx', 'xls'],
+                    key="survey_file"
+                )
+            with example_col:
+                st.markdown("<div style='height: 27px;'></div>", unsafe_allow_html=True)  # Match "Upload Data" label height
+                st.markdown('<div class="tall-button">', unsafe_allow_html=True)
+                if st.button("Try Example Dataset", key="example_btn", use_container_width=True):
+                    st.session_state.example_loaded = True
+                st.markdown('</div>', unsafe_allow_html=True)
+            columns = []
+            df = None
+            if uploaded_file is not None:
+                try:
+                    if uploaded_file.name.endswith('.csv'):
+                        df = pd.read_csv(uploaded_file)
+                    else:
+                        df = pd.read_excel(uploaded_file)
+                    columns = df.columns.tolist()
+                    st.success(f"Loaded {len(df):,} rows")
+                except Exception as e:
+                    st.error(f"Error loading file: {e}")
+            elif hasattr(st.session_state, 'example_loaded') and st.session_state.example_loaded:
+                try:
+                    df = pd.read_csv("example_data.csv")
+                    columns = df.columns.tolist()
+                    st.success(f"Loaded example dataset ({len(df)} rows)")
+                except:
+                    pass
+            selected_column = st.selectbox(
+                "Column to Process",
+                options=columns if columns else ["Upload a file first"],
+                disabled=not columns,
+                key="survey_column"
+            )
+            description = selected_column if columns else ""
+            original_filename = uploaded_file.name if uploaded_file else "example_data.csv"
+            if df is not None and columns and selected_column in columns:
+                input_data = df[selected_column].tolist()
+        else:  # Fetch from Bluesky
+            bsky_handle = st.text_input(
+                "Bluesky Handle",
+                placeholder="e.g. aoc.bsky.social or @aoc.bsky.social",
+                key="bluesky_handle_input"
+            )
+            bsky_num_posts = st.slider(
+                "Number of Posts to Fetch",
+                min_value=10, max_value=250, value=50, step=10,
+                key="bluesky_num_posts"
+            )
+            if st.button("Fetch Posts", key="fetch_bluesky_btn"):
+                handle_clean = bsky_handle.strip().lstrip("@")
+                if not handle_clean:
+                    st.error("Please enter a Bluesky handle.")
+                else:
+                    with st.spinner(f"Fetching {bsky_num_posts} posts from {handle_clean}..."):
+                        try:
+                            from catvader._social_media import fetch_bluesky
+                            df_bsky = fetch_bluesky(limit=bsky_num_posts, handle=handle_clean)
+                            df_bsky = df_bsky[df_bsky["media_type"] != "REPOST_FACADE"].reset_index(drop=True)
+                            st.session_state.bluesky_df = df_bsky
+                        except Exception as e:
+                            st.error(f"Error fetching posts: {e}")
+            if st.session_state.bluesky_df is not None:
+                bsky_df = st.session_state.bluesky_df
+                st.success(f"Fetched {len(bsky_df)} posts")
+                st.dataframe(
+                    bsky_df[["timestamp", "text", "likes", "replies"]].head(5),
+                    use_container_width=True
+                )
+                handle_clean = bsky_handle.strip().lstrip("@") if bsky_handle else "bluesky"
+                input_data = bsky_df["text"].tolist()
+                description = f"Bluesky posts from @{handle_clean}"
+                original_filename = f"bluesky_{handle_clean.replace('.', '_')}"
+    elif input_type_choice == "PDF Documents":
+        input_type_selected = "pdf"
+        pdf_files = st.file_uploader(
+            "Upload PDF Document(s)",
+            type=['pdf'],
+            accept_multiple_files=True,
+            key="pdf_files"
+        )
+        pdf_description = st.text_input(
+            "Document Description",
+            placeholder="e.g., 'research papers', 'interview transcripts'",
+            help="Helps the LLM understand context",
+            key="pdf_desc"
+        )
+        pdf_mode = st.radio(
+            "Processing Mode",
+            options=["Image (visual documents)", "Text (text-heavy)", "Both (comprehensive)"],
+            key="pdf_mode"
+        )
+        if pdf_files:
+            input_data = []
+            pdf_name_map = {}  # Map temp paths to original filenames
+            for f in pdf_files:
+                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
+                    tmp.write(f.read())
+                    input_data.append(tmp.name)
+                    pdf_name_map[tmp.name] = f.name.replace('.pdf', '')  # Store original name without extension
+            st.session_state.pdf_name_map = pdf_name_map
+            description = pdf_description or "document"
+            original_filename = "pdf_files"
+            st.success(f"Uploaded {len(pdf_files)} PDF file(s)")
+    else:  # Images
+        input_type_selected = "image"
+        image_files = st.file_uploader(
+            "Upload Images",
+            type=['png', 'jpg', 'jpeg', 'gif', 'webp'],
+            accept_multiple_files=True,
+            key="image_files"
+        )
+        image_description = st.text_input(
+            "Image Description",
+            placeholder="e.g., 'product photos', 'social media posts'",
+            help="Helps the LLM understand context",
+            key="image_desc"
+        )
+        if image_files:
+            input_data = []
+            for f in image_files:
+                suffix = '.' + f.name.split('.')[-1]
+                with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                    tmp.write(f.read())
+                    input_data.append(tmp.name)
+            description = image_description or "images"
+            original_filename = "image_files"
+            st.success(f"Uploaded {len(image_files)} image file(s)")
+    st.markdown("---")
+    # Task selection
+    st.markdown("### What would you like to do?")
+    col_btn1, col_btn2 = st.columns(2)
+    with col_btn1:
+        manual_mode = st.button("Enter Categories Manually", use_container_width=True)
+    with col_btn2:
+        auto_mode = st.button("Auto-extract Categories", use_container_width=True)
+    if manual_mode:
+        st.session_state.task_mode = "manual"
+    if auto_mode:
+        st.session_state.task_mode = "auto_extract"
+    # Auto-extract settings
+    if st.session_state.task_mode == "auto_extract":
+        st.markdown("### Auto-extract Categories")
+        st.markdown("We'll analyze your data to discover the main categories.")
+        max_categories = st.slider(
+            "Number of Categories to Extract",
+            min_value=3,
+            max_value=25,
+            value=12,
+            help="How many categories should be identified in your data"
+        )
+        specificity = st.selectbox(
+            "How specific should categories be?",
+            options=["Broad", "Moderate", "Narrow"],
+            index=0,
+            help="Broad = general themes, Moderate = balanced detail, Narrow = highly specific categories"
+        )
+        focus = st.text_input(
+            "What should categories be focused around? (optional)",
+            placeholder="e.g., 'decisions to move', 'emotional responses', 'financial factors'",
+            help="Guide the model to prioritize extracting categories related to this focus"
+        )
+        # Model selection for extraction
+        st.markdown("### Model Selection")
+        model_tier = st.radio(
+            "Model Tier",
+            options=["Free Models", "Bring Your Own Key"],
+            key="extract_model_tier"
+        )
+        if model_tier == "Free Models":
+            model_display = st.selectbox("Model", options=FREE_MODEL_DISPLAY_NAMES, key="extract_model")
+            model = FREE_MODELS_MAP[model_display]  # Convert to actual model name
+            api_key = ""
+        else:
+            model = st.selectbox("Model", options=PAID_MODEL_CHOICES, key="extract_model_paid")
+            api_key = st.text_input("API Key", type="password", key="extract_api_key")
+        if st.button("Extract Categories", type="primary"):
+            if input_data is None:
+                st.error("Please upload data first")
+            else:
+                mode = None
+                if input_type_selected == "pdf":
+                    mode_mapping = {
+                        "Image (visual documents)": "image",
+                        "Text (text-heavy)": "text",
+                        "Both (comprehensive)": "both"
+                    }
+                    mode = mode_mapping.get(pdf_mode, "image")
+                actual_api_key, provider = get_api_key(model, model_tier, api_key)
+                if not actual_api_key:
+                    st.error(f"{provider} API key not configured")
+                else:
+                    model_source = get_model_source(model)
+                    # Calculate estimated time based on input size
+                    num_items = len(input_data) if isinstance(input_data, list) else 1
+                    if input_type_selected == "pdf":
+                        # PDFs take longer - estimate ~5s per page
+                        total_pages = sum(count_pdf_pages(p) for p in (input_data if isinstance(input_data, list) else [input_data]))
+                        est_seconds = total_pages * 5
+                    elif input_type_selected == "image":
+                        # Images ~4s each
+                        est_seconds = num_items * 4
+                    else:
+                        # Text ~2s per item, but batched
+                        est_seconds = max(10, num_items * 0.5)
+                    # Progress tracking UI
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    start_time = time.time()
+                    # Progress callback for extraction
+                    def extract_progress_callback(current_step, total_steps, step_label):
+                        progress = current_step / total_steps if total_steps > 0 else 0
+                        progress_bar.progress(min(progress, 1.0))
+                        elapsed = time.time() - start_time
+                        if current_step > 0:
+                            avg_time = elapsed / current_step
+                            eta_seconds = avg_time * (total_steps - current_step)
+                            eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
+                        else:
+                            eta_str = ""
+                        status_text.text(f"Extracting categories: {step_label} ({progress*100:.0f}%){eta_str}")
+                    extract_kwargs = {
+                        'input_data': input_data,
+                        'api_key': actual_api_key,
+                        'input_type': input_type_selected,
+                        'description': description,
+                        'user_model': model,
+                        'model_source': model_source,
+                        'max_categories': int(max_categories),
+                        'specificity': specificity.lower(),
+                        'progress_callback': extract_progress_callback,
+                    }
+                    if mode:
+                        extract_kwargs['mode'] = mode
+                    if focus and focus.strip():
+                        extract_kwargs['focus'] = focus.strip()
+                    try:
+                        extract_result = catvader.extract(**extract_kwargs)
+                        categories = extract_result.get('top_categories', [])
+                        processing_time = time.time() - start_time
+                        progress_bar.progress(1.0)
+                        status_text.text(f"Completed in {processing_time:.1f}s")
+                        if categories:
+                            st.success(f"Extracted {len(categories)} categories in {processing_time:.1f}s")
+                            st.session_state.extracted_categories = categories
+                            # Store extraction params for code generation
+                            st.session_state.extraction_params = {
+                                'model': model,
+                                'model_source': model_source,
+                                'max_categories': int(max_categories),
+                                'input_type': input_type_selected,
+                                'description': description,
+                                'mode': mode,
+                            }
+                            st.session_state.task_mode = "manual"
+                            st.rerun()
+                        else:
+                            st.error("No categories were extracted from the data")
+                    except Exception as e:
+                        st.error(f"Error: {str(e)}")
+    # Category inputs (shown for manual mode or after extraction)
+    if st.session_state.task_mode == "manual":
+        st.markdown("### Categories")
+        st.markdown("Enter your classification categories below.")
+        # Pre-fill with extracted categories if available
+        if st.session_state.extracted_categories:
+            for i, cat in enumerate(st.session_state.extracted_categories[:MAX_CATEGORIES]):
+                st.session_state.categories[i] = cat
+            st.session_state.category_count = min(len(st.session_state.extracted_categories), MAX_CATEGORIES)
+            st.session_state.extracted_categories = None  # Clear after use
+        placeholder_examples = [
+            "e.g., Positive sentiment",
+            "e.g., Negative sentiment",
+            "e.g., Product feedback",
+            "e.g., Service complaint",
+            "e.g., Feature request",
+            "e.g., Custom category"
+        ]
+        categories_entered = []
+        for i in range(st.session_state.category_count):
+            placeholder = placeholder_examples[i] if i < len(placeholder_examples) else "e.g., Custom category"
+            cat_value = st.text_input(
+                f"Category {i+1}",
+                value=st.session_state.categories[i],
+                placeholder=placeholder,
+                key=f"cat_{i}"
+            )
+            st.session_state.categories[i] = cat_value
+            if cat_value.strip():
+                categories_entered.append(cat_value.strip())
+        if st.session_state.category_count < MAX_CATEGORIES:
+            if st.button("+ Add More"):
+                st.session_state.category_count += 1
+                st.rerun()
+        st.markdown("### Model Selection")
+        # Classification mode selector
+        classify_mode = st.radio(
+            "Classification Mode",
+            options=["Single Model", "Model Comparison", "Ensemble"],
+            horizontal=True,
+            key="classify_mode",
+            help="Single: one model. Comparison: see results from multiple models side-by-side. Ensemble: multiple models vote for consensus."
+        )
+        model_tier = st.radio(
+            "Model Tier",
+            options=["Free Models", "Bring Your Own Key"],
+            key="classify_model_tier"
+        )
+        # Multi-model mode uses multiselect
+        is_multi_model = classify_mode in ["Model Comparison", "Ensemble"]
+        min_models = 3 if classify_mode == "Ensemble" else 2
+        # Track per-run temperatures: list of (model_name, temperature) for ensemble,
+        # or dict {model_name: temperature} for model comparison
+        model_temperatures = {}
+        # ensemble_runs stores list of (model_name, temperature) allowing duplicate models
+        ensemble_runs = []
+        if classify_mode == "Ensemble":
+            # Ensemble mode: dynamic rows allowing same model multiple times with different temps
+            if "ensemble_num_runs" not in st.session_state:
+                st.session_state.ensemble_num_runs = 3
+            if model_tier == "Free Models":
+                model_options = FREE_MODEL_DISPLAY_NAMES
+                is_free = True
+            else:
+                model_options = PAID_MODEL_CHOICES
+                is_free = False
+            st.markdown(f"**Model Runs** (select {min_models}+ runs)")
+            for i in range(st.session_state.ensemble_num_runs):
+                cols = st.columns([3, 1, 0.5])
+                with cols[0]:
+                    default_idx = 0 if i < len(model_options) else i % len(model_options)
+                    selected = st.selectbox(
+                        f"Run {i+1}", options=model_options,
+                        index=default_idx, key=f"ensemble_model_{i}",
+                        label_visibility="collapsed"
+                    )
+                with cols[1]:
+                    temp = st.number_input(
+                        "Temp", min_value=0.0, max_value=2.0, value=round(i * 0.25, 2),
+                        step=0.25, key=f"ensemble_temp_{i}", label_visibility="collapsed"
+                    )
+                with cols[2]:
+                    if st.session_state.ensemble_num_runs > 3:
+                        if st.button("✕", key=f"ensemble_remove_{i}"):
+                            st.session_state.ensemble_num_runs -= 1
+                            st.rerun()
+                model_name = FREE_MODELS_MAP[selected] if is_free else selected
+                ensemble_runs.append((model_name, temp))
+            if st.button("Add Run", key="add_ensemble_run"):
+                st.session_state.ensemble_num_runs += 1
+                st.rerun()
+            models_list = [r[0] for r in ensemble_runs]
+            model_temperatures = {f"{r[0]}__run{i}": r[1] for i, r in enumerate(ensemble_runs)}
+            api_key = "" if model_tier == "Free Models" else st.text_input("API Key", type="password", key="classify_api_key")
+        elif is_multi_model:
+            # Model Comparison mode: multiselect (each model unique) + temperature row
+            if model_tier == "Free Models":
+                default_models = FREE_MODEL_DISPLAY_NAMES[:min_models] if len(FREE_MODEL_DISPLAY_NAMES) >= min_models else FREE_MODEL_DISPLAY_NAMES
+                model_displays = st.multiselect(
+                    f"Models (select {min_models}+)",
+                    options=FREE_MODEL_DISPLAY_NAMES,
+                    default=default_models,
+                    key="classify_models_multi"
+                )
+                models_list = [FREE_MODELS_MAP[d] for d in model_displays]
+                api_key = ""
+            else:
+                default_models = PAID_MODEL_CHOICES[:min_models] if len(PAID_MODEL_CHOICES) >= min_models else PAID_MODEL_CHOICES
+                models_list = st.multiselect(
+                    f"Models (select {min_models}+)",
+                    options=PAID_MODEL_CHOICES,
+                    default=default_models,
+                    key="classify_models_multi_paid"
+                )
+                api_key = st.text_input("API Key", type="password", key="classify_api_key")
+            if models_list:
+                st.markdown("**Model Temperature**")
+                temp_cols = st.columns(len(models_list))
+                for idx, (col, m) in enumerate(zip(temp_cols, models_list)):
+                    short_name = m.split('/')[-1].split(':')[0][:20]
+                    model_temperatures[m] = col.number_input(
+                        short_name,
+                        min_value=0.0,
+                        max_value=2.0,
+                        value=0.0,
+                        step=0.25,
+                        key=f"temp_{idx}",
+                        help=f"Temperature for {m} (0 = deterministic, higher = more creative)"
+                    )
+        else:
+            # Single model mode
+            if model_tier == "Free Models":
+                model_display = st.selectbox("Model", options=FREE_MODEL_DISPLAY_NAMES, key="classify_model")
+                model = FREE_MODELS_MAP[model_display]  # Convert to actual model name
+                models_list = [model]
+                api_key = ""
+            else:
+                model = st.selectbox("Model", options=PAID_MODEL_CHOICES, key="classify_model_paid")
+                models_list = [model]
+                api_key = st.text_input("API Key", type="password", key="classify_api_key")
+        # Ensemble-specific options
+        consensus_threshold = 0.5  # Default
+        if classify_mode == "Ensemble":
+            consensus_options = {
+                "Majority (50%+)": 0.5,
+                "Two-Thirds (67%+)": 0.67,
+                "Unanimous (100%)": 1.0,
+            }
+            consensus_choice = st.radio(
+                "Consensus Rule",
+                options=list(consensus_options.keys()),
+                horizontal=True,
+                key="consensus_choice",
+                help="How many models must agree for a category to be marked present"
+            )
+            consensus_threshold = consensus_options[consensus_choice]
+        if st.button("Categorize Data", type="primary", use_container_width=True):
+            if input_data is None:
+                st.error("Please upload data first")
+            elif not categories_entered:
+                st.error("Please enter at least one category")
+            elif classify_mode == "Model Comparison" and len(models_list) < 2:
+                st.error("Please select at least 2 models for comparison mode")
+            elif classify_mode == "Ensemble" and len(models_list) < 3:
+                st.error("Please select at least 3 models for ensemble mode (needed for majority voting)")
+            else:
+                # Set up progress tracking
+                mode = None
+                if input_type_selected == "pdf":
+                    mode_mapping = {
+                        "Image (visual documents)": "image",
+                        "Text (text-heavy)": "text",
+                        "Both (comprehensive)": "both"
+                    }
+                    mode = mode_mapping.get(pdf_mode, "image")
+                # Build models tuples list
+                # Uses 4-tuple (model, source, api_key, options) when per-model temperatures are set
+                models_tuples = []
+                api_key_error = None
+                if ensemble_runs:
+                    # Ensemble mode: use ensemble_runs (model, temp) pairs directly
+                    for m, temp in ensemble_runs:
+                        actual_key, provider = get_api_key(m, model_tier, api_key)
+                        if not actual_key:
+                            api_key_error = f"{provider} API key not configured for {m}"
+                            break
+                        m_source = get_model_source(m)
+                        models_tuples.append((m, m_source, actual_key, {"creativity": temp}))
+                else:
+                    for m in models_list:
+                        actual_key, provider = get_api_key(m, model_tier, api_key)
+                        if not actual_key:
+                            api_key_error = f"{provider} API key not configured for {m}"
+                            break
+                        m_source = get_model_source(m)
+                        temp = model_temperatures.get(m)
+                        if temp is not None and is_multi_model:
+                            models_tuples.append((m, m_source, actual_key, {"creativity": temp}))
+                        else:
+                            models_tuples.append((m, m_source, actual_key))
+                if api_key_error:
+                    st.error(api_key_error)
+                else:
+                    items_list = input_data if isinstance(input_data, list) else [input_data]
+                    # Progress UI
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    start_time = time.time()
+                    # For PDFs, use progress callback
+                    if input_type_selected == "pdf":
+                        # Progress callback for PDF page-by-page updates
+                        def pdf_progress_callback(current_idx, total_pages, page_label):
+                            progress = current_idx / total_pages if total_pages > 0 else 0
+                            progress_bar.progress(min(progress, 1.0))
+                            elapsed = time.time() - start_time
+                            if current_idx > 0:
+                                avg_time = elapsed / current_idx
+                                eta_seconds = avg_time * (total_pages - current_idx)
+                                eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
+                            else:
+                                eta_str = ""
+                            status_text.text(f"Processing page {current_idx+1} of {total_pages} ({page_label}) ({progress*100:.0f}%){eta_str}")
+                        try:
+                            # Build kwargs for classify
+                            classify_kwargs = {
+                                "input_data": items_list,
+                                "categories": categories_entered,
+                                "models": models_tuples,
+                                "description": description,
+                                "mode": mode,
+                                "progress_callback": pdf_progress_callback,
+                            }
+                            # Add consensus_threshold for ensemble mode
+                            if classify_mode == "Ensemble":
+                                classify_kwargs["consensus_threshold"] = consensus_threshold
+                            result_df = catvader.classify(**classify_kwargs)
+                            processing_time = time.time() - start_time
+                            total_items = len(result_df)
+                            progress_bar.progress(1.0)
+                            status_text.text(f"Completed {total_items} pages in {processing_time:.1f}s")
+                            # Replace temp paths with original filenames in pdf_input column
+                            if 'pdf_input' in result_df.columns:
+                                pdf_name_map = st.session_state.get('pdf_name_map', {})
+                                def replace_temp_path(val):
+                                    if pd.isna(val):
+                                        return val
+                                    val_str = str(val)
+                                    for temp_path, orig_name in pdf_name_map.items():
+                                        # Check if the temp path's filename (without extension) is in the value
+                                        temp_name = os.path.basename(temp_path).replace('.pdf', '')
+                                        if temp_name in val_str:
+                                            return val_str.replace(temp_name, orig_name)
+                                    return val_str
+                                result_df['pdf_input'] = result_df['pdf_input'].apply(replace_temp_path)
+                            all_results = [result_df]
+                        except Exception as e:
+                            st.error(f"Error: {str(e)}")
+                            all_results = []
+                    else:
+                        # Non-PDF processing (text, images) - process all at once
+                        total_items = len(items_list)
+                        # Progress callback for item-by-item updates
+                        def item_progress_callback(current_idx, total, item_label):
+                            progress = current_idx / total if total > 0 else 0
+                            progress_bar.progress(min(progress, 1.0))
+                            elapsed = time.time() - start_time
+                            if current_idx > 0:
+                                avg_time = elapsed / current_idx
+                                eta_seconds = avg_time * (total - current_idx)
+                                eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
+                            else:
+                                eta_str = ""
+                            status_text.text(f"Processing item {current_idx+1} of {total} ({progress*100:.0f}%){eta_str}")
+                        try:
+                            # Build kwargs for classify
+                            classify_kwargs = {
+                                "input_data": items_list,
+                                "categories": categories_entered,
+                                "models": models_tuples,
+                                "description": description,
+                                "progress_callback": item_progress_callback,
+                            }
+                            # Add consensus_threshold for ensemble mode
+                            if classify_mode == "Ensemble":
+                                classify_kwargs["consensus_threshold"] = consensus_threshold
+                            result_df = catvader.classify(**classify_kwargs)
+                            all_results = [result_df]
+                            processing_time = time.time() - start_time
+                            progress_bar.progress(1.0)
+                            status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
+                        except Exception as e:
+                            st.error(f"Error: {str(e)}")
+                            all_results = []
+                            processing_time = time.time() - start_time
+                    if all_results:
+                        # Combine results
+                        result_df = pd.concat(all_results, ignore_index=True)
+                        # Merge Bluesky engagement columns if available
+                        if st.session_state.get("bluesky_df") is not None:
+                            bsky_eng = st.session_state.bluesky_df.reset_index(drop=True)
+                            if len(bsky_eng) == len(result_df):
+                                for col in ["post_id", "timestamp", "likes", "replies", "reposts",
+                                            "media_type", "image_url", "post_length",
+                                            "contains_url", "contains_image", "is_repost"]:
+                                    if col in bsky_eng.columns:
+                                        result_df[col] = bsky_eng[col].values
+                        # Save CSV
+                        with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
+                            result_df.to_csv(f.name, index=False)
+                            csv_path = f.name
+                        # Calculate success rate
+                        if 'processing_status' in result_df.columns:
+                            success_count = (result_df['processing_status'] == 'success').sum()
+                            success_rate = (success_count / len(result_df)) * 100
+                        else:
+                            success_rate = 100.0
+                        # Get version info
+                        try:
+                            catvader_version = catvader.__version__
+                        except AttributeError:
+                            catvader_version = "unknown"
+                        python_version = sys.version.split()[0]
+                        # For reports: create model string (single or list)
+                        if len(models_list) == 1:
+                            report_model = models_list[0]
+                            report_model_source = models_tuples[0][1]
+                        else:
+                            report_model = ", ".join(models_list)
+                            report_model_source = f"{classify_mode} ({len(models_list)} models)"
+                        # Generate code first so we can include it in the PDF
+                        # If categories were auto-extracted, include both extract and classify code
+                        if st.session_state.extraction_params:
+                            classify_params = {
+                                'model': report_model,
+                                'description': description,
+                                'mode': mode,
+                                'classify_mode': classify_mode,
+                                'models_list': models_list,
+                                'consensus_threshold': consensus_threshold,
+                                'model_temperatures': model_temperatures,
+                                'ensemble_runs': ensemble_runs if ensemble_runs else None,
+                            }
+                            code = generate_full_code(st.session_state.extraction_params, classify_params)
+                        else:
+                            code = generate_classify_code(
+                                input_type_selected, description, categories_entered,
+                                report_model, report_model_source, mode,
+                                classify_mode=classify_mode, models_list=models_list,
+                                consensus_threshold=consensus_threshold,
+                                model_temperatures=model_temperatures,
+                                ensemble_runs=ensemble_runs if ensemble_runs else None,
+                            )
+                        # Generate methodology report with code included
+                        pdf_path = generate_methodology_report_pdf(
+                            categories=categories_entered,
+                            model=report_model,
+                            column_name=description,
+                            num_rows=len(result_df),
+                            model_source=report_model_source,
+                            filename=original_filename,
+                            success_rate=success_rate,
+                            result_df=result_df,
+                            processing_time=processing_time,
+                            catvader_version=catvader_version,
+                            python_version=python_version,
+                            task_type="assign",
+                            input_type=input_type_selected,
+                            description=description,
+                            classify_mode=classify_mode,
+                            models_list=models_list,
+                            code=code,
+                            consensus_threshold=consensus_threshold if classify_mode == "Ensemble" else None,
+                        )
+                        st.session_state.results = {
+                            'df': result_df,
+                            'csv_path': csv_path,
+                            'pdf_path': pdf_path,
+                            'code': code,
+                            'status': f"Classified {len(result_df)} items in {processing_time:.1f}s",
+                            'categories': categories_entered,
+                            'classify_mode': classify_mode,
+                            'models_list': models_list,
+                            'model_temperatures': model_temperatures,
+                            'ensemble_runs': ensemble_runs if ensemble_runs else None,
+                        }
+                        st.success(f"Classified {len(result_df)} items in {processing_time:.1f}s")
+                        st.rerun()
+                    else:
+                        st.error("No items were successfully classified")
+with col_output:
+    st.markdown("### Results")
+    if st.session_state.results:
+        results = st.session_state.results
+        # Visualization selector
+        viz_type = st.selectbox(
+            "Visualization",
+            options=["Category Distribution", "Classification Matrix"],
+            key="viz_type",
+            help="Distribution shows category percentages. Matrix shows each response's classifications."
+        )
+        if viz_type == "Category Distribution":
+            fig = create_distribution_chart(
+                results['df'],
+                results['categories'],
+                classify_mode=results.get('classify_mode', 'Single Model'),
+                models_list=results.get('models_list', [])
+            )
+            st.pyplot(fig)
+            st.caption("Note: Categories are not mutually exclusive—each item can belong to multiple categories.")
+        else:
+            fig = create_classification_heatmap(
+                results['df'],
+                results['categories'],
+                classify_mode=results.get('classify_mode', 'Single Model'),
+                models_list=results.get('models_list', [])
+            )
+            st.pyplot(fig)
+            st.caption("Green = category present, Black = not present. Each row is one response.")
+        # Results dataframe (hide technical columns from display)
+        display_df = results['df'].copy()
+        cols_to_hide = ['model_response', 'json', 'raw_response', 'raw_json']
+        display_df = display_df.drop(columns=[c for c in cols_to_hide if c in display_df.columns])
+        st.dataframe(display_df, use_container_width=True)
+        # Downloads
+        col_dl1, col_dl2, col_dl3 = st.columns(3)
+        with col_dl1:
+            with open(results['csv_path'], 'rb') as f:
+                st.download_button(
+                    "Download CSV",
+                    data=f,
+                    file_name="classified_results.csv",
+                    mime="text/csv"
+                )
+        with col_dl2:
+            with open(results['pdf_path'], 'rb') as f:
+                st.download_button(
+                    "Download Report",
+                    data=f,
+                    file_name="methodology_report.pdf",
+                    mime="application/pdf"
+                )
+        with col_dl3:
+            # Generate both plots and save to a single PDF
+            import io
+            from matplotlib.backends.backend_pdf import PdfPages
+            plot_buffer = io.BytesIO()
+            with PdfPages(plot_buffer) as pdf:
+                # Distribution chart
+                fig1 = create_distribution_chart(
+                    results['df'],
+                    results['categories'],
+                    classify_mode=results.get('classify_mode', 'Single Model'),
+                    models_list=results.get('models_list', [])
+                )
+                pdf.savefig(fig1, bbox_inches='tight')
+                plt.close(fig1)
+                # Classification matrix
+                fig2 = create_classification_heatmap(
+                    results['df'],
+                    results['categories'],
+                    classify_mode=results.get('classify_mode', 'Single Model'),
+                    models_list=results.get('models_list', [])
+                )
+                pdf.savefig(fig2, bbox_inches='tight')
+                plt.close(fig2)
+            plot_buffer.seek(0)
+            st.download_button(
+                "Download Plots",
+                data=plot_buffer,
+                file_name="classification_plots.pdf",
+                mime="application/pdf"
+            )
+        # Code
+        with st.expander("See the Code"):
+            st.code(results['code'], language='python')
+    else:
+        st.info("Upload data, select categories, and click 'Categorize Data' to see results here.")
+# Bottom buttons
+col_reset, col_code = st.columns(2)
+with col_reset:
+    if st.button("Reset", type="secondary", use_container_width=True):
+        st.session_state.categories = [''] * MAX_CATEGORIES
+        st.session_state.category_count = INITIAL_CATEGORIES
+        st.session_state.task_mode = None
+        st.session_state.extracted_categories = None
+        st.session_state.extraction_params = None
+        st.session_state.results = None
+        if hasattr(st.session_state, 'example_loaded'):
+            del st.session_state.example_loaded
+        st.rerun()
+with col_code:
+    if st.button("See in Code", use_container_width=True):
+        st.session_state.show_code_modal = True
+# Code modal/dialog
+if st.session_state.get('show_code_modal'):
+    st.markdown("---")
+    st.markdown("### Reproducibility Code")
+    st.markdown("Use this code to reproduce the classification with the CatVader Python package:")
+    # Use results code if available, otherwise generate from current parameters
+    if st.session_state.results:
+        code_to_show = st.session_state.results['code']
+    else:
+        # Get current categories from session state
+        current_categories = [c for c in st.session_state.categories[:st.session_state.category_count] if c.strip()]
+        # Determine current input type and description
+        input_type_map = {"Social Media Posts": "text", "PDF Documents": "pdf", "Images": "image"}
+        current_input_type = input_type_map.get(st.session_state.get('input_type_radio', 'Social Media Posts'), 'text')
+        current_description = st.session_state.get('survey_column', '') or st.session_state.get('pdf_desc', '') or st.session_state.get('image_desc', '') or 'your_data'
+        # Get current classification mode and models
+        current_classify_mode = st.session_state.get('classify_mode', 'Single Model')
+        current_model_tier = st.session_state.get('classify_model_tier', 'Free Models')
+        if current_classify_mode in ["Model Comparison", "Ensemble"]:
+            # Multi-model mode
+            if current_model_tier == 'Free Models':
+                model_displays = st.session_state.get('classify_models_multi', [])
+                current_models_list = [FREE_MODELS_MAP.get(d, d) for d in model_displays]
+            else:
+                current_models_list = st.session_state.get('classify_models_multi_paid', [])
+            current_model = ", ".join(current_models_list) if current_models_list else "gpt-4o-mini"
+            current_model_source = f"{current_classify_mode} ({len(current_models_list)} models)"
+        else:
+            # Single model mode
+            if current_model_tier == 'Free Models':
+                model_display = st.session_state.get('classify_model', 'GPT-4o Mini')
+                current_model = FREE_MODELS_MAP.get(model_display, 'gpt-4o-mini')
+            else:
+                current_model = st.session_state.get('classify_model_paid', 'gpt-4o-mini')
+            current_models_list = [current_model]
+            current_model_source = get_model_source(current_model)
+        # Get consensus threshold for ensemble mode
+        consensus_options = {"Majority (50%+)": 0.5, "Two-Thirds (67%+)": 0.67, "Unanimous (100%)": 1.0}
+        current_consensus = consensus_options.get(st.session_state.get('consensus_choice', 'Majority (50%+)'), 0.5)
+        # Get PDF mode if applicable
+        current_mode = None
+        if current_input_type == "pdf":
+            mode_mapping = {
+                "Image (visual documents)": "image",
+                "Text (text-heavy)": "text",
+                "Both (comprehensive)": "both"
+            }
+            current_mode = mode_mapping.get(st.session_state.get('pdf_mode', 'Image (visual documents)'), 'image')
+        if current_categories:
+            # Check if categories were auto-extracted
+            if st.session_state.extraction_params:
+                current_temperatures = results.get('model_temperatures', {})
+                classify_params = {
+                    'model': current_model,
+                    'description': current_description,
+                    'mode': current_mode,
+                    'classify_mode': current_classify_mode,
+                    'models_list': current_models_list,
+                    'consensus_threshold': current_consensus,
+                    'model_temperatures': current_temperatures,
+                    'ensemble_runs': results.get('ensemble_runs'),
+                }
+                code_to_show = generate_full_code(st.session_state.extraction_params, classify_params)
+            else:
+                current_temperatures = results.get('model_temperatures', {})
+                code_to_show = generate_classify_code(
+                    current_input_type, current_description, current_categories,
+                    current_model, current_model_source, current_mode,
+                    classify_mode=current_classify_mode, models_list=current_models_list,
+                    consensus_threshold=current_consensus,
+                    model_temperatures=current_temperatures,
+                    ensemble_runs=results.get('ensemble_runs'),
+                )
+        else:
+            code_to_show = '''import catvader
+# Define your categories
+categories = [
+    "Category 1",
+    "Category 2",
+    # Add more categories...
+]
+# Classify your data
+result = catvader.classify(
+    input_data=df["your_column"].tolist(),
+    categories=categories,
+    api_key="YOUR_API_KEY",
+    description="your_description",
+    user_model="gpt-4o-mini"
+)
+# View results
+print(result)
+result.to_csv("classified_results.csv", index=False)
+'''
+    st.code(code_to_show, language='python')
+    if st.button("Close"):
+        st.session_state.show_code_modal = False
+        st.rerun()