Spaces:

CatLLM
/

survey-classifier

Running

App Files Files Community

chrissoria commited on 12 days ago

Commit

c05b50f

1 Parent(s): d54c66c

Add PDF codebook generation with category mappings

Browse files

Files changed (2) hide show

app.py +100 -4
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import gradio as gr
 import pandas as pd
 import tempfile
 import os
 # Import catllm
 try:
@@ -57,6 +58,98 @@ def is_free_model(model, model_tier):
     return model_tier == "Free Models"
 def get_model_source(model):
     """Auto-detect model source. All HF router models (novita, groq, etc) use 'huggingface'."""
     model_lower = model.lower()
@@ -180,12 +273,15 @@ def classify_data(spreadsheet_file, spreadsheet_column,
             model_source=model_source
         )
-        # Save for download
         with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
             result.to_csv(f.name, index=False)
-            download_path = f.name
-        return result, download_path, f"**Success!** Classified {len(input_data)} responses"
     except Exception as e:
         return None, None, f"**Error:** {str(e)}"
@@ -358,7 +454,7 @@ https://github.com/chrissoria/cat-llm
         with gr.Column():
             status = gr.Markdown("Ready to classify")
             results = gr.DataFrame(label="Classification Results")
-            download_file = gr.File(label="Download Results")
             code_output = gr.Code(
                 label="Python Code",
                 language="python",

 import pandas as pd
 import tempfile
 import os
+from datetime import datetime
 # Import catllm
 try:
     return model_tier == "Free Models"
+def generate_codebook_pdf(categories, model, column_name, num_rows):
+    """Generate a PDF codebook explaining the output columns."""
+    from reportlab.lib.pagesizes import letter
+    from reportlab.lib import colors
+    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
+    # Create temp file for PDF
+    pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_codebook.pdf', delete=False)
+    doc = SimpleDocTemplate(pdf_file.name, pagesize=letter)
+    styles = getSampleStyleSheet()
+    # Custom styles
+    title_style = ParagraphStyle('Title', parent=styles['Heading1'], fontSize=18, spaceAfter=20)
+    heading_style = ParagraphStyle('Heading', parent=styles['Heading2'], fontSize=14, spaceAfter=10, spaceBefore=15)
+    normal_style = styles['Normal']
+    story = []
+    # Title
+    story.append(Paragraph("CatLLM Classification Codebook", title_style))
+    story.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", normal_style))
+    story.append(Spacer(1, 20))
+    # Classification summary
+    story.append(Paragraph("Classification Summary", heading_style))
+    summary_data = [
+        ["Source Column", column_name],
+        ["Model Used", model],
+        ["Rows Classified", str(num_rows)],
+        ["Number of Categories", str(len(categories))],
+    ]
+    summary_table = Table(summary_data, colWidths=[150, 300])
+    summary_table.setStyle(TableStyle([
+        ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
+        ('GRID', (0, 0), (-1, -1), 1, colors.black),
+        ('PADDING', (0, 0), (-1, -1), 8),
+    ]))
+    story.append(summary_table)
+    story.append(Spacer(1, 20))
+    # Category mapping
+    story.append(Paragraph("Category Mapping", heading_style))
+    story.append(Paragraph("Each category column contains binary values: 1 = present, 0 = not present", normal_style))
+    story.append(Spacer(1, 10))
+    category_data = [["Column Name", "Category Description"]]
+    for i, cat in enumerate(categories, 1):
+        category_data.append([f"category_{i}", cat])
+    cat_table = Table(category_data, colWidths=[120, 330])
+    cat_table.setStyle(TableStyle([
+        ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+        ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+        ('GRID', (0, 0), (-1, -1), 1, colors.black),
+        ('PADDING', (0, 0), (-1, -1), 8),
+        ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
+    ]))
+    story.append(cat_table)
+    story.append(Spacer(1, 20))
+    # Other columns
+    story.append(Paragraph("Other Output Columns", heading_style))
+    other_cols = [
+        ["Column Name", "Description"],
+        ["survey_input", "The original text that was classified"],
+        ["model_response", "Raw response from the LLM"],
+        ["json", "Extracted JSON with category assignments"],
+        ["processing_status", "'success' if classification worked, 'error' if it failed"],
+        ["categories_id", "Comma-separated list of category numbers that were assigned"],
+    ]
+    other_table = Table(other_cols, colWidths=[120, 330])
+    other_table.setStyle(TableStyle([
+        ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+        ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+        ('GRID', (0, 0), (-1, -1), 1, colors.black),
+        ('PADDING', (0, 0), (-1, -1), 8),
+        ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
+    ]))
+    story.append(other_table)
+    story.append(Spacer(1, 20))
+    # Citation
+    story.append(Paragraph("Citation", heading_style))
+    story.append(Paragraph("If you use CatLLM in your research, please cite:", normal_style))
+    story.append(Spacer(1, 5))
+    story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. https://github.com/chrissoria/cat-llm", normal_style))
+    doc.build(story)
+    return pdf_file.name
 def get_model_source(model):
     """Auto-detect model source. All HF router models (novita, groq, etc) use 'huggingface'."""
     model_lower = model.lower()
             model_source=model_source
         )
+        # Save CSV for download
         with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
             result.to_csv(f.name, index=False)
+            csv_path = f.name
+        # Generate PDF codebook
+        pdf_path = generate_codebook_pdf(categories, actual_model, spreadsheet_column, len(input_data))
+        return result, [csv_path, pdf_path], f"**Success!** Classified {len(input_data)} responses"
     except Exception as e:
         return None, None, f"**Error:** {str(e)}"
         with gr.Column():
             status = gr.Markdown("Ready to classify")
             results = gr.DataFrame(label="Classification Results")
+            download_file = gr.File(label="Download Results (CSV + Codebook PDF)", file_count="multiple")
             code_output = gr.Code(
                 label="Python Code",
                 language="python",

requirements.txt CHANGED Viewed

@@ -6,3 +6,4 @@ pandas
 openpyxl
 requests
 regex

 openpyxl
 requests
 regex
+reportlab