Spaces:
Running
Running
Commit
·
5ca7833
1
Parent(s):
4a5bb08
Add reproducibility code page to PDF codebook
Browse files
app.py
CHANGED
|
@@ -58,12 +58,12 @@ def is_free_model(model, model_tier):
|
|
| 58 |
return model_tier == "Free Models"
|
| 59 |
|
| 60 |
|
| 61 |
-
def generate_codebook_pdf(categories, model, column_name, num_rows):
|
| 62 |
"""Generate a PDF codebook explaining the output columns."""
|
| 63 |
from reportlab.lib.pagesizes import letter
|
| 64 |
from reportlab.lib import colors
|
| 65 |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 66 |
-
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
|
| 67 |
|
| 68 |
# Create temp file for PDF
|
| 69 |
pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_codebook.pdf', delete=False)
|
|
@@ -146,6 +146,51 @@ def generate_codebook_pdf(categories, model, column_name, num_rows):
|
|
| 146 |
story.append(Spacer(1, 5))
|
| 147 |
story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. https://github.com/chrissoria/cat-llm", normal_style))
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
doc.build(story)
|
| 150 |
return pdf_file.name
|
| 151 |
|
|
@@ -278,8 +323,11 @@ def classify_data(spreadsheet_file, spreadsheet_column,
|
|
| 278 |
result.to_csv(f.name, index=False)
|
| 279 |
csv_path = f.name
|
| 280 |
|
|
|
|
|
|
|
|
|
|
| 281 |
# Generate PDF codebook
|
| 282 |
-
pdf_path = generate_codebook_pdf(categories, actual_model, spreadsheet_column, len(input_data))
|
| 283 |
|
| 284 |
return result, [csv_path, pdf_path], f"**Success!** Classified {len(input_data)} responses"
|
| 285 |
|
|
|
|
| 58 |
return model_tier == "Free Models"
|
| 59 |
|
| 60 |
|
| 61 |
+
def generate_codebook_pdf(categories, model, column_name, num_rows, model_source, filename):
|
| 62 |
"""Generate a PDF codebook explaining the output columns."""
|
| 63 |
from reportlab.lib.pagesizes import letter
|
| 64 |
from reportlab.lib import colors
|
| 65 |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 66 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Preformatted
|
| 67 |
|
| 68 |
# Create temp file for PDF
|
| 69 |
pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_codebook.pdf', delete=False)
|
|
|
|
| 146 |
story.append(Spacer(1, 5))
|
| 147 |
story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. https://github.com/chrissoria/cat-llm", normal_style))
|
| 148 |
|
| 149 |
+
# Page break for reproducibility code
|
| 150 |
+
story.append(PageBreak())
|
| 151 |
+
story.append(Paragraph("Reproducibility Code", title_style))
|
| 152 |
+
story.append(Paragraph("Use the following Python code to reproduce this classification:", normal_style))
|
| 153 |
+
story.append(Spacer(1, 15))
|
| 154 |
+
|
| 155 |
+
# Build categories list string
|
| 156 |
+
categories_str = ", ".join([f'"{cat}"' for cat in categories])
|
| 157 |
+
|
| 158 |
+
code_text = f'''import catllm
|
| 159 |
+
import pandas as pd
|
| 160 |
+
|
| 161 |
+
# Load your survey data
|
| 162 |
+
df = pd.read_csv("{filename}")
|
| 163 |
+
|
| 164 |
+
# Define your categories
|
| 165 |
+
categories = [{categories_str}]
|
| 166 |
+
|
| 167 |
+
# Classify the responses
|
| 168 |
+
result = catllm.multi_class(
|
| 169 |
+
survey_input=df["{column_name}"].tolist(),
|
| 170 |
+
categories=categories,
|
| 171 |
+
api_key="YOUR_API_KEY",
|
| 172 |
+
user_model="{model}",
|
| 173 |
+
model_source="{model_source}"
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
# View results
|
| 177 |
+
print(result)
|
| 178 |
+
|
| 179 |
+
# Save to CSV
|
| 180 |
+
result.to_csv("classified_results.csv", index=False)'''
|
| 181 |
+
|
| 182 |
+
# Use a monospace style for code
|
| 183 |
+
code_style = ParagraphStyle('Code', parent=styles['Normal'], fontName='Courier', fontSize=9, leftIndent=20, spaceAfter=10)
|
| 184 |
+
|
| 185 |
+
# Split code into lines and add each as a paragraph
|
| 186 |
+
for line in code_text.split('\n'):
|
| 187 |
+
if line.strip() == '':
|
| 188 |
+
story.append(Spacer(1, 5))
|
| 189 |
+
else:
|
| 190 |
+
# Escape special characters for PDF
|
| 191 |
+
escaped_line = line.replace('&', '&').replace('<', '<').replace('>', '>')
|
| 192 |
+
story.append(Paragraph(escaped_line, code_style))
|
| 193 |
+
|
| 194 |
doc.build(story)
|
| 195 |
return pdf_file.name
|
| 196 |
|
|
|
|
| 323 |
result.to_csv(f.name, index=False)
|
| 324 |
csv_path = f.name
|
| 325 |
|
| 326 |
+
# Get original filename for codebook
|
| 327 |
+
original_filename = file_path.split("/")[-1]
|
| 328 |
+
|
| 329 |
# Generate PDF codebook
|
| 330 |
+
pdf_path = generate_codebook_pdf(categories, actual_model, spreadsheet_column, len(input_data), model_source, original_filename)
|
| 331 |
|
| 332 |
return result, [csv_path, pdf_path], f"**Success!** Classified {len(input_data)} responses"
|
| 333 |
|