chrissoria commited on
Commit
5ca7833
·
1 Parent(s): 4a5bb08

Add reproducibility code page to PDF codebook

Browse files
Files changed (1) hide show
  1. app.py +51 -3
app.py CHANGED
@@ -58,12 +58,12 @@ def is_free_model(model, model_tier):
58
  return model_tier == "Free Models"
59
 
60
 
61
- def generate_codebook_pdf(categories, model, column_name, num_rows):
62
  """Generate a PDF codebook explaining the output columns."""
63
  from reportlab.lib.pagesizes import letter
64
  from reportlab.lib import colors
65
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
66
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
67
 
68
  # Create temp file for PDF
69
  pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_codebook.pdf', delete=False)
@@ -146,6 +146,51 @@ def generate_codebook_pdf(categories, model, column_name, num_rows):
146
  story.append(Spacer(1, 5))
147
  story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. https://github.com/chrissoria/cat-llm", normal_style))
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  doc.build(story)
150
  return pdf_file.name
151
 
@@ -278,8 +323,11 @@ def classify_data(spreadsheet_file, spreadsheet_column,
278
  result.to_csv(f.name, index=False)
279
  csv_path = f.name
280
 
 
 
 
281
  # Generate PDF codebook
282
- pdf_path = generate_codebook_pdf(categories, actual_model, spreadsheet_column, len(input_data))
283
 
284
  return result, [csv_path, pdf_path], f"**Success!** Classified {len(input_data)} responses"
285
 
 
58
  return model_tier == "Free Models"
59
 
60
 
61
+ def generate_codebook_pdf(categories, model, column_name, num_rows, model_source, filename):
62
  """Generate a PDF codebook explaining the output columns."""
63
  from reportlab.lib.pagesizes import letter
64
  from reportlab.lib import colors
65
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
66
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Preformatted
67
 
68
  # Create temp file for PDF
69
  pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_codebook.pdf', delete=False)
 
146
  story.append(Spacer(1, 5))
147
  story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. https://github.com/chrissoria/cat-llm", normal_style))
148
 
149
+ # Page break for reproducibility code
150
+ story.append(PageBreak())
151
+ story.append(Paragraph("Reproducibility Code", title_style))
152
+ story.append(Paragraph("Use the following Python code to reproduce this classification:", normal_style))
153
+ story.append(Spacer(1, 15))
154
+
155
+ # Build categories list string
156
+ categories_str = ", ".join([f'"{cat}"' for cat in categories])
157
+
158
+ code_text = f'''import catllm
159
+ import pandas as pd
160
+
161
+ # Load your survey data
162
+ df = pd.read_csv("{filename}")
163
+
164
+ # Define your categories
165
+ categories = [{categories_str}]
166
+
167
+ # Classify the responses
168
+ result = catllm.multi_class(
169
+ survey_input=df["{column_name}"].tolist(),
170
+ categories=categories,
171
+ api_key="YOUR_API_KEY",
172
+ user_model="{model}",
173
+ model_source="{model_source}"
174
+ )
175
+
176
+ # View results
177
+ print(result)
178
+
179
+ # Save to CSV
180
+ result.to_csv("classified_results.csv", index=False)'''
181
+
182
+ # Use a monospace style for code
183
+ code_style = ParagraphStyle('Code', parent=styles['Normal'], fontName='Courier', fontSize=9, leftIndent=20, spaceAfter=10)
184
+
185
+ # Split code into lines and add each as a paragraph
186
+ for line in code_text.split('\n'):
187
+ if line.strip() == '':
188
+ story.append(Spacer(1, 5))
189
+ else:
190
+ # Escape special characters for PDF
191
+ escaped_line = line.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
192
+ story.append(Paragraph(escaped_line, code_style))
193
+
194
  doc.build(story)
195
  return pdf_file.name
196
 
 
323
  result.to_csv(f.name, index=False)
324
  csv_path = f.name
325
 
326
+ # Get original filename for codebook
327
+ original_filename = file_path.split("/")[-1]
328
+
329
  # Generate PDF codebook
330
+ pdf_path = generate_codebook_pdf(categories, actual_model, spreadsheet_column, len(input_data), model_source, original_filename)
331
 
332
  return result, [csv_path, pdf_path], f"**Success!** Classified {len(input_data)} responses"
333