chrissoria Claude commited on
Commit
66b262b
·
1 Parent(s): 47603f7

Enhance PDF codebook with comprehensive researcher documentation

Browse files

- Add Sample Results table (first 5 rows) to PDF Page 1
- Add Category Distribution page (Page 2) with counts/percentages
- Expand Classification Summary (Page 3) with processing time, data quality notes, version info
- Add Prompt Template page (Page 4) showing exact prompts sent to LLM
- Update app UI to show Category Distribution as main output, with sample results below
- Add timing capture and display processing speed in status message
- Include CatLLM version and Python version in codebook

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show
  1. __pycache__/app.cpython-311.pyc +0 -0
  2. app.py +299 -42
__pycache__/app.cpython-311.pyc CHANGED
Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ
 
app.py CHANGED
@@ -6,6 +6,8 @@ import gradio as gr
6
  import pandas as pd
7
  import tempfile
8
  import os
 
 
9
  from datetime import datetime
10
 
11
  # Import catllm
@@ -58,12 +60,14 @@ def is_free_model(model, model_tier):
58
  return model_tier == "Free Models"
59
 
60
 
61
- def generate_codebook_pdf(categories, model, column_name, num_rows, model_source, filename, success_rate):
62
- """Generate a PDF codebook explaining the output columns."""
 
 
63
  from reportlab.lib.pagesizes import letter
64
  from reportlab.lib import colors
65
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
66
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Preformatted
67
 
68
  # Create temp file for PDF
69
  pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_codebook.pdf', delete=False)
@@ -74,18 +78,19 @@ def generate_codebook_pdf(categories, model, column_name, num_rows, model_source
74
  title_style = ParagraphStyle('Title', parent=styles['Heading1'], fontSize=18, spaceAfter=20)
75
  heading_style = ParagraphStyle('Heading', parent=styles['Heading2'], fontSize=14, spaceAfter=10, spaceBefore=15)
76
  normal_style = styles['Normal']
 
77
 
78
  story = []
79
 
80
- # === PAGE 1: Title and Category Mapping ===
81
  story.append(Paragraph("CatLLM Classification Codebook", title_style))
82
  story.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", normal_style))
83
- story.append(Spacer(1, 20))
84
 
85
  # Category mapping
86
  story.append(Paragraph("Category Mapping", heading_style))
87
  story.append(Paragraph("Each category column contains binary values: 1 = present, 0 = not present", normal_style))
88
- story.append(Spacer(1, 10))
89
 
90
  category_data = [["Column Name", "Category Description"]]
91
  for i, cat in enumerate(categories, 1):
@@ -96,11 +101,46 @@ def generate_codebook_pdf(categories, model, column_name, num_rows, model_source
96
  ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
97
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
98
  ('GRID', (0, 0), (-1, -1), 1, colors.black),
99
- ('PADDING', (0, 0), (-1, -1), 8),
100
  ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
 
101
  ]))
102
  story.append(cat_table)
103
- story.append(Spacer(1, 20))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  # Other columns
106
  story.append(Paragraph("Other Output Columns", heading_style))
@@ -109,35 +149,72 @@ def generate_codebook_pdf(categories, model, column_name, num_rows, model_source
109
  ["survey_input", "The original text that was classified"],
110
  ["model_response", "Raw response from the LLM"],
111
  ["json", "Extracted JSON with category assignments"],
112
- ["processing_status", "'success' if classification worked, 'error' if it failed"],
113
- ["categories_id", "Comma-separated list of category numbers that were assigned"],
114
  ]
115
  other_table = Table(other_cols, colWidths=[120, 330])
116
  other_table.setStyle(TableStyle([
117
  ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
118
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
119
  ('GRID', (0, 0), (-1, -1), 1, colors.black),
120
- ('PADDING', (0, 0), (-1, -1), 8),
121
  ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
 
122
  ]))
123
  story.append(other_table)
124
- story.append(Spacer(1, 20))
125
 
126
- # Citation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  story.append(Paragraph("Citation", heading_style))
128
  story.append(Paragraph("If you use CatLLM in your research, please cite:", normal_style))
129
  story.append(Spacer(1, 5))
130
  story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. https://github.com/chrissoria/cat-llm", normal_style))
131
 
132
- # === PAGE 2: Classification Summary ===
133
  story.append(PageBreak())
134
  story.append(Paragraph("Classification Summary", title_style))
135
- story.append(Spacer(1, 20))
136
 
 
 
137
  summary_data = [
 
138
  ["Source Column", column_name],
139
  ["Model Used", model],
140
  ["Model Source", model_source],
 
141
  ["Rows Classified", str(num_rows)],
142
  ["Number of Categories", str(len(categories))],
143
  ["Success Rate", f"{success_rate:.2f}%"],
@@ -146,11 +223,107 @@ def generate_codebook_pdf(categories, model, column_name, num_rows, model_source
146
  summary_table.setStyle(TableStyle([
147
  ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
148
  ('GRID', (0, 0), (-1, -1), 1, colors.black),
149
- ('PADDING', (0, 0), (-1, -1), 8),
 
150
  ]))
151
  story.append(summary_table)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
- # === PAGE 3: Reproducibility Code ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  story.append(PageBreak())
155
  story.append(Paragraph("Reproducibility Code", title_style))
156
  story.append(Paragraph("Use the following Python code to reproduce this classification:", normal_style))
@@ -183,9 +356,6 @@ print(result)
183
  # Save to CSV
184
  result.to_csv("classified_results.csv", index=False)'''
185
 
186
- # Use a monospace style for code
187
- code_style = ParagraphStyle('Code', parent=styles['Normal'], fontName='Courier', fontSize=9, leftIndent=20, spaceAfter=10)
188
-
189
  # Split code into lines and add each as a paragraph
190
  for line in code_text.split('\n'):
191
  if line.strip() == '':
@@ -239,15 +409,15 @@ def load_columns(file):
239
  def classify_data(spreadsheet_file, spreadsheet_column,
240
  cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
241
  model_tier, model, model_source_input, api_key_input):
242
- """Main classification function."""
243
  if not CATLLM_AVAILABLE:
244
- return None, None, "**Error:** catllm package not available"
245
 
246
  all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
247
  categories = [c.strip() for c in all_cats if c and c.strip()]
248
 
249
  if not categories:
250
- return None, None, "**Error:** Please enter at least one category"
251
 
252
  actual_model = model
253
 
@@ -257,31 +427,31 @@ def classify_data(spreadsheet_file, spreadsheet_column,
257
  if model in HF_ROUTED_MODELS:
258
  actual_api_key = os.environ.get("HF_API_KEY", "")
259
  if not actual_api_key:
260
- return None, None, "**Error:** HuggingFace API key not configured in Space secrets"
261
  elif "gpt" in model.lower():
262
  actual_api_key = os.environ.get("OPENAI_API_KEY", "")
263
  if not actual_api_key:
264
- return None, None, "**Error:** OpenAI API key not configured in Space secrets"
265
  elif "gemini" in model.lower():
266
  actual_api_key = os.environ.get("GOOGLE_API_KEY", "")
267
  if not actual_api_key:
268
- return None, None, "**Error:** Google API key not configured in Space secrets"
269
  elif "mistral" in model.lower():
270
  actual_api_key = os.environ.get("MISTRAL_API_KEY", "")
271
  if not actual_api_key:
272
- return None, None, "**Error:** Mistral API key not configured in Space secrets"
273
  elif "claude" in model.lower():
274
  actual_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
275
  if not actual_api_key:
276
- return None, None, "**Error:** Anthropic API key not configured in Space secrets"
277
  elif "sonar" in model.lower():
278
  actual_api_key = os.environ.get("PERPLEXITY_API_KEY", "")
279
  if not actual_api_key:
280
- return None, None, "**Error:** Perplexity API key not configured in Space secrets"
281
  elif "grok" in model.lower():
282
  actual_api_key = os.environ.get("XAI_API_KEY", "")
283
  if not actual_api_key:
284
- return None, None, "**Error:** xAI API key not configured in Space secrets"
285
  else:
286
  actual_api_key = os.environ.get("HF_API_KEY", "")
287
  else:
@@ -289,7 +459,7 @@ def classify_data(spreadsheet_file, spreadsheet_column,
289
  if api_key_input and api_key_input.strip():
290
  actual_api_key = api_key_input.strip()
291
  else:
292
- return None, None, f"**Error:** Please provide your API key for {model}"
293
 
294
  # Use user-selected model_source, or auto-detect if "auto"
295
  if model_source_input == "auto":
@@ -299,9 +469,9 @@ def classify_data(spreadsheet_file, spreadsheet_column,
299
 
300
  try:
301
  if not spreadsheet_file:
302
- return None, None, "**Error:** Please upload a file"
303
  if not spreadsheet_column:
304
- return None, None, "**Error:** Please select a column to classify"
305
 
306
  file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
307
  if file_path.endswith('.csv'):
@@ -310,10 +480,23 @@ def classify_data(spreadsheet_file, spreadsheet_column,
310
  df = pd.read_excel(file_path)
311
 
312
  if spreadsheet_column not in df.columns:
313
- return None, None, f"**Error:** Column '{spreadsheet_column}' not found"
314
 
315
  input_data = df[spreadsheet_column].tolist()
316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  result = catllm.multi_class(
318
  survey_input=input_data,
319
  categories=categories,
@@ -322,6 +505,12 @@ def classify_data(spreadsheet_file, spreadsheet_column,
322
  model_source=model_source
323
  )
324
 
 
 
 
 
 
 
325
  # Save CSV for download
326
  with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
327
  result.to_csv(f.name, index=False)
@@ -337,13 +526,77 @@ def classify_data(spreadsheet_file, spreadsheet_column,
337
  else:
338
  success_rate = 100.0
339
 
340
- # Generate PDF codebook
341
- pdf_path = generate_codebook_pdf(categories, actual_model, spreadsheet_column, len(input_data), model_source, original_filename, success_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
- return result, [csv_path, pdf_path], f"**Success!** Classified {len(input_data)} responses"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
  except Exception as e:
346
- return None, None, f"**Error:** {str(e)}"
347
 
348
 
349
  def add_category_field(current_count):
@@ -374,7 +627,9 @@ def reset_all():
374
  "", # api_key
375
  "**Free tier** - no API key required! We cover the cost while CatLLM is in review.", # api_key_status
376
  "Ready to classify", # status
377
- None, # results
 
 
378
  None, # download_file
379
  gr.update(value="", visible=False), # code_output
380
  ])
@@ -538,7 +793,9 @@ https://github.com/chrissoria/cat-llm
538
 
539
  with gr.Column():
540
  status = gr.Markdown("Ready to classify")
541
- results = gr.DataFrame(label="Classification Results")
 
 
542
  download_file = gr.File(label="Download Results (CSV + Codebook PDF)", file_count="multiple")
543
  code_output = gr.Code(
544
  label="Python Code",
@@ -583,7 +840,7 @@ https://github.com/chrissoria/cat-llm
583
  classify_btn.click(
584
  fn=classify_data,
585
  inputs=[spreadsheet_file, spreadsheet_column] + category_inputs + [model_tier, model, model_source, api_key],
586
- outputs=[results, download_file, status]
587
  )
588
 
589
  see_code_btn.click(
@@ -595,7 +852,7 @@ https://github.com/chrissoria/cat-llm
595
  reset_btn.click(
596
  fn=reset_all,
597
  inputs=[],
598
- outputs=[spreadsheet_file, spreadsheet_column] + category_inputs + [add_category_btn, category_count, model_tier, model, model_source, api_key, api_key_status, status, results, download_file, code_output]
599
  )
600
 
601
 
 
6
  import pandas as pd
7
  import tempfile
8
  import os
9
+ import time
10
+ import sys
11
  from datetime import datetime
12
 
13
  # Import catllm
 
60
  return model_tier == "Free Models"
61
 
62
 
63
+ def generate_codebook_pdf(categories, model, column_name, num_rows, model_source, filename, success_rate,
64
+ result_df=None, processing_time=None, prompt_template=None,
65
+ data_quality=None, catllm_version=None, python_version=None):
66
+ """Generate a PDF codebook explaining the output columns with comprehensive documentation."""
67
  from reportlab.lib.pagesizes import letter
68
  from reportlab.lib import colors
69
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
70
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
71
 
72
  # Create temp file for PDF
73
  pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_codebook.pdf', delete=False)
 
78
  title_style = ParagraphStyle('Title', parent=styles['Heading1'], fontSize=18, spaceAfter=20)
79
  heading_style = ParagraphStyle('Heading', parent=styles['Heading2'], fontSize=14, spaceAfter=10, spaceBefore=15)
80
  normal_style = styles['Normal']
81
+ code_style = ParagraphStyle('Code', parent=styles['Normal'], fontName='Courier', fontSize=9, leftIndent=20, spaceAfter=3)
82
 
83
  story = []
84
 
85
+ # === PAGE 1: Title, Category Mapping, Sample Results ===
86
  story.append(Paragraph("CatLLM Classification Codebook", title_style))
87
  story.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", normal_style))
88
+ story.append(Spacer(1, 15))
89
 
90
  # Category mapping
91
  story.append(Paragraph("Category Mapping", heading_style))
92
  story.append(Paragraph("Each category column contains binary values: 1 = present, 0 = not present", normal_style))
93
+ story.append(Spacer(1, 8))
94
 
95
  category_data = [["Column Name", "Category Description"]]
96
  for i, cat in enumerate(categories, 1):
 
101
  ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
102
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
103
  ('GRID', (0, 0), (-1, -1), 1, colors.black),
104
+ ('PADDING', (0, 0), (-1, -1), 6),
105
  ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
106
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
107
  ]))
108
  story.append(cat_table)
109
+ story.append(Spacer(1, 15))
110
+
111
+ # Sample Results (first 5 rows)
112
+ if result_df is not None and len(result_df) > 0:
113
+ story.append(Paragraph("Sample Results (First 5 Rows)", heading_style))
114
+ story.append(Paragraph("Example classifications showing original text and assigned categories:", normal_style))
115
+ story.append(Spacer(1, 8))
116
+
117
+ sample_data = [["Original Text (truncated)", "Assigned Categories"]]
118
+ sample_df = result_df.head(5)
119
+
120
+ for _, row in sample_df.iterrows():
121
+ # Get original text, truncate to 80 chars
122
+ original_text = str(row.get('survey_input', ''))[:80]
123
+ if len(str(row.get('survey_input', ''))) > 80:
124
+ original_text += "..."
125
+
126
+ # Get assigned categories
127
+ assigned = row.get('categories_id', '')
128
+ if pd.isna(assigned) or assigned == '':
129
+ assigned = "None"
130
+
131
+ sample_data.append([original_text, str(assigned)])
132
+
133
+ sample_table = Table(sample_data, colWidths=[320, 130])
134
+ sample_table.setStyle(TableStyle([
135
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
136
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
137
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
138
+ ('PADDING', (0, 0), (-1, -1), 6),
139
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
140
+ ('VALIGN', (0, 0), (-1, -1), 'TOP'),
141
+ ]))
142
+ story.append(sample_table)
143
+ story.append(Spacer(1, 15))
144
 
145
  # Other columns
146
  story.append(Paragraph("Other Output Columns", heading_style))
 
149
  ["survey_input", "The original text that was classified"],
150
  ["model_response", "Raw response from the LLM"],
151
  ["json", "Extracted JSON with category assignments"],
152
+ ["processing_status", "'success' if classification worked, 'error' if failed"],
153
+ ["categories_id", "Comma-separated list of assigned category numbers"],
154
  ]
155
  other_table = Table(other_cols, colWidths=[120, 330])
156
  other_table.setStyle(TableStyle([
157
  ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
158
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
159
  ('GRID', (0, 0), (-1, -1), 1, colors.black),
160
+ ('PADDING', (0, 0), (-1, -1), 6),
161
  ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
162
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
163
  ]))
164
  story.append(other_table)
 
165
 
166
+ # === PAGE 2: Category Distribution ===
167
+ story.append(PageBreak())
168
+ story.append(Paragraph("Category Distribution", title_style))
169
+ story.append(Paragraph("Count and percentage of responses assigned to each category:", normal_style))
170
+ story.append(Spacer(1, 15))
171
+
172
+ if result_df is not None:
173
+ dist_data = [["Category", "Description", "Count", "Percentage"]]
174
+ total_rows = len(result_df)
175
+
176
+ for i, cat in enumerate(categories, 1):
177
+ col_name = f"category_{i}"
178
+ if col_name in result_df.columns:
179
+ count = int(result_df[col_name].sum())
180
+ pct = (count / total_rows) * 100 if total_rows > 0 else 0
181
+ dist_data.append([col_name, cat[:40], str(count), f"{pct:.1f}%"])
182
+ else:
183
+ dist_data.append([col_name, cat[:40], "N/A", "N/A"])
184
+
185
+ dist_table = Table(dist_data, colWidths=[80, 200, 60, 80])
186
+ dist_table.setStyle(TableStyle([
187
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
188
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
189
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
190
+ ('PADDING', (0, 0), (-1, -1), 6),
191
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
192
+ ('ALIGN', (2, 1), (-1, -1), 'CENTER'),
193
+ ]))
194
+ story.append(dist_table)
195
+ story.append(Spacer(1, 15))
196
+ story.append(Paragraph(f"<i>Note: Percentages may sum to more than 100% as responses can be assigned to multiple categories.</i>", normal_style))
197
+
198
+ # Citation on page 2
199
+ story.append(Spacer(1, 30))
200
  story.append(Paragraph("Citation", heading_style))
201
  story.append(Paragraph("If you use CatLLM in your research, please cite:", normal_style))
202
  story.append(Spacer(1, 5))
203
  story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. https://github.com/chrissoria/cat-llm", normal_style))
204
 
205
+ # === PAGE 3: Classification Summary (Expanded) ===
206
  story.append(PageBreak())
207
  story.append(Paragraph("Classification Summary", title_style))
208
+ story.append(Spacer(1, 15))
209
 
210
+ # Basic summary
211
+ story.append(Paragraph("Classification Details", heading_style))
212
  summary_data = [
213
+ ["Source File", filename],
214
  ["Source Column", column_name],
215
  ["Model Used", model],
216
  ["Model Source", model_source],
217
+ ["Temperature", "default"],
218
  ["Rows Classified", str(num_rows)],
219
  ["Number of Categories", str(len(categories))],
220
  ["Success Rate", f"{success_rate:.2f}%"],
 
223
  summary_table.setStyle(TableStyle([
224
  ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
225
  ('GRID', (0, 0), (-1, -1), 1, colors.black),
226
+ ('PADDING', (0, 0), (-1, -1), 6),
227
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
228
  ]))
229
  story.append(summary_table)
230
+ story.append(Spacer(1, 15))
231
+
232
+ # Processing Time
233
+ if processing_time is not None:
234
+ story.append(Paragraph("Processing Time", heading_style))
235
+ rows_per_min = (num_rows / processing_time) * 60 if processing_time > 0 else 0
236
+ avg_time = processing_time / num_rows if num_rows > 0 else 0
237
+
238
+ time_data = [
239
+ ["Total Processing Time", f"{processing_time:.1f} seconds"],
240
+ ["Average Time per Response", f"{avg_time:.2f} seconds"],
241
+ ["Processing Rate", f"{rows_per_min:.1f} rows/minute"],
242
+ ]
243
+ time_table = Table(time_data, colWidths=[180, 270])
244
+ time_table.setStyle(TableStyle([
245
+ ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
246
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
247
+ ('PADDING', (0, 0), (-1, -1), 6),
248
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
249
+ ]))
250
+ story.append(time_table)
251
+ story.append(Spacer(1, 15))
252
+
253
+ # Data Quality Notes
254
+ if data_quality is not None:
255
+ story.append(Paragraph("Data Quality Notes", heading_style))
256
+ quality_data = [
257
+ ["Empty/Null Inputs Skipped", str(data_quality.get('null_count', 0))],
258
+ ["Average Text Length", f"{data_quality.get('avg_length', 0)} characters"],
259
+ ["Min Text Length", f"{data_quality.get('min_length', 0)} characters"],
260
+ ["Max Text Length", f"{data_quality.get('max_length', 0)} characters"],
261
+ ["Responses with Errors", str(data_quality.get('error_count', 0))],
262
+ ]
263
+ quality_table = Table(quality_data, colWidths=[180, 270])
264
+ quality_table.setStyle(TableStyle([
265
+ ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
266
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
267
+ ('PADDING', (0, 0), (-1, -1), 6),
268
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
269
+ ]))
270
+ story.append(quality_table)
271
+ story.append(Spacer(1, 15))
272
+
273
+ # Version Information
274
+ story.append(Paragraph("Version Information", heading_style))
275
+ version_data = [
276
+ ["CatLLM Version", catllm_version or "unknown"],
277
+ ["Python Version", python_version or "unknown"],
278
+ ["Timestamp", datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
279
+ ]
280
+ version_table = Table(version_data, colWidths=[180, 270])
281
+ version_table.setStyle(TableStyle([
282
+ ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
283
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
284
+ ('PADDING', (0, 0), (-1, -1), 6),
285
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
286
+ ]))
287
+ story.append(version_table)
288
+
289
+ # === PAGE 4: Prompt Template ===
290
+ story.append(PageBreak())
291
+ story.append(Paragraph("Prompt Template Used", title_style))
292
+ story.append(Paragraph("The following prompt template was sent to the LLM for each classification:", normal_style))
293
+ story.append(Spacer(1, 15))
294
+
295
+ if prompt_template:
296
+ # Show the template with placeholders
297
+ story.append(Paragraph("Template with Placeholders:", heading_style))
298
+ story.append(Spacer(1, 8))
299
 
300
+ for line in prompt_template.split('\n'):
301
+ escaped_line = line.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
302
+ if escaped_line.strip():
303
+ story.append(Paragraph(escaped_line, code_style))
304
+ else:
305
+ story.append(Spacer(1, 5))
306
+
307
+ story.append(Spacer(1, 20))
308
+
309
+ # Show example with actual categories
310
+ story.append(Paragraph("Example with Your Categories:", heading_style))
311
+ story.append(Spacer(1, 8))
312
+
313
+ categories_list = "\n".join([f" {i}. {cat}" for i, cat in enumerate(categories, 1)])
314
+ example_prompt = f'''Categorize this survey response "[YOUR TEXT HERE]" into the following categories:
315
+ {categories_list}
316
+ Provide your work in JSON format where the number belonging to each category
317
+ is the key and a 1 if the category is present and a 0 if not.'''
318
+
319
+ for line in example_prompt.split('\n'):
320
+ escaped_line = line.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
321
+ if escaped_line.strip():
322
+ story.append(Paragraph(escaped_line, code_style))
323
+ else:
324
+ story.append(Spacer(1, 5))
325
+
326
+ # === PAGE 5: Reproducibility Code ===
327
  story.append(PageBreak())
328
  story.append(Paragraph("Reproducibility Code", title_style))
329
  story.append(Paragraph("Use the following Python code to reproduce this classification:", normal_style))
 
356
  # Save to CSV
357
  result.to_csv("classified_results.csv", index=False)'''
358
 
 
 
 
359
  # Split code into lines and add each as a paragraph
360
  for line in code_text.split('\n'):
361
  if line.strip() == '':
 
409
  def classify_data(spreadsheet_file, spreadsheet_column,
410
  cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
411
  model_tier, model, model_source_input, api_key_input):
412
+ """Main classification function. Returns distribution, samples, full results, files, and status."""
413
  if not CATLLM_AVAILABLE:
414
+ return None, None, None, None, "**Error:** catllm package not available"
415
 
416
  all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
417
  categories = [c.strip() for c in all_cats if c and c.strip()]
418
 
419
  if not categories:
420
+ return None, None, None, None, "**Error:** Please enter at least one category"
421
 
422
  actual_model = model
423
 
 
427
  if model in HF_ROUTED_MODELS:
428
  actual_api_key = os.environ.get("HF_API_KEY", "")
429
  if not actual_api_key:
430
+ return None, None, None, None, "**Error:** HuggingFace API key not configured in Space secrets"
431
  elif "gpt" in model.lower():
432
  actual_api_key = os.environ.get("OPENAI_API_KEY", "")
433
  if not actual_api_key:
434
+ return None, None, None, None, "**Error:** OpenAI API key not configured in Space secrets"
435
  elif "gemini" in model.lower():
436
  actual_api_key = os.environ.get("GOOGLE_API_KEY", "")
437
  if not actual_api_key:
438
+ return None, None, None, None, "**Error:** Google API key not configured in Space secrets"
439
  elif "mistral" in model.lower():
440
  actual_api_key = os.environ.get("MISTRAL_API_KEY", "")
441
  if not actual_api_key:
442
+ return None, None, None, None, "**Error:** Mistral API key not configured in Space secrets"
443
  elif "claude" in model.lower():
444
  actual_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
445
  if not actual_api_key:
446
+ return None, None, None, None, "**Error:** Anthropic API key not configured in Space secrets"
447
  elif "sonar" in model.lower():
448
  actual_api_key = os.environ.get("PERPLEXITY_API_KEY", "")
449
  if not actual_api_key:
450
+ return None, None, None, None, "**Error:** Perplexity API key not configured in Space secrets"
451
  elif "grok" in model.lower():
452
  actual_api_key = os.environ.get("XAI_API_KEY", "")
453
  if not actual_api_key:
454
+ return None, None, None, None, "**Error:** xAI API key not configured in Space secrets"
455
  else:
456
  actual_api_key = os.environ.get("HF_API_KEY", "")
457
  else:
 
459
  if api_key_input and api_key_input.strip():
460
  actual_api_key = api_key_input.strip()
461
  else:
462
+ return None, None, None, None, f"**Error:** Please provide your API key for {model}"
463
 
464
  # Use user-selected model_source, or auto-detect if "auto"
465
  if model_source_input == "auto":
 
469
 
470
  try:
471
  if not spreadsheet_file:
472
+ return None, None, None, None, "**Error:** Please upload a file"
473
  if not spreadsheet_column:
474
+ return None, None, None, None, "**Error:** Please select a column to classify"
475
 
476
  file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
477
  if file_path.endswith('.csv'):
 
480
  df = pd.read_excel(file_path)
481
 
482
  if spreadsheet_column not in df.columns:
483
+ return None, None, None, None, f"**Error:** Column '{spreadsheet_column}' not found"
484
 
485
  input_data = df[spreadsheet_column].tolist()
486
 
487
+ # Calculate data quality metrics before classification
488
+ text_series = df[spreadsheet_column].dropna().astype(str)
489
+ data_quality = {
490
+ 'null_count': int(df[spreadsheet_column].isna().sum()),
491
+ 'avg_length': round(text_series.str.len().mean(), 1) if len(text_series) > 0 else 0,
492
+ 'min_length': int(text_series.str.len().min()) if len(text_series) > 0 else 0,
493
+ 'max_length': int(text_series.str.len().max()) if len(text_series) > 0 else 0,
494
+ 'error_count': 0 # Will be updated after classification
495
+ }
496
+
497
+ # Capture timing
498
+ start_time = time.time()
499
+
500
  result = catllm.multi_class(
501
  survey_input=input_data,
502
  categories=categories,
 
505
  model_source=model_source
506
  )
507
 
508
+ processing_time = time.time() - start_time
509
+
510
+ # Update error count from results
511
+ if 'processing_status' in result.columns:
512
+ data_quality['error_count'] = int((result['processing_status'] == 'error').sum())
513
+
514
  # Save CSV for download
515
  with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
516
  result.to_csv(f.name, index=False)
 
526
  else:
527
  success_rate = 100.0
528
 
529
+ # Build prompt template for documentation
530
+ prompt_template = '''Categorize this survey response "{response}" into the following categories:
531
+ {categories}
532
+ Provide your work in JSON format where the number belonging to each category
533
+ is the key and a 1 if the category is present and a 0 if not.'''
534
+
535
+ # Get version info
536
+ try:
537
+ catllm_version = catllm.__version__
538
+ except AttributeError:
539
+ catllm_version = "unknown"
540
+ python_version = sys.version.split()[0]
541
+
542
+ # Generate PDF codebook with all new data
543
+ pdf_path = generate_codebook_pdf(
544
+ categories=categories,
545
+ model=actual_model,
546
+ column_name=spreadsheet_column,
547
+ num_rows=len(input_data),
548
+ model_source=model_source,
549
+ filename=original_filename,
550
+ success_rate=success_rate,
551
+ result_df=result,
552
+ processing_time=processing_time,
553
+ prompt_template=prompt_template,
554
+ data_quality=data_quality,
555
+ catllm_version=catllm_version,
556
+ python_version=python_version
557
+ )
558
 
559
+ # Build distribution summary DataFrame for display
560
+ dist_data = []
561
+ total_rows = len(result)
562
+ for i, cat in enumerate(categories, 1):
563
+ col_name = f"category_{i}"
564
+ if col_name in result.columns:
565
+ count = int(result[col_name].sum())
566
+ pct = (count / total_rows) * 100 if total_rows > 0 else 0
567
+ dist_data.append({
568
+ "Category": cat,
569
+ "Count": count,
570
+ "Percentage": f"{pct:.1f}%"
571
+ })
572
+ distribution_df = pd.DataFrame(dist_data)
573
+
574
+ # Build sample results DataFrame (first 5 rows)
575
+ sample_data = []
576
+ for _, row in result.head(5).iterrows():
577
+ original_text = str(row.get('survey_input', ''))[:100]
578
+ if len(str(row.get('survey_input', ''))) > 100:
579
+ original_text += "..."
580
+ assigned = row.get('categories_id', '')
581
+ if pd.isna(assigned) or assigned == '':
582
+ assigned = "None"
583
+ sample_data.append({
584
+ "Original Text": original_text,
585
+ "Assigned Categories": str(assigned)
586
+ })
587
+ sample_df = pd.DataFrame(sample_data)
588
+
589
+ # Return: distribution (visible), samples (visible), full results (visible), files, status
590
+ return (
591
+ gr.update(value=distribution_df, visible=True),
592
+ gr.update(value=sample_df, visible=True),
593
+ gr.update(value=result, visible=True),
594
+ [csv_path, pdf_path],
595
+ f"**Success!** Classified {len(input_data)} responses in {processing_time:.1f}s"
596
+ )
597
 
598
  except Exception as e:
599
+ return None, None, None, None, f"**Error:** {str(e)}"
600
 
601
 
602
  def add_category_field(current_count):
 
627
  "", # api_key
628
  "**Free tier** - no API key required! We cover the cost while CatLLM is in review.", # api_key_status
629
  "Ready to classify", # status
630
+ gr.update(value=None, visible=False), # distribution_df
631
+ gr.update(value=None, visible=False), # sample_results
632
+ gr.update(value=None, visible=False), # results
633
  None, # download_file
634
  gr.update(value="", visible=False), # code_output
635
  ])
 
793
 
794
  with gr.Column():
795
  status = gr.Markdown("Ready to classify")
796
+ distribution_df = gr.DataFrame(label="Category Distribution Summary", visible=False)
797
+ sample_results = gr.DataFrame(label="Sample Results (First 5 Rows)", visible=False)
798
+ results = gr.DataFrame(label="Full Classification Results", visible=False)
799
  download_file = gr.File(label="Download Results (CSV + Codebook PDF)", file_count="multiple")
800
  code_output = gr.Code(
801
  label="Python Code",
 
840
  classify_btn.click(
841
  fn=classify_data,
842
  inputs=[spreadsheet_file, spreadsheet_column] + category_inputs + [model_tier, model, model_source, api_key],
843
+ outputs=[distribution_df, sample_results, results, download_file, status]
844
  )
845
 
846
  see_code_btn.click(
 
852
  reset_btn.click(
853
  fn=reset_all,
854
  inputs=[],
855
+ outputs=[spreadsheet_file, spreadsheet_column] + category_inputs + [add_category_btn, category_count, model_tier, model, model_source, api_key, api_key_status, status, distribution_df, sample_results, results, download_file, code_output]
856
  )
857
 
858