chrissoria commited on
Commit
c05b50f
·
1 Parent(s): d54c66c

Add PDF codebook generation with category mappings

Browse files
Files changed (2) hide show
  1. app.py +100 -4
  2. requirements.txt +1 -0
app.py CHANGED
@@ -6,6 +6,7 @@ import gradio as gr
6
  import pandas as pd
7
  import tempfile
8
  import os
 
9
 
10
  # Import catllm
11
  try:
@@ -57,6 +58,98 @@ def is_free_model(model, model_tier):
57
  return model_tier == "Free Models"
58
 
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def get_model_source(model):
61
  """Auto-detect model source. All HF router models (novita, groq, etc) use 'huggingface'."""
62
  model_lower = model.lower()
@@ -180,12 +273,15 @@ def classify_data(spreadsheet_file, spreadsheet_column,
180
  model_source=model_source
181
  )
182
 
183
- # Save for download
184
  with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
185
  result.to_csv(f.name, index=False)
186
- download_path = f.name
 
 
 
187
 
188
- return result, download_path, f"**Success!** Classified {len(input_data)} responses"
189
 
190
  except Exception as e:
191
  return None, None, f"**Error:** {str(e)}"
@@ -358,7 +454,7 @@ https://github.com/chrissoria/cat-llm
358
  with gr.Column():
359
  status = gr.Markdown("Ready to classify")
360
  results = gr.DataFrame(label="Classification Results")
361
- download_file = gr.File(label="Download Results")
362
  code_output = gr.Code(
363
  label="Python Code",
364
  language="python",
 
6
  import pandas as pd
7
  import tempfile
8
  import os
9
+ from datetime import datetime
10
 
11
  # Import catllm
12
  try:
 
58
  return model_tier == "Free Models"
59
 
60
 
61
+ def generate_codebook_pdf(categories, model, column_name, num_rows):
62
+ """Generate a PDF codebook explaining the output columns."""
63
+ from reportlab.lib.pagesizes import letter
64
+ from reportlab.lib import colors
65
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
66
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
67
+
68
+ # Create temp file for PDF
69
+ pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_codebook.pdf', delete=False)
70
+ doc = SimpleDocTemplate(pdf_file.name, pagesize=letter)
71
+ styles = getSampleStyleSheet()
72
+
73
+ # Custom styles
74
+ title_style = ParagraphStyle('Title', parent=styles['Heading1'], fontSize=18, spaceAfter=20)
75
+ heading_style = ParagraphStyle('Heading', parent=styles['Heading2'], fontSize=14, spaceAfter=10, spaceBefore=15)
76
+ normal_style = styles['Normal']
77
+
78
+ story = []
79
+
80
+ # Title
81
+ story.append(Paragraph("CatLLM Classification Codebook", title_style))
82
+ story.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", normal_style))
83
+ story.append(Spacer(1, 20))
84
+
85
+ # Classification summary
86
+ story.append(Paragraph("Classification Summary", heading_style))
87
+ summary_data = [
88
+ ["Source Column", column_name],
89
+ ["Model Used", model],
90
+ ["Rows Classified", str(num_rows)],
91
+ ["Number of Categories", str(len(categories))],
92
+ ]
93
+ summary_table = Table(summary_data, colWidths=[150, 300])
94
+ summary_table.setStyle(TableStyle([
95
+ ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
96
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
97
+ ('PADDING', (0, 0), (-1, -1), 8),
98
+ ]))
99
+ story.append(summary_table)
100
+ story.append(Spacer(1, 20))
101
+
102
+ # Category mapping
103
+ story.append(Paragraph("Category Mapping", heading_style))
104
+ story.append(Paragraph("Each category column contains binary values: 1 = present, 0 = not present", normal_style))
105
+ story.append(Spacer(1, 10))
106
+
107
+ category_data = [["Column Name", "Category Description"]]
108
+ for i, cat in enumerate(categories, 1):
109
+ category_data.append([f"category_{i}", cat])
110
+
111
+ cat_table = Table(category_data, colWidths=[120, 330])
112
+ cat_table.setStyle(TableStyle([
113
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
114
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
115
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
116
+ ('PADDING', (0, 0), (-1, -1), 8),
117
+ ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
118
+ ]))
119
+ story.append(cat_table)
120
+ story.append(Spacer(1, 20))
121
+
122
+ # Other columns
123
+ story.append(Paragraph("Other Output Columns", heading_style))
124
+ other_cols = [
125
+ ["Column Name", "Description"],
126
+ ["survey_input", "The original text that was classified"],
127
+ ["model_response", "Raw response from the LLM"],
128
+ ["json", "Extracted JSON with category assignments"],
129
+ ["processing_status", "'success' if classification worked, 'error' if it failed"],
130
+ ["categories_id", "Comma-separated list of category numbers that were assigned"],
131
+ ]
132
+ other_table = Table(other_cols, colWidths=[120, 330])
133
+ other_table.setStyle(TableStyle([
134
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
135
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
136
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
137
+ ('PADDING', (0, 0), (-1, -1), 8),
138
+ ('BACKGROUND', (0, 1), (0, -1), colors.lightgrey),
139
+ ]))
140
+ story.append(other_table)
141
+ story.append(Spacer(1, 20))
142
+
143
+ # Citation
144
+ story.append(Paragraph("Citation", heading_style))
145
+ story.append(Paragraph("If you use CatLLM in your research, please cite:", normal_style))
146
+ story.append(Spacer(1, 5))
147
+ story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. https://github.com/chrissoria/cat-llm", normal_style))
148
+
149
+ doc.build(story)
150
+ return pdf_file.name
151
+
152
+
153
  def get_model_source(model):
154
  """Auto-detect model source. All HF router models (novita, groq, etc) use 'huggingface'."""
155
  model_lower = model.lower()
 
273
  model_source=model_source
274
  )
275
 
276
+ # Save CSV for download
277
  with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
278
  result.to_csv(f.name, index=False)
279
+ csv_path = f.name
280
+
281
+ # Generate PDF codebook
282
+ pdf_path = generate_codebook_pdf(categories, actual_model, spreadsheet_column, len(input_data))
283
 
284
+ return result, [csv_path, pdf_path], f"**Success!** Classified {len(input_data)} responses"
285
 
286
  except Exception as e:
287
  return None, None, f"**Error:** {str(e)}"
 
454
  with gr.Column():
455
  status = gr.Markdown("Ready to classify")
456
  results = gr.DataFrame(label="Classification Results")
457
+ download_file = gr.File(label="Download Results (CSV + Codebook PDF)", file_count="multiple")
458
  code_output = gr.Code(
459
  label="Python Code",
460
  language="python",
requirements.txt CHANGED
@@ -6,3 +6,4 @@ pandas
6
  openpyxl
7
  requests
8
  regex
 
 
6
  openpyxl
7
  requests
8
  regex
9
+ reportlab