chrissoria Claude commited on
Commit
c44cb7c
·
1 Parent(s): 41896b8

Add 'See the Code' feature and large file size warning

Browse files

- Add collapsible "See the Code" accordions for Extract and Classify tasks
- Generate reproducible Python code snippets for all operations
- Add file size check (>100MB) for images/PDFs with warning
- When large files detected, show warning and generated code as alternative
- Users can copy code to run locally with: pip install cat-llm

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +350 -40
app.py CHANGED
@@ -21,6 +21,167 @@ except ImportError as e:
21
 
22
  MAX_CATEGORIES = 10
23
  INITIAL_CATEGORIES = 3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Free models (uses Space secrets - no user API key needed)
26
  FREE_MODEL_CHOICES = [
@@ -497,12 +658,12 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
497
  progress=gr.Progress(track_tqdm=True)):
498
  """Extract categories from data and display them in a table."""
499
  if not CATLLM_AVAILABLE:
500
- yield None, None, "**Error:** catllm package not available"
501
  return
502
 
503
  actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
504
  if not actual_api_key:
505
- yield None, None, f"**Error:** {provider} API key not configured"
506
  return
507
 
508
  if model_source_input == "auto":
@@ -510,17 +671,48 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
510
  else:
511
  model_source = model_source_input
512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  try:
514
- yield None, None, "Extracting categories from your data..."
515
 
516
  start_time = time.time()
517
 
518
  if input_type == "Survey Responses":
519
  if not spreadsheet_file:
520
- yield None, None, "**Error:** Please upload a CSV/Excel file"
521
  return
522
  if not spreadsheet_column:
523
- yield None, None, "**Error:** Please select a column"
524
  return
525
 
526
  file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
@@ -554,7 +746,7 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
554
  else:
555
  pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
556
  else:
557
- yield None, None, "**Error:** Please upload PDF file(s) or a folder"
558
  return
559
 
560
  mode_mapping = {
@@ -593,7 +785,7 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
593
  else:
594
  image_input = image_file if isinstance(image_file, str) else image_file.name
595
  else:
596
- yield None, None, "**Error:** Please upload image file(s) or a folder"
597
  return
598
 
599
  # For images, use fewer divisions since each image can have multiple categories
@@ -614,7 +806,7 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
614
  )
615
 
616
  else:
617
- yield None, None, f"**Error:** Unknown input type: {input_type}"
618
  return
619
 
620
  processing_time = time.time() - start_time
@@ -635,14 +827,25 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
635
  categories_df.to_csv(f.name, index=False)
636
  csv_path = f.name
637
 
 
 
 
 
 
 
 
 
 
 
638
  yield (
639
  gr.update(value=categories_df, visible=True),
640
  csv_path,
 
641
  f"Extracted {len(top_categories)} categories in {processing_time:.1f}s"
642
  )
643
 
644
  except Exception as e:
645
- yield None, None, f"**Error:** {str(e)}"
646
 
647
 
648
  def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
@@ -653,19 +856,19 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
653
  progress=gr.Progress(track_tqdm=True)):
654
  """Classify data with user-provided categories."""
655
  if not CATLLM_AVAILABLE:
656
- yield None, None, None, None, "**Error:** catllm package not available"
657
  return
658
 
659
  all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
660
  categories = [c.strip() for c in all_cats if c and c.strip()]
661
 
662
  if not categories:
663
- yield None, None, None, None, "**Error:** Please enter at least one category"
664
  return
665
 
666
  actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
667
  if not actual_api_key:
668
- yield None, None, None, None, f"**Error:** {provider} API key not configured"
669
  return
670
 
671
  if model_source_input == "auto":
@@ -673,17 +876,48 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
673
  else:
674
  model_source = model_source_input
675
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
  try:
677
- yield None, None, None, None, "Classifying your data..."
678
 
679
  start_time = time.time()
680
 
681
  if input_type == "Survey Responses":
682
  if not spreadsheet_file:
683
- yield None, None, None, None, "**Error:** Please upload a CSV/Excel file"
684
  return
685
  if not spreadsheet_column:
686
- yield None, None, None, None, "**Error:** Please select a column"
687
  return
688
 
689
  file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
@@ -723,7 +957,7 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
723
  pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
724
  original_filename = pdf_input.split("/")[-1]
725
  else:
726
- yield None, None, None, None, "**Error:** Please upload PDF file(s) or a folder"
727
  return
728
 
729
  column_name = "PDF Pages"
@@ -763,7 +997,7 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
763
  image_input = image_file if isinstance(image_file, str) else image_file.name
764
  original_filename = image_input.split("/")[-1]
765
  else:
766
- yield None, None, None, None, "**Error:** Please upload image file(s) or a folder"
767
  return
768
 
769
  column_name = "Image Files"
@@ -779,7 +1013,7 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
779
  )
780
 
781
  else:
782
- yield None, None, None, None, f"**Error:** Unknown input type: {input_type}"
783
  return
784
 
785
  processing_time = time.time() - start_time
@@ -856,16 +1090,27 @@ Provide your work in JSON format where the number belonging to each category is
856
 
857
  plt.tight_layout()
858
 
 
 
 
 
 
 
 
 
 
 
859
  yield (
860
  gr.update(value=fig, visible=True),
861
  gr.update(value=result, visible=True),
862
  [csv_path, report_pdf_path],
 
863
  None,
864
  f"Classified {num_items} items in {processing_time:.1f}s"
865
  )
866
 
867
  except Exception as e:
868
- yield None, None, None, None, f"**Error:** {str(e)}"
869
 
870
 
871
  def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
@@ -876,12 +1121,12 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
876
  progress=gr.Progress(track_tqdm=True)):
877
  """Extract categories then classify data with them."""
878
  if not CATLLM_AVAILABLE:
879
- yield None, None, None, None, None, None, "**Error:** catllm package not available"
880
  return
881
 
882
  actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
883
  if not actual_api_key:
884
- yield None, None, None, None, None, None, f"**Error:** {provider} API key not configured"
885
  return
886
 
887
  if model_source_input == "auto":
@@ -889,18 +1134,49 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
889
  else:
890
  model_source = model_source_input
891
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
892
  try:
893
  # Phase 1: Extract categories
894
- yield None, None, None, None, None, None, "Phase 1: Extracting categories..."
895
 
896
  start_time = time.time()
897
 
898
  if input_type == "Survey Responses":
899
  if not spreadsheet_file:
900
- yield None, None, None, None, None, None, "**Error:** Please upload a CSV/Excel file"
901
  return
902
  if not spreadsheet_column:
903
- yield None, None, None, None, None, None, "**Error:** Please select a column"
904
  return
905
 
906
  file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
@@ -933,7 +1209,7 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
933
  input_data = pdf_file if isinstance(pdf_file, str) else pdf_file.name
934
  original_filename = input_data.split("/")[-1]
935
  else:
936
- yield None, None, None, None, None, None, "**Error:** Please upload PDF file(s) or a folder"
937
  return
938
 
939
  column_name = "PDF Pages"
@@ -964,7 +1240,7 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
964
  input_data = image_file if isinstance(image_file, str) else image_file.name
965
  original_filename = input_data.split("/")[-1]
966
  else:
967
- yield None, None, None, None, None, None, "**Error:** Please upload image file(s) or a folder"
968
  return
969
 
970
  column_name = "Image Files"
@@ -973,7 +1249,7 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
973
  mode_param = None
974
 
975
  else:
976
- yield None, None, None, None, None, None, f"**Error:** Unknown input type: {input_type}"
977
  return
978
 
979
  # Calculate sensible divisions based on input size and type
@@ -1010,7 +1286,7 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
1010
  categories_df = extract_result.get('counts_df', pd.DataFrame())
1011
 
1012
  if not categories:
1013
- yield None, None, None, None, None, None, "**Error:** No categories were extracted"
1014
  return
1015
 
1016
  extract_time = time.time() - start_time
@@ -1026,10 +1302,14 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
1026
  categories_df.to_csv(f.name, index=False)
1027
  extract_csv_path = f.name
1028
 
 
 
 
1029
  yield (
1030
  gr.update(value=categories_df, visible=True),
1031
  extract_csv_path,
1032
- None, None, None, None,
 
1033
  f"Extracted {len(categories)} categories in {extract_time:.1f}s. Now classifying..."
1034
  )
1035
 
@@ -1125,18 +1405,23 @@ Provide your work in JSON format where the number belonging to each category is
1125
 
1126
  plt.tight_layout()
1127
 
 
 
 
1128
  yield (
1129
  gr.update(value=categories_df, visible=True),
1130
  extract_csv_path,
 
1131
  gr.update(value=fig, visible=True),
1132
  gr.update(value=result, visible=True),
1133
  [classify_csv_path, report_pdf_path],
 
1134
  None,
1135
  f"Extracted {len(categories)} categories and classified {num_items} items in {total_time:.1f}s"
1136
  )
1137
 
1138
  except Exception as e:
1139
- yield None, None, None, None, None, None, f"**Error:** {str(e)}"
1140
 
1141
 
1142
  def add_category_field(current_count):
@@ -1190,10 +1475,12 @@ def reset_all():
1190
  gr.update(visible=False), # extract_output_group
1191
  gr.update(value=None, visible=False), # extracted_categories
1192
  None, # extract_download
 
1193
  gr.update(visible=False), # classify_output_group
1194
  gr.update(value=None, visible=False), # distribution_plot
1195
  gr.update(value=None, visible=False), # results
1196
  None, # download_file
 
1197
  ])
1198
  return updates
1199
 
@@ -1406,6 +1693,13 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
1406
  wrap=True
1407
  )
1408
  extract_download = gr.File(label="Download Categories (CSV)")
 
 
 
 
 
 
 
1409
 
1410
  # Classify output group
1411
  with gr.Group(visible=False) as classify_output_group:
@@ -1413,6 +1707,13 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
1413
  distribution_plot = gr.Plot(label="Category Distribution (%)", visible=False)
1414
  results = gr.DataFrame(label="Full Results", visible=False)
1415
  download_file = gr.File(label="Download Results (CSV + Methodology Report)", file_count="multiple")
 
 
 
 
 
 
 
1416
 
1417
  # Event handlers
1418
  def switch_input_type(input_type_val):
@@ -1534,6 +1835,7 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
1534
  progress=gr.Progress(track_tqdm=True)):
1535
  """Dispatch to appropriate function based on task mode."""
1536
  if task == "extract":
 
1537
  for update in run_extract_categories(
1538
  input_type, spreadsheet_file, spreadsheet_column,
1539
  pdf_file, pdf_folder_val, pdf_description, pdf_mode,
@@ -1546,12 +1848,15 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
1546
  yield (
1547
  update[0], # extracted_categories
1548
  update[1], # extract_download
 
1549
  None, # distribution_plot
1550
  None, # results
1551
  None, # download_file
1552
- update[2] # status
 
1553
  )
1554
  elif task == "assign":
 
1555
  for update in run_classify_data(
1556
  input_type, spreadsheet_file, spreadsheet_column,
1557
  pdf_file, pdf_folder_val, pdf_description, pdf_mode,
@@ -1564,12 +1869,15 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
1564
  yield (
1565
  None, # extracted_categories
1566
  None, # extract_download
 
1567
  update[0], # distribution_plot
1568
  update[1], # results
1569
  update[2], # download_file
1570
- update[4] # status
 
1571
  )
1572
  elif task == "extract_and_assign":
 
1573
  for update in run_extract_and_assign(
1574
  input_type, spreadsheet_file, spreadsheet_column,
1575
  pdf_file, pdf_folder_val, pdf_description, pdf_mode,
@@ -1581,13 +1889,15 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
1581
  yield (
1582
  update[0], # extracted_categories
1583
  update[1], # extract_download
1584
- update[2], # distribution_plot
1585
- update[3], # results
1586
- update[4], # download_file
1587
- update[6] # status
 
 
1588
  )
1589
  else:
1590
- yield (None, None, None, None, None, "Please select a task first.")
1591
 
1592
  run_btn.click(
1593
  fn=dispatch_run,
@@ -1595,7 +1905,7 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
1595
  pdf_file, pdf_folder, pdf_description, pdf_mode,
1596
  image_file, image_folder, image_description,
1597
  max_categories] + category_inputs + [model_tier, model, model_source, api_key],
1598
- outputs=[extracted_categories, extract_download, distribution_plot, results, download_file, status]
1599
  )
1600
 
1601
  reset_btn.click(
@@ -1612,8 +1922,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
1612
  categories_group, extract_settings_group, max_categories, model_group, run_btn,
1613
  model_tier, model, model_source, api_key, api_key, api_key_status,
1614
  status,
1615
- extract_output_group, extracted_categories, extract_download,
1616
- classify_output_group, distribution_plot, results, download_file
1617
  ]
1618
  )
1619
 
 
21
 
22
  MAX_CATEGORIES = 10
23
  INITIAL_CATEGORIES = 3
24
+ MAX_FILE_SIZE_MB = 100 # Warn users if total file size exceeds this
25
+
26
+
27
+ def calculate_total_file_size(files):
28
+ """Calculate total size of uploaded files in MB."""
29
+ if files is None:
30
+ return 0
31
+ if not isinstance(files, list):
32
+ files = [files]
33
+
34
+ total_bytes = 0
35
+ for f in files:
36
+ try:
37
+ file_path = f if isinstance(f, str) else f.name
38
+ total_bytes += os.path.getsize(file_path)
39
+ except (OSError, AttributeError):
40
+ pass
41
+ return total_bytes / (1024 * 1024) # Convert to MB
42
+
43
+
44
+ def generate_extract_code(input_type, description, model, model_source, max_categories, mode=None):
45
+ """Generate Python code for category extraction."""
46
+ if input_type == "text":
47
+ return f'''import catllm
48
+ import pandas as pd
49
+
50
+ # Load your data
51
+ df = pd.read_csv("your_data.csv")
52
+
53
+ # Extract categories from the text column
54
+ result = catllm.extract(
55
+ input_data=df["{description}"].tolist(),
56
+ api_key="YOUR_API_KEY",
57
+ input_type="text",
58
+ description="{description}",
59
+ user_model="{model}",
60
+ model_source="{model_source}",
61
+ max_categories={max_categories}
62
+ )
63
+
64
+ # View extracted categories
65
+ print(result["top_categories"])
66
+ print(result["counts_df"])
67
+ '''
68
+ elif input_type == "pdf":
69
+ mode_line = f',\n mode="{mode}"' if mode else ''
70
+ return f'''import catllm
71
+
72
+ # Extract categories from PDF documents
73
+ result = catllm.extract(
74
+ input_data="path/to/your/pdfs/", # or list of PDF paths
75
+ api_key="YOUR_API_KEY",
76
+ input_type="pdf",
77
+ description="{description}"{mode_line},
78
+ user_model="{model}",
79
+ model_source="{model_source}",
80
+ max_categories={max_categories}
81
+ )
82
+
83
+ # View extracted categories
84
+ print(result["top_categories"])
85
+ print(result["counts_df"])
86
+ '''
87
+ else: # image
88
+ return f'''import catllm
89
+
90
+ # Extract categories from images
91
+ result = catllm.extract(
92
+ input_data="path/to/your/images/", # or list of image paths
93
+ api_key="YOUR_API_KEY",
94
+ input_type="image",
95
+ description="{description}",
96
+ user_model="{model}",
97
+ model_source="{model_source}",
98
+ max_categories={max_categories}
99
+ )
100
+
101
+ # View extracted categories
102
+ print(result["top_categories"])
103
+ print(result["counts_df"])
104
+ '''
105
+
106
+
107
+ def generate_classify_code(input_type, description, categories, model, model_source, mode=None):
108
+ """Generate Python code for classification."""
109
+ categories_str = ",\n ".join([f'"{cat}"' for cat in categories])
110
+
111
+ if input_type == "text":
112
+ return f'''import catllm
113
+ import pandas as pd
114
+
115
+ # Load your data
116
+ df = pd.read_csv("your_data.csv")
117
+
118
+ # Define categories
119
+ categories = [
120
+ {categories_str}
121
+ ]
122
+
123
+ # Classify the text data
124
+ result = catllm.classify(
125
+ input_data=df["{description}"].tolist(),
126
+ categories=categories,
127
+ api_key="YOUR_API_KEY",
128
+ input_type="text",
129
+ description="{description}",
130
+ user_model="{model}",
131
+ model_source="{model_source}"
132
+ )
133
+
134
+ # View results
135
+ print(result)
136
+ result.to_csv("classified_results.csv", index=False)
137
+ '''
138
+ elif input_type == "pdf":
139
+ mode_line = f',\n mode="{mode}"' if mode else ''
140
+ return f'''import catllm
141
+
142
+ # Define categories
143
+ categories = [
144
+ {categories_str}
145
+ ]
146
+
147
+ # Classify PDF documents
148
+ result = catllm.classify(
149
+ input_data="path/to/your/pdfs/", # or list of PDF paths
150
+ categories=categories,
151
+ api_key="YOUR_API_KEY",
152
+ input_type="pdf",
153
+ description="{description}"{mode_line},
154
+ user_model="{model}",
155
+ model_source="{model_source}"
156
+ )
157
+
158
+ # View results
159
+ print(result)
160
+ result.to_csv("classified_results.csv", index=False)
161
+ '''
162
+ else: # image
163
+ return f'''import catllm
164
+
165
+ # Define categories
166
+ categories = [
167
+ {categories_str}
168
+ ]
169
+
170
+ # Classify images
171
+ result = catllm.classify(
172
+ input_data="path/to/your/images/", # or list of image paths
173
+ categories=categories,
174
+ api_key="YOUR_API_KEY",
175
+ input_type="image",
176
+ description="{description}",
177
+ user_model="{model}",
178
+ model_source="{model_source}"
179
+ )
180
+
181
+ # View results
182
+ print(result)
183
+ result.to_csv("classified_results.csv", index=False)
184
+ '''
185
 
186
  # Free models (uses Space secrets - no user API key needed)
187
  FREE_MODEL_CHOICES = [
 
658
  progress=gr.Progress(track_tqdm=True)):
659
  """Extract categories from data and display them in a table."""
660
  if not CATLLM_AVAILABLE:
661
+ yield None, None, None, "**Error:** catllm package not available"
662
  return
663
 
664
  actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
665
  if not actual_api_key:
666
+ yield None, None, None, f"**Error:** {provider} API key not configured"
667
  return
668
 
669
  if model_source_input == "auto":
 
671
  else:
672
  model_source = model_source_input
673
 
674
+ # Check file size for images and PDFs
675
+ files_to_check = None
676
+ if input_type == "Images":
677
+ files_to_check = image_folder if image_folder else image_file
678
+ elif input_type == "PDF Documents":
679
+ files_to_check = pdf_folder if pdf_folder else pdf_file
680
+
681
+ if files_to_check:
682
+ total_size_mb = calculate_total_file_size(files_to_check)
683
+ if total_size_mb > MAX_FILE_SIZE_MB:
684
+ # Generate the code for the user
685
+ if input_type == "Images":
686
+ code = generate_extract_code("image", image_description or "images", model, model_source, int(max_categories_val))
687
+ else:
688
+ mode_mapping = {"Image (visual documents)": "image", "Text (text-heavy)": "text", "Both (comprehensive)": "both"}
689
+ actual_mode = mode_mapping.get(pdf_mode, "image")
690
+ code = generate_extract_code("pdf", pdf_description or "document", model, model_source, int(max_categories_val), actual_mode)
691
+
692
+ warning_msg = f"""**⚠️ Large Upload Detected ({total_size_mb:.1f} MB)**
693
+
694
+ Uploads over {MAX_FILE_SIZE_MB} MB may experience performance issues or timeouts on this web app.
695
+
696
+ **Recommended:** Run the code locally using the Python package instead. See the code below, or click "See the Code" after this message.
697
+
698
+ ```
699
+ pip install cat-llm
700
+ ```
701
+ """
702
+ yield None, None, code, warning_msg
703
+ return
704
+
705
  try:
706
+ yield None, None, None, "Extracting categories from your data..."
707
 
708
  start_time = time.time()
709
 
710
  if input_type == "Survey Responses":
711
  if not spreadsheet_file:
712
+ yield None, None, None, "**Error:** Please upload a CSV/Excel file"
713
  return
714
  if not spreadsheet_column:
715
+ yield None, None, None, "**Error:** Please select a column"
716
  return
717
 
718
  file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
 
746
  else:
747
  pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
748
  else:
749
+ yield None, None, None, "**Error:** Please upload PDF file(s) or a folder"
750
  return
751
 
752
  mode_mapping = {
 
785
  else:
786
  image_input = image_file if isinstance(image_file, str) else image_file.name
787
  else:
788
+ yield None, None, None, "**Error:** Please upload image file(s) or a folder"
789
  return
790
 
791
  # For images, use fewer divisions since each image can have multiple categories
 
806
  )
807
 
808
  else:
809
+ yield None, None, None, f"**Error:** Unknown input type: {input_type}"
810
  return
811
 
812
  processing_time = time.time() - start_time
 
827
  categories_df.to_csv(f.name, index=False)
828
  csv_path = f.name
829
 
830
+ # Generate reproducibility code
831
+ if input_type == "Survey Responses":
832
+ code = generate_extract_code("text", spreadsheet_column, model, model_source, int(max_categories_val))
833
+ elif input_type == "PDF Documents":
834
+ mode_mapping = {"Image (visual documents)": "image", "Text (text-heavy)": "text", "Both (comprehensive)": "both"}
835
+ actual_mode = mode_mapping.get(pdf_mode, "image")
836
+ code = generate_extract_code("pdf", pdf_description or "document", model, model_source, int(max_categories_val), actual_mode)
837
+ else: # Images
838
+ code = generate_extract_code("image", image_description or "images", model, model_source, int(max_categories_val))
839
+
840
  yield (
841
  gr.update(value=categories_df, visible=True),
842
  csv_path,
843
+ code,
844
  f"Extracted {len(top_categories)} categories in {processing_time:.1f}s"
845
  )
846
 
847
  except Exception as e:
848
+ yield None, None, None, f"**Error:** {str(e)}"
849
 
850
 
851
  def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
 
856
  progress=gr.Progress(track_tqdm=True)):
857
  """Classify data with user-provided categories."""
858
  if not CATLLM_AVAILABLE:
859
+ yield None, None, None, None, None, "**Error:** catllm package not available"
860
  return
861
 
862
  all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
863
  categories = [c.strip() for c in all_cats if c and c.strip()]
864
 
865
  if not categories:
866
+ yield None, None, None, None, None, "**Error:** Please enter at least one category"
867
  return
868
 
869
  actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
870
  if not actual_api_key:
871
+ yield None, None, None, None, None, f"**Error:** {provider} API key not configured"
872
  return
873
 
874
  if model_source_input == "auto":
 
876
  else:
877
  model_source = model_source_input
878
 
879
+ # Check file size for images and PDFs
880
+ files_to_check = None
881
+ if input_type == "Images":
882
+ files_to_check = image_folder if image_folder else image_file
883
+ elif input_type == "PDF Documents":
884
+ files_to_check = pdf_folder if pdf_folder else pdf_file
885
+
886
+ if files_to_check:
887
+ total_size_mb = calculate_total_file_size(files_to_check)
888
+ if total_size_mb > MAX_FILE_SIZE_MB:
889
+ # Generate the code for the user
890
+ if input_type == "Images":
891
+ code = generate_classify_code("image", image_description or "images", categories, model, model_source)
892
+ else:
893
+ mode_mapping = {"Image (visual documents)": "image", "Text (text-heavy)": "text", "Both (comprehensive)": "both"}
894
+ actual_mode = mode_mapping.get(pdf_mode, "image")
895
+ code = generate_classify_code("pdf", pdf_description or "document", categories, model, model_source, actual_mode)
896
+
897
+ warning_msg = f"""**⚠️ Large Upload Detected ({total_size_mb:.1f} MB)**
898
+
899
+ Uploads over {MAX_FILE_SIZE_MB} MB may experience performance issues or timeouts on this web app.
900
+
901
+ **Recommended:** Run the code locally using the Python package instead. See the code below, or click "See the Code" after this message.
902
+
903
+ ```
904
+ pip install cat-llm
905
+ ```
906
+ """
907
+ yield None, None, None, code, None, warning_msg
908
+ return
909
+
910
  try:
911
+ yield None, None, None, None, None, "Classifying your data..."
912
 
913
  start_time = time.time()
914
 
915
  if input_type == "Survey Responses":
916
  if not spreadsheet_file:
917
+ yield None, None, None, None, None, "**Error:** Please upload a CSV/Excel file"
918
  return
919
  if not spreadsheet_column:
920
+ yield None, None, None, None, None, "**Error:** Please select a column"
921
  return
922
 
923
  file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
 
957
  pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
958
  original_filename = pdf_input.split("/")[-1]
959
  else:
960
+ yield None, None, None, None, None, "**Error:** Please upload PDF file(s) or a folder"
961
  return
962
 
963
  column_name = "PDF Pages"
 
997
  image_input = image_file if isinstance(image_file, str) else image_file.name
998
  original_filename = image_input.split("/")[-1]
999
  else:
1000
+ yield None, None, None, None, None, "**Error:** Please upload image file(s) or a folder"
1001
  return
1002
 
1003
  column_name = "Image Files"
 
1013
  )
1014
 
1015
  else:
1016
+ yield None, None, None, None, None, f"**Error:** Unknown input type: {input_type}"
1017
  return
1018
 
1019
  processing_time = time.time() - start_time
 
1090
 
1091
  plt.tight_layout()
1092
 
1093
+ # Generate reproducibility code
1094
+ if input_type == "Survey Responses":
1095
+ code = generate_classify_code("text", spreadsheet_column, categories, model, model_source)
1096
+ elif input_type == "PDF Documents":
1097
+ mode_mapping = {"Image (visual documents)": "image", "Text (text-heavy)": "text", "Both (comprehensive)": "both"}
1098
+ actual_mode = mode_mapping.get(pdf_mode, "image")
1099
+ code = generate_classify_code("pdf", pdf_description or "document", categories, model, model_source, actual_mode)
1100
+ else: # Images
1101
+ code = generate_classify_code("image", image_description or "images", categories, model, model_source)
1102
+
1103
  yield (
1104
  gr.update(value=fig, visible=True),
1105
  gr.update(value=result, visible=True),
1106
  [csv_path, report_pdf_path],
1107
+ code,
1108
  None,
1109
  f"Classified {num_items} items in {processing_time:.1f}s"
1110
  )
1111
 
1112
  except Exception as e:
1113
+ yield None, None, None, None, None, f"**Error:** {str(e)}"
1114
 
1115
 
1116
  def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
 
1121
  progress=gr.Progress(track_tqdm=True)):
1122
  """Extract categories then classify data with them."""
1123
  if not CATLLM_AVAILABLE:
1124
+ yield None, None, None, None, None, None, None, None, "**Error:** catllm package not available"
1125
  return
1126
 
1127
  actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
1128
  if not actual_api_key:
1129
+ yield None, None, None, None, None, None, None, None, f"**Error:** {provider} API key not configured"
1130
  return
1131
 
1132
  if model_source_input == "auto":
 
1134
  else:
1135
  model_source = model_source_input
1136
 
1137
+ # Check file size for images and PDFs
1138
+ files_to_check = None
1139
+ if input_type == "Images":
1140
+ files_to_check = image_folder if image_folder else image_file
1141
+ elif input_type == "PDF Documents":
1142
+ files_to_check = pdf_folder if pdf_folder else pdf_file
1143
+
1144
+ if files_to_check:
1145
+ total_size_mb = calculate_total_file_size(files_to_check)
1146
+ if total_size_mb > MAX_FILE_SIZE_MB:
1147
+ # Generate the code for the user
1148
+ if input_type == "Images":
1149
+ extract_code = generate_extract_code("image", image_description or "images", model, model_source, int(max_categories_val))
1150
+ else:
1151
+ mode_mapping = {"Image (visual documents)": "image", "Text (text-heavy)": "text", "Both (comprehensive)": "both"}
1152
+ actual_mode = mode_mapping.get(pdf_mode, "image")
1153
+ extract_code = generate_extract_code("pdf", pdf_description or "document", model, model_source, int(max_categories_val), actual_mode)
1154
+
1155
+ warning_msg = f"""**⚠️ Large Upload Detected ({total_size_mb:.1f} MB)**
1156
+
1157
+ Uploads over {MAX_FILE_SIZE_MB} MB may experience performance issues or timeouts on this web app.
1158
+
1159
+ **Recommended:** Run the code locally using the Python package instead. See the code below, or click "See the Code" after this message.
1160
+
1161
+ ```
1162
+ pip install cat-llm
1163
+ ```
1164
+ """
1165
+ yield None, None, extract_code, None, None, None, None, None, warning_msg
1166
+ return
1167
+
1168
  try:
1169
  # Phase 1: Extract categories
1170
+ yield None, None, None, None, None, None, None, None, "Phase 1: Extracting categories..."
1171
 
1172
  start_time = time.time()
1173
 
1174
  if input_type == "Survey Responses":
1175
  if not spreadsheet_file:
1176
+ yield None, None, None, None, None, None, None, None, "**Error:** Please upload a CSV/Excel file"
1177
  return
1178
  if not spreadsheet_column:
1179
+ yield None, None, None, None, None, None, None, None, "**Error:** Please select a column"
1180
  return
1181
 
1182
  file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
 
1209
  input_data = pdf_file if isinstance(pdf_file, str) else pdf_file.name
1210
  original_filename = input_data.split("/")[-1]
1211
  else:
1212
+ yield None, None, None, None, None, None, None, None, "**Error:** Please upload PDF file(s) or a folder"
1213
  return
1214
 
1215
  column_name = "PDF Pages"
 
1240
  input_data = image_file if isinstance(image_file, str) else image_file.name
1241
  original_filename = input_data.split("/")[-1]
1242
  else:
1243
+ yield None, None, None, None, None, None, None, None, "**Error:** Please upload image file(s) or a folder"
1244
  return
1245
 
1246
  column_name = "Image Files"
 
1249
  mode_param = None
1250
 
1251
  else:
1252
+ yield None, None, None, None, None, None, None, None, f"**Error:** Unknown input type: {input_type}"
1253
  return
1254
 
1255
  # Calculate sensible divisions based on input size and type
 
1286
  categories_df = extract_result.get('counts_df', pd.DataFrame())
1287
 
1288
  if not categories:
1289
+ yield None, None, None, None, None, None, None, None, "**Error:** No categories were extracted"
1290
  return
1291
 
1292
  extract_time = time.time() - start_time
 
1302
  categories_df.to_csv(f.name, index=False)
1303
  extract_csv_path = f.name
1304
 
1305
+ # Generate extract code
1306
+ extract_code = generate_extract_code(input_type_param, description, model, model_source, int(max_categories_val), mode_param)
1307
+
1308
  yield (
1309
  gr.update(value=categories_df, visible=True),
1310
  extract_csv_path,
1311
+ extract_code,
1312
+ None, None, None, None, None,
1313
  f"Extracted {len(categories)} categories in {extract_time:.1f}s. Now classifying..."
1314
  )
1315
 
 
1405
 
1406
  plt.tight_layout()
1407
 
1408
+ # Generate classify code
1409
+ classify_code = generate_classify_code(input_type_param, description, categories, model, model_source, mode_param)
1410
+
1411
  yield (
1412
  gr.update(value=categories_df, visible=True),
1413
  extract_csv_path,
1414
+ extract_code,
1415
  gr.update(value=fig, visible=True),
1416
  gr.update(value=result, visible=True),
1417
  [classify_csv_path, report_pdf_path],
1418
+ classify_code,
1419
  None,
1420
  f"Extracted {len(categories)} categories and classified {num_items} items in {total_time:.1f}s"
1421
  )
1422
 
1423
  except Exception as e:
1424
+ yield None, None, None, None, None, None, None, None, f"**Error:** {str(e)}"
1425
 
1426
 
1427
  def add_category_field(current_count):
 
1475
  gr.update(visible=False), # extract_output_group
1476
  gr.update(value=None, visible=False), # extracted_categories
1477
  None, # extract_download
1478
+ "# Code will be generated after extraction", # extract_code_display
1479
  gr.update(visible=False), # classify_output_group
1480
  gr.update(value=None, visible=False), # distribution_plot
1481
  gr.update(value=None, visible=False), # results
1482
  None, # download_file
1483
+ "# Code will be generated after classification", # classify_code_display
1484
  ])
1485
  return updates
1486
 
 
1693
  wrap=True
1694
  )
1695
  extract_download = gr.File(label="Download Categories (CSV)")
1696
+ with gr.Accordion("See the Code", open=False):
1697
+ extract_code_display = gr.Code(
1698
+ label="Python Code",
1699
+ language="python",
1700
+ value="# Code will be generated after extraction",
1701
+ interactive=False
1702
+ )
1703
 
1704
  # Classify output group
1705
  with gr.Group(visible=False) as classify_output_group:
 
1707
  distribution_plot = gr.Plot(label="Category Distribution (%)", visible=False)
1708
  results = gr.DataFrame(label="Full Results", visible=False)
1709
  download_file = gr.File(label="Download Results (CSV + Methodology Report)", file_count="multiple")
1710
+ with gr.Accordion("See the Code", open=False):
1711
+ classify_code_display = gr.Code(
1712
+ label="Python Code",
1713
+ language="python",
1714
+ value="# Code will be generated after classification",
1715
+ interactive=False
1716
+ )
1717
 
1718
  # Event handlers
1719
  def switch_input_type(input_type_val):
 
1835
  progress=gr.Progress(track_tqdm=True)):
1836
  """Dispatch to appropriate function based on task mode."""
1837
  if task == "extract":
1838
+ # run_extract_categories yields: (categories_df, csv_path, code, status)
1839
  for update in run_extract_categories(
1840
  input_type, spreadsheet_file, spreadsheet_column,
1841
  pdf_file, pdf_folder_val, pdf_description, pdf_mode,
 
1848
  yield (
1849
  update[0], # extracted_categories
1850
  update[1], # extract_download
1851
+ update[2], # extract_code_display
1852
  None, # distribution_plot
1853
  None, # results
1854
  None, # download_file
1855
+ None, # classify_code_display
1856
+ update[3] # status
1857
  )
1858
  elif task == "assign":
1859
+ # run_classify_data yields: (plot, df, files, code, unused, status)
1860
  for update in run_classify_data(
1861
  input_type, spreadsheet_file, spreadsheet_column,
1862
  pdf_file, pdf_folder_val, pdf_description, pdf_mode,
 
1869
  yield (
1870
  None, # extracted_categories
1871
  None, # extract_download
1872
+ None, # extract_code_display
1873
  update[0], # distribution_plot
1874
  update[1], # results
1875
  update[2], # download_file
1876
+ update[3], # classify_code_display
1877
+ update[5] # status
1878
  )
1879
  elif task == "extract_and_assign":
1880
+ # run_extract_and_assign yields: (categories_df, extract_csv, extract_code, plot, df, files, classify_code, unused, status)
1881
  for update in run_extract_and_assign(
1882
  input_type, spreadsheet_file, spreadsheet_column,
1883
  pdf_file, pdf_folder_val, pdf_description, pdf_mode,
 
1889
  yield (
1890
  update[0], # extracted_categories
1891
  update[1], # extract_download
1892
+ update[2], # extract_code_display
1893
+ update[3], # distribution_plot
1894
+ update[4], # results
1895
+ update[5], # download_file
1896
+ update[6], # classify_code_display
1897
+ update[8] # status
1898
  )
1899
  else:
1900
+ yield (None, None, None, None, None, None, None, "Please select a task first.")
1901
 
1902
  run_btn.click(
1903
  fn=dispatch_run,
 
1905
  pdf_file, pdf_folder, pdf_description, pdf_mode,
1906
  image_file, image_folder, image_description,
1907
  max_categories] + category_inputs + [model_tier, model, model_source, api_key],
1908
+ outputs=[extracted_categories, extract_download, extract_code_display, distribution_plot, results, download_file, classify_code_display, status]
1909
  )
1910
 
1911
  reset_btn.click(
 
1922
  categories_group, extract_settings_group, max_categories, model_group, run_btn,
1923
  model_tier, model, model_source, api_key, api_key, api_key_status,
1924
  status,
1925
+ extract_output_group, extracted_categories, extract_download, extract_code_display,
1926
+ classify_output_group, distribution_plot, results, download_file, classify_code_display
1927
  ]
1928
  )
1929