chrissoria Claude commited on
Commit
388d315
·
1 Parent(s): bb2d5e5

Add PDF document classification alongside text data

Browse files

- Rename app to "Research Data Classifier"
- Add input type toggle: Text Data (CSV/Excel) | PDF Documents
- Add PDF-specific inputs: file upload, description, processing mode (Image/Text/Both)
- Modify classify_data() to branch between multi_class() and pdf_multi_class()
- Update sample results and success message to handle both modes
- Update About section to mention PDF support

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +197 -78
app.py CHANGED
@@ -443,7 +443,8 @@ def load_columns(file):
443
  return gr.update(choices=[], value=None), f"**Error:** {str(e)}"
444
 
445
 
446
- def classify_data(spreadsheet_file, spreadsheet_column,
 
447
  cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
448
  model_tier, model, model_source_input, api_key_input):
449
  """Main classification function with progress updates. Yields status updates then final results."""
@@ -515,53 +516,132 @@ def classify_data(spreadsheet_file, spreadsheet_column,
515
  model_source = model_source_input
516
 
517
  try:
518
- if not spreadsheet_file:
519
- yield None, None, None, None, "**Error:** Please upload a file"
520
- return
521
- if not spreadsheet_column:
522
- yield None, None, None, None, "**Error:** Please select a column to classify"
523
- return
524
 
525
- file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
526
- if file_path.endswith('.csv'):
527
- df = pd.read_csv(file_path)
528
- else:
529
- df = pd.read_excel(file_path)
530
 
531
- if spreadsheet_column not in df.columns:
532
- yield None, None, None, None, f"**Error:** Column '{spreadsheet_column}' not found"
533
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534
 
535
- input_data = df[spreadsheet_column].tolist()
 
 
 
536
 
537
- # Progress update: data loaded
538
- yield None, None, None, None, f"⏳ **Loading data...** Found {len(input_data)} responses to classify."
 
539
 
540
- # Calculate data quality metrics before classification
541
- text_series = df[spreadsheet_column].dropna().astype(str)
542
- data_quality = {
543
- 'null_count': int(df[spreadsheet_column].isna().sum()),
544
- 'avg_length': round(text_series.str.len().mean(), 1) if len(text_series) > 0 else 0,
545
- 'min_length': int(text_series.str.len().min()) if len(text_series) > 0 else 0,
546
- 'max_length': int(text_series.str.len().max()) if len(text_series) > 0 else 0,
547
- 'error_count': 0 # Will be updated after classification
548
- }
549
 
550
- # Progress update: starting classification
551
- yield None, None, None, None, f"🔄 **Classifying {len(input_data)} responses...** This may take a moment."
552
 
553
- # Capture timing
554
- start_time = time.time()
 
 
 
 
 
 
555
 
556
- result = catllm.multi_class(
557
- survey_input=input_data,
558
- categories=categories,
559
- api_key=actual_api_key,
560
- user_model=actual_model,
561
- model_source=model_source
562
- )
 
 
563
 
564
- processing_time = time.time() - start_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565
 
566
  # Update error count from results
567
  if 'processing_status' in result.columns:
@@ -572,9 +652,6 @@ def classify_data(spreadsheet_file, spreadsheet_column,
572
  result.to_csv(f.name, index=False)
573
  csv_path = f.name
574
 
575
- # Get original filename for methodology report
576
- original_filename = file_path.split("/")[-1]
577
-
578
  # Calculate success rate
579
  if 'processing_status' in result.columns:
580
  success_count = (result['processing_status'] == 'success').sum()
@@ -582,17 +659,6 @@ def classify_data(spreadsheet_file, spreadsheet_column,
582
  else:
583
  success_rate = 100.0
584
 
585
- # Build prompt template for documentation (chain of thought - default)
586
- prompt_template = '''Categorize this survey response "{response}" into the following categories that apply:
587
- {categories}
588
-
589
- Let's think step by step:
590
- 1. First, identify the main themes mentioned in the response
591
- 2. Then, match each theme to the relevant categories
592
- 3. Finally, assign 1 to matching categories and 0 to non-matching categories
593
-
594
- Provide your work in JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values.'''
595
-
596
  # Get version info
597
  try:
598
  catllm_version = catllm.__version__
@@ -604,11 +670,11 @@ Provide your work in JSON format where the number belonging to each category is
604
  yield None, None, None, None, f"📄 **Generating methodology report...** Classification complete in {processing_time:.1f}s."
605
 
606
  # Generate PDF methodology report with all new data
607
- pdf_path = generate_methodology_report_pdf(
608
  categories=categories,
609
  model=actual_model,
610
- column_name=spreadsheet_column,
611
- num_rows=len(input_data),
612
  model_source=model_source,
613
  filename=original_filename,
614
  success_rate=success_rate,
@@ -657,26 +723,33 @@ Provide your work in JSON format where the number belonging to each category is
657
 
658
  # Build sample results DataFrame (first 5 rows)
659
  sample_data = []
 
 
 
 
660
  for _, row in result.head(5).iterrows():
661
- original_text = str(row.get('survey_input', ''))[:100]
662
- if len(str(row.get('survey_input', ''))) > 100:
663
  original_text += "..."
664
  assigned = row.get('categories_id', '')
665
  if pd.isna(assigned) or assigned == '':
666
  assigned = "None"
667
  sample_data.append({
668
- "Original Text": original_text,
669
  "Assigned Categories": str(assigned)
670
  })
671
  sample_df = pd.DataFrame(sample_data)
672
 
 
 
 
673
  # Final yield: distribution plot (visible), samples (visible), full results (visible), files, status
674
  yield (
675
  gr.update(value=distribution_fig, visible=True),
676
  gr.update(value=sample_df, visible=True),
677
  gr.update(value=result, visible=True),
678
- [csv_path, pdf_path],
679
- f"✅ **Success!** Classified {len(input_data)} responses in {processing_time:.1f}s"
680
  )
681
 
682
  except Exception as e:
@@ -696,8 +769,14 @@ def add_category_field(current_count):
696
  def reset_all():
697
  """Reset all inputs and outputs to initial state."""
698
  updates = [
 
 
 
699
  None, # spreadsheet_file
700
  gr.update(choices=[], value=None), # spreadsheet_column
 
 
 
701
  ]
702
  # Reset category inputs (first 3 visible, rest hidden, all empty)
703
  for i in range(MAX_CATEGORIES):
@@ -780,10 +859,10 @@ custom_css = """
780
  }
781
  """
782
 
783
- with gr.Blocks(title="CatLLM - Survey Response Classifier", theme=gr.themes.Soft(), css=custom_css) as demo:
784
  gr.Image("logo.png", show_label=False, show_download_button=False, height=115, container=False)
785
- gr.Markdown("# CatLLM - Survey Response Classifier")
786
- gr.Markdown("Classify survey responses into custom categories using LLMs.")
787
 
788
  with gr.Accordion("About This App", open=False):
789
  gr.Markdown("""
@@ -791,12 +870,12 @@ with gr.Blocks(title="CatLLM - Survey Response Classifier", theme=gr.themes.Soft
791
 
792
  ---
793
 
794
- **CatLLM** is an open-source Python package for classifying text data using Large Language Models.
795
 
796
  ### What It Does
797
- - Classifies survey responses, open-ended text, and other unstructured data into custom categories
798
  - Supports multiple LLM providers: OpenAI, Anthropic, Google, HuggingFace, and more
799
- - Returns structured results with category assignments for each response
800
  - Tested on over 40,000 rows of data with a 100% structured output rate (actual output rate ~99.98% due to occasional server errors)
801
 
802
  ### Beta Test - We Want Your Feedback!
@@ -823,17 +902,44 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
823
 
824
  with gr.Row():
825
  with gr.Column():
826
- spreadsheet_file = gr.File(
827
- label="Upload Survey Data (CSV or Excel)",
828
- file_types=[".csv", ".xlsx", ".xls"]
 
 
829
  )
830
- example_btn = gr.Button("📋 Try Example Dataset", variant="secondary", size="sm")
831
 
832
- spreadsheet_column = gr.Dropdown(
833
- label="Column to Classify",
834
- choices=[],
835
- info="Select the column containing text to classify"
836
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
837
 
838
  gr.Markdown("### Categories")
839
  category_inputs = []
@@ -910,6 +1016,19 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
910
  )
911
 
912
  # Event handlers
 
 
 
 
 
 
 
 
 
 
 
 
 
913
  def update_model_tier(tier):
914
  """Update model choices and API key visibility based on tier."""
915
  if tier == "Free Models":
@@ -951,7 +1070,7 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
951
 
952
  classify_btn.click(
953
  fn=classify_data,
954
- inputs=[spreadsheet_file, spreadsheet_column] + category_inputs + [model_tier, model, model_source, api_key],
955
  outputs=[distribution_plot, sample_results, results, download_file, status]
956
  )
957
 
@@ -964,7 +1083,7 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
964
  reset_btn.click(
965
  fn=reset_all,
966
  inputs=[],
967
- outputs=[spreadsheet_file, spreadsheet_column] + category_inputs + [add_category_btn, category_count, model_tier, model, model_source, api_key, api_key_status, status, distribution_plot, sample_results, results, download_file, code_output]
968
  )
969
 
970
 
 
443
  return gr.update(choices=[], value=None), f"**Error:** {str(e)}"
444
 
445
 
446
+ def classify_data(input_type, spreadsheet_file, spreadsheet_column,
447
+ pdf_file, pdf_description, pdf_mode,
448
  cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
449
  model_tier, model, model_source_input, api_key_input):
450
  """Main classification function with progress updates. Yields status updates then final results."""
 
516
  model_source = model_source_input
517
 
518
  try:
519
+ # Determine if we're processing text or PDF
520
+ is_pdf_mode = input_type == "PDF Documents"
 
 
 
 
521
 
522
+ if is_pdf_mode:
523
+ # PDF validation
524
+ if not pdf_file:
525
+ yield None, None, None, None, "**Error:** Please upload a PDF file"
526
+ return
527
 
528
+ pdf_path = pdf_file if isinstance(pdf_file, str) else pdf_file.name
529
+
530
+ # Map UI mode to function parameter
531
+ mode_mapping = {
532
+ "Image (visual documents)": "image",
533
+ "Text (text-heavy)": "text",
534
+ "Both (comprehensive)": "both"
535
+ }
536
+ actual_pdf_mode = mode_mapping.get(pdf_mode, "image")
537
+
538
+ # Progress update
539
+ yield None, None, None, None, f"⏳ **Loading PDF...** Processing document."
540
+
541
+ # Data quality placeholder for PDFs
542
+ data_quality = {
543
+ 'null_count': 0,
544
+ 'avg_length': 0,
545
+ 'min_length': 0,
546
+ 'max_length': 0,
547
+ 'error_count': 0
548
+ }
549
+
550
+ # Progress update: starting classification
551
+ yield None, None, None, None, f"🔄 **Classifying PDF pages...** This may take a moment."
552
+
553
+ # Capture timing
554
+ start_time = time.time()
555
+
556
+ result = catllm.pdf_multi_class(
557
+ pdf_description=pdf_description or "document",
558
+ pdf_input=pdf_path,
559
+ categories=categories,
560
+ api_key=actual_api_key,
561
+ user_model=actual_model,
562
+ model_source=model_source,
563
+ mode=actual_pdf_mode
564
+ )
565
 
566
+ processing_time = time.time() - start_time
567
+ num_items = len(result)
568
+ original_filename = pdf_path.split("/")[-1]
569
+ column_name = "PDF Pages"
570
 
571
+ # Build prompt template for PDF
572
+ prompt_template = f'''Categorize this PDF page from "{pdf_description or 'document'}" into the following categories that apply:
573
+ {{categories}}
574
 
575
+ Let's think step by step:
576
+ 1. First, identify the main themes present in this page
577
+ 2. Then, match each theme to the relevant categories
578
+ 3. Finally, assign 1 to matching categories and 0 to non-matching categories
 
 
 
 
 
579
 
580
+ Provide your work in JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values.'''
 
581
 
582
+ else:
583
+ # Text data validation
584
+ if not spreadsheet_file:
585
+ yield None, None, None, None, "**Error:** Please upload a file"
586
+ return
587
+ if not spreadsheet_column:
588
+ yield None, None, None, None, "**Error:** Please select a column to classify"
589
+ return
590
 
591
+ file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
592
+ if file_path.endswith('.csv'):
593
+ df = pd.read_csv(file_path)
594
+ else:
595
+ df = pd.read_excel(file_path)
596
+
597
+ if spreadsheet_column not in df.columns:
598
+ yield None, None, None, None, f"**Error:** Column '{spreadsheet_column}' not found"
599
+ return
600
 
601
+ input_data = df[spreadsheet_column].tolist()
602
+
603
+ # Progress update: data loaded
604
+ yield None, None, None, None, f"⏳ **Loading data...** Found {len(input_data)} responses to classify."
605
+
606
+ # Calculate data quality metrics before classification
607
+ text_series = df[spreadsheet_column].dropna().astype(str)
608
+ data_quality = {
609
+ 'null_count': int(df[spreadsheet_column].isna().sum()),
610
+ 'avg_length': round(text_series.str.len().mean(), 1) if len(text_series) > 0 else 0,
611
+ 'min_length': int(text_series.str.len().min()) if len(text_series) > 0 else 0,
612
+ 'max_length': int(text_series.str.len().max()) if len(text_series) > 0 else 0,
613
+ 'error_count': 0 # Will be updated after classification
614
+ }
615
+
616
+ # Progress update: starting classification
617
+ yield None, None, None, None, f"🔄 **Classifying {len(input_data)} responses...** This may take a moment."
618
+
619
+ # Capture timing
620
+ start_time = time.time()
621
+
622
+ result = catllm.multi_class(
623
+ survey_input=input_data,
624
+ categories=categories,
625
+ api_key=actual_api_key,
626
+ user_model=actual_model,
627
+ model_source=model_source
628
+ )
629
+
630
+ processing_time = time.time() - start_time
631
+ num_items = len(input_data)
632
+ original_filename = file_path.split("/")[-1]
633
+ column_name = spreadsheet_column
634
+
635
+ # Build prompt template for documentation (chain of thought - default)
636
+ prompt_template = '''Categorize this survey response "{response}" into the following categories that apply:
637
+ {categories}
638
+
639
+ Let's think step by step:
640
+ 1. First, identify the main themes mentioned in the response
641
+ 2. Then, match each theme to the relevant categories
642
+ 3. Finally, assign 1 to matching categories and 0 to non-matching categories
643
+
644
+ Provide your work in JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values.'''
645
 
646
  # Update error count from results
647
  if 'processing_status' in result.columns:
 
652
  result.to_csv(f.name, index=False)
653
  csv_path = f.name
654
 
 
 
 
655
  # Calculate success rate
656
  if 'processing_status' in result.columns:
657
  success_count = (result['processing_status'] == 'success').sum()
 
659
  else:
660
  success_rate = 100.0
661
 
 
 
 
 
 
 
 
 
 
 
 
662
  # Get version info
663
  try:
664
  catllm_version = catllm.__version__
 
670
  yield None, None, None, None, f"📄 **Generating methodology report...** Classification complete in {processing_time:.1f}s."
671
 
672
  # Generate PDF methodology report with all new data
673
+ report_pdf_path = generate_methodology_report_pdf(
674
  categories=categories,
675
  model=actual_model,
676
+ column_name=column_name,
677
+ num_rows=num_items,
678
  model_source=model_source,
679
  filename=original_filename,
680
  success_rate=success_rate,
 
723
 
724
  # Build sample results DataFrame (first 5 rows)
725
  sample_data = []
726
+ # Determine the input column name based on mode
727
+ input_col = 'pdf_input' if is_pdf_mode else 'survey_input'
728
+ input_label = "PDF Page" if is_pdf_mode else "Original Text"
729
+
730
  for _, row in result.head(5).iterrows():
731
+ original_text = str(row.get(input_col, ''))[:100]
732
+ if len(str(row.get(input_col, ''))) > 100:
733
  original_text += "..."
734
  assigned = row.get('categories_id', '')
735
  if pd.isna(assigned) or assigned == '':
736
  assigned = "None"
737
  sample_data.append({
738
+ input_label: original_text,
739
  "Assigned Categories": str(assigned)
740
  })
741
  sample_df = pd.DataFrame(sample_data)
742
 
743
+ # Determine success message based on mode
744
+ item_type = "pages" if is_pdf_mode else "responses"
745
+
746
  # Final yield: distribution plot (visible), samples (visible), full results (visible), files, status
747
  yield (
748
  gr.update(value=distribution_fig, visible=True),
749
  gr.update(value=sample_df, visible=True),
750
  gr.update(value=result, visible=True),
751
+ [csv_path, report_pdf_path],
752
+ f"✅ **Success!** Classified {num_items} {item_type} in {processing_time:.1f}s"
753
  )
754
 
755
  except Exception as e:
 
769
  def reset_all():
770
  """Reset all inputs and outputs to initial state."""
771
  updates = [
772
+ "Text Data (CSV/Excel)", # input_type
773
+ gr.update(visible=True), # text_input_group
774
+ gr.update(visible=False), # pdf_input_group
775
  None, # spreadsheet_file
776
  gr.update(choices=[], value=None), # spreadsheet_column
777
+ None, # pdf_file
778
+ "", # pdf_description
779
+ "Image (visual documents)", # pdf_mode
780
  ]
781
  # Reset category inputs (first 3 visible, rest hidden, all empty)
782
  for i in range(MAX_CATEGORIES):
 
859
  }
860
  """
861
 
862
+ with gr.Blocks(title="CatLLM - Research Data Classifier", theme=gr.themes.Soft(), css=custom_css) as demo:
863
  gr.Image("logo.png", show_label=False, show_download_button=False, height=115, container=False)
864
+ gr.Markdown("# CatLLM - Research Data Classifier")
865
+ gr.Markdown("Classify text data (CSV/Excel) and PDF documents into custom categories using LLMs.")
866
 
867
  with gr.Accordion("About This App", open=False):
868
  gr.Markdown("""
 
870
 
871
  ---
872
 
873
+ **CatLLM** is an open-source Python package for classifying text and document data using Large Language Models.
874
 
875
  ### What It Does
876
+ - Classifies survey responses, open-ended text, PDF documents, and other unstructured data into custom categories
877
  - Supports multiple LLM providers: OpenAI, Anthropic, Google, HuggingFace, and more
878
+ - Returns structured results with category assignments for each response or PDF page
879
  - Tested on over 40,000 rows of data with a 100% structured output rate (actual output rate ~99.98% due to occasional server errors)
880
 
881
  ### Beta Test - We Want Your Feedback!
 
902
 
903
  with gr.Row():
904
  with gr.Column():
905
+ # Input type toggle
906
+ input_type = gr.Radio(
907
+ choices=["Text Data (CSV/Excel)", "PDF Documents"],
908
+ value="Text Data (CSV/Excel)",
909
+ label="Input Type"
910
  )
 
911
 
912
+ # Text data input group
913
+ with gr.Group(visible=True) as text_input_group:
914
+ spreadsheet_file = gr.File(
915
+ label="Upload Data (CSV or Excel)",
916
+ file_types=[".csv", ".xlsx", ".xls"]
917
+ )
918
+ example_btn = gr.Button("📋 Try Example Dataset", variant="secondary", size="sm")
919
+
920
+ spreadsheet_column = gr.Dropdown(
921
+ label="Column to Classify",
922
+ choices=[],
923
+ info="Select the column containing text to classify"
924
+ )
925
+
926
+ # PDF input group
927
+ with gr.Group(visible=False) as pdf_input_group:
928
+ pdf_file = gr.File(
929
+ label="Upload PDF Document",
930
+ file_types=[".pdf"]
931
+ )
932
+ pdf_description = gr.Textbox(
933
+ label="Document Description",
934
+ placeholder="e.g., 'research papers', 'interview transcripts', 'policy documents'",
935
+ info="Helps the LLM understand the context of your PDF"
936
+ )
937
+ pdf_mode = gr.Radio(
938
+ choices=["Image (visual documents)", "Text (text-heavy)", "Both (comprehensive)"],
939
+ value="Image (visual documents)",
940
+ label="Processing Mode",
941
+ info="Image mode is best for scans/charts; Text mode is faster for text-heavy docs"
942
+ )
943
 
944
  gr.Markdown("### Categories")
945
  category_inputs = []
 
1016
  )
1017
 
1018
  # Event handlers
1019
+ def switch_input_type(input_type_val):
1020
+ """Toggle visibility between text and PDF input groups."""
1021
+ if input_type_val == "Text Data (CSV/Excel)":
1022
+ return gr.update(visible=True), gr.update(visible=False), "Ready to classify text data"
1023
+ else:
1024
+ return gr.update(visible=False), gr.update(visible=True), "Ready to classify PDF document"
1025
+
1026
+ input_type.change(
1027
+ fn=switch_input_type,
1028
+ inputs=[input_type],
1029
+ outputs=[text_input_group, pdf_input_group, status]
1030
+ )
1031
+
1032
  def update_model_tier(tier):
1033
  """Update model choices and API key visibility based on tier."""
1034
  if tier == "Free Models":
 
1070
 
1071
  classify_btn.click(
1072
  fn=classify_data,
1073
+ inputs=[input_type, spreadsheet_file, spreadsheet_column, pdf_file, pdf_description, pdf_mode] + category_inputs + [model_tier, model, model_source, api_key],
1074
  outputs=[distribution_plot, sample_results, results, download_file, status]
1075
  )
1076
 
 
1083
  reset_btn.click(
1084
  fn=reset_all,
1085
  inputs=[],
1086
+ outputs=[input_type, text_input_group, pdf_input_group, spreadsheet_file, spreadsheet_column, pdf_file, pdf_description, pdf_mode] + category_inputs + [add_category_btn, category_count, model_tier, model, model_source, api_key, api_key_status, status, distribution_plot, sample_results, results, download_file, code_output]
1087
  )
1088
 
1089