Spaces:
Running
Running
Commit
·
c44cb7c
1
Parent(s):
41896b8
Add 'See the Code' feature and large file size warning
Browse files- Add collapsible "See the Code" accordions for Extract and Classify tasks
- Generate reproducible Python code snippets for all operations
- Add file size check (>100MB) for images/PDFs with warning
- When large files detected, show warning and generated code as alternative
- Users can copy code to run locally with: pip install cat-llm
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
app.py
CHANGED
|
@@ -21,6 +21,167 @@ except ImportError as e:
|
|
| 21 |
|
| 22 |
MAX_CATEGORIES = 10
|
| 23 |
INITIAL_CATEGORIES = 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# Free models (uses Space secrets - no user API key needed)
|
| 26 |
FREE_MODEL_CHOICES = [
|
|
@@ -497,12 +658,12 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 497 |
progress=gr.Progress(track_tqdm=True)):
|
| 498 |
"""Extract categories from data and display them in a table."""
|
| 499 |
if not CATLLM_AVAILABLE:
|
| 500 |
-
yield None, None, "**Error:** catllm package not available"
|
| 501 |
return
|
| 502 |
|
| 503 |
actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
|
| 504 |
if not actual_api_key:
|
| 505 |
-
yield None, None, f"**Error:** {provider} API key not configured"
|
| 506 |
return
|
| 507 |
|
| 508 |
if model_source_input == "auto":
|
|
@@ -510,17 +671,48 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 510 |
else:
|
| 511 |
model_source = model_source_input
|
| 512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
try:
|
| 514 |
-
yield None, None, "Extracting categories from your data..."
|
| 515 |
|
| 516 |
start_time = time.time()
|
| 517 |
|
| 518 |
if input_type == "Survey Responses":
|
| 519 |
if not spreadsheet_file:
|
| 520 |
-
yield None, None, "**Error:** Please upload a CSV/Excel file"
|
| 521 |
return
|
| 522 |
if not spreadsheet_column:
|
| 523 |
-
yield None, None, "**Error:** Please select a column"
|
| 524 |
return
|
| 525 |
|
| 526 |
file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
|
|
@@ -554,7 +746,7 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 554 |
else:
|
| 555 |
pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
|
| 556 |
else:
|
| 557 |
-
yield None, None, "**Error:** Please upload PDF file(s) or a folder"
|
| 558 |
return
|
| 559 |
|
| 560 |
mode_mapping = {
|
|
@@ -593,7 +785,7 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 593 |
else:
|
| 594 |
image_input = image_file if isinstance(image_file, str) else image_file.name
|
| 595 |
else:
|
| 596 |
-
yield None, None, "**Error:** Please upload image file(s) or a folder"
|
| 597 |
return
|
| 598 |
|
| 599 |
# For images, use fewer divisions since each image can have multiple categories
|
|
@@ -614,7 +806,7 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 614 |
)
|
| 615 |
|
| 616 |
else:
|
| 617 |
-
yield None, None, f"**Error:** Unknown input type: {input_type}"
|
| 618 |
return
|
| 619 |
|
| 620 |
processing_time = time.time() - start_time
|
|
@@ -635,14 +827,25 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 635 |
categories_df.to_csv(f.name, index=False)
|
| 636 |
csv_path = f.name
|
| 637 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
yield (
|
| 639 |
gr.update(value=categories_df, visible=True),
|
| 640 |
csv_path,
|
|
|
|
| 641 |
f"Extracted {len(top_categories)} categories in {processing_time:.1f}s"
|
| 642 |
)
|
| 643 |
|
| 644 |
except Exception as e:
|
| 645 |
-
yield None, None, f"**Error:** {str(e)}"
|
| 646 |
|
| 647 |
|
| 648 |
def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
|
|
@@ -653,19 +856,19 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 653 |
progress=gr.Progress(track_tqdm=True)):
|
| 654 |
"""Classify data with user-provided categories."""
|
| 655 |
if not CATLLM_AVAILABLE:
|
| 656 |
-
yield None, None, None, None, "**Error:** catllm package not available"
|
| 657 |
return
|
| 658 |
|
| 659 |
all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
|
| 660 |
categories = [c.strip() for c in all_cats if c and c.strip()]
|
| 661 |
|
| 662 |
if not categories:
|
| 663 |
-
yield None, None, None, None, "**Error:** Please enter at least one category"
|
| 664 |
return
|
| 665 |
|
| 666 |
actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
|
| 667 |
if not actual_api_key:
|
| 668 |
-
yield None, None, None, None, f"**Error:** {provider} API key not configured"
|
| 669 |
return
|
| 670 |
|
| 671 |
if model_source_input == "auto":
|
|
@@ -673,17 +876,48 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 673 |
else:
|
| 674 |
model_source = model_source_input
|
| 675 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 676 |
try:
|
| 677 |
-
yield None, None, None, None, "Classifying your data..."
|
| 678 |
|
| 679 |
start_time = time.time()
|
| 680 |
|
| 681 |
if input_type == "Survey Responses":
|
| 682 |
if not spreadsheet_file:
|
| 683 |
-
yield None, None, None, None, "**Error:** Please upload a CSV/Excel file"
|
| 684 |
return
|
| 685 |
if not spreadsheet_column:
|
| 686 |
-
yield None, None, None, None, "**Error:** Please select a column"
|
| 687 |
return
|
| 688 |
|
| 689 |
file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
|
|
@@ -723,7 +957,7 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 723 |
pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
|
| 724 |
original_filename = pdf_input.split("/")[-1]
|
| 725 |
else:
|
| 726 |
-
yield None, None, None, None, "**Error:** Please upload PDF file(s) or a folder"
|
| 727 |
return
|
| 728 |
|
| 729 |
column_name = "PDF Pages"
|
|
@@ -763,7 +997,7 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 763 |
image_input = image_file if isinstance(image_file, str) else image_file.name
|
| 764 |
original_filename = image_input.split("/")[-1]
|
| 765 |
else:
|
| 766 |
-
yield None, None, None, None, "**Error:** Please upload image file(s) or a folder"
|
| 767 |
return
|
| 768 |
|
| 769 |
column_name = "Image Files"
|
|
@@ -779,7 +1013,7 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 779 |
)
|
| 780 |
|
| 781 |
else:
|
| 782 |
-
yield None, None, None, None, f"**Error:** Unknown input type: {input_type}"
|
| 783 |
return
|
| 784 |
|
| 785 |
processing_time = time.time() - start_time
|
|
@@ -856,16 +1090,27 @@ Provide your work in JSON format where the number belonging to each category is
|
|
| 856 |
|
| 857 |
plt.tight_layout()
|
| 858 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 859 |
yield (
|
| 860 |
gr.update(value=fig, visible=True),
|
| 861 |
gr.update(value=result, visible=True),
|
| 862 |
[csv_path, report_pdf_path],
|
|
|
|
| 863 |
None,
|
| 864 |
f"Classified {num_items} items in {processing_time:.1f}s"
|
| 865 |
)
|
| 866 |
|
| 867 |
except Exception as e:
|
| 868 |
-
yield None, None, None, None, f"**Error:** {str(e)}"
|
| 869 |
|
| 870 |
|
| 871 |
def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
|
|
@@ -876,12 +1121,12 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 876 |
progress=gr.Progress(track_tqdm=True)):
|
| 877 |
"""Extract categories then classify data with them."""
|
| 878 |
if not CATLLM_AVAILABLE:
|
| 879 |
-
yield None, None, None, None, None, None, "**Error:** catllm package not available"
|
| 880 |
return
|
| 881 |
|
| 882 |
actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
|
| 883 |
if not actual_api_key:
|
| 884 |
-
yield None, None, None, None, None, None, f"**Error:** {provider} API key not configured"
|
| 885 |
return
|
| 886 |
|
| 887 |
if model_source_input == "auto":
|
|
@@ -889,18 +1134,49 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 889 |
else:
|
| 890 |
model_source = model_source_input
|
| 891 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 892 |
try:
|
| 893 |
# Phase 1: Extract categories
|
| 894 |
-
yield None, None, None, None, None, None, "Phase 1: Extracting categories..."
|
| 895 |
|
| 896 |
start_time = time.time()
|
| 897 |
|
| 898 |
if input_type == "Survey Responses":
|
| 899 |
if not spreadsheet_file:
|
| 900 |
-
yield None, None, None, None, None, None, "**Error:** Please upload a CSV/Excel file"
|
| 901 |
return
|
| 902 |
if not spreadsheet_column:
|
| 903 |
-
yield None, None, None, None, None, None, "**Error:** Please select a column"
|
| 904 |
return
|
| 905 |
|
| 906 |
file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
|
|
@@ -933,7 +1209,7 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 933 |
input_data = pdf_file if isinstance(pdf_file, str) else pdf_file.name
|
| 934 |
original_filename = input_data.split("/")[-1]
|
| 935 |
else:
|
| 936 |
-
yield None, None, None, None, None, None, "**Error:** Please upload PDF file(s) or a folder"
|
| 937 |
return
|
| 938 |
|
| 939 |
column_name = "PDF Pages"
|
|
@@ -964,7 +1240,7 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 964 |
input_data = image_file if isinstance(image_file, str) else image_file.name
|
| 965 |
original_filename = input_data.split("/")[-1]
|
| 966 |
else:
|
| 967 |
-
yield None, None, None, None, None, None, "**Error:** Please upload image file(s) or a folder"
|
| 968 |
return
|
| 969 |
|
| 970 |
column_name = "Image Files"
|
|
@@ -973,7 +1249,7 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 973 |
mode_param = None
|
| 974 |
|
| 975 |
else:
|
| 976 |
-
yield None, None, None, None, None, None, f"**Error:** Unknown input type: {input_type}"
|
| 977 |
return
|
| 978 |
|
| 979 |
# Calculate sensible divisions based on input size and type
|
|
@@ -1010,7 +1286,7 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 1010 |
categories_df = extract_result.get('counts_df', pd.DataFrame())
|
| 1011 |
|
| 1012 |
if not categories:
|
| 1013 |
-
yield None, None, None, None, None, None, "**Error:** No categories were extracted"
|
| 1014 |
return
|
| 1015 |
|
| 1016 |
extract_time = time.time() - start_time
|
|
@@ -1026,10 +1302,14 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
|
|
| 1026 |
categories_df.to_csv(f.name, index=False)
|
| 1027 |
extract_csv_path = f.name
|
| 1028 |
|
|
|
|
|
|
|
|
|
|
| 1029 |
yield (
|
| 1030 |
gr.update(value=categories_df, visible=True),
|
| 1031 |
extract_csv_path,
|
| 1032 |
-
|
|
|
|
| 1033 |
f"Extracted {len(categories)} categories in {extract_time:.1f}s. Now classifying..."
|
| 1034 |
)
|
| 1035 |
|
|
@@ -1125,18 +1405,23 @@ Provide your work in JSON format where the number belonging to each category is
|
|
| 1125 |
|
| 1126 |
plt.tight_layout()
|
| 1127 |
|
|
|
|
|
|
|
|
|
|
| 1128 |
yield (
|
| 1129 |
gr.update(value=categories_df, visible=True),
|
| 1130 |
extract_csv_path,
|
|
|
|
| 1131 |
gr.update(value=fig, visible=True),
|
| 1132 |
gr.update(value=result, visible=True),
|
| 1133 |
[classify_csv_path, report_pdf_path],
|
|
|
|
| 1134 |
None,
|
| 1135 |
f"Extracted {len(categories)} categories and classified {num_items} items in {total_time:.1f}s"
|
| 1136 |
)
|
| 1137 |
|
| 1138 |
except Exception as e:
|
| 1139 |
-
yield None, None, None, None, None, None, f"**Error:** {str(e)}"
|
| 1140 |
|
| 1141 |
|
| 1142 |
def add_category_field(current_count):
|
|
@@ -1190,10 +1475,12 @@ def reset_all():
|
|
| 1190 |
gr.update(visible=False), # extract_output_group
|
| 1191 |
gr.update(value=None, visible=False), # extracted_categories
|
| 1192 |
None, # extract_download
|
|
|
|
| 1193 |
gr.update(visible=False), # classify_output_group
|
| 1194 |
gr.update(value=None, visible=False), # distribution_plot
|
| 1195 |
gr.update(value=None, visible=False), # results
|
| 1196 |
None, # download_file
|
|
|
|
| 1197 |
])
|
| 1198 |
return updates
|
| 1199 |
|
|
@@ -1406,6 +1693,13 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
|
|
| 1406 |
wrap=True
|
| 1407 |
)
|
| 1408 |
extract_download = gr.File(label="Download Categories (CSV)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1409 |
|
| 1410 |
# Classify output group
|
| 1411 |
with gr.Group(visible=False) as classify_output_group:
|
|
@@ -1413,6 +1707,13 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
|
|
| 1413 |
distribution_plot = gr.Plot(label="Category Distribution (%)", visible=False)
|
| 1414 |
results = gr.DataFrame(label="Full Results", visible=False)
|
| 1415 |
download_file = gr.File(label="Download Results (CSV + Methodology Report)", file_count="multiple")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1416 |
|
| 1417 |
# Event handlers
|
| 1418 |
def switch_input_type(input_type_val):
|
|
@@ -1534,6 +1835,7 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
|
|
| 1534 |
progress=gr.Progress(track_tqdm=True)):
|
| 1535 |
"""Dispatch to appropriate function based on task mode."""
|
| 1536 |
if task == "extract":
|
|
|
|
| 1537 |
for update in run_extract_categories(
|
| 1538 |
input_type, spreadsheet_file, spreadsheet_column,
|
| 1539 |
pdf_file, pdf_folder_val, pdf_description, pdf_mode,
|
|
@@ -1546,12 +1848,15 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
|
|
| 1546 |
yield (
|
| 1547 |
update[0], # extracted_categories
|
| 1548 |
update[1], # extract_download
|
|
|
|
| 1549 |
None, # distribution_plot
|
| 1550 |
None, # results
|
| 1551 |
None, # download_file
|
| 1552 |
-
|
|
|
|
| 1553 |
)
|
| 1554 |
elif task == "assign":
|
|
|
|
| 1555 |
for update in run_classify_data(
|
| 1556 |
input_type, spreadsheet_file, spreadsheet_column,
|
| 1557 |
pdf_file, pdf_folder_val, pdf_description, pdf_mode,
|
|
@@ -1564,12 +1869,15 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
|
|
| 1564 |
yield (
|
| 1565 |
None, # extracted_categories
|
| 1566 |
None, # extract_download
|
|
|
|
| 1567 |
update[0], # distribution_plot
|
| 1568 |
update[1], # results
|
| 1569 |
update[2], # download_file
|
| 1570 |
-
update[
|
|
|
|
| 1571 |
)
|
| 1572 |
elif task == "extract_and_assign":
|
|
|
|
| 1573 |
for update in run_extract_and_assign(
|
| 1574 |
input_type, spreadsheet_file, spreadsheet_column,
|
| 1575 |
pdf_file, pdf_folder_val, pdf_description, pdf_mode,
|
|
@@ -1581,13 +1889,15 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
|
|
| 1581 |
yield (
|
| 1582 |
update[0], # extracted_categories
|
| 1583 |
update[1], # extract_download
|
| 1584 |
-
update[2], #
|
| 1585 |
-
update[3], #
|
| 1586 |
-
update[4], #
|
| 1587 |
-
update[
|
|
|
|
|
|
|
| 1588 |
)
|
| 1589 |
else:
|
| 1590 |
-
yield (None, None, None, None, None, "Please select a task first.")
|
| 1591 |
|
| 1592 |
run_btn.click(
|
| 1593 |
fn=dispatch_run,
|
|
@@ -1595,7 +1905,7 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
|
|
| 1595 |
pdf_file, pdf_folder, pdf_description, pdf_mode,
|
| 1596 |
image_file, image_folder, image_description,
|
| 1597 |
max_categories] + category_inputs + [model_tier, model, model_source, api_key],
|
| 1598 |
-
outputs=[extracted_categories, extract_download, distribution_plot, results, download_file, status]
|
| 1599 |
)
|
| 1600 |
|
| 1601 |
reset_btn.click(
|
|
@@ -1612,8 +1922,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
|
|
| 1612 |
categories_group, extract_settings_group, max_categories, model_group, run_btn,
|
| 1613 |
model_tier, model, model_source, api_key, api_key, api_key_status,
|
| 1614 |
status,
|
| 1615 |
-
extract_output_group, extracted_categories, extract_download,
|
| 1616 |
-
classify_output_group, distribution_plot, results, download_file
|
| 1617 |
]
|
| 1618 |
)
|
| 1619 |
|
|
|
|
| 21 |
|
| 22 |
MAX_CATEGORIES = 10
|
| 23 |
INITIAL_CATEGORIES = 3
|
| 24 |
+
MAX_FILE_SIZE_MB = 100 # Warn users if total file size exceeds this
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def calculate_total_file_size(files):
|
| 28 |
+
"""Calculate total size of uploaded files in MB."""
|
| 29 |
+
if files is None:
|
| 30 |
+
return 0
|
| 31 |
+
if not isinstance(files, list):
|
| 32 |
+
files = [files]
|
| 33 |
+
|
| 34 |
+
total_bytes = 0
|
| 35 |
+
for f in files:
|
| 36 |
+
try:
|
| 37 |
+
file_path = f if isinstance(f, str) else f.name
|
| 38 |
+
total_bytes += os.path.getsize(file_path)
|
| 39 |
+
except (OSError, AttributeError):
|
| 40 |
+
pass
|
| 41 |
+
return total_bytes / (1024 * 1024) # Convert to MB
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def generate_extract_code(input_type, description, model, model_source, max_categories, mode=None):
|
| 45 |
+
"""Generate Python code for category extraction."""
|
| 46 |
+
if input_type == "text":
|
| 47 |
+
return f'''import catllm
|
| 48 |
+
import pandas as pd
|
| 49 |
+
|
| 50 |
+
# Load your data
|
| 51 |
+
df = pd.read_csv("your_data.csv")
|
| 52 |
+
|
| 53 |
+
# Extract categories from the text column
|
| 54 |
+
result = catllm.extract(
|
| 55 |
+
input_data=df["{description}"].tolist(),
|
| 56 |
+
api_key="YOUR_API_KEY",
|
| 57 |
+
input_type="text",
|
| 58 |
+
description="{description}",
|
| 59 |
+
user_model="{model}",
|
| 60 |
+
model_source="{model_source}",
|
| 61 |
+
max_categories={max_categories}
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# View extracted categories
|
| 65 |
+
print(result["top_categories"])
|
| 66 |
+
print(result["counts_df"])
|
| 67 |
+
'''
|
| 68 |
+
elif input_type == "pdf":
|
| 69 |
+
mode_line = f',\n mode="{mode}"' if mode else ''
|
| 70 |
+
return f'''import catllm
|
| 71 |
+
|
| 72 |
+
# Extract categories from PDF documents
|
| 73 |
+
result = catllm.extract(
|
| 74 |
+
input_data="path/to/your/pdfs/", # or list of PDF paths
|
| 75 |
+
api_key="YOUR_API_KEY",
|
| 76 |
+
input_type="pdf",
|
| 77 |
+
description="{description}"{mode_line},
|
| 78 |
+
user_model="{model}",
|
| 79 |
+
model_source="{model_source}",
|
| 80 |
+
max_categories={max_categories}
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# View extracted categories
|
| 84 |
+
print(result["top_categories"])
|
| 85 |
+
print(result["counts_df"])
|
| 86 |
+
'''
|
| 87 |
+
else: # image
|
| 88 |
+
return f'''import catllm
|
| 89 |
+
|
| 90 |
+
# Extract categories from images
|
| 91 |
+
result = catllm.extract(
|
| 92 |
+
input_data="path/to/your/images/", # or list of image paths
|
| 93 |
+
api_key="YOUR_API_KEY",
|
| 94 |
+
input_type="image",
|
| 95 |
+
description="{description}",
|
| 96 |
+
user_model="{model}",
|
| 97 |
+
model_source="{model_source}",
|
| 98 |
+
max_categories={max_categories}
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# View extracted categories
|
| 102 |
+
print(result["top_categories"])
|
| 103 |
+
print(result["counts_df"])
|
| 104 |
+
'''
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def generate_classify_code(input_type, description, categories, model, model_source, mode=None):
|
| 108 |
+
"""Generate Python code for classification."""
|
| 109 |
+
categories_str = ",\n ".join([f'"{cat}"' for cat in categories])
|
| 110 |
+
|
| 111 |
+
if input_type == "text":
|
| 112 |
+
return f'''import catllm
|
| 113 |
+
import pandas as pd
|
| 114 |
+
|
| 115 |
+
# Load your data
|
| 116 |
+
df = pd.read_csv("your_data.csv")
|
| 117 |
+
|
| 118 |
+
# Define categories
|
| 119 |
+
categories = [
|
| 120 |
+
{categories_str}
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
# Classify the text data
|
| 124 |
+
result = catllm.classify(
|
| 125 |
+
input_data=df["{description}"].tolist(),
|
| 126 |
+
categories=categories,
|
| 127 |
+
api_key="YOUR_API_KEY",
|
| 128 |
+
input_type="text",
|
| 129 |
+
description="{description}",
|
| 130 |
+
user_model="{model}",
|
| 131 |
+
model_source="{model_source}"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# View results
|
| 135 |
+
print(result)
|
| 136 |
+
result.to_csv("classified_results.csv", index=False)
|
| 137 |
+
'''
|
| 138 |
+
elif input_type == "pdf":
|
| 139 |
+
mode_line = f',\n mode="{mode}"' if mode else ''
|
| 140 |
+
return f'''import catllm
|
| 141 |
+
|
| 142 |
+
# Define categories
|
| 143 |
+
categories = [
|
| 144 |
+
{categories_str}
|
| 145 |
+
]
|
| 146 |
+
|
| 147 |
+
# Classify PDF documents
|
| 148 |
+
result = catllm.classify(
|
| 149 |
+
input_data="path/to/your/pdfs/", # or list of PDF paths
|
| 150 |
+
categories=categories,
|
| 151 |
+
api_key="YOUR_API_KEY",
|
| 152 |
+
input_type="pdf",
|
| 153 |
+
description="{description}"{mode_line},
|
| 154 |
+
user_model="{model}",
|
| 155 |
+
model_source="{model_source}"
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
# View results
|
| 159 |
+
print(result)
|
| 160 |
+
result.to_csv("classified_results.csv", index=False)
|
| 161 |
+
'''
|
| 162 |
+
else: # image
|
| 163 |
+
return f'''import catllm
|
| 164 |
+
|
| 165 |
+
# Define categories
|
| 166 |
+
categories = [
|
| 167 |
+
{categories_str}
|
| 168 |
+
]
|
| 169 |
+
|
| 170 |
+
# Classify images
|
| 171 |
+
result = catllm.classify(
|
| 172 |
+
input_data="path/to/your/images/", # or list of image paths
|
| 173 |
+
categories=categories,
|
| 174 |
+
api_key="YOUR_API_KEY",
|
| 175 |
+
input_type="image",
|
| 176 |
+
description="{description}",
|
| 177 |
+
user_model="{model}",
|
| 178 |
+
model_source="{model_source}"
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# View results
|
| 182 |
+
print(result)
|
| 183 |
+
result.to_csv("classified_results.csv", index=False)
|
| 184 |
+
'''
|
| 185 |
|
| 186 |
# Free models (uses Space secrets - no user API key needed)
|
| 187 |
FREE_MODEL_CHOICES = [
|
|
|
|
| 658 |
progress=gr.Progress(track_tqdm=True)):
|
| 659 |
"""Extract categories from data and display them in a table."""
|
| 660 |
if not CATLLM_AVAILABLE:
|
| 661 |
+
yield None, None, None, "**Error:** catllm package not available"
|
| 662 |
return
|
| 663 |
|
| 664 |
actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
|
| 665 |
if not actual_api_key:
|
| 666 |
+
yield None, None, None, f"**Error:** {provider} API key not configured"
|
| 667 |
return
|
| 668 |
|
| 669 |
if model_source_input == "auto":
|
|
|
|
| 671 |
else:
|
| 672 |
model_source = model_source_input
|
| 673 |
|
| 674 |
+
# Check file size for images and PDFs
|
| 675 |
+
files_to_check = None
|
| 676 |
+
if input_type == "Images":
|
| 677 |
+
files_to_check = image_folder if image_folder else image_file
|
| 678 |
+
elif input_type == "PDF Documents":
|
| 679 |
+
files_to_check = pdf_folder if pdf_folder else pdf_file
|
| 680 |
+
|
| 681 |
+
if files_to_check:
|
| 682 |
+
total_size_mb = calculate_total_file_size(files_to_check)
|
| 683 |
+
if total_size_mb > MAX_FILE_SIZE_MB:
|
| 684 |
+
# Generate the code for the user
|
| 685 |
+
if input_type == "Images":
|
| 686 |
+
code = generate_extract_code("image", image_description or "images", model, model_source, int(max_categories_val))
|
| 687 |
+
else:
|
| 688 |
+
mode_mapping = {"Image (visual documents)": "image", "Text (text-heavy)": "text", "Both (comprehensive)": "both"}
|
| 689 |
+
actual_mode = mode_mapping.get(pdf_mode, "image")
|
| 690 |
+
code = generate_extract_code("pdf", pdf_description or "document", model, model_source, int(max_categories_val), actual_mode)
|
| 691 |
+
|
| 692 |
+
warning_msg = f"""**⚠️ Large Upload Detected ({total_size_mb:.1f} MB)**
|
| 693 |
+
|
| 694 |
+
Uploads over {MAX_FILE_SIZE_MB} MB may experience performance issues or timeouts on this web app.
|
| 695 |
+
|
| 696 |
+
**Recommended:** Run the code locally using the Python package instead. See the code below, or click "See the Code" after this message.
|
| 697 |
+
|
| 698 |
+
```
|
| 699 |
+
pip install cat-llm
|
| 700 |
+
```
|
| 701 |
+
"""
|
| 702 |
+
yield None, None, code, warning_msg
|
| 703 |
+
return
|
| 704 |
+
|
| 705 |
try:
|
| 706 |
+
yield None, None, None, "Extracting categories from your data..."
|
| 707 |
|
| 708 |
start_time = time.time()
|
| 709 |
|
| 710 |
if input_type == "Survey Responses":
|
| 711 |
if not spreadsheet_file:
|
| 712 |
+
yield None, None, None, "**Error:** Please upload a CSV/Excel file"
|
| 713 |
return
|
| 714 |
if not spreadsheet_column:
|
| 715 |
+
yield None, None, None, "**Error:** Please select a column"
|
| 716 |
return
|
| 717 |
|
| 718 |
file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
|
|
|
|
| 746 |
else:
|
| 747 |
pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
|
| 748 |
else:
|
| 749 |
+
yield None, None, None, "**Error:** Please upload PDF file(s) or a folder"
|
| 750 |
return
|
| 751 |
|
| 752 |
mode_mapping = {
|
|
|
|
| 785 |
else:
|
| 786 |
image_input = image_file if isinstance(image_file, str) else image_file.name
|
| 787 |
else:
|
| 788 |
+
yield None, None, None, "**Error:** Please upload image file(s) or a folder"
|
| 789 |
return
|
| 790 |
|
| 791 |
# For images, use fewer divisions since each image can have multiple categories
|
|
|
|
| 806 |
)
|
| 807 |
|
| 808 |
else:
|
| 809 |
+
yield None, None, None, f"**Error:** Unknown input type: {input_type}"
|
| 810 |
return
|
| 811 |
|
| 812 |
processing_time = time.time() - start_time
|
|
|
|
| 827 |
categories_df.to_csv(f.name, index=False)
|
| 828 |
csv_path = f.name
|
| 829 |
|
| 830 |
+
# Generate reproducibility code
|
| 831 |
+
if input_type == "Survey Responses":
|
| 832 |
+
code = generate_extract_code("text", spreadsheet_column, model, model_source, int(max_categories_val))
|
| 833 |
+
elif input_type == "PDF Documents":
|
| 834 |
+
mode_mapping = {"Image (visual documents)": "image", "Text (text-heavy)": "text", "Both (comprehensive)": "both"}
|
| 835 |
+
actual_mode = mode_mapping.get(pdf_mode, "image")
|
| 836 |
+
code = generate_extract_code("pdf", pdf_description or "document", model, model_source, int(max_categories_val), actual_mode)
|
| 837 |
+
else: # Images
|
| 838 |
+
code = generate_extract_code("image", image_description or "images", model, model_source, int(max_categories_val))
|
| 839 |
+
|
| 840 |
yield (
|
| 841 |
gr.update(value=categories_df, visible=True),
|
| 842 |
csv_path,
|
| 843 |
+
code,
|
| 844 |
f"Extracted {len(top_categories)} categories in {processing_time:.1f}s"
|
| 845 |
)
|
| 846 |
|
| 847 |
except Exception as e:
|
| 848 |
+
yield None, None, None, f"**Error:** {str(e)}"
|
| 849 |
|
| 850 |
|
| 851 |
def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
|
|
|
|
| 856 |
progress=gr.Progress(track_tqdm=True)):
|
| 857 |
"""Classify data with user-provided categories."""
|
| 858 |
if not CATLLM_AVAILABLE:
|
| 859 |
+
yield None, None, None, None, None, "**Error:** catllm package not available"
|
| 860 |
return
|
| 861 |
|
| 862 |
all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10]
|
| 863 |
categories = [c.strip() for c in all_cats if c and c.strip()]
|
| 864 |
|
| 865 |
if not categories:
|
| 866 |
+
yield None, None, None, None, None, "**Error:** Please enter at least one category"
|
| 867 |
return
|
| 868 |
|
| 869 |
actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
|
| 870 |
if not actual_api_key:
|
| 871 |
+
yield None, None, None, None, None, f"**Error:** {provider} API key not configured"
|
| 872 |
return
|
| 873 |
|
| 874 |
if model_source_input == "auto":
|
|
|
|
| 876 |
else:
|
| 877 |
model_source = model_source_input
|
| 878 |
|
| 879 |
+
# Check file size for images and PDFs
|
| 880 |
+
files_to_check = None
|
| 881 |
+
if input_type == "Images":
|
| 882 |
+
files_to_check = image_folder if image_folder else image_file
|
| 883 |
+
elif input_type == "PDF Documents":
|
| 884 |
+
files_to_check = pdf_folder if pdf_folder else pdf_file
|
| 885 |
+
|
| 886 |
+
if files_to_check:
|
| 887 |
+
total_size_mb = calculate_total_file_size(files_to_check)
|
| 888 |
+
if total_size_mb > MAX_FILE_SIZE_MB:
|
| 889 |
+
# Generate the code for the user
|
| 890 |
+
if input_type == "Images":
|
| 891 |
+
code = generate_classify_code("image", image_description or "images", categories, model, model_source)
|
| 892 |
+
else:
|
| 893 |
+
mode_mapping = {"Image (visual documents)": "image", "Text (text-heavy)": "text", "Both (comprehensive)": "both"}
|
| 894 |
+
actual_mode = mode_mapping.get(pdf_mode, "image")
|
| 895 |
+
code = generate_classify_code("pdf", pdf_description or "document", categories, model, model_source, actual_mode)
|
| 896 |
+
|
| 897 |
+
warning_msg = f"""**⚠️ Large Upload Detected ({total_size_mb:.1f} MB)**
|
| 898 |
+
|
| 899 |
+
Uploads over {MAX_FILE_SIZE_MB} MB may experience performance issues or timeouts on this web app.
|
| 900 |
+
|
| 901 |
+
**Recommended:** Run the code locally using the Python package instead. See the code below, or click "See the Code" after this message.
|
| 902 |
+
|
| 903 |
+
```
|
| 904 |
+
pip install cat-llm
|
| 905 |
+
```
|
| 906 |
+
"""
|
| 907 |
+
yield None, None, None, code, None, warning_msg
|
| 908 |
+
return
|
| 909 |
+
|
| 910 |
try:
|
| 911 |
+
yield None, None, None, None, None, "Classifying your data..."
|
| 912 |
|
| 913 |
start_time = time.time()
|
| 914 |
|
| 915 |
if input_type == "Survey Responses":
|
| 916 |
if not spreadsheet_file:
|
| 917 |
+
yield None, None, None, None, None, "**Error:** Please upload a CSV/Excel file"
|
| 918 |
return
|
| 919 |
if not spreadsheet_column:
|
| 920 |
+
yield None, None, None, None, None, "**Error:** Please select a column"
|
| 921 |
return
|
| 922 |
|
| 923 |
file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
|
|
|
|
| 957 |
pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
|
| 958 |
original_filename = pdf_input.split("/")[-1]
|
| 959 |
else:
|
| 960 |
+
yield None, None, None, None, None, "**Error:** Please upload PDF file(s) or a folder"
|
| 961 |
return
|
| 962 |
|
| 963 |
column_name = "PDF Pages"
|
|
|
|
| 997 |
image_input = image_file if isinstance(image_file, str) else image_file.name
|
| 998 |
original_filename = image_input.split("/")[-1]
|
| 999 |
else:
|
| 1000 |
+
yield None, None, None, None, None, "**Error:** Please upload image file(s) or a folder"
|
| 1001 |
return
|
| 1002 |
|
| 1003 |
column_name = "Image Files"
|
|
|
|
| 1013 |
)
|
| 1014 |
|
| 1015 |
else:
|
| 1016 |
+
yield None, None, None, None, None, f"**Error:** Unknown input type: {input_type}"
|
| 1017 |
return
|
| 1018 |
|
| 1019 |
processing_time = time.time() - start_time
|
|
|
|
| 1090 |
|
| 1091 |
plt.tight_layout()
|
| 1092 |
|
| 1093 |
+
# Generate reproducibility code
|
| 1094 |
+
if input_type == "Survey Responses":
|
| 1095 |
+
code = generate_classify_code("text", spreadsheet_column, categories, model, model_source)
|
| 1096 |
+
elif input_type == "PDF Documents":
|
| 1097 |
+
mode_mapping = {"Image (visual documents)": "image", "Text (text-heavy)": "text", "Both (comprehensive)": "both"}
|
| 1098 |
+
actual_mode = mode_mapping.get(pdf_mode, "image")
|
| 1099 |
+
code = generate_classify_code("pdf", pdf_description or "document", categories, model, model_source, actual_mode)
|
| 1100 |
+
else: # Images
|
| 1101 |
+
code = generate_classify_code("image", image_description or "images", categories, model, model_source)
|
| 1102 |
+
|
| 1103 |
yield (
|
| 1104 |
gr.update(value=fig, visible=True),
|
| 1105 |
gr.update(value=result, visible=True),
|
| 1106 |
[csv_path, report_pdf_path],
|
| 1107 |
+
code,
|
| 1108 |
None,
|
| 1109 |
f"Classified {num_items} items in {processing_time:.1f}s"
|
| 1110 |
)
|
| 1111 |
|
| 1112 |
except Exception as e:
|
| 1113 |
+
yield None, None, None, None, None, f"**Error:** {str(e)}"
|
| 1114 |
|
| 1115 |
|
| 1116 |
def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
|
|
|
|
| 1121 |
progress=gr.Progress(track_tqdm=True)):
|
| 1122 |
"""Extract categories then classify data with them."""
|
| 1123 |
if not CATLLM_AVAILABLE:
|
| 1124 |
+
yield None, None, None, None, None, None, None, None, "**Error:** catllm package not available"
|
| 1125 |
return
|
| 1126 |
|
| 1127 |
actual_api_key, provider = get_api_key(model, model_tier, api_key_input)
|
| 1128 |
if not actual_api_key:
|
| 1129 |
+
yield None, None, None, None, None, None, None, None, f"**Error:** {provider} API key not configured"
|
| 1130 |
return
|
| 1131 |
|
| 1132 |
if model_source_input == "auto":
|
|
|
|
| 1134 |
else:
|
| 1135 |
model_source = model_source_input
|
| 1136 |
|
| 1137 |
+
# Check file size for images and PDFs
|
| 1138 |
+
files_to_check = None
|
| 1139 |
+
if input_type == "Images":
|
| 1140 |
+
files_to_check = image_folder if image_folder else image_file
|
| 1141 |
+
elif input_type == "PDF Documents":
|
| 1142 |
+
files_to_check = pdf_folder if pdf_folder else pdf_file
|
| 1143 |
+
|
| 1144 |
+
if files_to_check:
|
| 1145 |
+
total_size_mb = calculate_total_file_size(files_to_check)
|
| 1146 |
+
if total_size_mb > MAX_FILE_SIZE_MB:
|
| 1147 |
+
# Generate the code for the user
|
| 1148 |
+
if input_type == "Images":
|
| 1149 |
+
extract_code = generate_extract_code("image", image_description or "images", model, model_source, int(max_categories_val))
|
| 1150 |
+
else:
|
| 1151 |
+
mode_mapping = {"Image (visual documents)": "image", "Text (text-heavy)": "text", "Both (comprehensive)": "both"}
|
| 1152 |
+
actual_mode = mode_mapping.get(pdf_mode, "image")
|
| 1153 |
+
extract_code = generate_extract_code("pdf", pdf_description or "document", model, model_source, int(max_categories_val), actual_mode)
|
| 1154 |
+
|
| 1155 |
+
warning_msg = f"""**⚠️ Large Upload Detected ({total_size_mb:.1f} MB)**
|
| 1156 |
+
|
| 1157 |
+
Uploads over {MAX_FILE_SIZE_MB} MB may experience performance issues or timeouts on this web app.
|
| 1158 |
+
|
| 1159 |
+
**Recommended:** Run the code locally using the Python package instead. See the code below, or click "See the Code" after this message.
|
| 1160 |
+
|
| 1161 |
+
```
|
| 1162 |
+
pip install cat-llm
|
| 1163 |
+
```
|
| 1164 |
+
"""
|
| 1165 |
+
yield None, None, extract_code, None, None, None, None, None, warning_msg
|
| 1166 |
+
return
|
| 1167 |
+
|
| 1168 |
try:
|
| 1169 |
# Phase 1: Extract categories
|
| 1170 |
+
yield None, None, None, None, None, None, None, None, "Phase 1: Extracting categories..."
|
| 1171 |
|
| 1172 |
start_time = time.time()
|
| 1173 |
|
| 1174 |
if input_type == "Survey Responses":
|
| 1175 |
if not spreadsheet_file:
|
| 1176 |
+
yield None, None, None, None, None, None, None, None, "**Error:** Please upload a CSV/Excel file"
|
| 1177 |
return
|
| 1178 |
if not spreadsheet_column:
|
| 1179 |
+
yield None, None, None, None, None, None, None, None, "**Error:** Please select a column"
|
| 1180 |
return
|
| 1181 |
|
| 1182 |
file_path = spreadsheet_file if isinstance(spreadsheet_file, str) else spreadsheet_file.name
|
|
|
|
| 1209 |
input_data = pdf_file if isinstance(pdf_file, str) else pdf_file.name
|
| 1210 |
original_filename = input_data.split("/")[-1]
|
| 1211 |
else:
|
| 1212 |
+
yield None, None, None, None, None, None, None, None, "**Error:** Please upload PDF file(s) or a folder"
|
| 1213 |
return
|
| 1214 |
|
| 1215 |
column_name = "PDF Pages"
|
|
|
|
| 1240 |
input_data = image_file if isinstance(image_file, str) else image_file.name
|
| 1241 |
original_filename = input_data.split("/")[-1]
|
| 1242 |
else:
|
| 1243 |
+
yield None, None, None, None, None, None, None, None, "**Error:** Please upload image file(s) or a folder"
|
| 1244 |
return
|
| 1245 |
|
| 1246 |
column_name = "Image Files"
|
|
|
|
| 1249 |
mode_param = None
|
| 1250 |
|
| 1251 |
else:
|
| 1252 |
+
yield None, None, None, None, None, None, None, None, f"**Error:** Unknown input type: {input_type}"
|
| 1253 |
return
|
| 1254 |
|
| 1255 |
# Calculate sensible divisions based on input size and type
|
|
|
|
| 1286 |
categories_df = extract_result.get('counts_df', pd.DataFrame())
|
| 1287 |
|
| 1288 |
if not categories:
|
| 1289 |
+
yield None, None, None, None, None, None, None, None, "**Error:** No categories were extracted"
|
| 1290 |
return
|
| 1291 |
|
| 1292 |
extract_time = time.time() - start_time
|
|
|
|
| 1302 |
categories_df.to_csv(f.name, index=False)
|
| 1303 |
extract_csv_path = f.name
|
| 1304 |
|
| 1305 |
+
# Generate extract code
|
| 1306 |
+
extract_code = generate_extract_code(input_type_param, description, model, model_source, int(max_categories_val), mode_param)
|
| 1307 |
+
|
| 1308 |
yield (
|
| 1309 |
gr.update(value=categories_df, visible=True),
|
| 1310 |
extract_csv_path,
|
| 1311 |
+
extract_code,
|
| 1312 |
+
None, None, None, None, None,
|
| 1313 |
f"Extracted {len(categories)} categories in {extract_time:.1f}s. Now classifying..."
|
| 1314 |
)
|
| 1315 |
|
|
|
|
| 1405 |
|
| 1406 |
plt.tight_layout()
|
| 1407 |
|
| 1408 |
+
# Generate classify code
|
| 1409 |
+
classify_code = generate_classify_code(input_type_param, description, categories, model, model_source, mode_param)
|
| 1410 |
+
|
| 1411 |
yield (
|
| 1412 |
gr.update(value=categories_df, visible=True),
|
| 1413 |
extract_csv_path,
|
| 1414 |
+
extract_code,
|
| 1415 |
gr.update(value=fig, visible=True),
|
| 1416 |
gr.update(value=result, visible=True),
|
| 1417 |
[classify_csv_path, report_pdf_path],
|
| 1418 |
+
classify_code,
|
| 1419 |
None,
|
| 1420 |
f"Extracted {len(categories)} categories and classified {num_items} items in {total_time:.1f}s"
|
| 1421 |
)
|
| 1422 |
|
| 1423 |
except Exception as e:
|
| 1424 |
+
yield None, None, None, None, None, None, None, None, f"**Error:** {str(e)}"
|
| 1425 |
|
| 1426 |
|
| 1427 |
def add_category_field(current_count):
|
|
|
|
| 1475 |
gr.update(visible=False), # extract_output_group
|
| 1476 |
gr.update(value=None, visible=False), # extracted_categories
|
| 1477 |
None, # extract_download
|
| 1478 |
+
"# Code will be generated after extraction", # extract_code_display
|
| 1479 |
gr.update(visible=False), # classify_output_group
|
| 1480 |
gr.update(value=None, visible=False), # distribution_plot
|
| 1481 |
gr.update(value=None, visible=False), # results
|
| 1482 |
None, # download_file
|
| 1483 |
+
"# Code will be generated after classification", # classify_code_display
|
| 1484 |
])
|
| 1485 |
return updates
|
| 1486 |
|
|
|
|
| 1693 |
wrap=True
|
| 1694 |
)
|
| 1695 |
extract_download = gr.File(label="Download Categories (CSV)")
|
| 1696 |
+
with gr.Accordion("See the Code", open=False):
|
| 1697 |
+
extract_code_display = gr.Code(
|
| 1698 |
+
label="Python Code",
|
| 1699 |
+
language="python",
|
| 1700 |
+
value="# Code will be generated after extraction",
|
| 1701 |
+
interactive=False
|
| 1702 |
+
)
|
| 1703 |
|
| 1704 |
# Classify output group
|
| 1705 |
with gr.Group(visible=False) as classify_output_group:
|
|
|
|
| 1707 |
distribution_plot = gr.Plot(label="Category Distribution (%)", visible=False)
|
| 1708 |
results = gr.DataFrame(label="Full Results", visible=False)
|
| 1709 |
download_file = gr.File(label="Download Results (CSV + Methodology Report)", file_count="multiple")
|
| 1710 |
+
with gr.Accordion("See the Code", open=False):
|
| 1711 |
+
classify_code_display = gr.Code(
|
| 1712 |
+
label="Python Code",
|
| 1713 |
+
language="python",
|
| 1714 |
+
value="# Code will be generated after classification",
|
| 1715 |
+
interactive=False
|
| 1716 |
+
)
|
| 1717 |
|
| 1718 |
# Event handlers
|
| 1719 |
def switch_input_type(input_type_val):
|
|
|
|
| 1835 |
progress=gr.Progress(track_tqdm=True)):
|
| 1836 |
"""Dispatch to appropriate function based on task mode."""
|
| 1837 |
if task == "extract":
|
| 1838 |
+
# run_extract_categories yields: (categories_df, csv_path, code, status)
|
| 1839 |
for update in run_extract_categories(
|
| 1840 |
input_type, spreadsheet_file, spreadsheet_column,
|
| 1841 |
pdf_file, pdf_folder_val, pdf_description, pdf_mode,
|
|
|
|
| 1848 |
yield (
|
| 1849 |
update[0], # extracted_categories
|
| 1850 |
update[1], # extract_download
|
| 1851 |
+
update[2], # extract_code_display
|
| 1852 |
None, # distribution_plot
|
| 1853 |
None, # results
|
| 1854 |
None, # download_file
|
| 1855 |
+
None, # classify_code_display
|
| 1856 |
+
update[3] # status
|
| 1857 |
)
|
| 1858 |
elif task == "assign":
|
| 1859 |
+
# run_classify_data yields: (plot, df, files, code, unused, status)
|
| 1860 |
for update in run_classify_data(
|
| 1861 |
input_type, spreadsheet_file, spreadsheet_column,
|
| 1862 |
pdf_file, pdf_folder_val, pdf_description, pdf_mode,
|
|
|
|
| 1869 |
yield (
|
| 1870 |
None, # extracted_categories
|
| 1871 |
None, # extract_download
|
| 1872 |
+
None, # extract_code_display
|
| 1873 |
update[0], # distribution_plot
|
| 1874 |
update[1], # results
|
| 1875 |
update[2], # download_file
|
| 1876 |
+
update[3], # classify_code_display
|
| 1877 |
+
update[5] # status
|
| 1878 |
)
|
| 1879 |
elif task == "extract_and_assign":
|
| 1880 |
+
# run_extract_and_assign yields: (categories_df, extract_csv, extract_code, plot, df, files, classify_code, unused, status)
|
| 1881 |
for update in run_extract_and_assign(
|
| 1882 |
input_type, spreadsheet_file, spreadsheet_column,
|
| 1883 |
pdf_file, pdf_folder_val, pdf_description, pdf_mode,
|
|
|
|
| 1889 |
yield (
|
| 1890 |
update[0], # extracted_categories
|
| 1891 |
update[1], # extract_download
|
| 1892 |
+
update[2], # extract_code_display
|
| 1893 |
+
update[3], # distribution_plot
|
| 1894 |
+
update[4], # results
|
| 1895 |
+
update[5], # download_file
|
| 1896 |
+
update[6], # classify_code_display
|
| 1897 |
+
update[8] # status
|
| 1898 |
)
|
| 1899 |
else:
|
| 1900 |
+
yield (None, None, None, None, None, None, None, "Please select a task first.")
|
| 1901 |
|
| 1902 |
run_btn.click(
|
| 1903 |
fn=dispatch_run,
|
|
|
|
| 1905 |
pdf_file, pdf_folder, pdf_description, pdf_mode,
|
| 1906 |
image_file, image_folder, image_description,
|
| 1907 |
max_categories] + category_inputs + [model_tier, model, model_source, api_key],
|
| 1908 |
+
outputs=[extracted_categories, extract_download, extract_code_display, distribution_plot, results, download_file, classify_code_display, status]
|
| 1909 |
)
|
| 1910 |
|
| 1911 |
reset_btn.click(
|
|
|
|
| 1922 |
categories_group, extract_settings_group, max_categories, model_group, run_btn,
|
| 1923 |
model_tier, model, model_source, api_key, api_key, api_key_status,
|
| 1924 |
status,
|
| 1925 |
+
extract_output_group, extracted_categories, extract_download, extract_code_display,
|
| 1926 |
+
classify_output_group, distribution_plot, results, download_file, classify_code_display
|
| 1927 |
]
|
| 1928 |
)
|
| 1929 |
|