Spaces:

CatLLM
/

survey-classifier

Running

chrissoria Claude commited on 13 days ago

Commit

ae1b24b

1 Parent(s): 7a755b8

Replace path textbox with folder upload option

- Add toggle between "Upload File(s)" and "Upload Folder" for PDFs and images
- Use Gradio's file_count="directory" for folder uploads
- Remove text-based path input (doesn't work for hosted apps)
- Updated all processing functions to handle folder uploads

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

app.py +116 -56

app.py CHANGED Viewed

@@ -486,8 +486,8 @@ def update_task_visibility(task):
 def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
-                           pdf_file, pdf_path, pdf_description, pdf_mode,
-                           image_file, image_path, image_description,
                            model_tier, model, model_source_input, api_key_input,
                            progress=gr.Progress(track_tqdm=True)):
     """Extract categories from data and display them in a table."""
@@ -536,16 +536,19 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
             )
         elif input_type == "PDF Documents":
-            # Use path if provided, otherwise use uploaded file
-            if pdf_path and pdf_path.strip():
-                pdf_input = pdf_path.strip()
             elif pdf_file:
                 if isinstance(pdf_file, list):
                     pdf_input = [f if isinstance(f, str) else f.name for f in pdf_file]
                 else:
                     pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
             else:
-                yield None, None, "**Error:** Please upload a PDF file or enter a path"
                 return
             mode_mapping = {
@@ -566,16 +569,19 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
             )
         elif input_type == "Images":
-            # Use path if provided, otherwise use uploaded file
-            if image_path and image_path.strip():
-                image_input = image_path.strip()
             elif image_file:
                 if isinstance(image_file, list):
                     image_input = [f if isinstance(f, str) else f.name for f in image_file]
                 else:
                     image_input = image_file if isinstance(image_file, str) else image_file.name
             else:
-                yield None, None, "**Error:** Please upload image files or enter a path"
                 return
             result = catllm.extract(
@@ -620,8 +626,8 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
 def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
-                      pdf_file, pdf_path, pdf_description, pdf_mode,
-                      image_file, image_path, image_description,
                       cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                       model_tier, model, model_source_input, api_key_input,
                       progress=gr.Progress(track_tqdm=True)):
@@ -681,10 +687,14 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
             )
         elif input_type == "PDF Documents":
-            # Use path if provided, otherwise use uploaded file
-            if pdf_path and pdf_path.strip():
-                pdf_input = pdf_path.strip()
-                original_filename = pdf_input.split("/")[-1]
             elif pdf_file:
                 if isinstance(pdf_file, list):
                     pdf_input = [f if isinstance(f, str) else f.name for f in pdf_file]
@@ -693,7 +703,7 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
                     pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
                     original_filename = pdf_input.split("/")[-1]
             else:
-                yield None, None, None, None, "**Error:** Please upload a PDF file or enter a path"
                 return
             column_name = "PDF Pages"
@@ -717,10 +727,14 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
             )
         elif input_type == "Images":
-            # Use path if provided, otherwise use uploaded file
-            if image_path and image_path.strip():
-                image_input = image_path.strip()
-                original_filename = image_input.split("/")[-1]
             elif image_file:
                 if isinstance(image_file, list):
                     image_input = [f if isinstance(f, str) else f.name for f in image_file]
@@ -729,7 +743,7 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
                     image_input = image_file if isinstance(image_file, str) else image_file.name
                     original_filename = image_input.split("/")[-1]
             else:
-                yield None, None, None, None, "**Error:** Please upload image files or enter a path"
                 return
             column_name = "Image Files"
@@ -835,8 +849,8 @@ Provide your work in JSON format where the number belonging to each category is
 def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
-                           pdf_file, pdf_path, pdf_description, pdf_mode,
-                           image_file, image_path, image_description,
                            model_tier, model, model_source_input, api_key_input,
                            progress=gr.Progress(track_tqdm=True)):
     """Extract categories then classify data with them."""
@@ -882,10 +896,14 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
             mode_param = None
         elif input_type == "PDF Documents":
-            # Use path if provided, otherwise use uploaded file
-            if pdf_path and pdf_path.strip():
-                input_data = pdf_path.strip()
-                original_filename = input_data.split("/")[-1]
             elif pdf_file:
                 if isinstance(pdf_file, list):
                     input_data = [f if isinstance(f, str) else f.name for f in pdf_file]
@@ -894,7 +912,7 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
                     input_data = pdf_file if isinstance(pdf_file, str) else pdf_file.name
                     original_filename = input_data.split("/")[-1]
             else:
-                yield None, None, None, None, None, None, "**Error:** Please upload a PDF file or enter a path"
                 return
             column_name = "PDF Pages"
@@ -909,10 +927,14 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
             mode_param = mode_mapping.get(pdf_mode, "image")
         elif input_type == "Images":
-            # Use path if provided, otherwise use uploaded file
-            if image_path and image_path.strip():
-                input_data = image_path.strip()
-                original_filename = input_data.split("/")[-1]
             elif image_file:
                 if isinstance(image_file, list):
                     input_data = [f if isinstance(f, str) else f.name for f in image_file]
@@ -921,7 +943,7 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
                     input_data = image_file if isinstance(image_file, str) else image_file.name
                     original_filename = input_data.split("/")[-1]
             else:
-                yield None, None, None, None, None, None, "**Error:** Please upload image files or enter a path"
                 return
             column_name = "Image Files"
@@ -1098,12 +1120,14 @@ def reset_all():
         gr.update(visible=False),  # image_input_group
         None,  # spreadsheet_file
         gr.update(choices=[], value=None),  # spreadsheet_column
         None,  # pdf_file
-        "",  # pdf_path
         "",  # pdf_description
         "Image (visual documents)",  # pdf_mode
         None,  # image_file
-        "",  # image_path
         "",  # image_description
         None,  # task_mode
     ]
@@ -1206,15 +1230,20 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
             # PDF input group
             with gr.Group(visible=False) as pdf_input_group:
                 pdf_file = gr.File(
                     label="Upload PDF Document(s)",
                     file_types=[".pdf"],
                     file_count="multiple"
                 )
-                pdf_path = gr.Textbox(
-                    label="Or Enter File/Directory Path",
-                    placeholder="e.g., /path/to/documents/ or /path/to/file.pdf",
-                    info="Local path to PDF file or directory containing PDFs"
                 )
                 pdf_description = gr.Textbox(
                     label="Document Description",
@@ -1229,15 +1258,20 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
             # Image input group
             with gr.Group(visible=False) as image_input_group:
                 image_file = gr.File(
                     label="Upload Images",
                     file_types=["image"],
                     file_count="multiple"
                 )
-                image_path = gr.Textbox(
-                    label="Or Enter File/Directory Path",
-                    placeholder="e.g., /path/to/images/ or /path/to/image.jpg",
-                    info="Local path to image file or directory containing images"
                 )
                 image_description = gr.Textbox(
                     label="Image Description",
@@ -1374,6 +1408,32 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
         outputs=[spreadsheet_file, spreadsheet_column, status]
     )
     add_category_btn.click(
         fn=add_category_field,
         inputs=[category_count],
@@ -1410,8 +1470,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
     # Main run button handler - dispatches based on task_mode
     def dispatch_run(task, input_type, spreadsheet_file, spreadsheet_column,
-                     pdf_file, pdf_path_val, pdf_description, pdf_mode,
-                     image_file, image_path_val, image_description,
                      cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                      model_tier, model, model_source, api_key,
                      progress=gr.Progress(track_tqdm=True)):
@@ -1419,8 +1479,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
         if task == "extract":
             for update in run_extract_categories(
                 input_type, spreadsheet_file, spreadsheet_column,
-                pdf_file, pdf_path_val, pdf_description, pdf_mode,
-                image_file, image_path_val, image_description,
                 model_tier, model, model_source, api_key,
                 progress
             ):
@@ -1436,8 +1496,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
         elif task == "assign":
             for update in run_classify_data(
                 input_type, spreadsheet_file, spreadsheet_column,
-                pdf_file, pdf_path_val, pdf_description, pdf_mode,
-                image_file, image_path_val, image_description,
                 cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                 model_tier, model, model_source, api_key,
                 progress
@@ -1454,8 +1514,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
         elif task == "extract_and_assign":
             for update in run_extract_and_assign(
                 input_type, spreadsheet_file, spreadsheet_column,
-                pdf_file, pdf_path_val, pdf_description, pdf_mode,
-                image_file, image_path_val, image_description,
                 model_tier, model, model_source, api_key,
                 progress
             ):
@@ -1473,8 +1533,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
     run_btn.click(
         fn=dispatch_run,
         inputs=[task_mode, input_type, spreadsheet_file, spreadsheet_column,
-                pdf_file, pdf_path, pdf_description, pdf_mode,
-                image_file, image_path, image_description] + category_inputs + [model_tier, model, model_source, api_key],
         outputs=[extracted_categories, extract_download, distribution_plot, results, download_file, status]
     )
@@ -1484,8 +1544,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
         outputs=[
             input_type, text_input_group, pdf_input_group, image_input_group,
             spreadsheet_file, spreadsheet_column,
-            pdf_file, pdf_path, pdf_description, pdf_mode,
-            image_file, image_path, image_description,
             task_mode
         ] + category_inputs + [
             add_category_btn, category_count,

 def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
+                           pdf_file, pdf_folder, pdf_description, pdf_mode,
+                           image_file, image_folder, image_description,
                            model_tier, model, model_source_input, api_key_input,
                            progress=gr.Progress(track_tqdm=True)):
     """Extract categories from data and display them in a table."""
             )
         elif input_type == "PDF Documents":
+            # Use folder if provided, otherwise use uploaded files
+            if pdf_folder:
+                if isinstance(pdf_folder, list):
+                    pdf_input = [f if isinstance(f, str) else f.name for f in pdf_folder if str(f.name if hasattr(f, 'name') else f).lower().endswith('.pdf')]
+                else:
+                    pdf_input = pdf_folder if isinstance(pdf_folder, str) else pdf_folder.name
             elif pdf_file:
                 if isinstance(pdf_file, list):
                     pdf_input = [f if isinstance(f, str) else f.name for f in pdf_file]
                 else:
                     pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
             else:
+                yield None, None, "**Error:** Please upload PDF file(s) or a folder"
                 return
             mode_mapping = {
             )
         elif input_type == "Images":
+            # Use folder if provided, otherwise use uploaded files
+            if image_folder:
+                if isinstance(image_folder, list):
+                    image_input = [f if isinstance(f, str) else f.name for f in image_folder]
+                else:
+                    image_input = image_folder if isinstance(image_folder, str) else image_folder.name
             elif image_file:
                 if isinstance(image_file, list):
                     image_input = [f if isinstance(f, str) else f.name for f in image_file]
                 else:
                     image_input = image_file if isinstance(image_file, str) else image_file.name
             else:
+                yield None, None, "**Error:** Please upload image file(s) or a folder"
                 return
             result = catllm.extract(
 def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
+                      pdf_file, pdf_folder, pdf_description, pdf_mode,
+                      image_file, image_folder, image_description,
                       cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                       model_tier, model, model_source_input, api_key_input,
                       progress=gr.Progress(track_tqdm=True)):
             )
         elif input_type == "PDF Documents":
+            # Use folder if provided, otherwise use uploaded files
+            if pdf_folder:
+                if isinstance(pdf_folder, list):
+                    pdf_input = [f if isinstance(f, str) else f.name for f in pdf_folder if str(f.name if hasattr(f, 'name') else f).lower().endswith('.pdf')]
+                    original_filename = "pdf_folder"
+                else:
+                    pdf_input = pdf_folder if isinstance(pdf_folder, str) else pdf_folder.name
+                    original_filename = pdf_input.split("/")[-1]
             elif pdf_file:
                 if isinstance(pdf_file, list):
                     pdf_input = [f if isinstance(f, str) else f.name for f in pdf_file]
                     pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
                     original_filename = pdf_input.split("/")[-1]
             else:
+                yield None, None, None, None, "**Error:** Please upload PDF file(s) or a folder"
                 return
             column_name = "PDF Pages"
             )
         elif input_type == "Images":
+            # Use folder if provided, otherwise use uploaded files
+            if image_folder:
+                if isinstance(image_folder, list):
+                    image_input = [f if isinstance(f, str) else f.name for f in image_folder]
+                    original_filename = "image_folder"
+                else:
+                    image_input = image_folder if isinstance(image_folder, str) else image_folder.name
+                    original_filename = image_input.split("/")[-1]
             elif image_file:
                 if isinstance(image_file, list):
                     image_input = [f if isinstance(f, str) else f.name for f in image_file]
                     image_input = image_file if isinstance(image_file, str) else image_file.name
                     original_filename = image_input.split("/")[-1]
             else:
+                yield None, None, None, None, "**Error:** Please upload image file(s) or a folder"
                 return
             column_name = "Image Files"
 def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
+                           pdf_file, pdf_folder, pdf_description, pdf_mode,
+                           image_file, image_folder, image_description,
                            model_tier, model, model_source_input, api_key_input,
                            progress=gr.Progress(track_tqdm=True)):
     """Extract categories then classify data with them."""
             mode_param = None
         elif input_type == "PDF Documents":
+            # Use folder if provided, otherwise use uploaded files
+            if pdf_folder:
+                if isinstance(pdf_folder, list):
+                    input_data = [f if isinstance(f, str) else f.name for f in pdf_folder if str(f.name if hasattr(f, 'name') else f).lower().endswith('.pdf')]
+                    original_filename = "pdf_folder"
+                else:
+                    input_data = pdf_folder if isinstance(pdf_folder, str) else pdf_folder.name
+                    original_filename = input_data.split("/")[-1]
             elif pdf_file:
                 if isinstance(pdf_file, list):
                     input_data = [f if isinstance(f, str) else f.name for f in pdf_file]
                     input_data = pdf_file if isinstance(pdf_file, str) else pdf_file.name
                     original_filename = input_data.split("/")[-1]
             else:
+                yield None, None, None, None, None, None, "**Error:** Please upload PDF file(s) or a folder"
                 return
             column_name = "PDF Pages"
             mode_param = mode_mapping.get(pdf_mode, "image")
         elif input_type == "Images":
+            # Use folder if provided, otherwise use uploaded files
+            if image_folder:
+                if isinstance(image_folder, list):
+                    input_data = [f if isinstance(f, str) else f.name for f in image_folder]
+                    original_filename = "image_folder"
+                else:
+                    input_data = image_folder if isinstance(image_folder, str) else image_folder.name
+                    original_filename = input_data.split("/")[-1]
             elif image_file:
                 if isinstance(image_file, list):
                     input_data = [f if isinstance(f, str) else f.name for f in image_file]
                     input_data = image_file if isinstance(image_file, str) else image_file.name
                     original_filename = input_data.split("/")[-1]
             else:
+                yield None, None, None, None, None, None, "**Error:** Please upload image file(s) or a folder"
                 return
             column_name = "Image Files"
         gr.update(visible=False),  # image_input_group
         None,  # spreadsheet_file
         gr.update(choices=[], value=None),  # spreadsheet_column
+        "Upload File(s)",  # pdf_upload_type
         None,  # pdf_file
+        None,  # pdf_folder
         "",  # pdf_description
         "Image (visual documents)",  # pdf_mode
+        "Upload File(s)",  # image_upload_type
         None,  # image_file
+        None,  # image_folder
         "",  # image_description
         None,  # task_mode
     ]
             # PDF input group
             with gr.Group(visible=False) as pdf_input_group:
+                pdf_upload_type = gr.Radio(
+                    choices=["Upload File(s)", "Upload Folder"],
+                    value="Upload File(s)",
+                    label="Upload Type"
+                )
                 pdf_file = gr.File(
                     label="Upload PDF Document(s)",
                     file_types=[".pdf"],
                     file_count="multiple"
                 )
+                pdf_folder = gr.File(
+                    label="Upload PDF Folder",
+                    file_count="directory",
+                    visible=False
                 )
                 pdf_description = gr.Textbox(
                     label="Document Description",
             # Image input group
             with gr.Group(visible=False) as image_input_group:
+                image_upload_type = gr.Radio(
+                    choices=["Upload File(s)", "Upload Folder"],
+                    value="Upload File(s)",
+                    label="Upload Type"
+                )
                 image_file = gr.File(
                     label="Upload Images",
                     file_types=["image"],
                     file_count="multiple"
                 )
+                image_folder = gr.File(
+                    label="Upload Image Folder",
+                    file_count="directory",
+                    visible=False
                 )
                 image_description = gr.Textbox(
                     label="Image Description",
         outputs=[spreadsheet_file, spreadsheet_column, status]
     )
+    # Toggle between file and folder upload for PDFs
+    def toggle_pdf_upload(upload_type):
+        if upload_type == "Upload File(s)":
+            return gr.update(visible=True), gr.update(visible=False)
+        else:
+            return gr.update(visible=False), gr.update(visible=True)
+    pdf_upload_type.change(
+        fn=toggle_pdf_upload,
+        inputs=[pdf_upload_type],
+        outputs=[pdf_file, pdf_folder]
+    )
+    # Toggle between file and folder upload for Images
+    def toggle_image_upload(upload_type):
+        if upload_type == "Upload File(s)":
+            return gr.update(visible=True), gr.update(visible=False)
+        else:
+            return gr.update(visible=False), gr.update(visible=True)
+    image_upload_type.change(
+        fn=toggle_image_upload,
+        inputs=[image_upload_type],
+        outputs=[image_file, image_folder]
+    )
     add_category_btn.click(
         fn=add_category_field,
         inputs=[category_count],
     # Main run button handler - dispatches based on task_mode
     def dispatch_run(task, input_type, spreadsheet_file, spreadsheet_column,
+                     pdf_file, pdf_folder_val, pdf_description, pdf_mode,
+                     image_file, image_folder_val, image_description,
                      cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                      model_tier, model, model_source, api_key,
                      progress=gr.Progress(track_tqdm=True)):
         if task == "extract":
             for update in run_extract_categories(
                 input_type, spreadsheet_file, spreadsheet_column,
+                pdf_file, pdf_folder_val, pdf_description, pdf_mode,
+                image_file, image_folder_val, image_description,
                 model_tier, model, model_source, api_key,
                 progress
             ):
         elif task == "assign":
             for update in run_classify_data(
                 input_type, spreadsheet_file, spreadsheet_column,
+                pdf_file, pdf_folder_val, pdf_description, pdf_mode,
+                image_file, image_folder_val, image_description,
                 cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                 model_tier, model, model_source, api_key,
                 progress
         elif task == "extract_and_assign":
             for update in run_extract_and_assign(
                 input_type, spreadsheet_file, spreadsheet_column,
+                pdf_file, pdf_folder_val, pdf_description, pdf_mode,
+                image_file, image_folder_val, image_description,
                 model_tier, model, model_source, api_key,
                 progress
             ):
     run_btn.click(
         fn=dispatch_run,
         inputs=[task_mode, input_type, spreadsheet_file, spreadsheet_column,
+                pdf_file, pdf_folder, pdf_description, pdf_mode,
+                image_file, image_folder, image_description] + category_inputs + [model_tier, model, model_source, api_key],
         outputs=[extracted_categories, extract_download, distribution_plot, results, download_file, status]
     )
         outputs=[
             input_type, text_input_group, pdf_input_group, image_input_group,
             spreadsheet_file, spreadsheet_column,
+            pdf_upload_type, pdf_file, pdf_folder, pdf_description, pdf_mode,
+            image_upload_type, image_file, image_folder, image_description,
             task_mode
         ] + category_inputs + [
             add_category_btn, category_count,