Spaces:

CatLLM
/

survey-classifier

Running

chrissoria Claude commited on 13 days ago

Commit

7a755b8

1 Parent(s): 6a41c27

Add file/directory path input option for PDFs and images

- Add text input for entering local file or directory paths
- Path input works as alternative to file upload
- Supports single files, multiple files, or directories
- Updated all processing functions to handle path input

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

app.py +109 -60

app.py CHANGED Viewed

@@ -486,8 +486,8 @@ def update_task_visibility(task):
 def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
-                           pdf_file, pdf_description, pdf_mode,
-                           image_file, image_description,
                            model_tier, model, model_source_input, api_key_input,
                            progress=gr.Progress(track_tqdm=True)):
     """Extract categories from data and display them in a table."""
@@ -536,11 +536,18 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
             )
         elif input_type == "PDF Documents":
-            if not pdf_file:
-                yield None, None, "**Error:** Please upload a PDF file"
                 return
-            pdf_path = pdf_file if isinstance(pdf_file, str) else pdf_file.name
             mode_mapping = {
                 "Image (visual documents)": "image",
                 "Text (text-heavy)": "text",
@@ -549,7 +556,7 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
             actual_mode = mode_mapping.get(pdf_mode, "image")
             result = catllm.extract(
-                input_data=pdf_path,
                 api_key=actual_api_key,
                 input_type="pdf",
                 description=pdf_description or "document",
@@ -559,18 +566,20 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
             )
         elif input_type == "Images":
-            if not image_file:
-                yield None, None, "**Error:** Please upload image files"
-                return
-            # Handle single or multiple image files
-            if isinstance(image_file, list):
-                image_paths = [f if isinstance(f, str) else f.name for f in image_file]
             else:
-                image_paths = image_file if isinstance(image_file, str) else image_file.name
             result = catllm.extract(
-                input_data=image_paths,
                 api_key=actual_api_key,
                 input_type="image",
                 description=image_description or "images",
@@ -611,8 +620,8 @@ def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
 def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
-                      pdf_file, pdf_description, pdf_mode,
-                      image_file, image_description,
                       cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                       model_tier, model, model_source_input, api_key_input,
                       progress=gr.Progress(track_tqdm=True)):
@@ -672,12 +681,21 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
             )
         elif input_type == "PDF Documents":
-            if not pdf_file:
-                yield None, None, None, None, "**Error:** Please upload a PDF file"
                 return
-            pdf_path = pdf_file if isinstance(pdf_file, str) else pdf_file.name
-            original_filename = pdf_path.split("/")[-1]
             column_name = "PDF Pages"
             mode_mapping = {
@@ -688,7 +706,7 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
             actual_mode = mode_mapping.get(pdf_mode, "image")
             result = catllm.classify(
-                input_data=pdf_path,
                 categories=categories,
                 api_key=actual_api_key,
                 input_type="pdf",
@@ -699,20 +717,25 @@ def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
             )
         elif input_type == "Images":
-            if not image_file:
-                yield None, None, None, None, "**Error:** Please upload image files"
-                return
-            if isinstance(image_file, list):
-                image_paths = [f if isinstance(f, str) else f.name for f in image_file]
             else:
-                image_paths = image_file if isinstance(image_file, str) else image_file.name
-            original_filename = "images"
             column_name = "Image Files"
             result = catllm.classify(
-                input_data=image_paths,
                 categories=categories,
                 api_key=actual_api_key,
                 input_type="image",
@@ -812,8 +835,8 @@ Provide your work in JSON format where the number belonging to each category is
 def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
-                           pdf_file, pdf_description, pdf_mode,
-                           image_file, image_description,
                            model_tier, model, model_source_input, api_key_input,
                            progress=gr.Progress(track_tqdm=True)):
     """Extract categories then classify data with them."""
@@ -859,13 +882,21 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
             mode_param = None
         elif input_type == "PDF Documents":
-            if not pdf_file:
-                yield None, None, None, None, None, None, "**Error:** Please upload a PDF file"
                 return
-            pdf_path = pdf_file if isinstance(pdf_file, str) else pdf_file.name
-            input_data = pdf_path
-            original_filename = pdf_path.split("/")[-1]
             column_name = "PDF Pages"
             input_type_param = "pdf"
             description = pdf_description or "document"
@@ -878,16 +909,21 @@ def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
             mode_param = mode_mapping.get(pdf_mode, "image")
         elif input_type == "Images":
-            if not image_file:
-                yield None, None, None, None, None, None, "**Error:** Please upload image files"
-                return
-            if isinstance(image_file, list):
-                input_data = [f if isinstance(f, str) else f.name for f in image_file]
             else:
-                input_data = image_file if isinstance(image_file, str) else image_file.name
-            original_filename = "images"
             column_name = "Image Files"
             input_type_param = "image"
             description = image_description or "images"
@@ -1063,9 +1099,11 @@ def reset_all():
         None,  # spreadsheet_file
         gr.update(choices=[], value=None),  # spreadsheet_column
         None,  # pdf_file
         "",  # pdf_description
         "Image (visual documents)",  # pdf_mode
         None,  # image_file
         "",  # image_description
         None,  # task_mode
     ]
@@ -1169,8 +1207,14 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
             # PDF input group
             with gr.Group(visible=False) as pdf_input_group:
                 pdf_file = gr.File(
-                    label="Upload PDF Document",
-                    file_types=[".pdf"]
                 )
                 pdf_description = gr.Textbox(
                     label="Document Description",
@@ -1190,6 +1234,11 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
                     file_types=["image"],
                     file_count="multiple"
                 )
                 image_description = gr.Textbox(
                     label="Image Description",
                     placeholder="e.g., 'product photos', 'social media posts'",
@@ -1361,8 +1410,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
     # Main run button handler - dispatches based on task_mode
     def dispatch_run(task, input_type, spreadsheet_file, spreadsheet_column,
-                     pdf_file, pdf_description, pdf_mode,
-                     image_file, image_description,
                      cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                      model_tier, model, model_source, api_key,
                      progress=gr.Progress(track_tqdm=True)):
@@ -1370,8 +1419,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
         if task == "extract":
             for update in run_extract_categories(
                 input_type, spreadsheet_file, spreadsheet_column,
-                pdf_file, pdf_description, pdf_mode,
-                image_file, image_description,
                 model_tier, model, model_source, api_key,
                 progress
             ):
@@ -1387,8 +1436,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
         elif task == "assign":
             for update in run_classify_data(
                 input_type, spreadsheet_file, spreadsheet_column,
-                pdf_file, pdf_description, pdf_mode,
-                image_file, image_description,
                 cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                 model_tier, model, model_source, api_key,
                 progress
@@ -1405,8 +1454,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
         elif task == "extract_and_assign":
             for update in run_extract_and_assign(
                 input_type, spreadsheet_file, spreadsheet_column,
-                pdf_file, pdf_description, pdf_mode,
-                image_file, image_description,
                 model_tier, model, model_source, api_key,
                 progress
             ):
@@ -1424,8 +1473,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
     run_btn.click(
         fn=dispatch_run,
         inputs=[task_mode, input_type, spreadsheet_file, spreadsheet_column,
-                pdf_file, pdf_description, pdf_mode,
-                image_file, image_description] + category_inputs + [model_tier, model, model_source, api_key],
         outputs=[extracted_categories, extract_download, distribution_plot, results, download_file, status]
     )
@@ -1435,8 +1484,8 @@ Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DO
         outputs=[
             input_type, text_input_group, pdf_input_group, image_input_group,
             spreadsheet_file, spreadsheet_column,
-            pdf_file, pdf_description, pdf_mode,
-            image_file, image_description,
             task_mode
         ] + category_inputs + [
             add_category_btn, category_count,

 def run_extract_categories(input_type, spreadsheet_file, spreadsheet_column,
+                           pdf_file, pdf_path, pdf_description, pdf_mode,
+                           image_file, image_path, image_description,
                            model_tier, model, model_source_input, api_key_input,
                            progress=gr.Progress(track_tqdm=True)):
     """Extract categories from data and display them in a table."""
             )
         elif input_type == "PDF Documents":
+            # Use path if provided, otherwise use uploaded file
+            if pdf_path and pdf_path.strip():
+                pdf_input = pdf_path.strip()
+            elif pdf_file:
+                if isinstance(pdf_file, list):
+                    pdf_input = [f if isinstance(f, str) else f.name for f in pdf_file]
+                else:
+                    pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
+            else:
+                yield None, None, "**Error:** Please upload a PDF file or enter a path"
                 return
             mode_mapping = {
                 "Image (visual documents)": "image",
                 "Text (text-heavy)": "text",
             actual_mode = mode_mapping.get(pdf_mode, "image")
             result = catllm.extract(
+                input_data=pdf_input,
                 api_key=actual_api_key,
                 input_type="pdf",
                 description=pdf_description or "document",
             )
         elif input_type == "Images":
+            # Use path if provided, otherwise use uploaded file
+            if image_path and image_path.strip():
+                image_input = image_path.strip()
+            elif image_file:
+                if isinstance(image_file, list):
+                    image_input = [f if isinstance(f, str) else f.name for f in image_file]
+                else:
+                    image_input = image_file if isinstance(image_file, str) else image_file.name
             else:
+                yield None, None, "**Error:** Please upload image files or enter a path"
+                return
             result = catllm.extract(
+                input_data=image_input,
                 api_key=actual_api_key,
                 input_type="image",
                 description=image_description or "images",
 def run_classify_data(input_type, spreadsheet_file, spreadsheet_column,
+                      pdf_file, pdf_path, pdf_description, pdf_mode,
+                      image_file, image_path, image_description,
                       cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                       model_tier, model, model_source_input, api_key_input,
                       progress=gr.Progress(track_tqdm=True)):
             )
         elif input_type == "PDF Documents":
+            # Use path if provided, otherwise use uploaded file
+            if pdf_path and pdf_path.strip():
+                pdf_input = pdf_path.strip()
+                original_filename = pdf_input.split("/")[-1]
+            elif pdf_file:
+                if isinstance(pdf_file, list):
+                    pdf_input = [f if isinstance(f, str) else f.name for f in pdf_file]
+                    original_filename = "multiple_pdfs"
+                else:
+                    pdf_input = pdf_file if isinstance(pdf_file, str) else pdf_file.name
+                    original_filename = pdf_input.split("/")[-1]
+            else:
+                yield None, None, None, None, "**Error:** Please upload a PDF file or enter a path"
                 return
             column_name = "PDF Pages"
             mode_mapping = {
             actual_mode = mode_mapping.get(pdf_mode, "image")
             result = catllm.classify(
+                input_data=pdf_input,
                 categories=categories,
                 api_key=actual_api_key,
                 input_type="pdf",
             )
         elif input_type == "Images":
+            # Use path if provided, otherwise use uploaded file
+            if image_path and image_path.strip():
+                image_input = image_path.strip()
+                original_filename = image_input.split("/")[-1]
+            elif image_file:
+                if isinstance(image_file, list):
+                    image_input = [f if isinstance(f, str) else f.name for f in image_file]
+                    original_filename = "multiple_images"
+                else:
+                    image_input = image_file if isinstance(image_file, str) else image_file.name
+                    original_filename = image_input.split("/")[-1]
             else:
+                yield None, None, None, None, "**Error:** Please upload image files or enter a path"
+                return
             column_name = "Image Files"
             result = catllm.classify(
+                input_data=image_input,
                 categories=categories,
                 api_key=actual_api_key,
                 input_type="image",
 def run_extract_and_assign(input_type, spreadsheet_file, spreadsheet_column,
+                           pdf_file, pdf_path, pdf_description, pdf_mode,
+                           image_file, image_path, image_description,
                            model_tier, model, model_source_input, api_key_input,
                            progress=gr.Progress(track_tqdm=True)):
     """Extract categories then classify data with them."""
             mode_param = None
         elif input_type == "PDF Documents":
+            # Use path if provided, otherwise use uploaded file
+            if pdf_path and pdf_path.strip():
+                input_data = pdf_path.strip()
+                original_filename = input_data.split("/")[-1]
+            elif pdf_file:
+                if isinstance(pdf_file, list):
+                    input_data = [f if isinstance(f, str) else f.name for f in pdf_file]
+                    original_filename = "multiple_pdfs"
+                else:
+                    input_data = pdf_file if isinstance(pdf_file, str) else pdf_file.name
+                    original_filename = input_data.split("/")[-1]
+            else:
+                yield None, None, None, None, None, None, "**Error:** Please upload a PDF file or enter a path"
                 return
             column_name = "PDF Pages"
             input_type_param = "pdf"
             description = pdf_description or "document"
             mode_param = mode_mapping.get(pdf_mode, "image")
         elif input_type == "Images":
+            # Use path if provided, otherwise use uploaded file
+            if image_path and image_path.strip():
+                input_data = image_path.strip()
+                original_filename = input_data.split("/")[-1]
+            elif image_file:
+                if isinstance(image_file, list):
+                    input_data = [f if isinstance(f, str) else f.name for f in image_file]
+                    original_filename = "multiple_images"
+                else:
+                    input_data = image_file if isinstance(image_file, str) else image_file.name
+                    original_filename = input_data.split("/")[-1]
             else:
+                yield None, None, None, None, None, None, "**Error:** Please upload image files or enter a path"
+                return
             column_name = "Image Files"
             input_type_param = "image"
             description = image_description or "images"
         None,  # spreadsheet_file
         gr.update(choices=[], value=None),  # spreadsheet_column
         None,  # pdf_file
+        "",  # pdf_path
         "",  # pdf_description
         "Image (visual documents)",  # pdf_mode
         None,  # image_file
+        "",  # image_path
         "",  # image_description
         None,  # task_mode
     ]
             # PDF input group
             with gr.Group(visible=False) as pdf_input_group:
                 pdf_file = gr.File(
+                    label="Upload PDF Document(s)",
+                    file_types=[".pdf"],
+                    file_count="multiple"
+                )
+                pdf_path = gr.Textbox(
+                    label="Or Enter File/Directory Path",
+                    placeholder="e.g., /path/to/documents/ or /path/to/file.pdf",
+                    info="Local path to PDF file or directory containing PDFs"
                 )
                 pdf_description = gr.Textbox(
                     label="Document Description",
                     file_types=["image"],
                     file_count="multiple"
                 )
+                image_path = gr.Textbox(
+                    label="Or Enter File/Directory Path",
+                    placeholder="e.g., /path/to/images/ or /path/to/image.jpg",
+                    info="Local path to image file or directory containing images"
+                )
                 image_description = gr.Textbox(
                     label="Image Description",
                     placeholder="e.g., 'product photos', 'social media posts'",
     # Main run button handler - dispatches based on task_mode
     def dispatch_run(task, input_type, spreadsheet_file, spreadsheet_column,
+                     pdf_file, pdf_path_val, pdf_description, pdf_mode,
+                     image_file, image_path_val, image_description,
                      cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                      model_tier, model, model_source, api_key,
                      progress=gr.Progress(track_tqdm=True)):
         if task == "extract":
             for update in run_extract_categories(
                 input_type, spreadsheet_file, spreadsheet_column,
+                pdf_file, pdf_path_val, pdf_description, pdf_mode,
+                image_file, image_path_val, image_description,
                 model_tier, model, model_source, api_key,
                 progress
             ):
         elif task == "assign":
             for update in run_classify_data(
                 input_type, spreadsheet_file, spreadsheet_column,
+                pdf_file, pdf_path_val, pdf_description, pdf_mode,
+                image_file, image_path_val, image_description,
                 cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, cat10,
                 model_tier, model, model_source, api_key,
                 progress
         elif task == "extract_and_assign":
             for update in run_extract_and_assign(
                 input_type, spreadsheet_file, spreadsheet_column,
+                pdf_file, pdf_path_val, pdf_description, pdf_mode,
+                image_file, image_path_val, image_description,
                 model_tier, model, model_source, api_key,
                 progress
             ):
     run_btn.click(
         fn=dispatch_run,
         inputs=[task_mode, input_type, spreadsheet_file, spreadsheet_column,
+                pdf_file, pdf_path, pdf_description, pdf_mode,
+                image_file, image_path, image_description] + category_inputs + [model_tier, model, model_source, api_key],
         outputs=[extracted_categories, extract_download, distribution_plot, results, download_file, status]
     )
         outputs=[
             input_type, text_input_group, pdf_input_group, image_input_group,
             spreadsheet_file, spreadsheet_column,
+            pdf_file, pdf_path, pdf_description, pdf_mode,
+            image_file, image_path, image_description,
             task_mode
         ] + category_inputs + [
             add_category_btn, category_count,