Spaces:

CatLLM
/

survey-classifier

Running

App Files Files Community

chrissoria commited on Jan 9

Commit

f3d18a8

verified ·

1 Parent(s): b4d8ab0

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

CLAUDE.md +15 -0
__pycache__/app.cpython-311.pyc +0 -0
app.py +27 -52
requirements.txt +1 -1

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,15 @@

+# CatLLM HuggingFace Space
+## Deployment
+Push to: https://huggingface.co/spaces/CatLLM/survey-classifier
+## Authentication
+HuggingFace token is stored in `.env` in this directory:
+- Variable: `CATLLM_HF_TOKEN`
+- Used for pushing to the CatLLM organization space
+## Architecture
+The HuggingFace app should always use the `catllm` Python package for core functions (classification, extraction, etc.). Do not duplicate catllm logic in the app—import and call `catllm.classify()` and `catllm.extract()` directly. If new functionality is needed (e.g., progress callbacks), add it to the catllm package first, then use it in the app.

__pycache__/app.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ

app.py CHANGED Viewed

@@ -547,18 +547,15 @@ def run_classify_data(input_type, input_data, description, categories,
         start_time = time.time()
         classify_kwargs = {
-            'input_data': input_data,
             'categories': categories,
-            'api_key': actual_api_key,
-            'input_type': input_type,
-            'description': description,
-            'user_model': model,
-            'model_source': model_source
         }
         if mode:
-            classify_kwargs['mode'] = mode
-        result = catllm.classify(**classify_kwargs)
         processing_time = time.time() - start_time
         num_items = len(result)
@@ -1061,15 +1058,12 @@ with col_input:
                             status_text.text(f"Processing page {current_idx+1} of {total_pages} ({page_label}) ({progress*100:.0f}%){eta_str}")
                         try:
-                            result_df = catllm.classify(
-                                input_data=items_list,
                                 categories=categories_entered,
-                                api_key=actual_api_key,
-                                input_type="pdf",
-                                description=description,
-                                user_model=model,
-                                model_source=model_source,
-                                mode=mode,
                                 progress_callback=pdf_progress_callback
                             )
@@ -1100,46 +1094,27 @@ with col_input:
                             all_results = []
                     else:
-                        # Non-PDF processing (text, images) - item by item
-                        all_results = []
                         total_items = len(items_list)
-                        for i, item in enumerate(items_list):
-                            progress = i / total_items if total_items > 0 else 0
-                            progress_bar.progress(min(progress, 1.0))
-                            elapsed = time.time() - start_time
-                            if i > 0:
-                                avg_time = elapsed / i
-                                eta_seconds = avg_time * (total_items - i)
-                                eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
-                            else:
-                                eta_str = ""
-                            status_text.text(f"Processing item {i+1} of {total_items} ({progress*100:.0f}%){eta_str}")
-                            try:
-                                item_result = catllm.classify(
-                                    input_data=[item],
-                                    categories=categories_entered,
-                                    api_key=actual_api_key,
-                                    input_type=input_type_selected,
-                                    description=description,
-                                    user_model=model,
-                                    model_source=model_source
-                                )
-                                all_results.append(item_result)
-                                progress = (i + 1) / total_items if total_items > 0 else 1.0
-                                progress_bar.progress(min(progress, 1.0))
-                            except Exception as e:
-                                st.warning(f"Error on item {i+1}: {str(e)}")
-                                continue
-                        processing_time = time.time() - start_time
-                        progress_bar.progress(1.0)
-                        status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
                     if all_results:
                         # Combine results

         start_time = time.time()
         classify_kwargs = {
+            'survey_input': input_data,
             'categories': categories,
+            'models': [(model, model_source, actual_api_key)],
+            'input_description': description,
         }
         if mode:
+            classify_kwargs['pdf_mode'] = mode
+        result = catllm.multi_class_ensemble(**classify_kwargs)
         processing_time = time.time() - start_time
         num_items = len(result)
                             status_text.text(f"Processing page {current_idx+1} of {total_pages} ({page_label}) ({progress*100:.0f}%){eta_str}")
                         try:
+                            result_df = catllm.multi_class_ensemble(
+                                survey_input=items_list,
                                 categories=categories_entered,
+                                models=[(model, model_source, actual_api_key)],
+                                input_description=description,
+                                pdf_mode=mode,
                                 progress_callback=pdf_progress_callback
                             )
                             all_results = []
                     else:
+                        # Non-PDF processing (text, images) - process all at once
                         total_items = len(items_list)
+                        status_text.text(f"Processing {total_items} items...")
+                        try:
+                            result_df = catllm.multi_class_ensemble(
+                                survey_input=items_list,
+                                categories=categories_entered,
+                                models=[(model, model_source, actual_api_key)],
+                                input_description=description,
+                            )
+                            all_results = [result_df]
+                            processing_time = time.time() - start_time
+                            progress_bar.progress(1.0)
+                            status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
+                        except Exception as e:
+                            st.error(f"Error: {str(e)}")
+                            all_results = []
+                            processing_time = time.time() - start_time
                     if all_results:
                         # Combine results

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 streamlit>=1.32.0
-cat-llm[pdf]>=0.1.8
 mistralai
 pydantic==2.10.6
 huggingface_hub<0.27.0

 streamlit>=1.32.0
+cat-llm[pdf]>=0.1.9
 mistralai
 pydantic==2.10.6
 huggingface_hub<0.27.0