Spaces:

CatLLM
/

survey-classifier

Running

App Files Files Community

chrissoria commited on 7 days ago

Commit

5542b9b

1 Parent(s): 34012b0

For PDF text mode extraction: extract all text first, process as random chunks like text corpus

Browse files

Files changed (1) hide show

app.py +178 -80

app.py CHANGED Viewed

@@ -35,6 +35,23 @@ def count_pdf_pages(pdf_path):
     except Exception:
         return 1  # Default to 1 if can't read
 # Free models (uses Space secrets - no user API key needed)
 FREE_MODEL_CHOICES = [
     "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
@@ -832,91 +849,172 @@ with col_input:
                     st.error(f"{provider} API key not configured")
                 else:
                     model_source = get_model_source(model)
-                    num_items = len(input_data) if isinstance(input_data, list) else 1
-                    # Calculate divisions (same logic as run_auto_extract)
-                    if input_type_selected == "image":
-                        divisions = min(3, max(1, num_items // 5))
-                        categories_per_chunk = 12
-                    else:
-                        divisions = max(1, num_items // 15)
-                        divisions = min(divisions, 5)
-                        chunk_size = num_items // max(1, divisions)
-                        categories_per_chunk = min(10, chunk_size - 1)
-                    # Progress UI
-                    progress_bar = st.progress(0)
-                    status_text = st.empty()
-                    start_time = time.time()
-                    # Process each division
-                    all_categories = []
                     items_list = input_data if isinstance(input_data, list) else [input_data]
-                    chunk_size = len(items_list) // divisions if divisions > 0 else len(items_list)
-                    for div_idx in range(divisions):
-                        # Update progress
-                        progress = div_idx / divisions
-                        progress_bar.progress(progress)
-                        # Calculate ETA
-                        elapsed = time.time() - start_time
-                        if div_idx > 0:
-                            avg_time = elapsed / div_idx
-                            eta_seconds = avg_time * (divisions - div_idx)
-                            eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
-                        else:
-                            eta_str = ""
-                        status_text.text(f"Analyzing chunk {div_idx + 1} of {divisions} ({progress*100:.0f}%){eta_str}")
-                        # Get chunk of data
-                        start_idx = div_idx * chunk_size
-                        end_idx = start_idx + chunk_size if div_idx < divisions - 1 else len(items_list)
-                        chunk_data = items_list[start_idx:end_idx]
-                        if not chunk_data:
-                            continue
-                        try:
-                            extract_kwargs = {
-                                'input_data': chunk_data,
-                                'api_key': actual_api_key,
-                                'input_type': input_type_selected,
-                                'description': description,
-                                'user_model': model,
-                                'model_source': model_source,
-                                'divisions': 1,  # Process as single chunk
-                                'categories_per_chunk': categories_per_chunk,
-                                'max_categories': int(max_categories)
-                            }
-                            if mode:
-                                extract_kwargs['mode'] = mode
-                            chunk_result = catllm.extract(**extract_kwargs)
-                            chunk_cats = chunk_result.get('top_categories', [])
-                            all_categories.extend(chunk_cats)
-                        except Exception as e:
-                            st.warning(f"Error on chunk {div_idx + 1}: {str(e)}")
-                            continue
-                    # Complete progress
-                    progress_bar.progress(1.0)
-                    processing_time = time.time() - start_time
-                    status_text.text(f"Extraction complete in {processing_time:.1f}s")
-                    if all_categories:
-                        # Deduplicate and get top categories
-                        from collections import Counter
-                        cat_counts = Counter(all_categories)
-                        top_cats = [cat for cat, _ in cat_counts.most_common(int(max_categories))]
-                        st.session_state.extracted_categories = top_cats
-                        st.session_state.task_mode = "manual"
-                        st.success(f"Extracted {len(top_cats)} categories in {processing_time:.1f}s")
-                        st.rerun()
                     else:
-                        st.error("No categories were extracted")
     # Category inputs (shown for manual mode or after extraction)
     if st.session_state.task_mode == "manual":

     except Exception:
         return 1  # Default to 1 if can't read
+def extract_text_from_pdfs(pdf_paths):
+    """Extract text from all pages of all PDFs, returning list of page texts."""
+    import fitz  # PyMuPDF
+    all_texts = []
+    for pdf_path in pdf_paths:
+        try:
+            doc = fitz.open(pdf_path)
+            for page in doc:
+                text = page.get_text().strip()
+                if text:  # Only add non-empty pages
+                    all_texts.append(text)
+            doc.close()
+        except Exception as e:
+            print(f"Error extracting text from {pdf_path}: {e}")
+    return all_texts
 # Free models (uses Space secrets - no user API key needed)
 FREE_MODEL_CHOICES = [
     "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
                     st.error(f"{provider} API key not configured")
                 else:
                     model_source = get_model_source(model)
                     items_list = input_data if isinstance(input_data, list) else [input_data]
+                    # For PDF text mode, extract text and process like text corpus
+                    if input_type_selected == "pdf" and mode == "text":
+                        status_text = st.empty()
+                        status_text.text("Extracting text from PDFs...")
+                        # Extract text from all PDF pages
+                        extracted_texts = extract_text_from_pdfs(items_list)
+                        if not extracted_texts:
+                            st.error("No text could be extracted from the PDFs")
+                        else:
+                            # Now process as text corpus with random chunks
+                            import random
+                            random.shuffle(extracted_texts)  # Randomize order
+                            num_items = len(extracted_texts)
+                            divisions = max(1, num_items // 15)
+                            divisions = min(divisions, 5)
+                            chunk_size_calc = num_items // max(1, divisions)
+                            categories_per_chunk = min(10, chunk_size_calc - 1) if chunk_size_calc > 1 else 5
+                            # Progress UI
+                            progress_bar = st.progress(0)
+                            start_time = time.time()
+                            all_categories = []
+                            chunk_size = len(extracted_texts) // divisions if divisions > 0 else len(extracted_texts)
+                            for div_idx in range(divisions):
+                                progress = div_idx / divisions
+                                progress_bar.progress(progress)
+                                elapsed = time.time() - start_time
+                                if div_idx > 0:
+                                    avg_time = elapsed / div_idx
+                                    eta_seconds = avg_time * (divisions - div_idx)
+                                    eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
+                                else:
+                                    eta_str = ""
+                                status_text.text(f"Analyzing text chunk {div_idx + 1} of {divisions} ({progress*100:.0f}%){eta_str}")
+                                start_idx = div_idx * chunk_size
+                                end_idx = start_idx + chunk_size if div_idx < divisions - 1 else len(extracted_texts)
+                                chunk_data = extracted_texts[start_idx:end_idx]
+                                if not chunk_data:
+                                    continue
+                                try:
+                                    # Process as text type since we extracted the text
+                                    extract_kwargs = {
+                                        'input_data': chunk_data,
+                                        'api_key': actual_api_key,
+                                        'input_type': 'text',  # Treat as text now
+                                        'description': description,
+                                        'user_model': model,
+                                        'model_source': model_source,
+                                        'divisions': 1,
+                                        'categories_per_chunk': categories_per_chunk,
+                                        'max_categories': int(max_categories)
+                                    }
+                                    chunk_result = catllm.extract(**extract_kwargs)
+                                    chunk_cats = chunk_result.get('top_categories', [])
+                                    all_categories.extend(chunk_cats)
+                                except Exception as e:
+                                    st.warning(f"Error on chunk {div_idx + 1}: {str(e)}")
+                                    continue
+                            progress_bar.progress(1.0)
+                            processing_time = time.time() - start_time
+                            status_text.text(f"Extraction complete in {processing_time:.1f}s")
+                            if all_categories:
+                                from collections import Counter
+                                cat_counts = Counter(all_categories)
+                                top_cats = [cat for cat, _ in cat_counts.most_common(int(max_categories))]
+                                st.session_state.extracted_categories = top_cats
+                                st.session_state.task_mode = "manual"
+                                st.success(f"Extracted {len(top_cats)} categories from {len(extracted_texts)} pages in {processing_time:.1f}s")
+                                st.rerun()
+                            else:
+                                st.error("No categories were extracted")
                     else:
+                        # Original logic for images, PDFs in image/both mode, and text
+                        num_items = len(items_list)
+                        if input_type_selected == "image":
+                            divisions = min(3, max(1, num_items // 5))
+                            categories_per_chunk = 12
+                        else:
+                            divisions = max(1, num_items // 15)
+                            divisions = min(divisions, 5)
+                            chunk_size_calc = num_items // max(1, divisions)
+                            categories_per_chunk = min(10, chunk_size_calc - 1) if chunk_size_calc > 1 else 5
+                        # Progress UI
+                        progress_bar = st.progress(0)
+                        status_text = st.empty()
+                        start_time = time.time()
+                        all_categories = []
+                        chunk_size = len(items_list) // divisions if divisions > 0 else len(items_list)
+                        for div_idx in range(divisions):
+                            progress = div_idx / divisions
+                            progress_bar.progress(progress)
+                            elapsed = time.time() - start_time
+                            if div_idx > 0:
+                                avg_time = elapsed / div_idx
+                                eta_seconds = avg_time * (divisions - div_idx)
+                                eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
+                            else:
+                                eta_str = ""
+                            status_text.text(f"Analyzing chunk {div_idx + 1} of {divisions} ({progress*100:.0f}%){eta_str}")
+                            start_idx = div_idx * chunk_size
+                            end_idx = start_idx + chunk_size if div_idx < divisions - 1 else len(items_list)
+                            chunk_data = items_list[start_idx:end_idx]
+                            if not chunk_data:
+                                continue
+                            try:
+                                extract_kwargs = {
+                                    'input_data': chunk_data,
+                                    'api_key': actual_api_key,
+                                    'input_type': input_type_selected,
+                                    'description': description,
+                                    'user_model': model,
+                                    'model_source': model_source,
+                                    'divisions': 1,
+                                    'categories_per_chunk': categories_per_chunk,
+                                    'max_categories': int(max_categories)
+                                }
+                                if mode:
+                                    extract_kwargs['mode'] = mode
+                                chunk_result = catllm.extract(**extract_kwargs)
+                                chunk_cats = chunk_result.get('top_categories', [])
+                                all_categories.extend(chunk_cats)
+                            except Exception as e:
+                                st.warning(f"Error on chunk {div_idx + 1}: {str(e)}")
+                                continue
+                        progress_bar.progress(1.0)
+                        processing_time = time.time() - start_time
+                        status_text.text(f"Extraction complete in {processing_time:.1f}s")
+                        if all_categories:
+                            from collections import Counter
+                            cat_counts = Counter(all_categories)
+                            top_cats = [cat for cat, _ in cat_counts.most_common(int(max_categories))]
+                            st.session_state.extracted_categories = top_cats
+                            st.session_state.task_mode = "manual"
+                            st.success(f"Extracted {len(top_cats)} categories in {processing_time:.1f}s")
+                            st.rerun()
+                        else:
+                            st.error("No categories were extracted")
     # Category inputs (shown for manual mode or after extraction)
     if st.session_state.task_mode == "manual":