Spaces:

CatLLM
/

survey-classifier

Running

App Files Files Community

chrissoria commited on 19 days ago

Commit

af04675

1 Parent(s): a9678de

Process PDF pages one at a time for real page-by-page progress

Browse files

Files changed (1) hide show

app.py +139 -63

app.py CHANGED Viewed

@@ -52,6 +52,46 @@ def extract_text_from_pdfs(pdf_paths):
             print(f"Error extracting text from {pdf_path}: {e}")
     return all_texts
 # Free models - display name -> actual API model name
 FREE_MODELS_MAP = {
     "Qwen3 235B": "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
@@ -984,14 +1024,6 @@ with col_input:
                     model_source = get_model_source(model)
                     items_list = input_data if isinstance(input_data, list) else [input_data]
-                    # Progress tracking
-                    total_items = len(items_list)
-                    # For PDFs, also get page counts for display
-                    if input_type_selected == "pdf":
-                        pdf_page_counts = [(pdf_path, count_pdf_pages(pdf_path)) for pdf_path in items_list]
-                        total_pages = sum(pc for _, pc in pdf_page_counts)
                     # Progress UI
                     progress_bar = st.progress(0)
                     status_text = st.empty()
@@ -999,56 +1031,114 @@ with col_input:
                     all_results = []
                     start_time = time.time()
-                    for i, item in enumerate(items_list):
-                        # Update progress before processing
-                        progress = i / total_items if total_items > 0 else 0
-                        progress_bar.progress(min(progress, 1.0))
-                        # Calculate ETA
-                        elapsed = time.time() - start_time
-                        if i > 0:
-                            avg_time_per_item = elapsed / i
-                            remaining_items = total_items - i
-                            eta_seconds = avg_time_per_item * remaining_items
-                            eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
-                        else:
-                            eta_str = ""
-                        if input_type_selected == "pdf":
-                            doc_pages = pdf_page_counts[i][1]
-                            status_text.text(f"Processing document {i+1} of {total_items} ({doc_pages} pages) ({progress*100:.0f}%){eta_str}")
-                        else:
-                            status_text.text(f"Processing item {i+1} of {total_items} ({progress*100:.0f}%){eta_str}")
-                        try:
-                            classify_kwargs = {
-                                'input_data': [item],
-                                'categories': categories_entered,
-                                'api_key': actual_api_key,
-                                'input_type': input_type_selected,
-                                'description': description,
-                                'user_model': model,
-                                'model_source': model_source
-                            }
-                            if mode:
-                                classify_kwargs['mode'] = mode
-                            item_result = catllm.classify(**classify_kwargs)
-                            all_results.append(item_result)
-                            # Update progress after processing
-                            progress = (i + 1) / total_items if total_items > 0 else 1.0
                             progress_bar.progress(min(progress, 1.0))
-                        except Exception as e:
-                            st.warning(f"Error on item {i+1}: {str(e)}")
-                            continue
                     # Complete progress
                     progress_bar.progress(1.0)
                     processing_time = time.time() - start_time
                     if input_type_selected == "pdf":
-                        status_text.text(f"Completed {total_items} document(s) ({total_pages} pages) in {processing_time:.1f}s")
                     else:
                         status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
@@ -1056,20 +1146,6 @@ with col_input:
                         # Combine results
                         result_df = pd.concat(all_results, ignore_index=True)
-                        # For PDFs, replace temp file paths with original filenames
-                        if input_type_selected == "pdf" and 'pdf_input' in result_df.columns:
-                            pdf_name_map = st.session_state.get('pdf_name_map', {})
-                            def replace_temp_path(val):
-                                if pd.isna(val):
-                                    return val
-                                val_str = str(val)
-                                for temp_path, orig_name in pdf_name_map.items():
-                                    if temp_path in val_str:
-                                        # Replace temp path with original name, keep page suffix
-                                        return val_str.replace(temp_path, orig_name)
-                                return val_str
-                            result_df['pdf_input'] = result_df['pdf_input'].apply(replace_temp_path)
                         # Save CSV
                         with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
                             result_df.to_csv(f.name, index=False)

             print(f"Error extracting text from {pdf_path}: {e}")
     return all_texts
+def extract_pdf_pages(pdf_paths, pdf_name_map, mode="image"):
+    """
+    Extract individual pages from PDFs.
+    Returns list of (page_data, page_label) tuples.
+    For image mode: page_data is path to temp image file
+    For text mode: page_data is extracted text
+    """
+    import fitz  # PyMuPDF
+    pages = []
+    for pdf_path in pdf_paths:
+        orig_name = pdf_name_map.get(pdf_path, os.path.basename(pdf_path).replace('.pdf', ''))
+        try:
+            doc = fitz.open(pdf_path)
+            for page_num, page in enumerate(doc, 1):
+                page_label = f"{orig_name}_p{page_num}"
+                if mode == "text":
+                    # Extract text
+                    text = page.get_text().strip()
+                    if text:
+                        pages.append((text, page_label, "text"))
+                else:
+                    # Render as image (for image or both mode)
+                    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x zoom for better quality
+                    img_path = tempfile.NamedTemporaryFile(delete=False, suffix='.png').name
+                    pix.save(img_path)
+                    if mode == "both":
+                        text = page.get_text().strip()
+                        pages.append((img_path, page_label, "image", text))
+                    else:
+                        pages.append((img_path, page_label, "image"))
+            doc.close()
+        except Exception as e:
+            print(f"Error extracting pages from {pdf_path}: {e}")
+    return pages
 # Free models - display name -> actual API model name
 FREE_MODELS_MAP = {
     "Qwen3 235B": "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
                     model_source = get_model_source(model)
                     items_list = input_data if isinstance(input_data, list) else [input_data]
                     # Progress UI
                     progress_bar = st.progress(0)
                     status_text = st.empty()
                     all_results = []
                     start_time = time.time()
+                    # For PDFs, extract pages and process one at a time
+                    if input_type_selected == "pdf":
+                        pdf_name_map = st.session_state.get('pdf_name_map', {})
+                        status_text.text("Extracting PDF pages...")
+                        pages = extract_pdf_pages(items_list, pdf_name_map, mode)
+                        total_pages = len(pages)
+                        for i, page_data in enumerate(pages):
+                            # Update progress
+                            progress = i / total_pages if total_pages > 0 else 0
                             progress_bar.progress(min(progress, 1.0))
+                            # Calculate ETA
+                            elapsed = time.time() - start_time
+                            if i > 0:
+                                avg_time = elapsed / i
+                                eta_seconds = avg_time * (total_pages - i)
+                                eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
+                            else:
+                                eta_str = ""
+                            page_label = page_data[1]
+                            status_text.text(f"Processing page {i+1} of {total_pages} ({page_label}) ({progress*100:.0f}%){eta_str}")
+                            try:
+                                if page_data[2] == "text":
+                                    # Text mode - classify as text
+                                    result = catllm.classify(
+                                        input_data=[page_data[0]],
+                                        categories=categories_entered,
+                                        api_key=actual_api_key,
+                                        input_type="text",
+                                        description=description,
+                                        user_model=model,
+                                        model_source=model_source
+                                    )
+                                else:
+                                    # Image mode - classify as image
+                                    result = catllm.classify(
+                                        input_data=[page_data[0]],
+                                        categories=categories_entered,
+                                        api_key=actual_api_key,
+                                        input_type="image",
+                                        description=description,
+                                        user_model=model,
+                                        model_source=model_source
+                                    )
+                                # Replace the input column with the page label
+                                if 'image_input' in result.columns:
+                                    result['pdf_input'] = page_label
+                                    result = result.drop(columns=['image_input'])
+                                elif 'text_input' in result.columns:
+                                    result['pdf_input'] = page_label
+                                    result = result.drop(columns=['text_input'])
+                                else:
+                                    result['pdf_input'] = page_label
+                                all_results.append(result)
+                            except Exception as e:
+                                st.warning(f"Error on {page_label}: {str(e)}")
+                                continue
+                        total_items = total_pages
+                    else:
+                        # Non-PDF processing (text, images)
+                        total_items = len(items_list)
+                        for i, item in enumerate(items_list):
+                            progress = i / total_items if total_items > 0 else 0
+                            progress_bar.progress(min(progress, 1.0))
+                            elapsed = time.time() - start_time
+                            if i > 0:
+                                avg_time = elapsed / i
+                                eta_seconds = avg_time * (total_items - i)
+                                eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
+                            else:
+                                eta_str = ""
+                            status_text.text(f"Processing item {i+1} of {total_items} ({progress*100:.0f}%){eta_str}")
+                            try:
+                                item_result = catllm.classify(
+                                    input_data=[item],
+                                    categories=categories_entered,
+                                    api_key=actual_api_key,
+                                    input_type=input_type_selected,
+                                    description=description,
+                                    user_model=model,
+                                    model_source=model_source
+                                )
+                                all_results.append(item_result)
+                                # Update progress after processing
+                                progress = (i + 1) / total_items if total_items > 0 else 1.0
+                                progress_bar.progress(min(progress, 1.0))
+                            except Exception as e:
+                                st.warning(f"Error on item {i+1}: {str(e)}")
+                                continue
                     # Complete progress
                     progress_bar.progress(1.0)
                     processing_time = time.time() - start_time
                     if input_type_selected == "pdf":
+                        status_text.text(f"Completed {total_items} pages in {processing_time:.1f}s")
                     else:
                         status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")
                         # Combine results
                         result_df = pd.concat(all_results, ignore_index=True)
                         # Save CSV
                         with tempfile.NamedTemporaryFile(mode='w', suffix='_classified.csv', delete=False) as f:
                             result_df.to_csv(f.name, index=False)