Spaces:
Sleeping
Sleeping
| import pdfplumber | |
| # Note: The Gemini call logic is now centralized in gemini_utils.py | |
| # This file is now only for PDF-specific processing. | |
| def preprocess_pdf_text(pdf_path: str) -> str: | |
| """ | |
| Extracts all text from all pages of the PDF using pdfplumber, | |
| attempting to preserve layout for better LLM understanding. | |
| """ | |
| processed_text_parts = [] | |
| try: | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for i, page in enumerate(pdf.pages): | |
| page_text = page.extract_text(x_tolerance=2, y_tolerance=2, layout=True) | |
| if page_text: | |
| processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n{page_text}") | |
| else: | |
| processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n[No text extracted from this page]") | |
| full_text = "".join(processed_text_parts) | |
| return full_text | |
| except Exception as e: | |
| error_msg = f"Error processing PDF with pdfplumber: {type(e).__name__} - {e}" | |
| print(error_msg) | |
| return error_msg | |
| # We are keeping the get_structured_data_with_gemini call in the main app flow | |
| # but importing it from gemini_utils to keep API calls together. | |
| from gemini_utils import get_structured_data_with_gemini |