Spaces:
Sleeping
Sleeping
| """ | |
| Isolated PDF parser to avoid import conflicts in deployment | |
| """ | |
| import os | |
| import re | |
| from pathlib import Path | |
| from typing import List, Dict, Any | |
| def simple_pdf_text_extract(pdf_path: str) -> str: | |
| """ | |
| Simple PDF text extraction using only PyMuPDF to avoid dependency conflicts | |
| """ | |
| try: | |
| import fitz # PyMuPDF | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| page_text = page.get_text() | |
| # Ensure page_text is a string | |
| if isinstance(page_text, str): | |
| text += page_text + "\n" | |
| doc.close() | |
| return text.strip() | |
| except Exception as e: | |
| raise RuntimeError(f"Error extracting text from PDF: {e}") | |
| def fallback_parse_document(pdf_path: str) -> Dict[str, Any]: | |
| """ | |
| Fallback PDF parsing function that avoids complex dependencies | |
| """ | |
| try: | |
| text_content = simple_pdf_text_extract(pdf_path) | |
| return { | |
| 'document_name': os.path.basename(pdf_path), | |
| 'content': text_content, | |
| 'total_pages': 1, # We don't track pages in simple mode | |
| 'parsing_method': 'simple_fallback', | |
| 'processing_time': 0, | |
| 'metadata': { | |
| 'total_elements': 1, | |
| 'text_elements': 1, | |
| 'table_elements': 0, | |
| 'pages_processed': 1, | |
| 'characters_extracted': len(text_content) | |
| } | |
| } | |
| except Exception as e: | |
| return { | |
| 'document_name': os.path.basename(pdf_path), | |
| 'content': "", | |
| 'total_pages': 0, | |
| 'parsing_method': 'fallback_error', | |
| 'processing_time': 0, | |
| 'metadata': { | |
| 'total_elements': 0, | |
| 'text_elements': 0, | |
| 'table_elements': 0, | |
| 'pages_processed': 0, | |
| 'characters_extracted': 0, | |
| 'error': str(e) | |
| } | |
| } |