""" Isolated PDF parser to avoid import conflicts in deployment """ import os import re from pathlib import Path from typing import List, Dict, Any def simple_pdf_text_extract(pdf_path: str) -> str: """ Simple PDF text extraction using only PyMuPDF to avoid dependency conflicts """ try: import fitz # PyMuPDF doc = fitz.open(pdf_path) text = "" for page_num in range(len(doc)): page = doc[page_num] page_text = page.get_text() # Ensure page_text is a string if isinstance(page_text, str): text += page_text + "\n" doc.close() return text.strip() except Exception as e: raise RuntimeError(f"Error extracting text from PDF: {e}") def fallback_parse_document(pdf_path: str) -> Dict[str, Any]: """ Fallback PDF parsing function that avoids complex dependencies """ try: text_content = simple_pdf_text_extract(pdf_path) return { 'document_name': os.path.basename(pdf_path), 'content': text_content, 'total_pages': 1, # We don't track pages in simple mode 'parsing_method': 'simple_fallback', 'processing_time': 0, 'metadata': { 'total_elements': 1, 'text_elements': 1, 'table_elements': 0, 'pages_processed': 1, 'characters_extracted': len(text_content) } } except Exception as e: return { 'document_name': os.path.basename(pdf_path), 'content': "", 'total_pages': 0, 'parsing_method': 'fallback_error', 'processing_time': 0, 'metadata': { 'total_elements': 0, 'text_elements': 0, 'table_elements': 0, 'pages_processed': 0, 'characters_extracted': 0, 'error': str(e) } }