Rs_mini_projrct / src /simple_pdf_parser.py
Harshilforworks's picture
Upload 14 files
e681f27 verified
"""
Isolated PDF parser to avoid import conflicts in deployment
"""
import os
import re
from pathlib import Path
from typing import List, Dict, Any
def simple_pdf_text_extract(pdf_path: str) -> str:
"""
Simple PDF text extraction using only PyMuPDF to avoid dependency conflicts
"""
try:
import fitz # PyMuPDF
doc = fitz.open(pdf_path)
text = ""
for page_num in range(len(doc)):
page = doc[page_num]
page_text = page.get_text()
# Ensure page_text is a string
if isinstance(page_text, str):
text += page_text + "\n"
doc.close()
return text.strip()
except Exception as e:
raise RuntimeError(f"Error extracting text from PDF: {e}")
def fallback_parse_document(pdf_path: str) -> Dict[str, Any]:
"""
Fallback PDF parsing function that avoids complex dependencies
"""
try:
text_content = simple_pdf_text_extract(pdf_path)
return {
'document_name': os.path.basename(pdf_path),
'content': text_content,
'total_pages': 1, # We don't track pages in simple mode
'parsing_method': 'simple_fallback',
'processing_time': 0,
'metadata': {
'total_elements': 1,
'text_elements': 1,
'table_elements': 0,
'pages_processed': 1,
'characters_extracted': len(text_content)
}
}
except Exception as e:
return {
'document_name': os.path.basename(pdf_path),
'content': "",
'total_pages': 0,
'parsing_method': 'fallback_error',
'processing_time': 0,
'metadata': {
'total_elements': 0,
'text_elements': 0,
'table_elements': 0,
'pages_processed': 0,
'characters_extracted': 0,
'error': str(e)
}
}