lt_space / app /models /document_processor.py
Arsive2's picture
Updated comments
d0d0352
import logging
from pathlib import Path
import fitz # PyMuPDF
logger = logging.getLogger(__name__)
class DocumentProcessor:
"""Simplified document processor for the API service"""
def __init__(self):
"""Initialize the document processor"""
self.supported_formats = {'.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.bmp'}
def process_document(
self,
file_data: bytes,
filename: str,
use_ocr: bool = False
) -> str:
"""
Extract text from document (PDF or image)
Args:
file_data: Raw file content
filename: Original filename
use_ocr: Whether to use OCR (not implemented in this simplified version)
Returns:
Extracted text as string
"""
try:
file_ext = Path(filename).suffix.lower()
logger.info(f"Processing file: {filename} with extension: {file_ext}")
if file_ext not in self.supported_formats:
raise ValueError(f"Unsupported file format: {file_ext}")
if file_ext == '.pdf':
return self._process_pdf(file_data)
else:
if use_ocr:
raise NotImplementedError("OCR for images not implemented")
else:
return "Text extraction from images requires OCR to be enabled"
except Exception as e:
logger.error(f"Error processing document: {str(e)}")
raise
def _process_pdf(self, file_data: bytes) -> str:
"""Process PDF to extract text using PyMuPDF"""
try:
with fitz.open(stream=file_data, filetype="pdf") as pdf_doc:
text_parts = []
for page_num in range(len(pdf_doc)):
page = pdf_doc[page_num]
text = page.get_text()
text_parts.append(text)
return "\n\n".join(text_parts)
except Exception as e:
logger.error(f"Error processing PDF: {str(e)}")
raise