import logging from pathlib import Path import fitz # PyMuPDF logger = logging.getLogger(__name__) class DocumentProcessor: """Simplified document processor for the API service""" def __init__(self): """Initialize the document processor""" self.supported_formats = {'.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.bmp'} def process_document( self, file_data: bytes, filename: str, use_ocr: bool = False ) -> str: """ Extract text from document (PDF or image) Args: file_data: Raw file content filename: Original filename use_ocr: Whether to use OCR (not implemented in this simplified version) Returns: Extracted text as string """ try: file_ext = Path(filename).suffix.lower() logger.info(f"Processing file: {filename} with extension: {file_ext}") if file_ext not in self.supported_formats: raise ValueError(f"Unsupported file format: {file_ext}") if file_ext == '.pdf': return self._process_pdf(file_data) else: if use_ocr: raise NotImplementedError("OCR for images not implemented") else: return "Text extraction from images requires OCR to be enabled" except Exception as e: logger.error(f"Error processing document: {str(e)}") raise def _process_pdf(self, file_data: bytes) -> str: """Process PDF to extract text using PyMuPDF""" try: with fitz.open(stream=file_data, filetype="pdf") as pdf_doc: text_parts = [] for page_num in range(len(pdf_doc)): page = pdf_doc[page_num] text = page.get_text() text_parts.append(text) return "\n\n".join(text_parts) except Exception as e: logger.error(f"Error processing PDF: {str(e)}") raise