import io from typing import List import pypdf import pdfplumber from docx import Document as DocxDocument from app.ports.document_processor import DocumentProcessorPort, ExtractedText class DocumentProcessorAdapter(DocumentProcessorPort): """Extract text from PDF and DOCX files.""" SUPPORTED_TYPES = { "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword" } async def extract(self, file_bytes: bytes, content_type: str) -> ExtractedText: """Route to appropriate extractor based on content type.""" if content_type == "application/pdf": content = self._extract_pdf(file_bytes) return ExtractedText( content=content, metadata={"content_type": content_type}, page_count=self._get_pdf_page_count(file_bytes) ) elif content_type in [ "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword" ]: content = self._extract_docx(file_bytes) return ExtractedText( content=content, metadata={"content_type": content_type}, page_count=1 ) else: raise ValueError(f"Unsupported content type: {content_type}") def supports(self, content_type: str) -> bool: """Check if content type is supported.""" return content_type in self.SUPPORTED_TYPES def _extract_pdf(self, file_bytes: bytes, use_pdfplumber: bool = True) -> str: """Extract text from PDF using pdfplumber (better for complex layouts) or pypdf.""" if use_pdfplumber: try: return self._extract_with_pdfplumber(file_bytes) except Exception: # Fallback to pypdf return self._extract_with_pypdf(file_bytes) return self._extract_with_pypdf(file_bytes) def _extract_with_pdfplumber(self, file_bytes: bytes) -> str: text = "" with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" return text.strip() def _extract_with_pypdf(self, file_bytes: bytes) -> str: text = "" pdf = pypdf.PdfReader(io.BytesIO(file_bytes)) for page in pdf.pages: text += page.extract_text() + "\n" return text.strip() def _get_pdf_page_count(self, file_bytes: bytes) -> int: """Get number of pages in PDF.""" try: pdf = pypdf.PdfReader(io.BytesIO(file_bytes)) return len(pdf.pages) except: return 1 def _extract_docx(self, file_bytes: bytes) -> str: """Extract text from DOCX file.""" doc = DocxDocument(io.BytesIO(file_bytes)) text = [] for para in doc.paragraphs: if para.text.strip(): text.append(para.text) return "\n".join(text) # Singleton instance for dependency injection def get_document_processor() -> DocumentProcessorPort: return DocumentProcessorAdapter()