Spaces:
Sleeping
Sleeping
| import io | |
| from typing import List | |
| import pypdf | |
| import pdfplumber | |
| from docx import Document as DocxDocument | |
| from app.ports.document_processor import DocumentProcessorPort, ExtractedText | |
| class DocumentProcessorAdapter(DocumentProcessorPort): | |
| """Extract text from PDF and DOCX files.""" | |
| SUPPORTED_TYPES = { | |
| "application/pdf", | |
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
| "application/msword" | |
| } | |
| async def extract(self, file_bytes: bytes, content_type: str) -> ExtractedText: | |
| """Route to appropriate extractor based on content type.""" | |
| if content_type == "application/pdf": | |
| content = self._extract_pdf(file_bytes) | |
| return ExtractedText( | |
| content=content, | |
| metadata={"content_type": content_type}, | |
| page_count=self._get_pdf_page_count(file_bytes) | |
| ) | |
| elif content_type in [ | |
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
| "application/msword" | |
| ]: | |
| content = self._extract_docx(file_bytes) | |
| return ExtractedText( | |
| content=content, | |
| metadata={"content_type": content_type}, | |
| page_count=1 | |
| ) | |
| else: | |
| raise ValueError(f"Unsupported content type: {content_type}") | |
| def supports(self, content_type: str) -> bool: | |
| """Check if content type is supported.""" | |
| return content_type in self.SUPPORTED_TYPES | |
| def _extract_pdf(self, file_bytes: bytes, use_pdfplumber: bool = True) -> str: | |
| """Extract text from PDF using pdfplumber (better for complex layouts) or pypdf.""" | |
| if use_pdfplumber: | |
| try: | |
| return self._extract_with_pdfplumber(file_bytes) | |
| except Exception: | |
| # Fallback to pypdf | |
| return self._extract_with_pypdf(file_bytes) | |
| return self._extract_with_pypdf(file_bytes) | |
| def _extract_with_pdfplumber(self, file_bytes: bytes) -> str: | |
| text = "" | |
| with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| return text.strip() | |
| def _extract_with_pypdf(self, file_bytes: bytes) -> str: | |
| text = "" | |
| pdf = pypdf.PdfReader(io.BytesIO(file_bytes)) | |
| for page in pdf.pages: | |
| text += page.extract_text() + "\n" | |
| return text.strip() | |
| def _get_pdf_page_count(self, file_bytes: bytes) -> int: | |
| """Get number of pages in PDF.""" | |
| try: | |
| pdf = pypdf.PdfReader(io.BytesIO(file_bytes)) | |
| return len(pdf.pages) | |
| except: | |
| return 1 | |
| def _extract_docx(self, file_bytes: bytes) -> str: | |
| """Extract text from DOCX file.""" | |
| doc = DocxDocument(io.BytesIO(file_bytes)) | |
| text = [] | |
| for para in doc.paragraphs: | |
| if para.text.strip(): | |
| text.append(para.text) | |
| return "\n".join(text) | |
| # Singleton instance for dependency injection | |
| def get_document_processor() -> DocumentProcessorPort: | |
| return DocumentProcessorAdapter() | |