Spaces:
Sleeping
Sleeping
File size: 3,299 Bytes
f02c5b9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | import io
from typing import List
import pypdf
import pdfplumber
from docx import Document as DocxDocument
from app.ports.document_processor import DocumentProcessorPort, ExtractedText
class DocumentProcessorAdapter(DocumentProcessorPort):
"""Extract text from PDF and DOCX files."""
SUPPORTED_TYPES = {
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword"
}
async def extract(self, file_bytes: bytes, content_type: str) -> ExtractedText:
"""Route to appropriate extractor based on content type."""
if content_type == "application/pdf":
content = self._extract_pdf(file_bytes)
return ExtractedText(
content=content,
metadata={"content_type": content_type},
page_count=self._get_pdf_page_count(file_bytes)
)
elif content_type in [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword"
]:
content = self._extract_docx(file_bytes)
return ExtractedText(
content=content,
metadata={"content_type": content_type},
page_count=1
)
else:
raise ValueError(f"Unsupported content type: {content_type}")
def supports(self, content_type: str) -> bool:
"""Check if content type is supported."""
return content_type in self.SUPPORTED_TYPES
def _extract_pdf(self, file_bytes: bytes, use_pdfplumber: bool = True) -> str:
"""Extract text from PDF using pdfplumber (better for complex layouts) or pypdf."""
if use_pdfplumber:
try:
return self._extract_with_pdfplumber(file_bytes)
except Exception:
# Fallback to pypdf
return self._extract_with_pypdf(file_bytes)
return self._extract_with_pypdf(file_bytes)
def _extract_with_pdfplumber(self, file_bytes: bytes) -> str:
text = ""
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text.strip()
def _extract_with_pypdf(self, file_bytes: bytes) -> str:
text = ""
pdf = pypdf.PdfReader(io.BytesIO(file_bytes))
for page in pdf.pages:
text += page.extract_text() + "\n"
return text.strip()
def _get_pdf_page_count(self, file_bytes: bytes) -> int:
"""Get number of pages in PDF."""
try:
pdf = pypdf.PdfReader(io.BytesIO(file_bytes))
return len(pdf.pages)
except:
return 1
def _extract_docx(self, file_bytes: bytes) -> str:
"""Extract text from DOCX file."""
doc = DocxDocument(io.BytesIO(file_bytes))
text = []
for para in doc.paragraphs:
if para.text.strip():
text.append(para.text)
return "\n".join(text)
# Singleton instance for dependency injection
def get_document_processor() -> DocumentProcessorPort:
return DocumentProcessorAdapter()
|