Ragora-Server / app /services /document_processor.py
Peterase's picture
deploy: initial deployment to Hugging Face Spaces
f02c5b9
import io
from typing import List
import pypdf
import pdfplumber
from docx import Document as DocxDocument
from app.ports.document_processor import DocumentProcessorPort, ExtractedText
class DocumentProcessorAdapter(DocumentProcessorPort):
"""Extract text from PDF and DOCX files."""
SUPPORTED_TYPES = {
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword"
}
async def extract(self, file_bytes: bytes, content_type: str) -> ExtractedText:
"""Route to appropriate extractor based on content type."""
if content_type == "application/pdf":
content = self._extract_pdf(file_bytes)
return ExtractedText(
content=content,
metadata={"content_type": content_type},
page_count=self._get_pdf_page_count(file_bytes)
)
elif content_type in [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword"
]:
content = self._extract_docx(file_bytes)
return ExtractedText(
content=content,
metadata={"content_type": content_type},
page_count=1
)
else:
raise ValueError(f"Unsupported content type: {content_type}")
def supports(self, content_type: str) -> bool:
"""Check if content type is supported."""
return content_type in self.SUPPORTED_TYPES
def _extract_pdf(self, file_bytes: bytes, use_pdfplumber: bool = True) -> str:
"""Extract text from PDF using pdfplumber (better for complex layouts) or pypdf."""
if use_pdfplumber:
try:
return self._extract_with_pdfplumber(file_bytes)
except Exception:
# Fallback to pypdf
return self._extract_with_pypdf(file_bytes)
return self._extract_with_pypdf(file_bytes)
def _extract_with_pdfplumber(self, file_bytes: bytes) -> str:
text = ""
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text.strip()
def _extract_with_pypdf(self, file_bytes: bytes) -> str:
text = ""
pdf = pypdf.PdfReader(io.BytesIO(file_bytes))
for page in pdf.pages:
text += page.extract_text() + "\n"
return text.strip()
def _get_pdf_page_count(self, file_bytes: bytes) -> int:
"""Get number of pages in PDF."""
try:
pdf = pypdf.PdfReader(io.BytesIO(file_bytes))
return len(pdf.pages)
except:
return 1
def _extract_docx(self, file_bytes: bytes) -> str:
"""Extract text from DOCX file."""
doc = DocxDocument(io.BytesIO(file_bytes))
text = []
for para in doc.paragraphs:
if para.text.strip():
text.append(para.text)
return "\n".join(text)
# Singleton instance for dependency injection
def get_document_processor() -> DocumentProcessorPort:
return DocumentProcessorAdapter()