Spaces:

Peterase
/

Ragora-Server

Sleeping

File size: 3,299 Bytes

f02c5b9

import io
from typing import List
import pypdf
import pdfplumber
from docx import Document as DocxDocument
from app.ports.document_processor import DocumentProcessorPort, ExtractedText


class DocumentProcessorAdapter(DocumentProcessorPort):
    """Extract text from PDF and DOCX files."""
    
    SUPPORTED_TYPES = {
        "application/pdf",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "application/msword"
    }
    
    async def extract(self, file_bytes: bytes, content_type: str) -> ExtractedText:
        """Route to appropriate extractor based on content type."""
        if content_type == "application/pdf":
            content = self._extract_pdf(file_bytes)
            return ExtractedText(
                content=content,
                metadata={"content_type": content_type},
                page_count=self._get_pdf_page_count(file_bytes)
            )
        elif content_type in [
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            "application/msword"
        ]:
            content = self._extract_docx(file_bytes)
            return ExtractedText(
                content=content,
                metadata={"content_type": content_type},
                page_count=1
            )
        else:
            raise ValueError(f"Unsupported content type: {content_type}")
    
    def supports(self, content_type: str) -> bool:
        """Check if content type is supported."""
        return content_type in self.SUPPORTED_TYPES
    
    def _extract_pdf(self, file_bytes: bytes, use_pdfplumber: bool = True) -> str:
        """Extract text from PDF using pdfplumber (better for complex layouts) or pypdf."""
        if use_pdfplumber:
            try:
                return self._extract_with_pdfplumber(file_bytes)
            except Exception:
                # Fallback to pypdf
                return self._extract_with_pypdf(file_bytes)
        return self._extract_with_pypdf(file_bytes)
    
    def _extract_with_pdfplumber(self, file_bytes: bytes) -> str:
        text = ""
        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        return text.strip()
    
    def _extract_with_pypdf(self, file_bytes: bytes) -> str:
        text = ""
        pdf = pypdf.PdfReader(io.BytesIO(file_bytes))
        for page in pdf.pages:
            text += page.extract_text() + "\n"
        return text.strip()
    
    def _get_pdf_page_count(self, file_bytes: bytes) -> int:
        """Get number of pages in PDF."""
        try:
            pdf = pypdf.PdfReader(io.BytesIO(file_bytes))
            return len(pdf.pages)
        except:
            return 1
    
    def _extract_docx(self, file_bytes: bytes) -> str:
        """Extract text from DOCX file."""
        doc = DocxDocument(io.BytesIO(file_bytes))
        text = []
        for para in doc.paragraphs:
            if para.text.strip():
                text.append(para.text)
        return "\n".join(text)


# Singleton instance for dependency injection
def get_document_processor() -> DocumentProcessorPort:
    return DocumentProcessorAdapter()