Spaces:

Peterase
/

Ragora-Server

Sleeping

File size: 2,290 Bytes

"""Document processor adapter — text extraction runs in thread pool."""
from app.ports.document_processor import DocumentProcessorPort
from typing import BinaryIO
from pathlib import Path
import asyncio
import logging

logger = logging.getLogger(__name__)

# Shared executor for I/O-bound extraction work
_executor = None

def _get_executor():
    global _executor
    if _executor is None:
        from concurrent.futures import ThreadPoolExecutor
        _executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="doc_processor")
    return _executor


def _extract_pdf_sync(data: bytes) -> str:
    """Synchronous PDF extraction — runs in thread pool."""
    from io import BytesIO
    from PyPDF2 import PdfReader
    reader = PdfReader(BytesIO(data))
    text = ""
    for page in reader.pages:
        text += (page.extract_text() or "") + "\n"
    if not text.strip():
        raise ValueError("No text could be extracted from PDF")
    logger.info(f"Extracted {len(text)} characters from PDF")
    return text.strip()


def _extract_docx_sync(data: bytes) -> str:
    """Synchronous DOCX extraction — runs in thread pool."""
    from io import BytesIO
    from docx import Document
    doc = Document(BytesIO(data))
    text = "\n".join(p.text for p in doc.paragraphs)
    if not text.strip():
        raise ValueError("No text could be extracted from DOCX")
    logger.info(f"Extracted {len(text)} characters from DOCX")
    return text.strip()


class DocumentProcessorAdapter(DocumentProcessorPort):
    """Non-blocking text extraction via thread pool."""

    async def extract_text(self, file: BinaryIO, filename: str) -> str:
        """Extract text without blocking the event loop."""
        ext = Path(filename).suffix.lower()
        data = file.read()  # read bytes once

        loop = asyncio.get_event_loop()

        if ext == ".pdf":
            return await loop.run_in_executor(_get_executor(), _extract_pdf_sync, data)
        elif ext in (".docx", ".doc"):
            return await loop.run_in_executor(_get_executor(), _extract_docx_sync, data)
        else:
            raise ValueError(f"Unsupported file type: {ext}")

    def supports_file(self, filename: str) -> bool:
        return Path(filename).suffix.lower() in (".pdf", ".docx", ".doc")