""" MediGuard AI — PDF Parser Service Production PDF parsing with Docling (preferred) falling back to PyPDF. Returns structured text with section metadata. """ from __future__ import annotations import hashlib import logging from dataclasses import dataclass, field from functools import lru_cache from pathlib import Path logger = logging.getLogger(__name__) @dataclass class ParsedSection: """One logical section extracted from a PDF.""" title: str text: str page_numbers: list[int] = field(default_factory=list) @dataclass class ParsedDocument: """Result of parsing a single PDF.""" filename: str content_hash: str full_text: str sections: list[ParsedSection] = field(default_factory=list) page_count: int = 0 error: str | None = None class PDFParserService: """Unified PDF parsing with Docling → PyPDF fallback.""" def __init__(self) -> None: self._has_docling = self._check_docling() @staticmethod def _check_docling() -> bool: try: import docling # noqa: F401 return True except ImportError: logger.info("Docling not installed — using PyPDF fallback") return False def parse(self, path: Path) -> ParsedDocument: """Parse a PDF file and return structured text.""" if not path.exists(): return ParsedDocument( filename=path.name, content_hash="", full_text="", error=f"File not found: {path}", ) content_hash = hashlib.sha256(path.read_bytes()).hexdigest() if self._has_docling: return self._parse_with_docling(path, content_hash) return self._parse_with_pypdf(path, content_hash) # ------------------------------------------------------------------ # # Docling (preferred) # ------------------------------------------------------------------ # def _parse_with_docling(self, path: Path, content_hash: str) -> ParsedDocument: try: from docling.document_converter import DocumentConverter converter = DocumentConverter() result = converter.convert(str(path)) doc = result.document sections: list[ParsedSection] = [] full_parts: list[str] = [] for element in doc.iterate_items(): text = element.text if hasattr(element, "text") else str(element) if text.strip(): full_parts.append(text.strip()) sections.append( ParsedSection( title=getattr(element, "label", ""), text=text.strip(), ) ) full_text = "\n\n".join(full_parts) return ParsedDocument( filename=path.name, content_hash=content_hash, full_text=full_text, sections=sections, page_count=getattr(doc, "num_pages", 0), ) except Exception as exc: logger.warning("Docling failed for %s — falling back to PyPDF: %s", path.name, exc) return self._parse_with_pypdf(path, content_hash) # ------------------------------------------------------------------ # # PyPDF fallback # ------------------------------------------------------------------ # def _parse_with_pypdf(self, path: Path, content_hash: str) -> ParsedDocument: try: from pypdf import PdfReader reader = PdfReader(str(path)) pages_text: list[str] = [] for i, page in enumerate(reader.pages): text = page.extract_text() or "" if text.strip(): pages_text.append(text.strip()) full_text = "\n\n".join(pages_text) sections = [ ParsedSection(title=f"Page {i + 1}", text=t, page_numbers=[i + 1]) for i, t in enumerate(pages_text) ] return ParsedDocument( filename=path.name, content_hash=content_hash, full_text=full_text, sections=sections, page_count=len(reader.pages), ) except Exception as exc: logger.error("PyPDF failed for %s: %s", path.name, exc) return ParsedDocument( filename=path.name, content_hash=content_hash, full_text="", error=str(exc), ) # ------------------------------------------------------------------ # # Batch # ------------------------------------------------------------------ # def parse_directory(self, directory: Path) -> list[ParsedDocument]: """Parse all PDFs in a directory.""" results: list[ParsedDocument] = [] for pdf_path in sorted(directory.glob("*.pdf")): logger.info("Parsing %s …", pdf_path.name) results.append(self.parse(pdf_path)) return results @lru_cache(maxsize=1) def make_pdf_parser_service() -> PDFParserService: return PDFParserService()