Spaces:
Running
Running
File size: 5,231 Bytes
1e732dd 696f787 1e732dd 696f787 1e732dd 696f787 1e732dd 9659593 1e732dd 9659593 1e732dd 696f787 1e732dd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | """
MediGuard AI — PDF Parser Service
Production PDF parsing with Docling (preferred) falling back to PyPDF.
Returns structured text with section metadata.
"""
from __future__ import annotations
import hashlib
import logging
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path
logger = logging.getLogger(__name__)
@dataclass
class ParsedSection:
"""One logical section extracted from a PDF."""
title: str
text: str
page_numbers: list[int] = field(default_factory=list)
@dataclass
class ParsedDocument:
"""Result of parsing a single PDF."""
filename: str
content_hash: str
full_text: str
sections: list[ParsedSection] = field(default_factory=list)
page_count: int = 0
error: str | None = None
class PDFParserService:
"""Unified PDF parsing with Docling → PyPDF fallback."""
def __init__(self) -> None:
self._has_docling = self._check_docling()
@staticmethod
def _check_docling() -> bool:
try:
import docling # noqa: F401
return True
except ImportError:
logger.info("Docling not installed — using PyPDF fallback")
return False
def parse(self, path: Path) -> ParsedDocument:
"""Parse a PDF file and return structured text."""
if not path.exists():
return ParsedDocument(
filename=path.name,
content_hash="",
full_text="",
error=f"File not found: {path}",
)
content_hash = hashlib.sha256(path.read_bytes()).hexdigest()
if self._has_docling:
return self._parse_with_docling(path, content_hash)
return self._parse_with_pypdf(path, content_hash)
# ------------------------------------------------------------------ #
# Docling (preferred)
# ------------------------------------------------------------------ #
def _parse_with_docling(self, path: Path, content_hash: str) -> ParsedDocument:
try:
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(str(path))
doc = result.document
sections: list[ParsedSection] = []
full_parts: list[str] = []
for element in doc.iterate_items():
text = element.text if hasattr(element, "text") else str(element)
if text.strip():
full_parts.append(text.strip())
sections.append(
ParsedSection(
title=getattr(element, "label", ""),
text=text.strip(),
)
)
full_text = "\n\n".join(full_parts)
return ParsedDocument(
filename=path.name,
content_hash=content_hash,
full_text=full_text,
sections=sections,
page_count=getattr(doc, "num_pages", 0),
)
except Exception as exc:
logger.warning("Docling failed for %s — falling back to PyPDF: %s", path.name, exc)
return self._parse_with_pypdf(path, content_hash)
# ------------------------------------------------------------------ #
# PyPDF fallback
# ------------------------------------------------------------------ #
def _parse_with_pypdf(self, path: Path, content_hash: str) -> ParsedDocument:
try:
from pypdf import PdfReader
reader = PdfReader(str(path))
pages_text: list[str] = []
for i, page in enumerate(reader.pages):
text = page.extract_text() or ""
if text.strip():
pages_text.append(text.strip())
full_text = "\n\n".join(pages_text)
sections = [
ParsedSection(title=f"Page {i + 1}", text=t, page_numbers=[i + 1]) for i, t in enumerate(pages_text)
]
return ParsedDocument(
filename=path.name,
content_hash=content_hash,
full_text=full_text,
sections=sections,
page_count=len(reader.pages),
)
except Exception as exc:
logger.error("PyPDF failed for %s: %s", path.name, exc)
return ParsedDocument(
filename=path.name,
content_hash=content_hash,
full_text="",
error=str(exc),
)
# ------------------------------------------------------------------ #
# Batch
# ------------------------------------------------------------------ #
def parse_directory(self, directory: Path) -> list[ParsedDocument]:
"""Parse all PDFs in a directory."""
results: list[ParsedDocument] = []
for pdf_path in sorted(directory.glob("*.pdf")):
logger.info("Parsing %s …", pdf_path.name)
results.append(self.parse(pdf_path))
return results
@lru_cache(maxsize=1)
def make_pdf_parser_service() -> PDFParserService:
return PDFParserService()
|