File size: 5,231 Bytes
1e732dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696f787
1e732dd
 
 
 
 
 
 
 
 
696f787
1e732dd
696f787
1e732dd
 
 
 
 
 
 
 
 
 
 
 
9659593
1e732dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9659593
1e732dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696f787
1e732dd
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
MediGuard AI — PDF Parser Service

Production PDF parsing with Docling (preferred) falling back to PyPDF.
Returns structured text with section metadata.
"""

from __future__ import annotations

import hashlib
import logging
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path

logger = logging.getLogger(__name__)


@dataclass
class ParsedSection:
    """One logical section extracted from a PDF."""

    title: str
    text: str
    page_numbers: list[int] = field(default_factory=list)


@dataclass
class ParsedDocument:
    """Result of parsing a single PDF."""

    filename: str
    content_hash: str
    full_text: str
    sections: list[ParsedSection] = field(default_factory=list)
    page_count: int = 0
    error: str | None = None


class PDFParserService:
    """Unified PDF parsing with Docling → PyPDF fallback."""

    def __init__(self) -> None:
        self._has_docling = self._check_docling()

    @staticmethod
    def _check_docling() -> bool:
        try:
            import docling  # noqa: F401

            return True
        except ImportError:
            logger.info("Docling not installed — using PyPDF fallback")
            return False

    def parse(self, path: Path) -> ParsedDocument:
        """Parse a PDF file and return structured text."""
        if not path.exists():
            return ParsedDocument(
                filename=path.name,
                content_hash="",
                full_text="",
                error=f"File not found: {path}",
            )

        content_hash = hashlib.sha256(path.read_bytes()).hexdigest()

        if self._has_docling:
            return self._parse_with_docling(path, content_hash)
        return self._parse_with_pypdf(path, content_hash)

    # ------------------------------------------------------------------ #
    # Docling (preferred)
    # ------------------------------------------------------------------ #

    def _parse_with_docling(self, path: Path, content_hash: str) -> ParsedDocument:
        try:
            from docling.document_converter import DocumentConverter

            converter = DocumentConverter()
            result = converter.convert(str(path))
            doc = result.document

            sections: list[ParsedSection] = []
            full_parts: list[str] = []

            for element in doc.iterate_items():
                text = element.text if hasattr(element, "text") else str(element)
                if text.strip():
                    full_parts.append(text.strip())
                    sections.append(
                        ParsedSection(
                            title=getattr(element, "label", ""),
                            text=text.strip(),
                        )
                    )

            full_text = "\n\n".join(full_parts)
            return ParsedDocument(
                filename=path.name,
                content_hash=content_hash,
                full_text=full_text,
                sections=sections,
                page_count=getattr(doc, "num_pages", 0),
            )
        except Exception as exc:
            logger.warning("Docling failed for %s — falling back to PyPDF: %s", path.name, exc)
            return self._parse_with_pypdf(path, content_hash)

    # ------------------------------------------------------------------ #
    # PyPDF fallback
    # ------------------------------------------------------------------ #

    def _parse_with_pypdf(self, path: Path, content_hash: str) -> ParsedDocument:
        try:
            from pypdf import PdfReader

            reader = PdfReader(str(path))
            pages_text: list[str] = []
            for i, page in enumerate(reader.pages):
                text = page.extract_text() or ""
                if text.strip():
                    pages_text.append(text.strip())

            full_text = "\n\n".join(pages_text)
            sections = [
                ParsedSection(title=f"Page {i + 1}", text=t, page_numbers=[i + 1]) for i, t in enumerate(pages_text)
            ]

            return ParsedDocument(
                filename=path.name,
                content_hash=content_hash,
                full_text=full_text,
                sections=sections,
                page_count=len(reader.pages),
            )
        except Exception as exc:
            logger.error("PyPDF failed for %s: %s", path.name, exc)
            return ParsedDocument(
                filename=path.name,
                content_hash=content_hash,
                full_text="",
                error=str(exc),
            )

    # ------------------------------------------------------------------ #
    # Batch
    # ------------------------------------------------------------------ #

    def parse_directory(self, directory: Path) -> list[ParsedDocument]:
        """Parse all PDFs in a directory."""
        results: list[ParsedDocument] = []
        for pdf_path in sorted(directory.glob("*.pdf")):
            logger.info("Parsing %s …", pdf_path.name)
            results.append(self.parse(pdf_path))
        return results


@lru_cache(maxsize=1)
def make_pdf_parser_service() -> PDFParserService:
    return PDFParserService()