hmc-rag / scripts /extract_pdf.py
webmuppet
Initial commit β€” health marketing compliance RAG
bad8b6c
"""
PDF/HTML extraction with layout-aware preference and graceful fallback.
Default path: **Docling** (IBM, layout-aware). Preserves heading hierarchy,
list ordering, table structure, and de-duplicates page-header repetition.
Critical for multi-column source documents (e.g. Physiotherapy Standards
framework PDF, where pymupdf scrambled bullet ordering).
Fallback path: **markitdown**. Used when Docling import or convert fails
on a specific source. Markitdown is faster and lighter; it just doesn't
handle multi-column layouts well.
Usage in build scripts::
from extract_pdf import extract_to_markdown
body = extract_to_markdown(cache_path)
The function handles both PDFs and HTML uniformly. PDFs go through Docling
by default; HTML always uses markitdown (Docling's HTML support is less
mature than its PDF support, and our HTML pages are mostly SilverStripe
with semantic markup that markitdown handles well).
"""
from __future__ import annotations
from pathlib import Path
# Lazy-import flags β€” checked once, cached at module load
try:
from docling.document_converter import DocumentConverter
_DOCLING_AVAILABLE = True
except ImportError:
_DOCLING_AVAILABLE = False
DocumentConverter = None # type: ignore[assignment]
def extract_to_markdown(path: str | Path, format_hint: str | None = None) -> str:
"""Convert a PDF or HTML file to markdown text.
Args:
path: filesystem path to the source file
format_hint: optional "pdf" or "html". If omitted, inferred from file extension.
Returns:
Markdown text content (stripped of leading/trailing whitespace).
Strategy:
- .pdf β†’ Docling preferred, markitdown fallback if Docling errors
- .html / .htm β†’ markitdown (Docling's HTML support less mature than PDF)
- other β†’ markitdown (general-purpose)
"""
p = Path(path)
fmt = format_hint or p.suffix.lstrip(".").lower()
if fmt == "pdf" and _DOCLING_AVAILABLE:
try:
return _docling_extract(p)
except Exception as e:
print(f" ⚠ Docling failed on {p.name}: {e!r}; falling back to markitdown")
return _markitdown_extract(p)
def _docling_extract(path: Path) -> str:
"""Run Docling and return the markdown export."""
converter = DocumentConverter()
result = converter.convert(str(path))
return result.document.export_to_markdown().strip()
def _markitdown_extract(path: Path) -> str:
"""Run markitdown and return the text content."""
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(path))
return result.text_content.strip()
def is_docling_available() -> bool:
"""Public flag β€” useful for build scripts wanting to log which extractor is in use."""
return _DOCLING_AVAILABLE
# ---- Heading demotion --------------------------------------------------
import re as _re
_HEADING_LINE_RE = _re.compile(r"^(#{1,6}) ", _re.MULTILINE)
def demote_headings(text: str, levels: int = 1, max_depth: int = 6) -> str:
"""Demote markdown headings by ``levels`` levels, capped at H``max_depth``.
Used by build scripts to nest extracted body content under a higher-level
heading injected by the build script. Without demotion, Docling-extracted
body H2s collide with the build script's source-level H2 wrapper at the
same tree depth.
Example::
## Source Title (build script wrapper)
## Introduction <-- collides at H2 with other sources' Introductions
Body...
becomes::
## Source Title (build script wrapper)
### Introduction <-- now a child of the wrapper
Body...
Multiple sources can each have their own "### Introduction" without
colliding because each is scoped under its own H2 parent.
Caps demotion at ``max_depth`` (default H6) since markdown beyond H6 is
treated as paragraph text.
"""
def _demote(m: "_re.Match[str]") -> str:
hashes = m.group(1)
new_count = min(len(hashes) + levels, max_depth)
return "#" * new_count + " "
return _HEADING_LINE_RE.sub(_demote, text)