""" PDF/HTML extraction with layout-aware preference and graceful fallback. Default path: **Docling** (IBM, layout-aware). Preserves heading hierarchy, list ordering, table structure, and de-duplicates page-header repetition. Critical for multi-column source documents (e.g. Physiotherapy Standards framework PDF, where pymupdf scrambled bullet ordering). Fallback path: **markitdown**. Used when Docling import or convert fails on a specific source. Markitdown is faster and lighter; it just doesn't handle multi-column layouts well. Usage in build scripts:: from extract_pdf import extract_to_markdown body = extract_to_markdown(cache_path) The function handles both PDFs and HTML uniformly. PDFs go through Docling by default; HTML always uses markitdown (Docling's HTML support is less mature than its PDF support, and our HTML pages are mostly SilverStripe with semantic markup that markitdown handles well). """ from __future__ import annotations from pathlib import Path # Lazy-import flags — checked once, cached at module load try: from docling.document_converter import DocumentConverter _DOCLING_AVAILABLE = True except ImportError: _DOCLING_AVAILABLE = False DocumentConverter = None # type: ignore[assignment] def extract_to_markdown(path: str | Path, format_hint: str | None = None) -> str: """Convert a PDF or HTML file to markdown text. Args: path: filesystem path to the source file format_hint: optional "pdf" or "html". If omitted, inferred from file extension. Returns: Markdown text content (stripped of leading/trailing whitespace). Strategy: - .pdf → Docling preferred, markitdown fallback if Docling errors - .html / .htm → markitdown (Docling's HTML support less mature than PDF) - other → markitdown (general-purpose) """ p = Path(path) fmt = format_hint or p.suffix.lstrip(".").lower() if fmt == "pdf" and _DOCLING_AVAILABLE: try: return _docling_extract(p) except Exception as e: print(f" ⚠ Docling failed on {p.name}: {e!r}; falling back to markitdown") return _markitdown_extract(p) def _docling_extract(path: Path) -> str: """Run Docling and return the markdown export.""" converter = DocumentConverter() result = converter.convert(str(path)) return result.document.export_to_markdown().strip() def _markitdown_extract(path: Path) -> str: """Run markitdown and return the text content.""" from markitdown import MarkItDown md = MarkItDown() result = md.convert(str(path)) return result.text_content.strip() def is_docling_available() -> bool: """Public flag — useful for build scripts wanting to log which extractor is in use.""" return _DOCLING_AVAILABLE # ---- Heading demotion -------------------------------------------------- import re as _re _HEADING_LINE_RE = _re.compile(r"^(#{1,6}) ", _re.MULTILINE) def demote_headings(text: str, levels: int = 1, max_depth: int = 6) -> str: """Demote markdown headings by ``levels`` levels, capped at H``max_depth``. Used by build scripts to nest extracted body content under a higher-level heading injected by the build script. Without demotion, Docling-extracted body H2s collide with the build script's source-level H2 wrapper at the same tree depth. Example:: ## Source Title (build script wrapper) ## Introduction <-- collides at H2 with other sources' Introductions Body... becomes:: ## Source Title (build script wrapper) ### Introduction <-- now a child of the wrapper Body... Multiple sources can each have their own "### Introduction" without colliding because each is scoped under its own H2 parent. Caps demotion at ``max_depth`` (default H6) since markdown beyond H6 is treated as paragraph text. """ def _demote(m: "_re.Match[str]") -> str: hashes = m.group(1) new_count = min(len(hashes) + levels, max_depth) return "#" * new_count + " " return _HEADING_LINE_RE.sub(_demote, text)