Spaces:

webmuppetnz
/

hmc-rag

Running

File size: 4,154 Bytes

bad8b6c

"""
PDF/HTML extraction with layout-aware preference and graceful fallback.

Default path: **Docling** (IBM, layout-aware). Preserves heading hierarchy,
list ordering, table structure, and de-duplicates page-header repetition.
Critical for multi-column source documents (e.g. Physiotherapy Standards
framework PDF, where pymupdf scrambled bullet ordering).

Fallback path: **markitdown**. Used when Docling import or convert fails
on a specific source. Markitdown is faster and lighter; it just doesn't
handle multi-column layouts well.

Usage in build scripts::

    from extract_pdf import extract_to_markdown

    body = extract_to_markdown(cache_path)

The function handles both PDFs and HTML uniformly. PDFs go through Docling
by default; HTML always uses markitdown (Docling's HTML support is less
mature than its PDF support, and our HTML pages are mostly SilverStripe
with semantic markup that markitdown handles well).
"""

from __future__ import annotations

from pathlib import Path

# Lazy-import flags — checked once, cached at module load
try:
    from docling.document_converter import DocumentConverter
    _DOCLING_AVAILABLE = True
except ImportError:
    _DOCLING_AVAILABLE = False
    DocumentConverter = None  # type: ignore[assignment]


def extract_to_markdown(path: str | Path, format_hint: str | None = None) -> str:
    """Convert a PDF or HTML file to markdown text.

    Args:
        path: filesystem path to the source file
        format_hint: optional "pdf" or "html". If omitted, inferred from file extension.

    Returns:
        Markdown text content (stripped of leading/trailing whitespace).

    Strategy:
        - .pdf → Docling preferred, markitdown fallback if Docling errors
        - .html / .htm → markitdown (Docling's HTML support less mature than PDF)
        - other → markitdown (general-purpose)
    """
    p = Path(path)
    fmt = format_hint or p.suffix.lstrip(".").lower()

    if fmt == "pdf" and _DOCLING_AVAILABLE:
        try:
            return _docling_extract(p)
        except Exception as e:
            print(f"  ⚠ Docling failed on {p.name}: {e!r}; falling back to markitdown")

    return _markitdown_extract(p)


def _docling_extract(path: Path) -> str:
    """Run Docling and return the markdown export."""
    converter = DocumentConverter()
    result = converter.convert(str(path))
    return result.document.export_to_markdown().strip()


def _markitdown_extract(path: Path) -> str:
    """Run markitdown and return the text content."""
    from markitdown import MarkItDown
    md = MarkItDown()
    result = md.convert(str(path))
    return result.text_content.strip()


def is_docling_available() -> bool:
    """Public flag — useful for build scripts wanting to log which extractor is in use."""
    return _DOCLING_AVAILABLE


# ---- Heading demotion --------------------------------------------------

import re as _re

_HEADING_LINE_RE = _re.compile(r"^(#{1,6}) ", _re.MULTILINE)


def demote_headings(text: str, levels: int = 1, max_depth: int = 6) -> str:
    """Demote markdown headings by ``levels`` levels, capped at H``max_depth``.

    Used by build scripts to nest extracted body content under a higher-level
    heading injected by the build script. Without demotion, Docling-extracted
    body H2s collide with the build script's source-level H2 wrapper at the
    same tree depth.

    Example::

        ## Source Title (build script wrapper)
        ## Introduction        <-- collides at H2 with other sources' Introductions
        Body...

    becomes::

        ## Source Title (build script wrapper)
        ### Introduction       <-- now a child of the wrapper
        Body...

    Multiple sources can each have their own "### Introduction" without
    colliding because each is scoped under its own H2 parent.

    Caps demotion at ``max_depth`` (default H6) since markdown beyond H6 is
    treated as paragraph text.
    """
    def _demote(m: "_re.Match[str]") -> str:
        hashes = m.group(1)
        new_count = min(len(hashes) + levels, max_depth)
        return "#" * new_count + " "

    return _HEADING_LINE_RE.sub(_demote, text)