Spaces:

webmuppetnz
/

hmc-rag

Running

webmuppet

Initial commit — health marketing compliance RAG

bad8b6c 5 days ago

4.15 kB

	"""
	PDF/HTML extraction with layout-aware preference and graceful fallback.

	Default path: Docling (IBM, layout-aware). Preserves heading hierarchy,
	list ordering, table structure, and de-duplicates page-header repetition.
	Critical for multi-column source documents (e.g. Physiotherapy Standards
	framework PDF, where pymupdf scrambled bullet ordering).

	Fallback path: markitdown. Used when Docling import or convert fails
	on a specific source. Markitdown is faster and lighter; it just doesn't
	handle multi-column layouts well.

	Usage in build scripts::

	from extract_pdf import extract_to_markdown

	body = extract_to_markdown(cache_path)

	The function handles both PDFs and HTML uniformly. PDFs go through Docling
	by default; HTML always uses markitdown (Docling's HTML support is less
	mature than its PDF support, and our HTML pages are mostly SilverStripe
	with semantic markup that markitdown handles well).
	"""

	from __future__ import annotations

	from pathlib import Path

	# Lazy-import flags — checked once, cached at module load
	try:
	from docling.document_converter import DocumentConverter
	_DOCLING_AVAILABLE = True
	except ImportError:
	_DOCLING_AVAILABLE = False
	DocumentConverter = None # type: ignore[assignment]


	def extract_to_markdown(path: str \| Path, format_hint: str \| None = None) -> str:
	"""Convert a PDF or HTML file to markdown text.

	Args:
	path: filesystem path to the source file
	format_hint: optional "pdf" or "html". If omitted, inferred from file extension.

	Returns:
	Markdown text content (stripped of leading/trailing whitespace).

	Strategy:
	- .pdf → Docling preferred, markitdown fallback if Docling errors
	- .html / .htm → markitdown (Docling's HTML support less mature than PDF)
	- other → markitdown (general-purpose)
	"""
	p = Path(path)
	fmt = format_hint or p.suffix.lstrip(".").lower()

	if fmt == "pdf" and _DOCLING_AVAILABLE:
	try:
	return _docling_extract(p)
	except Exception as e:
	print(f" ⚠ Docling failed on {p.name}: {e!r}; falling back to markitdown")

	return _markitdown_extract(p)


	def _docling_extract(path: Path) -> str:
	"""Run Docling and return the markdown export."""
	converter = DocumentConverter()
	result = converter.convert(str(path))
	return result.document.export_to_markdown().strip()


	def _markitdown_extract(path: Path) -> str:
	"""Run markitdown and return the text content."""
	from markitdown import MarkItDown
	md = MarkItDown()
	result = md.convert(str(path))
	return result.text_content.strip()


	def is_docling_available() -> bool:
	"""Public flag — useful for build scripts wanting to log which extractor is in use."""
	return _DOCLING_AVAILABLE


	# ---- Heading demotion --------------------------------------------------

	import re as _re

	_HEADING_LINE_RE = _re.compile(r"^(#{1,6}) ", _re.MULTILINE)


	def demote_headings(text: str, levels: int = 1, max_depth: int = 6) -> str:
	"""Demote markdown headings by ``levels`` levels, capped at H``max_depth``.

	Used by build scripts to nest extracted body content under a higher-level
	heading injected by the build script. Without demotion, Docling-extracted
	body H2s collide with the build script's source-level H2 wrapper at the
	same tree depth.

	Example::

	## Source Title (build script wrapper)
	## Introduction <-- collides at H2 with other sources' Introductions
	Body...

	becomes::

	## Source Title (build script wrapper)
	### Introduction <-- now a child of the wrapper
	Body...

	Multiple sources can each have their own "### Introduction" without
	colliding because each is scoped under its own H2 parent.

	Caps demotion at ``max_depth`` (default H6) since markdown beyond H6 is
	treated as paragraph text.
	"""
	def _demote(m: "_re.Match[str]") -> str:
	hashes = m.group(1)
	new_count = min(len(hashes) + levels, max_depth)
	return "#" * new_count + " "

	return _HEADING_LINE_RE.sub(_demote, text)