Spaces:

webmuppetnz
/

hmc-rag

Running

hmc-rag / scripts /propagate_source_urls.py

webmuppet

Initial commit — health marketing compliance RAG

bad8b6c 5 days ago

4.07 kB

	"""
	Propagate Source: URLs down through each index tree.

	Problem: corpus files only write `Source: URL` lines at the top of each major
	section, so only top-level H2 nodes carry an inline URL. Deep descendants
	end up with no URL and fall through to the domain-level fallback in
	src/config.py — which means nearly every citation for ero / reform-context /
	cross-regulator / legislation points to the same generic hub page (or worse,
	a 404).

	Fix: walk each index tree top-down, inherit the nearest ancestor's Source URL,
	and write it as `node["source_url"]` on every node. The retriever can then
	prefer this field over text extraction.

	Zero API cost. Non-destructive — writes to `.patched` first unless --apply.
	"""

	from __future__ import annotations

	import argparse
	import json
	import re
	import sys
	from pathlib import Path

	INDEX_DIR = Path(__file__).resolve().parents[1] / "indexes"

	SOURCE_LINE_RE = re.compile(r"^\sSource:\s(https?://\S+)\s*$", re.MULTILINE)


	def extract_source_url(text: str) -> str \| None:
	"""Return the first Source: URL found in the first 5 lines of text."""
	if not text:
	return None
	for line in text.splitlines()[:5]:
	m = SOURCE_LINE_RE.match(line)
	if m:
	return m.group(1).strip()
	return None


	def propagate(node: dict, inherited: str \| None, stats: dict) -> None:
	"""Recursively walk a node tree, assigning `source_url` to every node.

	Each node's own `Source:` line (if present) takes precedence and becomes
	the inherited URL for its descendants. Otherwise the inherited URL flows
	through unchanged.
	"""
	own = extract_source_url(node.get("text", ""))
	effective = own or inherited

	if effective is not None:
	if node.get("source_url") != effective:
	stats["assigned"] += 1
	node["source_url"] = effective
	else:
	stats["no_url_available"] += 1

	if own:
	stats["own_source"] += 1
	else:
	stats["inherited"] += 1

	for child in node.get("nodes", []) or []:
	propagate(child, effective, stats)


	def process_index(path: Path, apply: bool) -> dict:
	data = json.loads(path.read_text())
	stats = {
	"nodes_total": 0,
	"own_source": 0,
	"inherited": 0,
	"no_url_available": 0,
	"assigned": 0,
	}

	def count(node):
	if isinstance(node, dict):
	stats["nodes_total"] += 1
	for c in node.get("nodes", []) or []:
	count(c)

	structure = data.get("structure", [])
	for root in structure:
	count(root)
	propagate(root, None, stats)

	out = path if apply else path.with_suffix(path.suffix + ".patched")
	out.write_text(json.dumps(data, ensure_ascii=False))
	stats["wrote"] = str(out.relative_to(path.parent.parent))
	return stats


	def main():
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	"--apply",
	action="store_true",
	help="Overwrite the real index files (default: write to .patched sibling)",
	)
	parser.add_argument(
	"indexes",
	nargs="*",
	help="Specific index stems to process (default: all)",
	)
	args = parser.parse_args()

	targets = sorted(INDEX_DIR.glob("*.json"))
	if args.indexes:
	wanted = set(args.indexes)
	targets = [t for t in targets if t.stem in wanted]

	if not targets:
	print("No index files found.", file=sys.stderr)
	sys.exit(1)

	print(f"{'Apply' if args.apply else 'Preview'} mode — {len(targets)} index file(s)")
	print()

	for ix in targets:
	stats = process_index(ix, apply=args.apply)
	print(f" {ix.stem}")
	print(f" nodes: {stats['nodes_total']:4d}")
	print(f" own Source: {stats['own_source']:4d}")
	print(f" inherited: {stats['inherited']:4d}")
	print(f" no URL at all: {stats['no_url_available']:4d}")
	print(f" wrote: {stats['wrote']}")
	print()


	if __name__ == "__main__":
	main()