hmc-rag / scripts /propagate_source_urls.py
webmuppet
Initial commit — health marketing compliance RAG
bad8b6c
"""
Propagate Source: URLs down through each index tree.
Problem: corpus files only write `Source: URL` lines at the top of each major
section, so only top-level H2 nodes carry an inline URL. Deep descendants
end up with no URL and fall through to the domain-level fallback in
src/config.py — which means nearly every citation for ero / reform-context /
cross-regulator / legislation points to the same generic hub page (or worse,
a 404).
Fix: walk each index tree top-down, inherit the nearest ancestor's Source URL,
and write it as `node["source_url"]` on every node. The retriever can then
prefer this field over text extraction.
Zero API cost. Non-destructive — writes to `.patched` first unless --apply.
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from pathlib import Path
INDEX_DIR = Path(__file__).resolve().parents[1] / "indexes"
SOURCE_LINE_RE = re.compile(r"^\s*Source:\s*(https?://\S+)\s*$", re.MULTILINE)
def extract_source_url(text: str) -> str | None:
"""Return the first Source: URL found in the first 5 lines of text."""
if not text:
return None
for line in text.splitlines()[:5]:
m = SOURCE_LINE_RE.match(line)
if m:
return m.group(1).strip()
return None
def propagate(node: dict, inherited: str | None, stats: dict) -> None:
"""Recursively walk a node tree, assigning `source_url` to every node.
Each node's own `Source:` line (if present) takes precedence and becomes
the inherited URL for its descendants. Otherwise the inherited URL flows
through unchanged.
"""
own = extract_source_url(node.get("text", ""))
effective = own or inherited
if effective is not None:
if node.get("source_url") != effective:
stats["assigned"] += 1
node["source_url"] = effective
else:
stats["no_url_available"] += 1
if own:
stats["own_source"] += 1
else:
stats["inherited"] += 1
for child in node.get("nodes", []) or []:
propagate(child, effective, stats)
def process_index(path: Path, apply: bool) -> dict:
data = json.loads(path.read_text())
stats = {
"nodes_total": 0,
"own_source": 0,
"inherited": 0,
"no_url_available": 0,
"assigned": 0,
}
def count(node):
if isinstance(node, dict):
stats["nodes_total"] += 1
for c in node.get("nodes", []) or []:
count(c)
structure = data.get("structure", [])
for root in structure:
count(root)
propagate(root, None, stats)
out = path if apply else path.with_suffix(path.suffix + ".patched")
out.write_text(json.dumps(data, ensure_ascii=False))
stats["wrote"] = str(out.relative_to(path.parent.parent))
return stats
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--apply",
action="store_true",
help="Overwrite the real index files (default: write to .patched sibling)",
)
parser.add_argument(
"indexes",
nargs="*",
help="Specific index stems to process (default: all)",
)
args = parser.parse_args()
targets = sorted(INDEX_DIR.glob("*.json"))
if args.indexes:
wanted = set(args.indexes)
targets = [t for t in targets if t.stem in wanted]
if not targets:
print("No index files found.", file=sys.stderr)
sys.exit(1)
print(f"{'Apply' if args.apply else 'Preview'} mode — {len(targets)} index file(s)")
print()
for ix in targets:
stats = process_index(ix, apply=args.apply)
print(f" {ix.stem}")
print(f" nodes: {stats['nodes_total']:4d}")
print(f" own Source: {stats['own_source']:4d}")
print(f" inherited: {stats['inherited']:4d}")
print(f" no URL at all: {stats['no_url_available']:4d}")
print(f" wrote: {stats['wrote']}")
print()
if __name__ == "__main__":
main()