""" Propagate Source: URLs down through each index tree. Problem: corpus files only write `Source: URL` lines at the top of each major section, so only top-level H2 nodes carry an inline URL. Deep descendants end up with no URL and fall through to the domain-level fallback in src/config.py — which means nearly every citation for ero / reform-context / cross-regulator / legislation points to the same generic hub page (or worse, a 404). Fix: walk each index tree top-down, inherit the nearest ancestor's Source URL, and write it as `node["source_url"]` on every node. The retriever can then prefer this field over text extraction. Zero API cost. Non-destructive — writes to `.patched` first unless --apply. """ from __future__ import annotations import argparse import json import re import sys from pathlib import Path INDEX_DIR = Path(__file__).resolve().parents[1] / "indexes" SOURCE_LINE_RE = re.compile(r"^\s*Source:\s*(https?://\S+)\s*$", re.MULTILINE) def extract_source_url(text: str) -> str | None: """Return the first Source: URL found in the first 5 lines of text.""" if not text: return None for line in text.splitlines()[:5]: m = SOURCE_LINE_RE.match(line) if m: return m.group(1).strip() return None def propagate(node: dict, inherited: str | None, stats: dict) -> None: """Recursively walk a node tree, assigning `source_url` to every node. Each node's own `Source:` line (if present) takes precedence and becomes the inherited URL for its descendants. Otherwise the inherited URL flows through unchanged. """ own = extract_source_url(node.get("text", "")) effective = own or inherited if effective is not None: if node.get("source_url") != effective: stats["assigned"] += 1 node["source_url"] = effective else: stats["no_url_available"] += 1 if own: stats["own_source"] += 1 else: stats["inherited"] += 1 for child in node.get("nodes", []) or []: propagate(child, effective, stats) def process_index(path: Path, apply: bool) -> dict: data = json.loads(path.read_text()) stats = { "nodes_total": 0, "own_source": 0, "inherited": 0, "no_url_available": 0, "assigned": 0, } def count(node): if isinstance(node, dict): stats["nodes_total"] += 1 for c in node.get("nodes", []) or []: count(c) structure = data.get("structure", []) for root in structure: count(root) propagate(root, None, stats) out = path if apply else path.with_suffix(path.suffix + ".patched") out.write_text(json.dumps(data, ensure_ascii=False)) stats["wrote"] = str(out.relative_to(path.parent.parent)) return stats def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--apply", action="store_true", help="Overwrite the real index files (default: write to .patched sibling)", ) parser.add_argument( "indexes", nargs="*", help="Specific index stems to process (default: all)", ) args = parser.parse_args() targets = sorted(INDEX_DIR.glob("*.json")) if args.indexes: wanted = set(args.indexes) targets = [t for t in targets if t.stem in wanted] if not targets: print("No index files found.", file=sys.stderr) sys.exit(1) print(f"{'Apply' if args.apply else 'Preview'} mode — {len(targets)} index file(s)") print() for ix in targets: stats = process_index(ix, apply=args.apply) print(f" {ix.stem}") print(f" nodes: {stats['nodes_total']:4d}") print(f" own Source: {stats['own_source']:4d}") print(f" inherited: {stats['inherited']:4d}") print(f" no URL at all: {stats['no_url_available']:4d}") print(f" wrote: {stats['wrote']}") print() if __name__ == "__main__": main()