Spaces:

webmuppetnz
/

hmc-rag

Running

File size: 4,065 Bytes

bad8b6c

"""
Propagate Source: URLs down through each index tree.

Problem: corpus files only write `Source: URL` lines at the top of each major
section, so only top-level H2 nodes carry an inline URL. Deep descendants
end up with no URL and fall through to the domain-level fallback in
src/config.py — which means nearly every citation for ero / reform-context /
cross-regulator / legislation points to the same generic hub page (or worse,
a 404).

Fix: walk each index tree top-down, inherit the nearest ancestor's Source URL,
and write it as `node["source_url"]` on every node. The retriever can then
prefer this field over text extraction.

Zero API cost. Non-destructive — writes to `.patched` first unless --apply.
"""

from __future__ import annotations

import argparse
import json
import re
import sys
from pathlib import Path

INDEX_DIR = Path(__file__).resolve().parents[1] / "indexes"

SOURCE_LINE_RE = re.compile(r"^\s*Source:\s*(https?://\S+)\s*$", re.MULTILINE)


def extract_source_url(text: str) -> str | None:
    """Return the first Source: URL found in the first 5 lines of text."""
    if not text:
        return None
    for line in text.splitlines()[:5]:
        m = SOURCE_LINE_RE.match(line)
        if m:
            return m.group(1).strip()
    return None


def propagate(node: dict, inherited: str | None, stats: dict) -> None:
    """Recursively walk a node tree, assigning `source_url` to every node.

    Each node's own `Source:` line (if present) takes precedence and becomes
    the inherited URL for its descendants. Otherwise the inherited URL flows
    through unchanged.
    """
    own = extract_source_url(node.get("text", ""))
    effective = own or inherited

    if effective is not None:
        if node.get("source_url") != effective:
            stats["assigned"] += 1
        node["source_url"] = effective
    else:
        stats["no_url_available"] += 1

    if own:
        stats["own_source"] += 1
    else:
        stats["inherited"] += 1

    for child in node.get("nodes", []) or []:
        propagate(child, effective, stats)


def process_index(path: Path, apply: bool) -> dict:
    data = json.loads(path.read_text())
    stats = {
        "nodes_total": 0,
        "own_source": 0,
        "inherited": 0,
        "no_url_available": 0,
        "assigned": 0,
    }

    def count(node):
        if isinstance(node, dict):
            stats["nodes_total"] += 1
            for c in node.get("nodes", []) or []:
                count(c)

    structure = data.get("structure", [])
    for root in structure:
        count(root)
        propagate(root, None, stats)

    out = path if apply else path.with_suffix(path.suffix + ".patched")
    out.write_text(json.dumps(data, ensure_ascii=False))
    stats["wrote"] = str(out.relative_to(path.parent.parent))
    return stats


def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--apply",
        action="store_true",
        help="Overwrite the real index files (default: write to .patched sibling)",
    )
    parser.add_argument(
        "indexes",
        nargs="*",
        help="Specific index stems to process (default: all)",
    )
    args = parser.parse_args()

    targets = sorted(INDEX_DIR.glob("*.json"))
    if args.indexes:
        wanted = set(args.indexes)
        targets = [t for t in targets if t.stem in wanted]

    if not targets:
        print("No index files found.", file=sys.stderr)
        sys.exit(1)

    print(f"{'Apply' if args.apply else 'Preview'} mode — {len(targets)} index file(s)")
    print()

    for ix in targets:
        stats = process_index(ix, apply=args.apply)
        print(f"  {ix.stem}")
        print(f"    nodes:          {stats['nodes_total']:4d}")
        print(f"    own Source:     {stats['own_source']:4d}")
        print(f"    inherited:      {stats['inherited']:4d}")
        print(f"    no URL at all:  {stats['no_url_available']:4d}")
        print(f"    wrote:          {stats['wrote']}")
        print()


if __name__ == "__main__":
    main()