Spaces:
Running
Running
| """ | |
| Propagate Source: URLs down through each index tree. | |
| Problem: corpus files only write `Source: URL` lines at the top of each major | |
| section, so only top-level H2 nodes carry an inline URL. Deep descendants | |
| end up with no URL and fall through to the domain-level fallback in | |
| src/config.py — which means nearly every citation for ero / reform-context / | |
| cross-regulator / legislation points to the same generic hub page (or worse, | |
| a 404). | |
| Fix: walk each index tree top-down, inherit the nearest ancestor's Source URL, | |
| and write it as `node["source_url"]` on every node. The retriever can then | |
| prefer this field over text extraction. | |
| Zero API cost. Non-destructive — writes to `.patched` first unless --apply. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import re | |
| import sys | |
| from pathlib import Path | |
| INDEX_DIR = Path(__file__).resolve().parents[1] / "indexes" | |
| SOURCE_LINE_RE = re.compile(r"^\s*Source:\s*(https?://\S+)\s*$", re.MULTILINE) | |
| def extract_source_url(text: str) -> str | None: | |
| """Return the first Source: URL found in the first 5 lines of text.""" | |
| if not text: | |
| return None | |
| for line in text.splitlines()[:5]: | |
| m = SOURCE_LINE_RE.match(line) | |
| if m: | |
| return m.group(1).strip() | |
| return None | |
| def propagate(node: dict, inherited: str | None, stats: dict) -> None: | |
| """Recursively walk a node tree, assigning `source_url` to every node. | |
| Each node's own `Source:` line (if present) takes precedence and becomes | |
| the inherited URL for its descendants. Otherwise the inherited URL flows | |
| through unchanged. | |
| """ | |
| own = extract_source_url(node.get("text", "")) | |
| effective = own or inherited | |
| if effective is not None: | |
| if node.get("source_url") != effective: | |
| stats["assigned"] += 1 | |
| node["source_url"] = effective | |
| else: | |
| stats["no_url_available"] += 1 | |
| if own: | |
| stats["own_source"] += 1 | |
| else: | |
| stats["inherited"] += 1 | |
| for child in node.get("nodes", []) or []: | |
| propagate(child, effective, stats) | |
| def process_index(path: Path, apply: bool) -> dict: | |
| data = json.loads(path.read_text()) | |
| stats = { | |
| "nodes_total": 0, | |
| "own_source": 0, | |
| "inherited": 0, | |
| "no_url_available": 0, | |
| "assigned": 0, | |
| } | |
| def count(node): | |
| if isinstance(node, dict): | |
| stats["nodes_total"] += 1 | |
| for c in node.get("nodes", []) or []: | |
| count(c) | |
| structure = data.get("structure", []) | |
| for root in structure: | |
| count(root) | |
| propagate(root, None, stats) | |
| out = path if apply else path.with_suffix(path.suffix + ".patched") | |
| out.write_text(json.dumps(data, ensure_ascii=False)) | |
| stats["wrote"] = str(out.relative_to(path.parent.parent)) | |
| return stats | |
| def main(): | |
| parser = argparse.ArgumentParser(description=__doc__) | |
| parser.add_argument( | |
| "--apply", | |
| action="store_true", | |
| help="Overwrite the real index files (default: write to .patched sibling)", | |
| ) | |
| parser.add_argument( | |
| "indexes", | |
| nargs="*", | |
| help="Specific index stems to process (default: all)", | |
| ) | |
| args = parser.parse_args() | |
| targets = sorted(INDEX_DIR.glob("*.json")) | |
| if args.indexes: | |
| wanted = set(args.indexes) | |
| targets = [t for t in targets if t.stem in wanted] | |
| if not targets: | |
| print("No index files found.", file=sys.stderr) | |
| sys.exit(1) | |
| print(f"{'Apply' if args.apply else 'Preview'} mode — {len(targets)} index file(s)") | |
| print() | |
| for ix in targets: | |
| stats = process_index(ix, apply=args.apply) | |
| print(f" {ix.stem}") | |
| print(f" nodes: {stats['nodes_total']:4d}") | |
| print(f" own Source: {stats['own_source']:4d}") | |
| print(f" inherited: {stats['inherited']:4d}") | |
| print(f" no URL at all: {stats['no_url_available']:4d}") | |
| print(f" wrote: {stats['wrote']}") | |
| print() | |
| if __name__ == "__main__": | |
| main() | |