Spaces:
Running
Running
File size: 4,065 Bytes
bad8b6c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | """
Propagate Source: URLs down through each index tree.
Problem: corpus files only write `Source: URL` lines at the top of each major
section, so only top-level H2 nodes carry an inline URL. Deep descendants
end up with no URL and fall through to the domain-level fallback in
src/config.py — which means nearly every citation for ero / reform-context /
cross-regulator / legislation points to the same generic hub page (or worse,
a 404).
Fix: walk each index tree top-down, inherit the nearest ancestor's Source URL,
and write it as `node["source_url"]` on every node. The retriever can then
prefer this field over text extraction.
Zero API cost. Non-destructive — writes to `.patched` first unless --apply.
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from pathlib import Path
INDEX_DIR = Path(__file__).resolve().parents[1] / "indexes"
SOURCE_LINE_RE = re.compile(r"^\s*Source:\s*(https?://\S+)\s*$", re.MULTILINE)
def extract_source_url(text: str) -> str | None:
"""Return the first Source: URL found in the first 5 lines of text."""
if not text:
return None
for line in text.splitlines()[:5]:
m = SOURCE_LINE_RE.match(line)
if m:
return m.group(1).strip()
return None
def propagate(node: dict, inherited: str | None, stats: dict) -> None:
"""Recursively walk a node tree, assigning `source_url` to every node.
Each node's own `Source:` line (if present) takes precedence and becomes
the inherited URL for its descendants. Otherwise the inherited URL flows
through unchanged.
"""
own = extract_source_url(node.get("text", ""))
effective = own or inherited
if effective is not None:
if node.get("source_url") != effective:
stats["assigned"] += 1
node["source_url"] = effective
else:
stats["no_url_available"] += 1
if own:
stats["own_source"] += 1
else:
stats["inherited"] += 1
for child in node.get("nodes", []) or []:
propagate(child, effective, stats)
def process_index(path: Path, apply: bool) -> dict:
data = json.loads(path.read_text())
stats = {
"nodes_total": 0,
"own_source": 0,
"inherited": 0,
"no_url_available": 0,
"assigned": 0,
}
def count(node):
if isinstance(node, dict):
stats["nodes_total"] += 1
for c in node.get("nodes", []) or []:
count(c)
structure = data.get("structure", [])
for root in structure:
count(root)
propagate(root, None, stats)
out = path if apply else path.with_suffix(path.suffix + ".patched")
out.write_text(json.dumps(data, ensure_ascii=False))
stats["wrote"] = str(out.relative_to(path.parent.parent))
return stats
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--apply",
action="store_true",
help="Overwrite the real index files (default: write to .patched sibling)",
)
parser.add_argument(
"indexes",
nargs="*",
help="Specific index stems to process (default: all)",
)
args = parser.parse_args()
targets = sorted(INDEX_DIR.glob("*.json"))
if args.indexes:
wanted = set(args.indexes)
targets = [t for t in targets if t.stem in wanted]
if not targets:
print("No index files found.", file=sys.stderr)
sys.exit(1)
print(f"{'Apply' if args.apply else 'Preview'} mode — {len(targets)} index file(s)")
print()
for ix in targets:
stats = process_index(ix, apply=args.apply)
print(f" {ix.stem}")
print(f" nodes: {stats['nodes_total']:4d}")
print(f" own Source: {stats['own_source']:4d}")
print(f" inherited: {stats['inherited']:4d}")
print(f" no URL at all: {stats['no_url_available']:4d}")
print(f" wrote: {stats['wrote']}")
print()
if __name__ == "__main__":
main()
|