File size: 2,089 Bytes
657d287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""Extract source passages from parsed docs for both modules.

Output:
  data/eval/source_passages/compliance_passages.json    (25 passages)
  data/eval/source_passages/credit_passages.json        (25 passages)
"""
from __future__ import annotations

import json
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))

from evaluation.passage_extractor import extract_passages

PROCESSED = ROOT / "data" / "processed"
OUT_DIR = ROOT / "data" / "eval" / "source_passages"
OUT_DIR.mkdir(parents=True, exist_ok=True)

N_PASSAGES_PER_MODULE = 25


def main() -> int:
    summary: dict = {}
    for module in ("compliance", "credit"):
        parsed_dir = PROCESSED / module / "parsed"
        docs = [json.loads(p.read_text()) for p in sorted(parsed_dir.glob("*.json"))]
        # Strip oversized full_text from logs but pass through to extractor
        print(f"\n[{module}] {len(docs)} parsed docs")

        passages = extract_passages(docs, n_passages=N_PASSAGES_PER_MODULE)
        print(f"  → extracted {len(passages)} passages")

        # Distribution check
        by_doc: dict[str, int] = {}
        by_doc_type: dict[str, int] = {}
        for p in passages:
            by_doc[p.source_doc_id] = by_doc.get(p.source_doc_id, 0) + 1
            by_doc_type[p.source_doc_type] = by_doc_type.get(p.source_doc_type, 0) + 1
        print(f"  by doc_type: {dict(sorted(by_doc_type.items(), key=lambda x: -x[1]))}")
        print(f"  unique source docs: {len(by_doc)} (max per doc: {max(by_doc.values()) if by_doc else 0})")

        # Save
        out_path = OUT_DIR / f"{module}_passages.json"
        out_path.write_text(json.dumps([p.to_dict() for p in passages], indent=2))
        print(f"  → {out_path.relative_to(ROOT)}")

        summary[module] = {
            "n_passages": len(passages),
            "by_doc_type": by_doc_type,
            "n_unique_docs": len(by_doc),
        }

    (OUT_DIR / "_summary.json").write_text(json.dumps(summary, indent=2))
    return 0


if __name__ == "__main__":
    sys.exit(main())