File size: 4,393 Bytes
7728916
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
Build draft evaluation questions from processed chunks.

Generates candidate Q/A pairs from actual chunk content. These are DRAFTS
and MUST be reviewed and corrected before being used as gold evaluation data.

Output is written to: eval/qa_50_draft_needs_review.jsonl

Usage:
  python scripts/build_eval_draft_from_chunks.py --data-dir data/processed
  python scripts/build_eval_draft_from_chunks.py --data-dir data/processed --document-id YOUR_ID
"""

import argparse
import json
import os
import re
import sys
from pathlib import Path
from typing import Any, Dict, List


def load_chunks(processed_dir: str, document_id: str) -> list:
    doc_dir = Path(processed_dir) / document_id
    chunks_file = doc_dir / "chunks.json"
    if not chunks_file.exists():
        return []
    with open(chunks_file, "r", encoding="utf-8") as f:
        return json.load(f)


def list_documents(processed_dir: str) -> list:
    base = Path(processed_dir)
    if not base.exists():
        return []
    return [d.name for d in base.iterdir() if d.is_dir() and (d / "chunks.json").exists()]


def extract_key_terms(text: str) -> List[str]:
    """Extract candidate key terms from chunk text."""
    words = re.findall(r'[A-Z][a-zA-Z]{2,}(?:\s+[A-Z][a-zA-Z]{2,}){0,3}', text)
    # Also get acronyms
    acronyms = re.findall(r'\b[A-Z]{2,6}\b', text)
    terms = list(set(words + acronyms))
    return [t for t in terms if len(t) > 2][:5]


def generate_question_from_chunk(chunk: dict, chunk_index: int) -> Dict[str, Any]:
    """Generate a draft question from a chunk."""
    content = chunk.get("content", chunk.get("text", ""))
    chunk_id = chunk.get("chunk_id", chunk.get("id", f"chunk_{chunk_index}"))
    page = chunk.get("page_number", None)

    # Try to form a question from the first sentence
    sentences = re.split(r'(?<=[.!?])\s+', content.strip())
    first_sentence = sentences[0] if sentences else content[:100]

    key_terms = extract_key_terms(content)
    if key_terms:
        main_term = key_terms[0]
        question = f"What does the document say about {main_term}?"
    else:
        question = f"What is discussed in the content on page {page or 'N/A'}?"

    return {
        "id": f"q{chunk_index + 1:03d}",
        "question": question,
        "gold_answer": f"DRAFT_NEEDS_REVIEW: {first_sentence[:200]}",
        "relevant_chunk_ids": [str(chunk_id)],
        "expected_terms": [t.lower() for t in key_terms[:4]],
        "difficulty": "medium",
        "_source_page": page,
        "_source_preview": content[:150],
    }


def main():
    parser = argparse.ArgumentParser(description="Build draft eval questions from chunks")
    parser.add_argument("--data-dir", default="data/processed")
    parser.add_argument("--document-id", default=None)
    parser.add_argument("--max-questions", type=int, default=50)
    parser.add_argument("--output", default="eval/qa_50_draft_needs_review.jsonl")
    args = parser.parse_args()

    docs = list_documents(args.data_dir)
    if args.document_id:
        docs = [d for d in docs if d == args.document_id]

    if not docs:
        print(f"No processed documents found in {args.data_dir}")
        sys.exit(0)

    all_chunks = []
    for doc_id in docs:
        chunks = load_chunks(args.data_dir, doc_id)
        for c in chunks:
            if isinstance(c, dict):
                c["_document_id"] = doc_id
            all_chunks.append(c)

    print(f"Found {len(all_chunks)} chunks across {len(docs)} documents")

    # Filter to substantive chunks
    good_chunks = []
    for c in all_chunks:
        content = c.get("content", c.get("text", ""))
        if len(str(content)) >= 100:
            good_chunks.append(c)

    # Sample evenly
    step = max(1, len(good_chunks) // args.max_questions)
    selected = good_chunks[::step][:args.max_questions]

    questions = []
    for i, chunk in enumerate(selected):
        q = generate_question_from_chunk(chunk, i)
        questions.append(q)

    os.makedirs(os.path.dirname(args.output), exist_ok=True)
    with open(args.output, "w", encoding="utf-8") as f:
        for q in questions:
            f.write(json.dumps(q, ensure_ascii=False) + "\n")

    print(f"Wrote {len(questions)} draft questions to {args.output}")
    print(f"\nIMPORTANT: These are DRAFTS. Review and correct before using as gold evaluation data.")


if __name__ == "__main__":
    main()