Spaces:
Sleeping
Sleeping
File size: 4,393 Bytes
7728916 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | """
Build draft evaluation questions from processed chunks.
Generates candidate Q/A pairs from actual chunk content. These are DRAFTS
and MUST be reviewed and corrected before being used as gold evaluation data.
Output is written to: eval/qa_50_draft_needs_review.jsonl
Usage:
python scripts/build_eval_draft_from_chunks.py --data-dir data/processed
python scripts/build_eval_draft_from_chunks.py --data-dir data/processed --document-id YOUR_ID
"""
import argparse
import json
import os
import re
import sys
from pathlib import Path
from typing import Any, Dict, List
def load_chunks(processed_dir: str, document_id: str) -> list:
doc_dir = Path(processed_dir) / document_id
chunks_file = doc_dir / "chunks.json"
if not chunks_file.exists():
return []
with open(chunks_file, "r", encoding="utf-8") as f:
return json.load(f)
def list_documents(processed_dir: str) -> list:
base = Path(processed_dir)
if not base.exists():
return []
return [d.name for d in base.iterdir() if d.is_dir() and (d / "chunks.json").exists()]
def extract_key_terms(text: str) -> List[str]:
"""Extract candidate key terms from chunk text."""
words = re.findall(r'[A-Z][a-zA-Z]{2,}(?:\s+[A-Z][a-zA-Z]{2,}){0,3}', text)
# Also get acronyms
acronyms = re.findall(r'\b[A-Z]{2,6}\b', text)
terms = list(set(words + acronyms))
return [t for t in terms if len(t) > 2][:5]
def generate_question_from_chunk(chunk: dict, chunk_index: int) -> Dict[str, Any]:
"""Generate a draft question from a chunk."""
content = chunk.get("content", chunk.get("text", ""))
chunk_id = chunk.get("chunk_id", chunk.get("id", f"chunk_{chunk_index}"))
page = chunk.get("page_number", None)
# Try to form a question from the first sentence
sentences = re.split(r'(?<=[.!?])\s+', content.strip())
first_sentence = sentences[0] if sentences else content[:100]
key_terms = extract_key_terms(content)
if key_terms:
main_term = key_terms[0]
question = f"What does the document say about {main_term}?"
else:
question = f"What is discussed in the content on page {page or 'N/A'}?"
return {
"id": f"q{chunk_index + 1:03d}",
"question": question,
"gold_answer": f"DRAFT_NEEDS_REVIEW: {first_sentence[:200]}",
"relevant_chunk_ids": [str(chunk_id)],
"expected_terms": [t.lower() for t in key_terms[:4]],
"difficulty": "medium",
"_source_page": page,
"_source_preview": content[:150],
}
def main():
parser = argparse.ArgumentParser(description="Build draft eval questions from chunks")
parser.add_argument("--data-dir", default="data/processed")
parser.add_argument("--document-id", default=None)
parser.add_argument("--max-questions", type=int, default=50)
parser.add_argument("--output", default="eval/qa_50_draft_needs_review.jsonl")
args = parser.parse_args()
docs = list_documents(args.data_dir)
if args.document_id:
docs = [d for d in docs if d == args.document_id]
if not docs:
print(f"No processed documents found in {args.data_dir}")
sys.exit(0)
all_chunks = []
for doc_id in docs:
chunks = load_chunks(args.data_dir, doc_id)
for c in chunks:
if isinstance(c, dict):
c["_document_id"] = doc_id
all_chunks.append(c)
print(f"Found {len(all_chunks)} chunks across {len(docs)} documents")
# Filter to substantive chunks
good_chunks = []
for c in all_chunks:
content = c.get("content", c.get("text", ""))
if len(str(content)) >= 100:
good_chunks.append(c)
# Sample evenly
step = max(1, len(good_chunks) // args.max_questions)
selected = good_chunks[::step][:args.max_questions]
questions = []
for i, chunk in enumerate(selected):
q = generate_question_from_chunk(chunk, i)
questions.append(q)
os.makedirs(os.path.dirname(args.output), exist_ok=True)
with open(args.output, "w", encoding="utf-8") as f:
for q in questions:
f.write(json.dumps(q, ensure_ascii=False) + "\n")
print(f"Wrote {len(questions)} draft questions to {args.output}")
print(f"\nIMPORTANT: These are DRAFTS. Review and correct before using as gold evaluation data.")
if __name__ == "__main__":
main()
|