Spaces:
Sleeping
Sleeping
| """ | |
| Build draft evaluation questions from processed chunks. | |
| Generates candidate Q/A pairs from actual chunk content. These are DRAFTS | |
| and MUST be reviewed and corrected before being used as gold evaluation data. | |
| Output is written to: eval/qa_50_draft_needs_review.jsonl | |
| Usage: | |
| python scripts/build_eval_draft_from_chunks.py --data-dir data/processed | |
| python scripts/build_eval_draft_from_chunks.py --data-dir data/processed --document-id YOUR_ID | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import re | |
| import sys | |
| from pathlib import Path | |
| from typing import Any, Dict, List | |
| def load_chunks(processed_dir: str, document_id: str) -> list: | |
| doc_dir = Path(processed_dir) / document_id | |
| chunks_file = doc_dir / "chunks.json" | |
| if not chunks_file.exists(): | |
| return [] | |
| with open(chunks_file, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| def list_documents(processed_dir: str) -> list: | |
| base = Path(processed_dir) | |
| if not base.exists(): | |
| return [] | |
| return [d.name for d in base.iterdir() if d.is_dir() and (d / "chunks.json").exists()] | |
| def extract_key_terms(text: str) -> List[str]: | |
| """Extract candidate key terms from chunk text.""" | |
| words = re.findall(r'[A-Z][a-zA-Z]{2,}(?:\s+[A-Z][a-zA-Z]{2,}){0,3}', text) | |
| # Also get acronyms | |
| acronyms = re.findall(r'\b[A-Z]{2,6}\b', text) | |
| terms = list(set(words + acronyms)) | |
| return [t for t in terms if len(t) > 2][:5] | |
| def generate_question_from_chunk(chunk: dict, chunk_index: int) -> Dict[str, Any]: | |
| """Generate a draft question from a chunk.""" | |
| content = chunk.get("content", chunk.get("text", "")) | |
| chunk_id = chunk.get("chunk_id", chunk.get("id", f"chunk_{chunk_index}")) | |
| page = chunk.get("page_number", None) | |
| # Try to form a question from the first sentence | |
| sentences = re.split(r'(?<=[.!?])\s+', content.strip()) | |
| first_sentence = sentences[0] if sentences else content[:100] | |
| key_terms = extract_key_terms(content) | |
| if key_terms: | |
| main_term = key_terms[0] | |
| question = f"What does the document say about {main_term}?" | |
| else: | |
| question = f"What is discussed in the content on page {page or 'N/A'}?" | |
| return { | |
| "id": f"q{chunk_index + 1:03d}", | |
| "question": question, | |
| "gold_answer": f"DRAFT_NEEDS_REVIEW: {first_sentence[:200]}", | |
| "relevant_chunk_ids": [str(chunk_id)], | |
| "expected_terms": [t.lower() for t in key_terms[:4]], | |
| "difficulty": "medium", | |
| "_source_page": page, | |
| "_source_preview": content[:150], | |
| } | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Build draft eval questions from chunks") | |
| parser.add_argument("--data-dir", default="data/processed") | |
| parser.add_argument("--document-id", default=None) | |
| parser.add_argument("--max-questions", type=int, default=50) | |
| parser.add_argument("--output", default="eval/qa_50_draft_needs_review.jsonl") | |
| args = parser.parse_args() | |
| docs = list_documents(args.data_dir) | |
| if args.document_id: | |
| docs = [d for d in docs if d == args.document_id] | |
| if not docs: | |
| print(f"No processed documents found in {args.data_dir}") | |
| sys.exit(0) | |
| all_chunks = [] | |
| for doc_id in docs: | |
| chunks = load_chunks(args.data_dir, doc_id) | |
| for c in chunks: | |
| if isinstance(c, dict): | |
| c["_document_id"] = doc_id | |
| all_chunks.append(c) | |
| print(f"Found {len(all_chunks)} chunks across {len(docs)} documents") | |
| # Filter to substantive chunks | |
| good_chunks = [] | |
| for c in all_chunks: | |
| content = c.get("content", c.get("text", "")) | |
| if len(str(content)) >= 100: | |
| good_chunks.append(c) | |
| # Sample evenly | |
| step = max(1, len(good_chunks) // args.max_questions) | |
| selected = good_chunks[::step][:args.max_questions] | |
| questions = [] | |
| for i, chunk in enumerate(selected): | |
| q = generate_question_from_chunk(chunk, i) | |
| questions.append(q) | |
| os.makedirs(os.path.dirname(args.output), exist_ok=True) | |
| with open(args.output, "w", encoding="utf-8") as f: | |
| for q in questions: | |
| f.write(json.dumps(q, ensure_ascii=False) + "\n") | |
| print(f"Wrote {len(questions)} draft questions to {args.output}") | |
| print(f"\nIMPORTANT: These are DRAFTS. Review and correct before using as gold evaluation data.") | |
| if __name__ == "__main__": | |
| main() | |