""" List candidate chunks from processed documents for evaluation labeling. Helps you fill relevant_chunk_ids in QA evaluation files by showing: - document ID - chunk ID - page metadata - chunk text preview Usage: python scripts/list_eval_candidate_chunks.py --data-dir data/processed python scripts/list_eval_candidate_chunks.py --data-dir data/processed --document-id YOUR_ID --search "RAG" """ import argparse import json import os import sys from pathlib import Path from typing import Optional def load_chunks(processed_dir: str, document_id: str): """Load chunks from a processed document directory.""" doc_dir = Path(processed_dir) / document_id chunks_file = doc_dir / "chunks.json" if not chunks_file.exists(): return None with open(chunks_file, "r", encoding="utf-8") as f: return json.load(f) def list_documents(processed_dir: str): """List all document IDs in the processed directory.""" base = Path(processed_dir) if not base.exists(): return [] return [d.name for d in base.iterdir() if d.is_dir() and (d / "chunks.json").exists()] def main(): parser = argparse.ArgumentParser(description="List candidate chunks for eval labeling") parser.add_argument("--data-dir", default="data/processed", help="Processed data directory") parser.add_argument("--document-id", default=None, help="Specific document ID to inspect") parser.add_argument("--search", default=None, help="Filter chunks by text search term") parser.add_argument("--limit", type=int, default=50, help="Max chunks to show per document") parser.add_argument("--preview-length", type=int, default=200, help="Text preview character limit") args = parser.parse_args() docs = list_documents(args.data_dir) if not docs: print(f"No processed documents found in {args.data_dir}") sys.exit(0) if args.document_id: docs = [d for d in docs if d == args.document_id] if not docs: print(f"Document {args.document_id} not found. Available: {list_documents(args.data_dir)}") sys.exit(1) for doc_id in docs: chunks = load_chunks(args.data_dir, doc_id) if chunks is None: continue print(f"\n{'='*80}") print(f"Document: {doc_id}") print(f"Total chunks: {len(chunks)}") print(f"{'='*80}") shown = 0 for chunk in chunks: if isinstance(chunk, dict): chunk_id = chunk.get("chunk_id", chunk.get("id", "unknown")) content = chunk.get("content", chunk.get("text", "")) page = chunk.get("page_number", "N/A") content_type = chunk.get("content_type", "text") else: chunk_id = getattr(chunk, "chunk_id", "unknown") content = getattr(chunk, "content", "") page = getattr(chunk, "page_number", "N/A") content_type = getattr(chunk, "content_type", "text") if args.search and args.search.lower() not in str(content).lower(): continue preview = str(content)[:args.preview_length].replace("\n", " ").strip() print(f"\n chunk_id: {chunk_id}") print(f" page: {page}") print(f" content_type: {content_type}") print(f" preview: {preview}") shown += 1 if shown >= args.limit: remaining = len(chunks) - shown if remaining > 0: print(f"\n ... {remaining} more chunks. Use --limit to see more.") break if __name__ == "__main__": main()