GraphResearcher / scripts /list_eval_candidate_chunks.py
yugbirla's picture
Add evaluation framework, ablation tests, and lean README
7728916
Raw
History Blame Contribute Delete
3.7 kB
"""
List candidate chunks from processed documents for evaluation labeling.
Helps you fill relevant_chunk_ids in QA evaluation files by showing:
- document ID
- chunk ID
- page metadata
- chunk text preview
Usage:
python scripts/list_eval_candidate_chunks.py --data-dir data/processed
python scripts/list_eval_candidate_chunks.py --data-dir data/processed --document-id YOUR_ID --search "RAG"
"""
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Optional
def load_chunks(processed_dir: str, document_id: str):
"""Load chunks from a processed document directory."""
doc_dir = Path(processed_dir) / document_id
chunks_file = doc_dir / "chunks.json"
if not chunks_file.exists():
return None
with open(chunks_file, "r", encoding="utf-8") as f:
return json.load(f)
def list_documents(processed_dir: str):
"""List all document IDs in the processed directory."""
base = Path(processed_dir)
if not base.exists():
return []
return [d.name for d in base.iterdir() if d.is_dir() and (d / "chunks.json").exists()]
def main():
parser = argparse.ArgumentParser(description="List candidate chunks for eval labeling")
parser.add_argument("--data-dir", default="data/processed", help="Processed data directory")
parser.add_argument("--document-id", default=None, help="Specific document ID to inspect")
parser.add_argument("--search", default=None, help="Filter chunks by text search term")
parser.add_argument("--limit", type=int, default=50, help="Max chunks to show per document")
parser.add_argument("--preview-length", type=int, default=200, help="Text preview character limit")
args = parser.parse_args()
docs = list_documents(args.data_dir)
if not docs:
print(f"No processed documents found in {args.data_dir}")
sys.exit(0)
if args.document_id:
docs = [d for d in docs if d == args.document_id]
if not docs:
print(f"Document {args.document_id} not found. Available: {list_documents(args.data_dir)}")
sys.exit(1)
for doc_id in docs:
chunks = load_chunks(args.data_dir, doc_id)
if chunks is None:
continue
print(f"\n{'='*80}")
print(f"Document: {doc_id}")
print(f"Total chunks: {len(chunks)}")
print(f"{'='*80}")
shown = 0
for chunk in chunks:
if isinstance(chunk, dict):
chunk_id = chunk.get("chunk_id", chunk.get("id", "unknown"))
content = chunk.get("content", chunk.get("text", ""))
page = chunk.get("page_number", "N/A")
content_type = chunk.get("content_type", "text")
else:
chunk_id = getattr(chunk, "chunk_id", "unknown")
content = getattr(chunk, "content", "")
page = getattr(chunk, "page_number", "N/A")
content_type = getattr(chunk, "content_type", "text")
if args.search and args.search.lower() not in str(content).lower():
continue
preview = str(content)[:args.preview_length].replace("\n", " ").strip()
print(f"\n chunk_id: {chunk_id}")
print(f" page: {page}")
print(f" content_type: {content_type}")
print(f" preview: {preview}")
shown += 1
if shown >= args.limit:
remaining = len(chunks) - shown
if remaining > 0:
print(f"\n ... {remaining} more chunks. Use --limit to see more.")
break
if __name__ == "__main__":
main()