Spaces:

yugbirla
/

GraphResearcher

Sleeping

App Files Files Community

GraphResearcher / scripts /list_eval_candidate_chunks.py

yugbirla

Add evaluation framework, ablation tests, and lean README

7728916 7 days ago

Raw

History Blame Contribute Delete

3.7 kB

	"""
	List candidate chunks from processed documents for evaluation labeling.

	Helps you fill relevant_chunk_ids in QA evaluation files by showing:
	- document ID
	- chunk ID
	- page metadata
	- chunk text preview

	Usage:
	python scripts/list_eval_candidate_chunks.py --data-dir data/processed
	python scripts/list_eval_candidate_chunks.py --data-dir data/processed --document-id YOUR_ID --search "RAG"
	"""

	import argparse
	import json
	import os
	import sys
	from pathlib import Path
	from typing import Optional


	def load_chunks(processed_dir: str, document_id: str):
	"""Load chunks from a processed document directory."""
	doc_dir = Path(processed_dir) / document_id
	chunks_file = doc_dir / "chunks.json"

	if not chunks_file.exists():
	return None

	with open(chunks_file, "r", encoding="utf-8") as f:
	return json.load(f)


	def list_documents(processed_dir: str):
	"""List all document IDs in the processed directory."""
	base = Path(processed_dir)
	if not base.exists():
	return []
	return [d.name for d in base.iterdir() if d.is_dir() and (d / "chunks.json").exists()]


	def main():
	parser = argparse.ArgumentParser(description="List candidate chunks for eval labeling")
	parser.add_argument("--data-dir", default="data/processed", help="Processed data directory")
	parser.add_argument("--document-id", default=None, help="Specific document ID to inspect")
	parser.add_argument("--search", default=None, help="Filter chunks by text search term")
	parser.add_argument("--limit", type=int, default=50, help="Max chunks to show per document")
	parser.add_argument("--preview-length", type=int, default=200, help="Text preview character limit")
	args = parser.parse_args()

	docs = list_documents(args.data_dir)

	if not docs:
	print(f"No processed documents found in {args.data_dir}")
	sys.exit(0)

	if args.document_id:
	docs = [d for d in docs if d == args.document_id]
	if not docs:
	print(f"Document {args.document_id} not found. Available: {list_documents(args.data_dir)}")
	sys.exit(1)

	for doc_id in docs:
	chunks = load_chunks(args.data_dir, doc_id)
	if chunks is None:
	continue

	print(f"\n{'='*80}")
	print(f"Document: {doc_id}")
	print(f"Total chunks: {len(chunks)}")
	print(f"{'='*80}")

	shown = 0
	for chunk in chunks:
	if isinstance(chunk, dict):
	chunk_id = chunk.get("chunk_id", chunk.get("id", "unknown"))
	content = chunk.get("content", chunk.get("text", ""))
	page = chunk.get("page_number", "N/A")
	content_type = chunk.get("content_type", "text")
	else:
	chunk_id = getattr(chunk, "chunk_id", "unknown")
	content = getattr(chunk, "content", "")
	page = getattr(chunk, "page_number", "N/A")
	content_type = getattr(chunk, "content_type", "text")

	if args.search and args.search.lower() not in str(content).lower():
	continue

	preview = str(content)[:args.preview_length].replace("\n", " ").strip()
	print(f"\n chunk_id: {chunk_id}")
	print(f" page: {page}")
	print(f" content_type: {content_type}")
	print(f" preview: {preview}")

	shown += 1
	if shown >= args.limit:
	remaining = len(chunks) - shown
	if remaining > 0:
	print(f"\n ... {remaining} more chunks. Use --limit to see more.")
	break


	if __name__ == "__main__":
	main()