Spaces:
Running
Running
File size: 3,697 Bytes
7728916 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | """
List candidate chunks from processed documents for evaluation labeling.
Helps you fill relevant_chunk_ids in QA evaluation files by showing:
- document ID
- chunk ID
- page metadata
- chunk text preview
Usage:
python scripts/list_eval_candidate_chunks.py --data-dir data/processed
python scripts/list_eval_candidate_chunks.py --data-dir data/processed --document-id YOUR_ID --search "RAG"
"""
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Optional
def load_chunks(processed_dir: str, document_id: str):
"""Load chunks from a processed document directory."""
doc_dir = Path(processed_dir) / document_id
chunks_file = doc_dir / "chunks.json"
if not chunks_file.exists():
return None
with open(chunks_file, "r", encoding="utf-8") as f:
return json.load(f)
def list_documents(processed_dir: str):
"""List all document IDs in the processed directory."""
base = Path(processed_dir)
if not base.exists():
return []
return [d.name for d in base.iterdir() if d.is_dir() and (d / "chunks.json").exists()]
def main():
parser = argparse.ArgumentParser(description="List candidate chunks for eval labeling")
parser.add_argument("--data-dir", default="data/processed", help="Processed data directory")
parser.add_argument("--document-id", default=None, help="Specific document ID to inspect")
parser.add_argument("--search", default=None, help="Filter chunks by text search term")
parser.add_argument("--limit", type=int, default=50, help="Max chunks to show per document")
parser.add_argument("--preview-length", type=int, default=200, help="Text preview character limit")
args = parser.parse_args()
docs = list_documents(args.data_dir)
if not docs:
print(f"No processed documents found in {args.data_dir}")
sys.exit(0)
if args.document_id:
docs = [d for d in docs if d == args.document_id]
if not docs:
print(f"Document {args.document_id} not found. Available: {list_documents(args.data_dir)}")
sys.exit(1)
for doc_id in docs:
chunks = load_chunks(args.data_dir, doc_id)
if chunks is None:
continue
print(f"\n{'='*80}")
print(f"Document: {doc_id}")
print(f"Total chunks: {len(chunks)}")
print(f"{'='*80}")
shown = 0
for chunk in chunks:
if isinstance(chunk, dict):
chunk_id = chunk.get("chunk_id", chunk.get("id", "unknown"))
content = chunk.get("content", chunk.get("text", ""))
page = chunk.get("page_number", "N/A")
content_type = chunk.get("content_type", "text")
else:
chunk_id = getattr(chunk, "chunk_id", "unknown")
content = getattr(chunk, "content", "")
page = getattr(chunk, "page_number", "N/A")
content_type = getattr(chunk, "content_type", "text")
if args.search and args.search.lower() not in str(content).lower():
continue
preview = str(content)[:args.preview_length].replace("\n", " ").strip()
print(f"\n chunk_id: {chunk_id}")
print(f" page: {page}")
print(f" content_type: {content_type}")
print(f" preview: {preview}")
shown += 1
if shown >= args.limit:
remaining = len(chunks) - shown
if remaining > 0:
print(f"\n ... {remaining} more chunks. Use --limit to see more.")
break
if __name__ == "__main__":
main()
|