Spaces:
Running
Running
| """Sample-data helper. | |
| This repository ships with sample data already at | |
| ``data/samples/custom_rag_examples.jsonl``. This script verifies the file | |
| exists and reports basic statistics. We deliberately do not fetch external | |
| datasets here, because each requires conversion to the universal schema. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import sys | |
| from pathlib import Path | |
| SAMPLE_PATH = Path("data/samples/custom_rag_examples.jsonl") | |
| def main() -> int: | |
| if not SAMPLE_PATH.exists(): | |
| print(f"ERROR: sample file not found at {SAMPLE_PATH}.", file=sys.stderr) | |
| print("Re-clone the repository or restore the file from git.", file=sys.stderr) | |
| return 1 | |
| n_examples = 0 | |
| n_claims = 0 | |
| n_contexts = 0 | |
| with open(SAMPLE_PATH, "r", encoding="utf-8") as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| obj = json.loads(line) | |
| n_examples += 1 | |
| n_claims += len(obj.get("gold_claim_labels") or []) | |
| n_contexts += len(obj.get("contexts") or []) | |
| print(f"Sample file: {SAMPLE_PATH}") | |
| print(f" examples: {n_examples}") | |
| print(f" total claims: {n_claims}") | |
| print(f" total contexts: {n_contexts}") | |
| print("\nAll sample examples are clearly marked as synthetic.") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |