| { | |
| "metadata": { | |
| "source_file": "data/processed/all_merged.jsonl", | |
| "output_directory": "data/processed" | |
| }, | |
| "summary": { | |
| "total": 13670, | |
| "gold": 82, | |
| "silver": 64, | |
| "bronze": 13524, | |
| "issues_distribution": { | |
| "language_mismatch_target": 7512, | |
| "misaligned_lengths": 3552, | |
| "language_mismatch_source": 6012 | |
| }, | |
| "tier_files": { | |
| "gold": "data/processed/data_gold_tier.jsonl", | |
| "silver": "data/processed/data_silver_tier.jsonl", | |
| "bronze": "data/processed/data_bronze_tier.jsonl" | |
| } | |
| }, | |
| "issues_distribution": { | |
| "language_mismatch_target": 7512, | |
| "misaligned_lengths": 3552, | |
| "language_mismatch_source": 6012 | |
| }, | |
| "recommendations": [ | |
| "\u26a0\ufe0f Low gold tier ratio (82/13670). Consider native speaker review for high-confidence pairs before training.", | |
| "\u26a0\ufe0f High language detection mismatches. Verify source language coding.", | |
| "\u26a0\ufe0f Many length misalignments. Check pair alignment in source data.", | |
| "\u2139\ufe0f Contains {bronze} bronze-tier pairs for cultural grounding/instruction tuning only." | |
| ] | |
| } |
Xet Storage Details
- Size:
- 1.13 kB
- Xet hash:
- ccc5bcc769c9f6807c22e59596129b1b96e64673b29303c2811e0040a3a6da5b
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.