Buckets:
| """Sample N stratified MJHQ-30k prompts for the quant benchmark. | |
| Writes outputs/eval/prompts.json = [{id, idx, prompt, category}] (idx = stable gen order + seed). | |
| Usage: python3 scripts/31_prep_mjhq.py N [SEED] | |
| """ | |
| import sys, json, os, random | |
| from huggingface_hub import hf_hub_download | |
| N = int(sys.argv[1]) if len(sys.argv) > 1 else 1000 | |
| SEED = int(sys.argv[2]) if len(sys.argv) > 2 else 0 | |
| meta = json.load(open(hf_hub_download('playgroundai/MJHQ-30K', 'meta_data.json', repo_type='dataset'))) | |
| by = {} | |
| for k, v in meta.items(): | |
| by.setdefault(v['category'], []).append((k, v['prompt'])) | |
| rng = random.Random(SEED) | |
| percat = max(1, N // len(by)) | |
| sel = [] | |
| for cat, items in sorted(by.items()): | |
| items = items[:]; rng.shuffle(items) | |
| sel += [{'id': k, 'prompt': p, 'category': cat} for k, p in items[:percat]] | |
| rng.shuffle(sel) | |
| sel = sel[:N] | |
| for i, d in enumerate(sel): | |
| d['idx'] = i | |
| os.makedirs('outputs/eval', exist_ok=True) | |
| json.dump(sel, open('outputs/eval/prompts.json', 'w'), indent=1) | |
| print(f"wrote {len(sel)} prompts -> outputs/eval/prompts.json ({percat}/cat across {len(by)} cats)") | |
| print("sample:", sel[0]['category'], '::', sel[0]['prompt'][:90]) | |
Xet Storage Details
- Size:
- 1.17 kB
- Xet hash:
- 898bb5cd7f0037b23b7d04c28991cb441a9897b2ddc18eb8d480ff40de978569
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.