{ "batch_size": 128, "grad_accum_steps": 1, "epochs": 10, "lr": 0.0003, "betas": [ 0.9, 0.95 ], "weight_decay": 0.1, "warmup_steps": 500, "max_steps": null, "clip_grad": 1.0, "min_lr": 1e-06, "label_smoothing": 0.0, "mixed_precision": "bf16", "log_dir": "./runs/crystal_beeper", "log_interval": 50, "ckpt_dir": "./checkpoints_crystal", "export_dir": "./export_crystal", "resume": true, "resume_strict": false, "resume_tag": "best_model.safetensors", "hf_repo": "AbstractPhil/beeper-ascii-v1", "upload_to_hub": true, "add_bos_eos": true, "span_corrupt_frac": 0.0, "val_ratio": 0.01, "test_ratio": 0.01, "max_rows_per_dataset": null, "dataset_cache_verbose": true, "lambda_route": 0.2, "route_topk": 32, "lambda_geom": 0.3, "lambda_geom_angle": 0.8, "lambda_geom_var": 0.3, "lambda_geom_edge": 0.3, "lambda_geom_vol": 0.6, "lambda_geom_minrel": 1.0, "geom_min_edge_rel": 0.6, "geom_vol_lower_frac": 0.85, "geom_sample_classes": 64, "lambda_rose": 0.1, "rose_scale": 1.8, "contrast_warmup": 800, "pent_temp": 0.1, "lambda_contrast": 0.25, "punctuation": { "enable": true, "chars": [ ".", ",", ";", ":", "!", "?", "'", "\"", "(", ")", "[", "]", "{", "}", "-", "\u2014", "\u2026" ], "alpha_soft": 0.6, "hard_mask_gate": false, "apply_to_coarse_ids": "ALL" }, "harmony": { "apply": true, "system": "You are Crystal-Beeper, a helpful, honest, precise assistant.", "style": "concise" }, "stages": [ { "name": "bootstrap", "epochs": 1, "lambda_route": 0.05, "lambda_geom": 0.2, "gate_tau": 0.1, "punct_alpha": 0.6, "hard_mask_gate": false, "mix_sdpa": 1.0 }, { "name": "crystal_warmup", "epochs": 2, "lambda_route": 0.2, "lambda_geom": 0.3, "gate_tau": 0.08, "punct_alpha": 0.7, "hard_mask_gate": false, "mix_sdpa": 1.0 }, { "name": "dictionary_crystals", "epochs": 2, "lambda_route": 0.25, "lambda_geom": 0.35, "gate_tau": 0.06, "punct_alpha": 0.85, "hard_mask_gate": false, "mix_sdpa": 0.9 }, { "name": "stability_tuning", "epochs": 3, "lambda_route": 0.3, "lambda_geom": 0.4, "gate_tau": 0.05, "punct_alpha": 1.0, "hard_mask_gate": false, "mix_sdpa": 0.8 } ], "corpus": [ { "name": "TinyStories", "path": "roneneldan/TinyStories", "split": "train[30%:50%]", "weight": 0.1, "dialect": [ 0.6, 0.1, 0.05, 0.05, 0.2 ] }, { "name": "WikipediaEN", "path": "wikimedia/wikipedia", "config": "20231101.en", "split": "train[5%:15%]", "weight": 0.5, "dialect": [ 0.12, 0.58, 0.1, 0.1, 0.1 ] }, { "name": "AGNews", "path": "ag_news", "split": "train[:]", "weight": 0.1, "dialect": [ 0.2, 0.5, 0.1, 0.1, 0.1 ] }, { "name": "GSM8K", "path": "openai/gsm8k", "config": "main", "split": "train[40%:60%]", "weight": 0.6, "dialect": [ 0.1, 0.15, 0.5, 0.15, 0.1 ] }, { "name": "AI2-ARC-Easy", "path": "allenai/ai2_arc", "config": "ARC-Easy", "split": "train[30%:60%]", "weight": 0.6, "dialect": [ 0.05, 0.15, 0.4, 0.25, 0.15 ] }, { "name": "HH-RLHF", "path": "Anthropic/hh-rlhf", "split": "train[5%:10%]", "weight": 0.5, "dialect": [ 0.1, 0.25, 0.2, 0.25, 0.2 ] }, { "name": "SVAMP", "path": "ChilleD/SVAMP", "split": "train", "weight": 0.25, "dialect": [ 0.1, 0.15, 0.55, 0.15, 0.05 ] }, { "name": "MATH-500", "path": "HuggingFaceH4/MATH-500", "split": "test", "weight": 0.25, "dialect": [ 0.05, 0.15, 0.6, 0.15, 0.05 ] }, { "name": "SEP", "path": "AiresPucrs/stanford-encyclopedia-philosophy", "split": "train", "weight": 0.3, "dialect": [ 0.05, 0.45, 0.18, 0.22, 0.1 ] } ], "_alive_entries": [ { "name": "TinyStories", "path": "roneneldan/TinyStories", "split": "train[30%:50%]", "weight": 0.1, "dialect": [ 0.6000000238418579, 0.10000000149011612, 0.05000000074505806, 0.05000000074505806, 0.20000000298023224 ], "class_id": 0, "p": 0.03125000000000001 }, { "name": "WikipediaEN", "path": "wikimedia/wikipedia", "config": "20231101.en", "split": "train[5%:15%]", "weight": 0.5, "dialect": [ 0.11999999731779099, 0.5799999833106995, 0.10000000149011612, 0.10000000149011612, 0.10000000149011612 ], "class_id": 1, "p": 0.15625 }, { "name": "AGNews", "path": "ag_news", "split": "train[:]", "weight": 0.1, "dialect": [ 0.20000000298023224, 0.5, 0.10000000149011612, 0.10000000149011612, 0.10000000149011612 ], "class_id": 2, "p": 0.03125000000000001 }, { "name": "GSM8K", "path": "openai/gsm8k", "config": "main", "split": "train[40%:60%]", "weight": 0.6, "dialect": [ 0.10000000149011612, 0.15000000596046448, 0.5, 0.15000000596046448, 0.10000000149011612 ], "class_id": 3, "p": 0.1875 }, { "name": "AI2-ARC-Easy", "path": "allenai/ai2_arc", "config": "ARC-Easy", "split": "train[30%:60%]", "weight": 0.6, "dialect": [ 0.05000000074505806, 0.15000000596046448, 0.4000000059604645, 0.25, 0.15000000596046448 ], "class_id": 4, "p": 0.1875 }, { "name": "HH-RLHF", "path": "Anthropic/hh-rlhf", "split": "train[5%:10%]", "weight": 0.5, "dialect": [ 0.10000000149011612, 0.25, 0.20000000298023224, 0.25, 0.20000000298023224 ], "class_id": 5, "p": 0.15625 }, { "name": "SVAMP", "path": "ChilleD/SVAMP", "split": "train", "weight": 0.25, "dialect": [ 0.10000000149011612, 0.15000000596046448, 0.550000011920929, 0.15000000596046448, 0.05000000074505806 ], "class_id": 6, "p": 0.078125 }, { "name": "MATH-500", "path": "HuggingFaceH4/MATH-500", "split": "test", "weight": 0.25, "dialect": [ 0.05000000074505806, 0.15000000596046448, 0.6000000238418579, 0.15000000596046448, 0.05000000074505806 ], "class_id": 7, "p": 0.078125 }, { "name": "SEP", "path": "AiresPucrs/stanford-encyclopedia-philosophy", "split": "train", "weight": 0.3, "dialect": [ 0.05000000074505806, 0.44999998807907104, 0.18000000715255737, 0.2199999988079071, 0.10000000149011612 ], "class_id": 8, "p": 0.09375 } ] }