| { |
| "version": "CPT-v3", |
| "model": "BioMistral/BioMistral-7B", |
| "gpu": "NVIDIA H200", |
| "vram_gb": 150.110011392, |
| "train_tokens": 2092364628, |
| "val_tokens": 10923190, |
| "train_notes": 2608855, |
| "val_notes": 13339, |
| "max_seq_len": 4096, |
| "effective_batch": 16, |
| "batch_size": 2, |
| "grad_accum": 8, |
| "lora_r": 16, |
| "lora_alpha": 32, |
| "lora_dropout": 0.1, |
| "trainable_params": 41943040, |
| "total_params": 7283675136, |
| "attention": "flash_attention_2", |
| "compute_dtype": "torch.bfloat16", |
| "epochs": 1, |
| "lr": 0.0002, |
| "best_val_ppl": 2.516552927049002, |
| "best_step": 900, |
| "final_val_ppl": 2.417028769703714, |
| "final_val_nll": 0.8825390047136883, |
| "total_steps": 900, |
| "total_tokens": 58982400, |
| "training_min": 154.6, |
| "avg_tok_per_sec": 6359.2, |
| "oom_skips": 0, |
| "paths": { |
| "best_model": "/mnt/biomistral/best_model", |
| "final_model": "/mnt/biomistral/final_model", |
| "checkpoints": "/mnt/biomistral/checkpoints", |
| "plots": "/mnt/biomistral/plots", |
| "logs": "/mnt/biomistral/logs", |
| "cache": "/mnt/biomistral/cache" |
| }, |
| "v3_changes_vs_v2": [ |
| "grad_accum 24\u21928 (3\u00d7 more optim steps per unit time)", |
| "max_optimizer_steps=900 (cosine fully anneals in 5h budget)", |
| "warmup_ratio 0.03\u21920.05 (faster warmup on short run)", |
| "eval_every 200\u219250, save_every 1000\u2192150 (tighter tracking)", |
| "wall-clock 22h\u21924.5h (hard guarantee w/ buffer for final eval)", |
| "early-stop patience 5\u21924 (bail faster)", |
| "kept from v2: batch=2, lora_r=16, dropout=0.10, flash-attn assert" |
| ] |
| } |