{ "version": "CPT-v3", "model": "BioMistral/BioMistral-7B", "gpu": "NVIDIA H200", "vram_gb": 150.110011392, "train_tokens": 2092364628, "val_tokens": 10923190, "train_notes": 2608855, "val_notes": 13339, "max_seq_len": 4096, "effective_batch": 16, "batch_size": 2, "grad_accum": 8, "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.1, "trainable_params": 41943040, "total_params": 7283675136, "attention": "flash_attention_2", "compute_dtype": "torch.bfloat16", "epochs": 1, "lr": 0.0002, "best_val_ppl": 2.516552927049002, "best_step": 900, "final_val_ppl": 2.417028769703714, "final_val_nll": 0.8825390047136883, "total_steps": 900, "total_tokens": 58982400, "training_min": 154.6, "avg_tok_per_sec": 6359.2, "oom_skips": 0, "paths": { "best_model": "/mnt/biomistral/best_model", "final_model": "/mnt/biomistral/final_model", "checkpoints": "/mnt/biomistral/checkpoints", "plots": "/mnt/biomistral/plots", "logs": "/mnt/biomistral/logs", "cache": "/mnt/biomistral/cache" }, "v3_changes_vs_v2": [ "grad_accum 24\u21928 (3\u00d7 more optim steps per unit time)", "max_optimizer_steps=900 (cosine fully anneals in 5h budget)", "warmup_ratio 0.03\u21920.05 (faster warmup on short run)", "eval_every 200\u219250, save_every 1000\u2192150 (tighter tracking)", "wall-clock 22h\u21924.5h (hard guarantee w/ buffer for final eval)", "early-stop patience 5\u21924 (bail faster)", "kept from v2: batch=2, lora_r=16, dropout=0.10, flash-attn assert" ] }