BioMistral-7B-MIMIC-Notes / logs /training_summary.json
GOVINDFROM's picture
Upload folder using huggingface_hub
a2ca250 verified
{
"version": "CPT-v3",
"model": "BioMistral/BioMistral-7B",
"gpu": "NVIDIA H200",
"vram_gb": 150.110011392,
"train_tokens": 2092364628,
"val_tokens": 10923190,
"train_notes": 2608855,
"val_notes": 13339,
"max_seq_len": 4096,
"effective_batch": 16,
"batch_size": 2,
"grad_accum": 8,
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.1,
"trainable_params": 41943040,
"total_params": 7283675136,
"attention": "flash_attention_2",
"compute_dtype": "torch.bfloat16",
"epochs": 1,
"lr": 0.0002,
"best_val_ppl": 2.516552927049002,
"best_step": 900,
"final_val_ppl": 2.417028769703714,
"final_val_nll": 0.8825390047136883,
"total_steps": 900,
"total_tokens": 58982400,
"training_min": 154.6,
"avg_tok_per_sec": 6359.2,
"oom_skips": 0,
"paths": {
"best_model": "/mnt/biomistral/best_model",
"final_model": "/mnt/biomistral/final_model",
"checkpoints": "/mnt/biomistral/checkpoints",
"plots": "/mnt/biomistral/plots",
"logs": "/mnt/biomistral/logs",
"cache": "/mnt/biomistral/cache"
},
"v3_changes_vs_v2": [
"grad_accum 24\u21928 (3\u00d7 more optim steps per unit time)",
"max_optimizer_steps=900 (cosine fully anneals in 5h budget)",
"warmup_ratio 0.03\u21920.05 (faster warmup on short run)",
"eval_every 200\u219250, save_every 1000\u2192150 (tighter tracking)",
"wall-clock 22h\u21924.5h (hard guarantee w/ buffer for final eval)",
"early-stop patience 5\u21924 (bail faster)",
"kept from v2: batch=2, lora_r=16, dropout=0.10, flash-attn assert"
]
}