File size: 1,564 Bytes
a2ca250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
{
  "version": "CPT-v3",
  "model": "BioMistral/BioMistral-7B",
  "gpu": "NVIDIA H200",
  "vram_gb": 150.110011392,
  "train_tokens": 2092364628,
  "val_tokens": 10923190,
  "train_notes": 2608855,
  "val_notes": 13339,
  "max_seq_len": 4096,
  "effective_batch": 16,
  "batch_size": 2,
  "grad_accum": 8,
  "lora_r": 16,
  "lora_alpha": 32,
  "lora_dropout": 0.1,
  "trainable_params": 41943040,
  "total_params": 7283675136,
  "attention": "flash_attention_2",
  "compute_dtype": "torch.bfloat16",
  "epochs": 1,
  "lr": 0.0002,
  "best_val_ppl": 2.516552927049002,
  "best_step": 900,
  "final_val_ppl": 2.417028769703714,
  "final_val_nll": 0.8825390047136883,
  "total_steps": 900,
  "total_tokens": 58982400,
  "training_min": 154.6,
  "avg_tok_per_sec": 6359.2,
  "oom_skips": 0,
  "paths": {
    "best_model": "/mnt/biomistral/best_model",
    "final_model": "/mnt/biomistral/final_model",
    "checkpoints": "/mnt/biomistral/checkpoints",
    "plots": "/mnt/biomistral/plots",
    "logs": "/mnt/biomistral/logs",
    "cache": "/mnt/biomistral/cache"
  },
  "v3_changes_vs_v2": [
    "grad_accum 24\u21928 (3\u00d7 more optim steps per unit time)",
    "max_optimizer_steps=900 (cosine fully anneals in 5h budget)",
    "warmup_ratio 0.03\u21920.05 (faster warmup on short run)",
    "eval_every 200\u219250, save_every 1000\u2192150 (tighter tracking)",
    "wall-clock 22h\u21924.5h (hard guarantee w/ buffer for final eval)",
    "early-stop patience 5\u21924 (bail faster)",
    "kept from v2: batch=2, lora_r=16, dropout=0.10, flash-attn assert"
  ]
}