Upload folder using huggingface_hub

a2ca250 verified 26 days ago

1.56 kB

	{
	"version": "CPT-v3",
	"model": "BioMistral/BioMistral-7B",
	"gpu": "NVIDIA H200",
	"vram_gb": 150.110011392,
	"train_tokens": 2092364628,
	"val_tokens": 10923190,
	"train_notes": 2608855,
	"val_notes": 13339,
	"max_seq_len": 4096,
	"effective_batch": 16,
	"batch_size": 2,
	"grad_accum": 8,
	"lora_r": 16,
	"lora_alpha": 32,
	"lora_dropout": 0.1,
	"trainable_params": 41943040,
	"total_params": 7283675136,
	"attention": "flash_attention_2",
	"compute_dtype": "torch.bfloat16",
	"epochs": 1,
	"lr": 0.0002,
	"best_val_ppl": 2.516552927049002,
	"best_step": 900,
	"final_val_ppl": 2.417028769703714,
	"final_val_nll": 0.8825390047136883,
	"total_steps": 900,
	"total_tokens": 58982400,
	"training_min": 154.6,
	"avg_tok_per_sec": 6359.2,
	"oom_skips": 0,
	"paths": {
	"best_model": "/mnt/biomistral/best_model",
	"final_model": "/mnt/biomistral/final_model",
	"checkpoints": "/mnt/biomistral/checkpoints",
	"plots": "/mnt/biomistral/plots",
	"logs": "/mnt/biomistral/logs",
	"cache": "/mnt/biomistral/cache"
	},
	"v3_changes_vs_v2": [
	"grad_accum 24\u21928 (3\u00d7 more optim steps per unit time)",
	"max_optimizer_steps=900 (cosine fully anneals in 5h budget)",
	"warmup_ratio 0.03\u21920.05 (faster warmup on short run)",
	"eval_every 200\u219250, save_every 1000\u2192150 (tighter tracking)",
	"wall-clock 22h\u21924.5h (hard guarantee w/ buffer for final eval)",
	"early-stop patience 5\u21924 (bail faster)",
	"kept from v2: batch=2, lora_r=16, dropout=0.10, flash-attn assert"
	]
	}