Mistral 7B v0.3 + CoDA-GQA-L: two-phase trained (unbounded + bounded)

a1acae5 verified 2 months ago

787 Bytes

	{
	"model": "mistralai/Mistral-7B-v0.3",
	"dataset": "wikitext-103",
	"max_steps": 2000,
	"batch_size": 1,
	"grad_accum": 8,
	"seq_len": 2048,
	"lr": 5e-05,
	"lr_coda": 0.001,
	"warmup_steps": 100,
	"weight_decay": 0.01,
	"freeze": "attention",
	"eval_every": 500,
	"save_every": 500,
	"eval_tokens": 50000,
	"eval_bounded_tokens": 20000,
	"no_eval_bounded": false,
	"head_norm_mode": "identity",
	"theta_init": 0.0,
	"bounded_steps": 2000,
	"bounded_config": "medium",
	"bounded_lr_scale": 0.5,
	"bounded_block_size": 512,
	"no_detach_evicted": false,
	"dtype": "bf16",
	"gradient_checkpointing": true,
	"output_dir": "/workspace/runs/mistral7b",
	"total_params": 7248023552,
	"trainable_params": 1346438144,
	"n_adapters": 32,
	"eff_batch_tokens": 16384
	}