picochat / config.json
BarryFutureman's picture
Upload config.json with huggingface_hub
7f67447 verified
{
"step": 2165,
"val_bpb": 0.8475956669593342,
"model_config": {
"sequence_len": 512,
"vocab_size": 32768,
"n_layer": 18,
"n_head": 9,
"n_kv_head": 9,
"n_embd": 1152,
"window_pattern": "L",
"use_mla": false,
"kv_lora_rank": 512,
"qk_lora_rank": 1536,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"use_ssa": true
},
"user_config": {
"run": "nanochat",
"device_type": "",
"fp8": false,
"fp8_recipe": "tensorwise",
"depth": 18,
"aspect_ratio": 64,
"head_dim": 128,
"max_seq_len": 512,
"window_pattern": "L",
"no_muon": false,
"mla": false,
"ssa": true,
"kv_lora_rank": 512,
"qk_lora_rank": 1536,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_iterations": -1,
"target_flops": -1.0,
"target_param_data_ratio": 7.0,
"device_batch_size": 128,
"total_batch_size": -1,
"embedding_lr": 0.3,
"unembedding_lr": 0.004,
"weight_decay": 0.2,
"matrix_lr": 0.02,
"adam_beta1": 0.8,
"adam_beta2": 0.95,
"warmup_ratio": 0.0,
"warmdown_ratio": 0.5,
"final_lr_frac": 0.0,
"resume_from_step": -1,
"eval_every": 250,
"eval_tokens": 20971520,
"core_metric_every": 2000,
"core_metric_max_per_task": 500,
"sample_every": 2000,
"save_every": -1,
"model_tag": "picochat",
"load_weights_from": null,
"load_weights_step": null
},
"device_batch_size": 128,
"max_seq_len": 512,
"total_batch_size": 1048576,
"dataloader_state_dict": {
"pq_idx": 104,
"rg_idx": 47,
"epoch": 1
},
"loop_state": {
"min_val_bpb": 0.8475956669593342,
"smooth_train_loss": 2.8003673421570308,
"total_training_time": 26251.267766714096
}
}