500m-dense-base / blueprint.json
Efe2898's picture
Upload blueprint.json with huggingface_hub
0a09f47 verified
{
"model_config": {
"tokenizer_id": "Qwen/Qwen3.5-2B-Base",
"hf_repo_id": "Efe2898/500m-dense-base",
"output_dir": "/kaggle/working/efe-500m-dense-base",
"vocab_size": null,
"hidden_size": 1024,
"intermediate_size": 4096,
"num_hidden_layers": 24,
"num_attention_heads": 16,
"num_key_value_heads": 4,
"max_position_embeddings": 65536,
"rope_theta": 1000000.0,
"rope_scaling": {
"type": "yarn",
"factor": 8.0,
"original_max_position_embeddings": 8192,
"rope_theta": 1000000.0,
"rope_type": "yarn"
},
"use_sliding_window": true,
"sliding_window": 4096,
"max_window_layers": 20,
"rms_norm_eps": 1e-06,
"hidden_act": "silu",
"attention_dropout": 0.0,
"tie_word_embeddings": true,
"initializer_range": 0.02,
"torch_dtype": "bfloat16",
"push_to_hub": false,
"hf_token": null
},
"training_ref": {
"mesh_shape": [
8
],
"mesh_axis_names": [
"data"
],
"attn_implementation": "eager",
"cpt_seq_len": 2048,
"cpt_global_batch": 256,
"cpt_grad_accum": 1,
"cpt_tokens_target": 10000000000,
"sft_seq_len": 4096,
"sft_global_batch": 128,
"sft_grad_accum": 1,
"optimizer": "Adafactor",
"lr": 0.0003,
"weight_decay": 0.01,
"clip_threshold": 1.0,
"relative_step": false,
"scale_parameter": false,
"warmup_steps": 500,
"lr_scheduler": "cosine",
"min_lr_ratio": 0.1,
"dtype": "bfloat16",
"gradient_checkpointing": true,
"log_every_n_steps": 10,
"save_every_n_steps": 500,
"eval_every_n_steps": 1000
},
"param_count": 618988544,
"param_count_M": 619.0,
"mlp_fraction": 0.8275,
"vocab_size": 248044,
"status": "untrained_blueprint",
"next_step": "CPT notebook"
}