| { |
| "model_config": { |
| "tokenizer_id": "Qwen/Qwen3.5-2B-Base", |
| "hf_repo_id": "Efe2898/500m-dense-base", |
| "output_dir": "/kaggle/working/efe-500m-dense-base", |
| "vocab_size": null, |
| "hidden_size": 1024, |
| "intermediate_size": 4096, |
| "num_hidden_layers": 24, |
| "num_attention_heads": 16, |
| "num_key_value_heads": 4, |
| "max_position_embeddings": 65536, |
| "rope_theta": 1000000.0, |
| "rope_scaling": { |
| "type": "yarn", |
| "factor": 8.0, |
| "original_max_position_embeddings": 8192, |
| "rope_theta": 1000000.0, |
| "rope_type": "yarn" |
| }, |
| "use_sliding_window": true, |
| "sliding_window": 4096, |
| "max_window_layers": 20, |
| "rms_norm_eps": 1e-06, |
| "hidden_act": "silu", |
| "attention_dropout": 0.0, |
| "tie_word_embeddings": true, |
| "initializer_range": 0.02, |
| "torch_dtype": "bfloat16", |
| "push_to_hub": false, |
| "hf_token": null |
| }, |
| "training_ref": { |
| "mesh_shape": [ |
| 8 |
| ], |
| "mesh_axis_names": [ |
| "data" |
| ], |
| "attn_implementation": "eager", |
| "cpt_seq_len": 2048, |
| "cpt_global_batch": 256, |
| "cpt_grad_accum": 1, |
| "cpt_tokens_target": 10000000000, |
| "sft_seq_len": 4096, |
| "sft_global_batch": 128, |
| "sft_grad_accum": 1, |
| "optimizer": "Adafactor", |
| "lr": 0.0003, |
| "weight_decay": 0.01, |
| "clip_threshold": 1.0, |
| "relative_step": false, |
| "scale_parameter": false, |
| "warmup_steps": 500, |
| "lr_scheduler": "cosine", |
| "min_lr_ratio": 0.1, |
| "dtype": "bfloat16", |
| "gradient_checkpointing": true, |
| "log_every_n_steps": 10, |
| "save_every_n_steps": 500, |
| "eval_every_n_steps": 1000 |
| }, |
| "param_count": 618988544, |
| "param_count_M": 619.0, |
| "mlp_fraction": 0.8275, |
| "vocab_size": 248044, |
| "status": "untrained_blueprint", |
| "next_step": "CPT notebook" |
| } |