Efe2898
/

500m-dense-base

Model card Files Files and versions

500m-dense-base / blueprint.json

Efe2898's picture

Upload blueprint.json with huggingface_hub

0a09f47 verified 26 days ago

history blame contribute delete

1.8 kB

	{
	"model_config": {
	"tokenizer_id": "Qwen/Qwen3.5-2B-Base",
	"hf_repo_id": "Efe2898/500m-dense-base",
	"output_dir": "/kaggle/working/efe-500m-dense-base",
	"vocab_size": null,
	"hidden_size": 1024,
	"intermediate_size": 4096,
	"num_hidden_layers": 24,
	"num_attention_heads": 16,
	"num_key_value_heads": 4,
	"max_position_embeddings": 65536,
	"rope_theta": 1000000.0,
	"rope_scaling": {
	"type": "yarn",
	"factor": 8.0,
	"original_max_position_embeddings": 8192,
	"rope_theta": 1000000.0,
	"rope_type": "yarn"
	},
	"use_sliding_window": true,
	"sliding_window": 4096,
	"max_window_layers": 20,
	"rms_norm_eps": 1e-06,
	"hidden_act": "silu",
	"attention_dropout": 0.0,
	"tie_word_embeddings": true,
	"initializer_range": 0.02,
	"torch_dtype": "bfloat16",
	"push_to_hub": false,
	"hf_token": null
	},
	"training_ref": {
	"mesh_shape": [
	8
	],
	"mesh_axis_names": [
	"data"
	],
	"attn_implementation": "eager",
	"cpt_seq_len": 2048,
	"cpt_global_batch": 256,
	"cpt_grad_accum": 1,
	"cpt_tokens_target": 10000000000,
	"sft_seq_len": 4096,
	"sft_global_batch": 128,
	"sft_grad_accum": 1,
	"optimizer": "Adafactor",
	"lr": 0.0003,
	"weight_decay": 0.01,
	"clip_threshold": 1.0,
	"relative_step": false,
	"scale_parameter": false,
	"warmup_steps": 500,
	"lr_scheduler": "cosine",
	"min_lr_ratio": 0.1,
	"dtype": "bfloat16",
	"gradient_checkpointing": true,
	"log_every_n_steps": 10,
	"save_every_n_steps": 500,
	"eval_every_n_steps": 1000
	},
	"param_count": 618988544,
	"param_count_M": 619.0,
	"mlp_fraction": 0.8275,
	"vocab_size": 248044,
	"status": "untrained_blueprint",
	"next_step": "CPT notebook"
	}