Efe2898 commited on
Commit
0a09f47
·
verified ·
1 Parent(s): c671e02

Upload blueprint.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. blueprint.json +69 -0
blueprint.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_config": {
3
+ "tokenizer_id": "Qwen/Qwen3.5-2B-Base",
4
+ "hf_repo_id": "Efe2898/500m-dense-base",
5
+ "output_dir": "/kaggle/working/efe-500m-dense-base",
6
+ "vocab_size": null,
7
+ "hidden_size": 1024,
8
+ "intermediate_size": 4096,
9
+ "num_hidden_layers": 24,
10
+ "num_attention_heads": 16,
11
+ "num_key_value_heads": 4,
12
+ "max_position_embeddings": 65536,
13
+ "rope_theta": 1000000.0,
14
+ "rope_scaling": {
15
+ "type": "yarn",
16
+ "factor": 8.0,
17
+ "original_max_position_embeddings": 8192,
18
+ "rope_theta": 1000000.0,
19
+ "rope_type": "yarn"
20
+ },
21
+ "use_sliding_window": true,
22
+ "sliding_window": 4096,
23
+ "max_window_layers": 20,
24
+ "rms_norm_eps": 1e-06,
25
+ "hidden_act": "silu",
26
+ "attention_dropout": 0.0,
27
+ "tie_word_embeddings": true,
28
+ "initializer_range": 0.02,
29
+ "torch_dtype": "bfloat16",
30
+ "push_to_hub": false,
31
+ "hf_token": null
32
+ },
33
+ "training_ref": {
34
+ "mesh_shape": [
35
+ 8
36
+ ],
37
+ "mesh_axis_names": [
38
+ "data"
39
+ ],
40
+ "attn_implementation": "eager",
41
+ "cpt_seq_len": 2048,
42
+ "cpt_global_batch": 256,
43
+ "cpt_grad_accum": 1,
44
+ "cpt_tokens_target": 10000000000,
45
+ "sft_seq_len": 4096,
46
+ "sft_global_batch": 128,
47
+ "sft_grad_accum": 1,
48
+ "optimizer": "Adafactor",
49
+ "lr": 0.0003,
50
+ "weight_decay": 0.01,
51
+ "clip_threshold": 1.0,
52
+ "relative_step": false,
53
+ "scale_parameter": false,
54
+ "warmup_steps": 500,
55
+ "lr_scheduler": "cosine",
56
+ "min_lr_ratio": 0.1,
57
+ "dtype": "bfloat16",
58
+ "gradient_checkpointing": true,
59
+ "log_every_n_steps": 10,
60
+ "save_every_n_steps": 500,
61
+ "eval_every_n_steps": 1000
62
+ },
63
+ "param_count": 618988544,
64
+ "param_count_M": 619.0,
65
+ "mlp_fraction": 0.8275,
66
+ "vocab_size": 248044,
67
+ "status": "untrained_blueprint",
68
+ "next_step": "CPT notebook"
69
+ }