Charlie81 commited on
Commit
ccb26b5
·
1 Parent(s): bc05da9

delete checkpoints

Browse files
checkpoints/checkpoint-200/config.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "architectures": [
3
- "MyOlmoeForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "clip_qkv": null,
8
- "eos_token_id": 50279,
9
- "hidden_act": "silu",
10
- "hidden_size": 2048,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 1024,
13
- "max_position_embeddings": 4096,
14
- "max_small_expert_count": 64,
15
- "model_type": "olmoe",
16
- "norm_topk_prob": false,
17
- "num_attention_heads": 16,
18
- "num_experts": 64,
19
- "num_experts_per_tok": 2,
20
- "num_hidden_layers": 16,
21
- "num_key_value_heads": 16,
22
- "num_small_experts": 64,
23
- "output_router_logits": false,
24
- "pad_token_id": 1,
25
- "rms_norm_eps": 1e-05,
26
- "rope_scaling": null,
27
- "rope_theta": 10000.0,
28
- "router_aux_loss_coef": 0.01,
29
- "small_expert_count": 64,
30
- "small_expert_intermediate_ratio": 64,
31
- "small_expert_intermediate_size": 0,
32
- "small_expert_sparsity_coef": 0.1,
33
- "tie_word_embeddings": false,
34
- "torch_dtype": "bfloat16",
35
- "transformers_version": "4.55.4",
36
- "use_cache": true,
37
- "vocab_size": 50304
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-200/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "eos_token_id": 50279,
4
- "pad_token_id": 1,
5
- "transformers_version": "4.55.4"
6
- }
 
 
 
 
 
 
 
checkpoints/checkpoint-200/model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a1b9651bad1a045a178e22cf198d5070c94d4374f6331086e38801fe8d88ca3
3
- size 4997482624
 
 
 
 
checkpoints/checkpoint-200/model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e21faaebae001426a76f6a3c25309632bba780041e37b7b0b22c610cbdcfbdf
3
- size 4997867120
 
 
 
 
checkpoints/checkpoint-200/model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:12d42d85ae49df1bd0baaa04b9ee2464265f3ae089ed723ed7910b0a0721a2ae
3
- size 3856242664
 
 
 
 
checkpoints/checkpoint-200/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-200/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba36ff3cf5aa103f2c5a162efe65287c247dccc43ee228fd9bcb3a14b6d695ae
3
- size 25858571
 
 
 
 
checkpoints/checkpoint-200/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
- size 14645
 
 
 
 
checkpoints/checkpoint-200/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1a87144f92021f7270fcad4a86cd226d4640d1deb1f882885045e9d8fc1212f
3
- size 1465
 
 
 
 
checkpoints/checkpoint-200/trainer_state.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.009811316126737676,
6
- "eval_steps": 500,
7
- "global_step": 200,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.0004905658063368838,
14
- "grad_norm": 52.75,
15
- "learning_rate": 1.471550032701112e-07,
16
- "loss": 104.1645,
17
- "step": 10
18
- },
19
- {
20
- "epoch": 0.0009811316126737675,
21
- "grad_norm": 57.5,
22
- "learning_rate": 3.106605624591236e-07,
23
- "loss": 105.2674,
24
- "step": 20
25
- },
26
- {
27
- "epoch": 0.0014716974190106514,
28
- "grad_norm": 46.25,
29
- "learning_rate": 4.7416612164813603e-07,
30
- "loss": 105.4015,
31
- "step": 30
32
- },
33
- {
34
- "epoch": 0.001962263225347535,
35
- "grad_norm": 36.25,
36
- "learning_rate": 6.376716808371485e-07,
37
- "loss": 105.1723,
38
- "step": 40
39
- },
40
- {
41
- "epoch": 0.002452829031684419,
42
- "grad_norm": 32.75,
43
- "learning_rate": 8.011772400261609e-07,
44
- "loss": 105.2332,
45
- "step": 50
46
- },
47
- {
48
- "epoch": 0.0029433948380213027,
49
- "grad_norm": 42.25,
50
- "learning_rate": 9.646827992151733e-07,
51
- "loss": 104.8549,
52
- "step": 60
53
- },
54
- {
55
- "epoch": 0.003433960644358187,
56
- "grad_norm": 35.5,
57
- "learning_rate": 1.1281883584041859e-06,
58
- "loss": 103.9609,
59
- "step": 70
60
- },
61
- {
62
- "epoch": 0.00392452645069507,
63
- "grad_norm": 37.25,
64
- "learning_rate": 1.2916939175931983e-06,
65
- "loss": 104.4941,
66
- "step": 80
67
- },
68
- {
69
- "epoch": 0.0044150922570319545,
70
- "grad_norm": 40.0,
71
- "learning_rate": 1.4551994767822106e-06,
72
- "loss": 104.7141,
73
- "step": 90
74
- },
75
- {
76
- "epoch": 0.004905658063368838,
77
- "grad_norm": 41.5,
78
- "learning_rate": 1.618705035971223e-06,
79
- "loss": 104.9211,
80
- "step": 100
81
- },
82
- {
83
- "epoch": 0.005396223869705722,
84
- "grad_norm": 51.25,
85
- "learning_rate": 1.7822105951602354e-06,
86
- "loss": 103.7137,
87
- "step": 110
88
- },
89
- {
90
- "epoch": 0.0058867896760426055,
91
- "grad_norm": 39.5,
92
- "learning_rate": 1.945716154349248e-06,
93
- "loss": 105.1651,
94
- "step": 120
95
- },
96
- {
97
- "epoch": 0.006377355482379489,
98
- "grad_norm": 37.75,
99
- "learning_rate": 2.1092217135382606e-06,
100
- "loss": 105.2588,
101
- "step": 130
102
- },
103
- {
104
- "epoch": 0.006867921288716374,
105
- "grad_norm": 38.25,
106
- "learning_rate": 2.2727272727272728e-06,
107
- "loss": 104.3058,
108
- "step": 140
109
- },
110
- {
111
- "epoch": 0.007358487095053257,
112
- "grad_norm": 40.0,
113
- "learning_rate": 2.4362328319162854e-06,
114
- "loss": 104.7016,
115
- "step": 150
116
- },
117
- {
118
- "epoch": 0.00784905290139014,
119
- "grad_norm": 41.0,
120
- "learning_rate": 2.5997383911052975e-06,
121
- "loss": 103.8965,
122
- "step": 160
123
- },
124
- {
125
- "epoch": 0.008339618707727025,
126
- "grad_norm": 41.0,
127
- "learning_rate": 2.76324395029431e-06,
128
- "loss": 105.4974,
129
- "step": 170
130
- },
131
- {
132
- "epoch": 0.008830184514063909,
133
- "grad_norm": 32.5,
134
- "learning_rate": 2.9267495094833227e-06,
135
- "loss": 104.178,
136
- "step": 180
137
- },
138
- {
139
- "epoch": 0.009320750320400792,
140
- "grad_norm": 35.0,
141
- "learning_rate": 3.090255068672335e-06,
142
- "loss": 104.7224,
143
- "step": 190
144
- },
145
- {
146
- "epoch": 0.009811316126737676,
147
- "grad_norm": 40.5,
148
- "learning_rate": 3.253760627861348e-06,
149
- "loss": 105.291,
150
- "step": 200
151
- }
152
- ],
153
- "logging_steps": 10,
154
- "max_steps": 61155,
155
- "num_input_tokens_seen": 0,
156
- "num_train_epochs": 3,
157
- "save_steps": 20,
158
- "stateful_callbacks": {
159
- "TrainerControl": {
160
- "args": {
161
- "should_epoch_stop": false,
162
- "should_evaluate": false,
163
- "should_log": false,
164
- "should_save": true,
165
- "should_training_stop": false
166
- },
167
- "attributes": {}
168
- }
169
- },
170
- "total_flos": 5.365480915206144e+17,
171
- "train_batch_size": 2,
172
- "trial_name": null,
173
- "trial_params": null
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-200/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffa490bde32401dd6d70c4f1b1cff8f5df114f94b79824d3a47b2ae8c00b822d
3
- size 5713
 
 
 
 
checkpoints/checkpoint-220/config.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "architectures": [
3
- "MyOlmoeForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "clip_qkv": null,
8
- "eos_token_id": 50279,
9
- "hidden_act": "silu",
10
- "hidden_size": 2048,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 1024,
13
- "max_position_embeddings": 4096,
14
- "max_small_expert_count": 64,
15
- "model_type": "olmoe",
16
- "norm_topk_prob": false,
17
- "num_attention_heads": 16,
18
- "num_experts": 64,
19
- "num_experts_per_tok": 2,
20
- "num_hidden_layers": 16,
21
- "num_key_value_heads": 16,
22
- "num_small_experts": 64,
23
- "output_router_logits": false,
24
- "pad_token_id": 1,
25
- "rms_norm_eps": 1e-05,
26
- "rope_scaling": null,
27
- "rope_theta": 10000.0,
28
- "router_aux_loss_coef": 0.01,
29
- "small_expert_count": 64,
30
- "small_expert_intermediate_ratio": 64,
31
- "small_expert_intermediate_size": 0,
32
- "small_expert_sparsity_coef": 0.1,
33
- "tie_word_embeddings": false,
34
- "torch_dtype": "bfloat16",
35
- "transformers_version": "4.55.4",
36
- "use_cache": true,
37
- "vocab_size": 50304
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-220/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "eos_token_id": 50279,
4
- "pad_token_id": 1,
5
- "transformers_version": "4.55.4"
6
- }
 
 
 
 
 
 
 
checkpoints/checkpoint-220/model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a1b9651bad1a045a178e22cf198d5070c94d4374f6331086e38801fe8d88ca3
3
- size 4997482624
 
 
 
 
checkpoints/checkpoint-220/model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0576e3f3539a0e79d172894fe362678c80b44a28cda8147ddd89c81a5f31886e
3
- size 4997867120
 
 
 
 
checkpoints/checkpoint-220/model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a1605301ccd4f90faaf68572324a548bf4c5e73dcd6f90f475b808a1a247096
3
- size 3856242664
 
 
 
 
checkpoints/checkpoint-220/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-220/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:361be11371d75e9b52d86c0ddaa928ba076a71f4b7a40ebd110351bd6b0d01c3
3
- size 25858571
 
 
 
 
checkpoints/checkpoint-220/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
- size 14645
 
 
 
 
checkpoints/checkpoint-220/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b3c0107f4752f47d660ab9481faad64ff112b078ad06a634204030d5c8d97e9
3
- size 1465
 
 
 
 
checkpoints/checkpoint-220/trainer_state.json DELETED
@@ -1,188 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.010792447739411444,
6
- "eval_steps": 500,
7
- "global_step": 220,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.0004905658063368838,
14
- "grad_norm": 52.75,
15
- "learning_rate": 1.471550032701112e-07,
16
- "loss": 104.1645,
17
- "step": 10
18
- },
19
- {
20
- "epoch": 0.0009811316126737675,
21
- "grad_norm": 57.5,
22
- "learning_rate": 3.106605624591236e-07,
23
- "loss": 105.2674,
24
- "step": 20
25
- },
26
- {
27
- "epoch": 0.0014716974190106514,
28
- "grad_norm": 46.25,
29
- "learning_rate": 4.7416612164813603e-07,
30
- "loss": 105.4015,
31
- "step": 30
32
- },
33
- {
34
- "epoch": 0.001962263225347535,
35
- "grad_norm": 36.25,
36
- "learning_rate": 6.376716808371485e-07,
37
- "loss": 105.1723,
38
- "step": 40
39
- },
40
- {
41
- "epoch": 0.002452829031684419,
42
- "grad_norm": 32.75,
43
- "learning_rate": 8.011772400261609e-07,
44
- "loss": 105.2332,
45
- "step": 50
46
- },
47
- {
48
- "epoch": 0.0029433948380213027,
49
- "grad_norm": 42.25,
50
- "learning_rate": 9.646827992151733e-07,
51
- "loss": 104.8549,
52
- "step": 60
53
- },
54
- {
55
- "epoch": 0.003433960644358187,
56
- "grad_norm": 35.5,
57
- "learning_rate": 1.1281883584041859e-06,
58
- "loss": 103.9609,
59
- "step": 70
60
- },
61
- {
62
- "epoch": 0.00392452645069507,
63
- "grad_norm": 37.25,
64
- "learning_rate": 1.2916939175931983e-06,
65
- "loss": 104.4941,
66
- "step": 80
67
- },
68
- {
69
- "epoch": 0.0044150922570319545,
70
- "grad_norm": 40.0,
71
- "learning_rate": 1.4551994767822106e-06,
72
- "loss": 104.7141,
73
- "step": 90
74
- },
75
- {
76
- "epoch": 0.004905658063368838,
77
- "grad_norm": 41.5,
78
- "learning_rate": 1.618705035971223e-06,
79
- "loss": 104.9211,
80
- "step": 100
81
- },
82
- {
83
- "epoch": 0.005396223869705722,
84
- "grad_norm": 51.25,
85
- "learning_rate": 1.7822105951602354e-06,
86
- "loss": 103.7137,
87
- "step": 110
88
- },
89
- {
90
- "epoch": 0.0058867896760426055,
91
- "grad_norm": 39.5,
92
- "learning_rate": 1.945716154349248e-06,
93
- "loss": 105.1651,
94
- "step": 120
95
- },
96
- {
97
- "epoch": 0.006377355482379489,
98
- "grad_norm": 37.75,
99
- "learning_rate": 2.1092217135382606e-06,
100
- "loss": 105.2588,
101
- "step": 130
102
- },
103
- {
104
- "epoch": 0.006867921288716374,
105
- "grad_norm": 38.25,
106
- "learning_rate": 2.2727272727272728e-06,
107
- "loss": 104.3058,
108
- "step": 140
109
- },
110
- {
111
- "epoch": 0.007358487095053257,
112
- "grad_norm": 40.0,
113
- "learning_rate": 2.4362328319162854e-06,
114
- "loss": 104.7016,
115
- "step": 150
116
- },
117
- {
118
- "epoch": 0.00784905290139014,
119
- "grad_norm": 41.0,
120
- "learning_rate": 2.5997383911052975e-06,
121
- "loss": 103.8965,
122
- "step": 160
123
- },
124
- {
125
- "epoch": 0.008339618707727025,
126
- "grad_norm": 41.0,
127
- "learning_rate": 2.76324395029431e-06,
128
- "loss": 105.4974,
129
- "step": 170
130
- },
131
- {
132
- "epoch": 0.008830184514063909,
133
- "grad_norm": 32.5,
134
- "learning_rate": 2.9267495094833227e-06,
135
- "loss": 104.178,
136
- "step": 180
137
- },
138
- {
139
- "epoch": 0.009320750320400792,
140
- "grad_norm": 35.0,
141
- "learning_rate": 3.090255068672335e-06,
142
- "loss": 104.7224,
143
- "step": 190
144
- },
145
- {
146
- "epoch": 0.009811316126737676,
147
- "grad_norm": 40.5,
148
- "learning_rate": 3.253760627861348e-06,
149
- "loss": 105.291,
150
- "step": 200
151
- },
152
- {
153
- "epoch": 0.01030188193307456,
154
- "grad_norm": 51.5,
155
- "learning_rate": 3.41726618705036e-06,
156
- "loss": 104.8932,
157
- "step": 210
158
- },
159
- {
160
- "epoch": 0.010792447739411444,
161
- "grad_norm": 40.5,
162
- "learning_rate": 3.5807717462393727e-06,
163
- "loss": 104.8132,
164
- "step": 220
165
- }
166
- ],
167
- "logging_steps": 10,
168
- "max_steps": 61155,
169
- "num_input_tokens_seen": 0,
170
- "num_train_epochs": 3,
171
- "save_steps": 20,
172
- "stateful_callbacks": {
173
- "TrainerControl": {
174
- "args": {
175
- "should_epoch_stop": false,
176
- "should_evaluate": false,
177
- "should_log": false,
178
- "should_save": true,
179
- "should_training_stop": false
180
- },
181
- "attributes": {}
182
- }
183
- },
184
- "total_flos": 5.902029006726758e+17,
185
- "train_batch_size": 2,
186
- "trial_name": null,
187
- "trial_params": null
188
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-220/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffa490bde32401dd6d70c4f1b1cff8f5df114f94b79824d3a47b2ae8c00b822d
3
- size 5713