Auto-save flat update: checkpoint-100
Browse files- config.json +5 -5
- model.safetensors +2 -2
- optimizer.pt +2 -2
- scheduler.pt +1 -1
- tokenizer.json +101 -1
- trainer_state.json +43 -323
- training_args.bin +1 -1
config.json
CHANGED
|
@@ -8,18 +8,18 @@
|
|
| 8 |
"bos_token_id": 1,
|
| 9 |
"dtype": "float32",
|
| 10 |
"eos_token_id": 2,
|
| 11 |
-
"head_dim":
|
| 12 |
"hidden_act": "silu",
|
| 13 |
"hidden_size": 256,
|
| 14 |
"initializer_range": 0.02,
|
| 15 |
"intermediate_size": 1024,
|
| 16 |
-
"max_position_embeddings":
|
| 17 |
"max_window_layers": 28,
|
| 18 |
"mlp_bias": false,
|
| 19 |
"model_type": "qwen2",
|
| 20 |
-
"num_attention_heads":
|
| 21 |
"num_hidden_layers": 1,
|
| 22 |
-
"num_key_value_heads":
|
| 23 |
"pad_token_id": 3,
|
| 24 |
"pretraining_tp": 1,
|
| 25 |
"rms_norm_eps": 1e-05,
|
|
@@ -31,5 +31,5 @@
|
|
| 31 |
"transformers_version": "4.48.3",
|
| 32 |
"use_cache": false,
|
| 33 |
"use_sliding_window": false,
|
| 34 |
-
"vocab_size":
|
| 35 |
}
|
|
|
|
| 8 |
"bos_token_id": 1,
|
| 9 |
"dtype": "float32",
|
| 10 |
"eos_token_id": 2,
|
| 11 |
+
"head_dim": 32,
|
| 12 |
"hidden_act": "silu",
|
| 13 |
"hidden_size": 256,
|
| 14 |
"initializer_range": 0.02,
|
| 15 |
"intermediate_size": 1024,
|
| 16 |
+
"max_position_embeddings": 1024,
|
| 17 |
"max_window_layers": 28,
|
| 18 |
"mlp_bias": false,
|
| 19 |
"model_type": "qwen2",
|
| 20 |
+
"num_attention_heads": 8,
|
| 21 |
"num_hidden_layers": 1,
|
| 22 |
+
"num_key_value_heads": 8,
|
| 23 |
"pad_token_id": 3,
|
| 24 |
"pretraining_tp": 1,
|
| 25 |
"rms_norm_eps": 1e-05,
|
|
|
|
| 31 |
"transformers_version": "4.48.3",
|
| 32 |
"use_cache": false,
|
| 33 |
"use_sliding_window": false,
|
| 34 |
+
"vocab_size": 192
|
| 35 |
}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6b0a725fafdc4cbc9ff3e3dd898c7b32faaea0147dd5188701fcf792ce45084
|
| 3 |
+
size 4398536
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd6cabe1dde2585f2289245c3f51d734eea81900d782207f109b03f385742dd5
|
| 3 |
+
size 8806533
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d82c58c32b204ed6cf1be47fcccac4a2997bdd7e1431fe3a6ec925f0a86a9891
|
| 3 |
size 1465
|
tokenizer.json
CHANGED
|
@@ -231,7 +231,27 @@
|
|
| 231 |
"Ġpro": 168,
|
| 232 |
"ch": 169,
|
| 233 |
"ow": 170,
|
| 234 |
-
"tic": 171
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
},
|
| 236 |
"merges": [
|
| 237 |
[
|
|
@@ -525,6 +545,86 @@
|
|
| 525 |
[
|
| 526 |
"ti",
|
| 527 |
"c"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
]
|
| 529 |
]
|
| 530 |
}
|
|
|
|
| 231 |
"Ġpro": 168,
|
| 232 |
"ch": 169,
|
| 233 |
"ow": 170,
|
| 234 |
+
"tic": 171,
|
| 235 |
+
"Ġcon": 172,
|
| 236 |
+
"qu": 173,
|
| 237 |
+
"Ġh": 174,
|
| 238 |
+
"per": 175,
|
| 239 |
+
"Ġon": 176,
|
| 240 |
+
"ig": 177,
|
| 241 |
+
"am": 178,
|
| 242 |
+
"res": 179,
|
| 243 |
+
"Ġwith": 180,
|
| 244 |
+
"Ġthat": 181,
|
| 245 |
+
"ĠW": 182,
|
| 246 |
+
"ver": 183,
|
| 247 |
+
"um": 184,
|
| 248 |
+
"Ġ$": 185,
|
| 249 |
+
"il": 186,
|
| 250 |
+
"Ġex": 187,
|
| 251 |
+
"ut": 188,
|
| 252 |
+
"se": 189,
|
| 253 |
+
"ot": 190,
|
| 254 |
+
"ate": 191
|
| 255 |
},
|
| 256 |
"merges": [
|
| 257 |
[
|
|
|
|
| 545 |
[
|
| 546 |
"ti",
|
| 547 |
"c"
|
| 548 |
+
],
|
| 549 |
+
[
|
| 550 |
+
"Ġc",
|
| 551 |
+
"on"
|
| 552 |
+
],
|
| 553 |
+
[
|
| 554 |
+
"q",
|
| 555 |
+
"u"
|
| 556 |
+
],
|
| 557 |
+
[
|
| 558 |
+
"Ġ",
|
| 559 |
+
"h"
|
| 560 |
+
],
|
| 561 |
+
[
|
| 562 |
+
"p",
|
| 563 |
+
"er"
|
| 564 |
+
],
|
| 565 |
+
[
|
| 566 |
+
"Ġ",
|
| 567 |
+
"on"
|
| 568 |
+
],
|
| 569 |
+
[
|
| 570 |
+
"i",
|
| 571 |
+
"g"
|
| 572 |
+
],
|
| 573 |
+
[
|
| 574 |
+
"a",
|
| 575 |
+
"m"
|
| 576 |
+
],
|
| 577 |
+
[
|
| 578 |
+
"re",
|
| 579 |
+
"s"
|
| 580 |
+
],
|
| 581 |
+
[
|
| 582 |
+
"Ġw",
|
| 583 |
+
"ith"
|
| 584 |
+
],
|
| 585 |
+
[
|
| 586 |
+
"Ġth",
|
| 587 |
+
"at"
|
| 588 |
+
],
|
| 589 |
+
[
|
| 590 |
+
"Ġ",
|
| 591 |
+
"W"
|
| 592 |
+
],
|
| 593 |
+
[
|
| 594 |
+
"v",
|
| 595 |
+
"er"
|
| 596 |
+
],
|
| 597 |
+
[
|
| 598 |
+
"u",
|
| 599 |
+
"m"
|
| 600 |
+
],
|
| 601 |
+
[
|
| 602 |
+
"Ġ",
|
| 603 |
+
"$"
|
| 604 |
+
],
|
| 605 |
+
[
|
| 606 |
+
"i",
|
| 607 |
+
"l"
|
| 608 |
+
],
|
| 609 |
+
[
|
| 610 |
+
"Ġe",
|
| 611 |
+
"x"
|
| 612 |
+
],
|
| 613 |
+
[
|
| 614 |
+
"u",
|
| 615 |
+
"t"
|
| 616 |
+
],
|
| 617 |
+
[
|
| 618 |
+
"s",
|
| 619 |
+
"e"
|
| 620 |
+
],
|
| 621 |
+
[
|
| 622 |
+
"o",
|
| 623 |
+
"t"
|
| 624 |
+
],
|
| 625 |
+
[
|
| 626 |
+
"at",
|
| 627 |
+
"e"
|
| 628 |
]
|
| 629 |
]
|
| 630 |
}
|
trainer_state.json
CHANGED
|
@@ -1,432 +1,152 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.
|
| 5 |
"eval_steps": 500,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.0005,
|
| 13 |
-
"grad_norm": 1.
|
| 14 |
"learning_rate": 0.0001,
|
| 15 |
-
"loss": 5.
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.001,
|
| 20 |
-
"grad_norm": 1.
|
| 21 |
"learning_rate": 0.0002,
|
| 22 |
-
"loss":
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.0015,
|
| 27 |
-
"grad_norm": 0.
|
| 28 |
"learning_rate": 0.0001998998998998999,
|
| 29 |
-
"loss": 4.
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.002,
|
| 34 |
-
"grad_norm": 0.
|
| 35 |
"learning_rate": 0.0001997997997997998,
|
| 36 |
-
"loss": 4.
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 0.0025,
|
| 41 |
-
"grad_norm": 0.
|
| 42 |
"learning_rate": 0.0001996996996996997,
|
| 43 |
-
"loss": 4.
|
| 44 |
"step": 25
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"epoch": 0.003,
|
| 48 |
-
"grad_norm": 0.
|
| 49 |
"learning_rate": 0.0001995995995995996,
|
| 50 |
-
"loss": 4.
|
| 51 |
"step": 30
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"epoch": 0.0035,
|
| 55 |
-
"grad_norm": 0.
|
| 56 |
"learning_rate": 0.0001994994994994995,
|
| 57 |
-
"loss": 4.
|
| 58 |
"step": 35
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"epoch": 0.004,
|
| 62 |
-
"grad_norm": 0.
|
| 63 |
"learning_rate": 0.0001993993993993994,
|
| 64 |
-
"loss":
|
| 65 |
"step": 40
|
| 66 |
},
|
| 67 |
{
|
| 68 |
"epoch": 0.0045,
|
| 69 |
-
"grad_norm": 0.
|
| 70 |
"learning_rate": 0.00019929929929929932,
|
| 71 |
-
"loss":
|
| 72 |
"step": 45
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"epoch": 0.005,
|
| 76 |
-
"grad_norm": 0.
|
| 77 |
"learning_rate": 0.0001991991991991992,
|
| 78 |
-
"loss": 3.
|
| 79 |
"step": 50
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"epoch": 0.0055,
|
| 83 |
-
"grad_norm": 0.
|
| 84 |
"learning_rate": 0.00019909909909909912,
|
| 85 |
-
"loss": 3.
|
| 86 |
"step": 55
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"epoch": 0.006,
|
| 90 |
-
"grad_norm": 0.
|
| 91 |
"learning_rate": 0.000198998998998999,
|
| 92 |
-
"loss": 3.
|
| 93 |
"step": 60
|
| 94 |
},
|
| 95 |
{
|
| 96 |
"epoch": 0.0065,
|
| 97 |
-
"grad_norm": 0.
|
| 98 |
"learning_rate": 0.0001988988988988989,
|
| 99 |
-
"loss": 3.
|
| 100 |
"step": 65
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"epoch": 0.007,
|
| 104 |
-
"grad_norm": 0.
|
| 105 |
"learning_rate": 0.0001987987987987988,
|
| 106 |
-
"loss": 3.
|
| 107 |
"step": 70
|
| 108 |
},
|
| 109 |
{
|
| 110 |
"epoch": 0.0075,
|
| 111 |
-
"grad_norm": 0.
|
| 112 |
"learning_rate": 0.0001986986986986987,
|
| 113 |
-
"loss": 3.
|
| 114 |
"step": 75
|
| 115 |
},
|
| 116 |
{
|
| 117 |
"epoch": 0.008,
|
| 118 |
-
"grad_norm": 0.
|
| 119 |
"learning_rate": 0.0001985985985985986,
|
| 120 |
-
"loss": 3.
|
| 121 |
"step": 80
|
| 122 |
},
|
| 123 |
{
|
| 124 |
"epoch": 0.0085,
|
| 125 |
-
"grad_norm": 0.
|
| 126 |
"learning_rate": 0.0001984984984984985,
|
| 127 |
-
"loss": 3.
|
| 128 |
"step": 85
|
| 129 |
},
|
| 130 |
{
|
| 131 |
"epoch": 0.009,
|
| 132 |
-
"grad_norm": 0.
|
| 133 |
"learning_rate": 0.0001983983983983984,
|
| 134 |
-
"loss": 3.
|
| 135 |
"step": 90
|
| 136 |
},
|
| 137 |
{
|
| 138 |
"epoch": 0.0095,
|
| 139 |
-
"grad_norm": 0.
|
| 140 |
"learning_rate": 0.00019829829829829833,
|
| 141 |
-
"loss": 3.
|
| 142 |
"step": 95
|
| 143 |
},
|
| 144 |
{
|
| 145 |
"epoch": 0.01,
|
| 146 |
-
"grad_norm": 0.
|
| 147 |
"learning_rate": 0.0001981981981981982,
|
| 148 |
-
"loss": 3.
|
| 149 |
"step": 100
|
| 150 |
-
},
|
| 151 |
-
{
|
| 152 |
-
"epoch": 0.0105,
|
| 153 |
-
"grad_norm": 0.3497621715068817,
|
| 154 |
-
"learning_rate": 0.00019809809809809813,
|
| 155 |
-
"loss": 3.5253,
|
| 156 |
-
"step": 105
|
| 157 |
-
},
|
| 158 |
-
{
|
| 159 |
-
"epoch": 0.011,
|
| 160 |
-
"grad_norm": 0.3970584273338318,
|
| 161 |
-
"learning_rate": 0.000197997997997998,
|
| 162 |
-
"loss": 3.513,
|
| 163 |
-
"step": 110
|
| 164 |
-
},
|
| 165 |
-
{
|
| 166 |
-
"epoch": 0.0115,
|
| 167 |
-
"grad_norm": 0.47932690382003784,
|
| 168 |
-
"learning_rate": 0.0001978978978978979,
|
| 169 |
-
"loss": 3.4934,
|
| 170 |
-
"step": 115
|
| 171 |
-
},
|
| 172 |
-
{
|
| 173 |
-
"epoch": 0.012,
|
| 174 |
-
"grad_norm": 0.3744785487651825,
|
| 175 |
-
"learning_rate": 0.0001977977977977978,
|
| 176 |
-
"loss": 3.4994,
|
| 177 |
-
"step": 120
|
| 178 |
-
},
|
| 179 |
-
{
|
| 180 |
-
"epoch": 0.0125,
|
| 181 |
-
"grad_norm": 0.35583263635635376,
|
| 182 |
-
"learning_rate": 0.0001976976976976977,
|
| 183 |
-
"loss": 3.4676,
|
| 184 |
-
"step": 125
|
| 185 |
-
},
|
| 186 |
-
{
|
| 187 |
-
"epoch": 0.013,
|
| 188 |
-
"grad_norm": 0.3067843019962311,
|
| 189 |
-
"learning_rate": 0.0001975975975975976,
|
| 190 |
-
"loss": 3.4778,
|
| 191 |
-
"step": 130
|
| 192 |
-
},
|
| 193 |
-
{
|
| 194 |
-
"epoch": 0.0135,
|
| 195 |
-
"grad_norm": 0.4709765315055847,
|
| 196 |
-
"learning_rate": 0.0001974974974974975,
|
| 197 |
-
"loss": 3.4547,
|
| 198 |
-
"step": 135
|
| 199 |
-
},
|
| 200 |
-
{
|
| 201 |
-
"epoch": 0.014,
|
| 202 |
-
"grad_norm": 0.6164122223854065,
|
| 203 |
-
"learning_rate": 0.00019739739739739739,
|
| 204 |
-
"loss": 3.4351,
|
| 205 |
-
"step": 140
|
| 206 |
-
},
|
| 207 |
-
{
|
| 208 |
-
"epoch": 0.0145,
|
| 209 |
-
"grad_norm": 0.41007131338119507,
|
| 210 |
-
"learning_rate": 0.0001972972972972973,
|
| 211 |
-
"loss": 3.4244,
|
| 212 |
-
"step": 145
|
| 213 |
-
},
|
| 214 |
-
{
|
| 215 |
-
"epoch": 0.015,
|
| 216 |
-
"grad_norm": 0.6154835224151611,
|
| 217 |
-
"learning_rate": 0.0001971971971971972,
|
| 218 |
-
"loss": 3.4039,
|
| 219 |
-
"step": 150
|
| 220 |
-
},
|
| 221 |
-
{
|
| 222 |
-
"epoch": 0.0155,
|
| 223 |
-
"grad_norm": 0.4073669910430908,
|
| 224 |
-
"learning_rate": 0.00019709709709709713,
|
| 225 |
-
"loss": 3.395,
|
| 226 |
-
"step": 155
|
| 227 |
-
},
|
| 228 |
-
{
|
| 229 |
-
"epoch": 0.016,
|
| 230 |
-
"grad_norm": 0.5838276147842407,
|
| 231 |
-
"learning_rate": 0.00019699699699699701,
|
| 232 |
-
"loss": 3.3642,
|
| 233 |
-
"step": 160
|
| 234 |
-
},
|
| 235 |
-
{
|
| 236 |
-
"epoch": 0.0165,
|
| 237 |
-
"grad_norm": 0.49278542399406433,
|
| 238 |
-
"learning_rate": 0.0001968968968968969,
|
| 239 |
-
"loss": 3.3515,
|
| 240 |
-
"step": 165
|
| 241 |
-
},
|
| 242 |
-
{
|
| 243 |
-
"epoch": 0.017,
|
| 244 |
-
"grad_norm": 0.4297572374343872,
|
| 245 |
-
"learning_rate": 0.00019679679679679681,
|
| 246 |
-
"loss": 3.3261,
|
| 247 |
-
"step": 170
|
| 248 |
-
},
|
| 249 |
-
{
|
| 250 |
-
"epoch": 0.0175,
|
| 251 |
-
"grad_norm": 0.43436136841773987,
|
| 252 |
-
"learning_rate": 0.0001966966966966967,
|
| 253 |
-
"loss": 3.2953,
|
| 254 |
-
"step": 175
|
| 255 |
-
},
|
| 256 |
-
{
|
| 257 |
-
"epoch": 0.018,
|
| 258 |
-
"grad_norm": 0.4154890179634094,
|
| 259 |
-
"learning_rate": 0.00019659659659659661,
|
| 260 |
-
"loss": 3.2588,
|
| 261 |
-
"step": 180
|
| 262 |
-
},
|
| 263 |
-
{
|
| 264 |
-
"epoch": 0.0185,
|
| 265 |
-
"grad_norm": 0.6486464142799377,
|
| 266 |
-
"learning_rate": 0.0001964964964964965,
|
| 267 |
-
"loss": 3.229,
|
| 268 |
-
"step": 185
|
| 269 |
-
},
|
| 270 |
-
{
|
| 271 |
-
"epoch": 0.019,
|
| 272 |
-
"grad_norm": 0.5434504151344299,
|
| 273 |
-
"learning_rate": 0.0001963963963963964,
|
| 274 |
-
"loss": 3.2005,
|
| 275 |
-
"step": 190
|
| 276 |
-
},
|
| 277 |
-
{
|
| 278 |
-
"epoch": 0.0195,
|
| 279 |
-
"grad_norm": 0.6403669714927673,
|
| 280 |
-
"learning_rate": 0.0001962962962962963,
|
| 281 |
-
"loss": 3.1609,
|
| 282 |
-
"step": 195
|
| 283 |
-
},
|
| 284 |
-
{
|
| 285 |
-
"epoch": 0.02,
|
| 286 |
-
"grad_norm": 0.5148853063583374,
|
| 287 |
-
"learning_rate": 0.00019619619619619621,
|
| 288 |
-
"loss": 3.1362,
|
| 289 |
-
"step": 200
|
| 290 |
-
},
|
| 291 |
-
{
|
| 292 |
-
"epoch": 0.0205,
|
| 293 |
-
"grad_norm": 0.6012855768203735,
|
| 294 |
-
"learning_rate": 0.00019609609609609613,
|
| 295 |
-
"loss": 3.1118,
|
| 296 |
-
"step": 205
|
| 297 |
-
},
|
| 298 |
-
{
|
| 299 |
-
"epoch": 0.021,
|
| 300 |
-
"grad_norm": 0.6342504620552063,
|
| 301 |
-
"learning_rate": 0.00019599599599599602,
|
| 302 |
-
"loss": 3.0452,
|
| 303 |
-
"step": 210
|
| 304 |
-
},
|
| 305 |
-
{
|
| 306 |
-
"epoch": 0.0215,
|
| 307 |
-
"grad_norm": 0.7762932777404785,
|
| 308 |
-
"learning_rate": 0.0001958958958958959,
|
| 309 |
-
"loss": 3.0401,
|
| 310 |
-
"step": 215
|
| 311 |
-
},
|
| 312 |
-
{
|
| 313 |
-
"epoch": 0.022,
|
| 314 |
-
"grad_norm": 0.6487250924110413,
|
| 315 |
-
"learning_rate": 0.00019579579579579582,
|
| 316 |
-
"loss": 3.0074,
|
| 317 |
-
"step": 220
|
| 318 |
-
},
|
| 319 |
-
{
|
| 320 |
-
"epoch": 0.0225,
|
| 321 |
-
"grad_norm": 0.7411482334136963,
|
| 322 |
-
"learning_rate": 0.0001956956956956957,
|
| 323 |
-
"loss": 2.9665,
|
| 324 |
-
"step": 225
|
| 325 |
-
},
|
| 326 |
-
{
|
| 327 |
-
"epoch": 0.023,
|
| 328 |
-
"grad_norm": 0.727695643901825,
|
| 329 |
-
"learning_rate": 0.00019559559559559562,
|
| 330 |
-
"loss": 2.9418,
|
| 331 |
-
"step": 230
|
| 332 |
-
},
|
| 333 |
-
{
|
| 334 |
-
"epoch": 0.0235,
|
| 335 |
-
"grad_norm": 0.6558846235275269,
|
| 336 |
-
"learning_rate": 0.0001954954954954955,
|
| 337 |
-
"loss": 2.8922,
|
| 338 |
-
"step": 235
|
| 339 |
-
},
|
| 340 |
-
{
|
| 341 |
-
"epoch": 0.024,
|
| 342 |
-
"grad_norm": 0.7584027051925659,
|
| 343 |
-
"learning_rate": 0.0001953953953953954,
|
| 344 |
-
"loss": 2.8897,
|
| 345 |
-
"step": 240
|
| 346 |
-
},
|
| 347 |
-
{
|
| 348 |
-
"epoch": 0.0245,
|
| 349 |
-
"grad_norm": 0.6296901106834412,
|
| 350 |
-
"learning_rate": 0.0001952952952952953,
|
| 351 |
-
"loss": 2.8531,
|
| 352 |
-
"step": 245
|
| 353 |
-
},
|
| 354 |
-
{
|
| 355 |
-
"epoch": 0.025,
|
| 356 |
-
"grad_norm": 0.6529428362846375,
|
| 357 |
-
"learning_rate": 0.0001951951951951952,
|
| 358 |
-
"loss": 2.8375,
|
| 359 |
-
"step": 250
|
| 360 |
-
},
|
| 361 |
-
{
|
| 362 |
-
"epoch": 0.0255,
|
| 363 |
-
"grad_norm": 0.6653200387954712,
|
| 364 |
-
"learning_rate": 0.0001950950950950951,
|
| 365 |
-
"loss": 2.796,
|
| 366 |
-
"step": 255
|
| 367 |
-
},
|
| 368 |
-
{
|
| 369 |
-
"epoch": 0.026,
|
| 370 |
-
"grad_norm": 0.6050741076469421,
|
| 371 |
-
"learning_rate": 0.00019499499499499502,
|
| 372 |
-
"loss": 2.787,
|
| 373 |
-
"step": 260
|
| 374 |
-
},
|
| 375 |
-
{
|
| 376 |
-
"epoch": 0.0265,
|
| 377 |
-
"grad_norm": 0.6170589923858643,
|
| 378 |
-
"learning_rate": 0.0001948948948948949,
|
| 379 |
-
"loss": 2.7591,
|
| 380 |
-
"step": 265
|
| 381 |
-
},
|
| 382 |
-
{
|
| 383 |
-
"epoch": 0.027,
|
| 384 |
-
"grad_norm": 0.6681796908378601,
|
| 385 |
-
"learning_rate": 0.00019479479479479482,
|
| 386 |
-
"loss": 2.7431,
|
| 387 |
-
"step": 270
|
| 388 |
-
},
|
| 389 |
-
{
|
| 390 |
-
"epoch": 0.0275,
|
| 391 |
-
"grad_norm": 0.6189929246902466,
|
| 392 |
-
"learning_rate": 0.0001946946946946947,
|
| 393 |
-
"loss": 2.7374,
|
| 394 |
-
"step": 275
|
| 395 |
-
},
|
| 396 |
-
{
|
| 397 |
-
"epoch": 0.028,
|
| 398 |
-
"grad_norm": 0.6890608668327332,
|
| 399 |
-
"learning_rate": 0.00019459459459459462,
|
| 400 |
-
"loss": 2.6941,
|
| 401 |
-
"step": 280
|
| 402 |
-
},
|
| 403 |
-
{
|
| 404 |
-
"epoch": 0.0285,
|
| 405 |
-
"grad_norm": 0.6476343274116516,
|
| 406 |
-
"learning_rate": 0.0001944944944944945,
|
| 407 |
-
"loss": 2.6852,
|
| 408 |
-
"step": 285
|
| 409 |
-
},
|
| 410 |
-
{
|
| 411 |
-
"epoch": 0.029,
|
| 412 |
-
"grad_norm": 0.7976285815238953,
|
| 413 |
-
"learning_rate": 0.0001943943943943944,
|
| 414 |
-
"loss": 2.6704,
|
| 415 |
-
"step": 290
|
| 416 |
-
},
|
| 417 |
-
{
|
| 418 |
-
"epoch": 0.0295,
|
| 419 |
-
"grad_norm": 0.8300926089286804,
|
| 420 |
-
"learning_rate": 0.0001942942942942943,
|
| 421 |
-
"loss": 2.645,
|
| 422 |
-
"step": 295
|
| 423 |
-
},
|
| 424 |
-
{
|
| 425 |
-
"epoch": 0.03,
|
| 426 |
-
"grad_norm": 0.7338405251502991,
|
| 427 |
-
"learning_rate": 0.0001941941941941942,
|
| 428 |
-
"loss": 2.6236,
|
| 429 |
-
"step": 300
|
| 430 |
}
|
| 431 |
],
|
| 432 |
"logging_steps": 5,
|
|
@@ -446,7 +166,7 @@
|
|
| 446 |
"attributes": {}
|
| 447 |
}
|
| 448 |
},
|
| 449 |
-
"total_flos":
|
| 450 |
"train_batch_size": 4,
|
| 451 |
"trial_name": null,
|
| 452 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 0.01,
|
| 5 |
"eval_steps": 500,
|
| 6 |
+
"global_step": 100,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
"epoch": 0.0005,
|
| 13 |
+
"grad_norm": 1.354914903640747,
|
| 14 |
"learning_rate": 0.0001,
|
| 15 |
+
"loss": 5.3068,
|
| 16 |
"step": 5
|
| 17 |
},
|
| 18 |
{
|
| 19 |
"epoch": 0.001,
|
| 20 |
+
"grad_norm": 1.0461070537567139,
|
| 21 |
"learning_rate": 0.0002,
|
| 22 |
+
"loss": 5.0784,
|
| 23 |
"step": 10
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"epoch": 0.0015,
|
| 27 |
+
"grad_norm": 0.7310259938240051,
|
| 28 |
"learning_rate": 0.0001998998998998999,
|
| 29 |
+
"loss": 4.8251,
|
| 30 |
"step": 15
|
| 31 |
},
|
| 32 |
{
|
| 33 |
"epoch": 0.002,
|
| 34 |
+
"grad_norm": 0.82170170545578,
|
| 35 |
"learning_rate": 0.0001997997997997998,
|
| 36 |
+
"loss": 4.6949,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 0.0025,
|
| 41 |
+
"grad_norm": 0.9640143513679504,
|
| 42 |
"learning_rate": 0.0001996996996996997,
|
| 43 |
+
"loss": 4.5294,
|
| 44 |
"step": 25
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"epoch": 0.003,
|
| 48 |
+
"grad_norm": 0.6337556838989258,
|
| 49 |
"learning_rate": 0.0001995995995995996,
|
| 50 |
+
"loss": 4.3776,
|
| 51 |
"step": 30
|
| 52 |
},
|
| 53 |
{
|
| 54 |
"epoch": 0.0035,
|
| 55 |
+
"grad_norm": 0.5715162754058838,
|
| 56 |
"learning_rate": 0.0001994994994994995,
|
| 57 |
+
"loss": 4.251,
|
| 58 |
"step": 35
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"epoch": 0.004,
|
| 62 |
+
"grad_norm": 0.47545069456100464,
|
| 63 |
"learning_rate": 0.0001993993993993994,
|
| 64 |
+
"loss": 4.142,
|
| 65 |
"step": 40
|
| 66 |
},
|
| 67 |
{
|
| 68 |
"epoch": 0.0045,
|
| 69 |
+
"grad_norm": 0.43138620257377625,
|
| 70 |
"learning_rate": 0.00019929929929929932,
|
| 71 |
+
"loss": 4.0538,
|
| 72 |
"step": 45
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"epoch": 0.005,
|
| 76 |
+
"grad_norm": 0.41834330558776855,
|
| 77 |
"learning_rate": 0.0001991991991991992,
|
| 78 |
+
"loss": 3.9896,
|
| 79 |
"step": 50
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"epoch": 0.0055,
|
| 83 |
+
"grad_norm": 0.3807925283908844,
|
| 84 |
"learning_rate": 0.00019909909909909912,
|
| 85 |
+
"loss": 3.9316,
|
| 86 |
"step": 55
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"epoch": 0.006,
|
| 90 |
+
"grad_norm": 0.4051252603530884,
|
| 91 |
"learning_rate": 0.000198998998998999,
|
| 92 |
+
"loss": 3.8816,
|
| 93 |
"step": 60
|
| 94 |
},
|
| 95 |
{
|
| 96 |
"epoch": 0.0065,
|
| 97 |
+
"grad_norm": 0.3600367307662964,
|
| 98 |
"learning_rate": 0.0001988988988988989,
|
| 99 |
+
"loss": 3.8327,
|
| 100 |
"step": 65
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"epoch": 0.007,
|
| 104 |
+
"grad_norm": 0.3089018762111664,
|
| 105 |
"learning_rate": 0.0001987987987987988,
|
| 106 |
+
"loss": 3.7908,
|
| 107 |
"step": 70
|
| 108 |
},
|
| 109 |
{
|
| 110 |
"epoch": 0.0075,
|
| 111 |
+
"grad_norm": 0.2999509572982788,
|
| 112 |
"learning_rate": 0.0001986986986986987,
|
| 113 |
+
"loss": 3.7632,
|
| 114 |
"step": 75
|
| 115 |
},
|
| 116 |
{
|
| 117 |
"epoch": 0.008,
|
| 118 |
+
"grad_norm": 0.29107317328453064,
|
| 119 |
"learning_rate": 0.0001985985985985986,
|
| 120 |
+
"loss": 3.7366,
|
| 121 |
"step": 80
|
| 122 |
},
|
| 123 |
{
|
| 124 |
"epoch": 0.0085,
|
| 125 |
+
"grad_norm": 0.3126203417778015,
|
| 126 |
"learning_rate": 0.0001984984984984985,
|
| 127 |
+
"loss": 3.7243,
|
| 128 |
"step": 85
|
| 129 |
},
|
| 130 |
{
|
| 131 |
"epoch": 0.009,
|
| 132 |
+
"grad_norm": 0.3028947710990906,
|
| 133 |
"learning_rate": 0.0001983983983983984,
|
| 134 |
+
"loss": 3.6909,
|
| 135 |
"step": 90
|
| 136 |
},
|
| 137 |
{
|
| 138 |
"epoch": 0.0095,
|
| 139 |
+
"grad_norm": 0.3013005554676056,
|
| 140 |
"learning_rate": 0.00019829829829829833,
|
| 141 |
+
"loss": 3.6686,
|
| 142 |
"step": 95
|
| 143 |
},
|
| 144 |
{
|
| 145 |
"epoch": 0.01,
|
| 146 |
+
"grad_norm": 0.26517948508262634,
|
| 147 |
"learning_rate": 0.0001981981981981982,
|
| 148 |
+
"loss": 3.6513,
|
| 149 |
"step": 100
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
}
|
| 151 |
],
|
| 152 |
"logging_steps": 5,
|
|
|
|
| 166 |
"attributes": {}
|
| 167 |
}
|
| 168 |
},
|
| 169 |
+
"total_flos": 41292084019200.0,
|
| 170 |
"train_batch_size": 4,
|
| 171 |
"trial_name": null,
|
| 172 |
"trial_params": null
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5713
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87f7094c9781b5c9394410d447866dce36653e1a7dc4508ca501767ea42b00ab
|
| 3 |
size 5713
|