Azrail commited on
Commit
06918f2
·
verified ·
1 Parent(s): 5e94c44

Model save

Browse files
README.md ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - generated_from_trainer
5
+ model-index:
6
+ - name: smallm_350
7
+ results: []
8
+ ---
9
+
10
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
11
+ should probably proofread and complete it, then remove this comment. -->
12
+
13
+ # smallm_350
14
+
15
+ This model is a fine-tuned version of [](https://huggingface.co/) on the None dataset.
16
+ It achieves the following results on the evaluation set:
17
+ - Loss: 1.9510
18
+ - Num Input Tokens Seen: 73388446624
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 0.001
38
+ - train_batch_size: 32
39
+ - eval_batch_size: 4
40
+ - seed: 42
41
+ - gradient_accumulation_steps: 16
42
+ - total_train_batch_size: 512
43
+ - optimizer: Use OptimizerNames.ADAMW_APEX_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
44
+ - lr_scheduler_type: warmup_stable_decay
45
+ - lr_scheduler_warmup_steps: 500
46
+ - training_steps: 140000
47
+
48
+ ### Training results
49
+
50
+ | Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen |
51
+ |:-------------:|:------:|:------:|:---------------:|:-----------------:|
52
+ | 3.8584 | 0.0048 | 500 | 3.6764 | 262104320 |
53
+ | 3.1457 | 0.0095 | 1000 | 3.0411 | 524218944 |
54
+ | 2.9472 | 0.0143 | 1500 | 2.8491 | 786331040 |
55
+ | 2.8357 | 0.0191 | 2000 | 2.7469 | 1048424032 |
56
+ | 2.7564 | 0.0239 | 2500 | 2.6824 | 1310500000 |
57
+ | 2.719 | 0.0286 | 3000 | 2.6300 | 1572599776 |
58
+ | 2.664 | 0.0334 | 3500 | 2.5908 | 1834732864 |
59
+ | 2.6351 | 0.0382 | 4000 | 2.5541 | 2096832320 |
60
+ | 2.5897 | 0.0429 | 4500 | 2.5187 | 2358960576 |
61
+ | 2.5672 | 0.0477 | 5000 | 2.4908 | 2621089856 |
62
+ | 2.5412 | 0.0525 | 5500 | 2.4639 | 2883166048 |
63
+ | 2.5194 | 0.0572 | 6000 | 2.4423 | 3145255744 |
64
+ | 2.5105 | 0.0620 | 6500 | 2.4206 | 3407356352 |
65
+ | 2.4816 | 0.0668 | 7000 | 2.4051 | 3669468320 |
66
+ | 2.4688 | 0.0716 | 7500 | 2.3871 | 3931543616 |
67
+ | 2.4446 | 0.0763 | 8000 | 2.3720 | 4193649312 |
68
+ | 2.4318 | 0.0811 | 8500 | 2.3614 | 4455747616 |
69
+ | 2.4292 | 0.0859 | 9000 | 2.3485 | 4717856864 |
70
+ | 2.4151 | 0.0906 | 9500 | 2.3371 | 4979955776 |
71
+ | 2.4056 | 0.0954 | 10000 | 2.3263 | 5242058496 |
72
+ | 2.3912 | 0.1002 | 10500 | 2.3170 | 5504131840 |
73
+ | 2.3781 | 0.1049 | 11000 | 2.3075 | 5766222432 |
74
+ | 2.3776 | 0.1097 | 11500 | 2.2976 | 6028313408 |
75
+ | 2.3759 | 0.1145 | 12000 | 2.2900 | 6290430912 |
76
+ | 2.3501 | 0.1193 | 12500 | 2.2824 | 6552526688 |
77
+ | 2.3572 | 0.1240 | 13000 | 2.2758 | 6814626336 |
78
+ | 2.3451 | 0.1288 | 13500 | 2.2677 | 7076718080 |
79
+ | 2.3437 | 0.1336 | 14000 | 2.2614 | 7338830528 |
80
+ | 2.328 | 0.1383 | 14500 | 2.2562 | 7600938432 |
81
+ | 2.3288 | 0.1431 | 15000 | 2.2487 | 7863022432 |
82
+ | 2.3259 | 0.1479 | 15500 | 2.2430 | 8125131456 |
83
+ | 2.3103 | 0.1526 | 16000 | 2.2366 | 8387218560 |
84
+ | 2.3137 | 0.1574 | 16500 | 2.2321 | 8649321536 |
85
+ | 2.3148 | 0.1622 | 17000 | 2.2286 | 8911431360 |
86
+ | 2.3076 | 0.1670 | 17500 | 2.2226 | 9173533056 |
87
+ | 2.2963 | 0.1717 | 18000 | 2.2166 | 9435637536 |
88
+ | 2.3052 | 0.1765 | 18500 | 2.2123 | 9697738752 |
89
+ | 2.2941 | 0.1813 | 19000 | 2.2090 | 9959851776 |
90
+ | 2.2888 | 0.1860 | 19500 | 2.2118 | 10221963136 |
91
+ | 2.2894 | 0.1908 | 20000 | 2.2017 | 10484059168 |
92
+ | 2.2882 | 0.1956 | 20500 | 2.1974 | 10746164768 |
93
+ | 2.2677 | 0.2003 | 21000 | 2.1927 | 11008255872 |
94
+ | 2.2577 | 0.2051 | 21500 | 2.1901 | 11270362240 |
95
+ | 2.2725 | 0.2099 | 22000 | 2.1856 | 11532457408 |
96
+ | 2.2519 | 0.2147 | 22500 | 2.1839 | 11794558656 |
97
+ | 2.266 | 0.2194 | 23000 | 2.1793 | 12056655104 |
98
+ | 2.2531 | 0.2242 | 23500 | 2.1767 | 12318747360 |
99
+ | 2.2522 | 0.2290 | 24000 | 2.1732 | 12580853504 |
100
+ | 2.2604 | 0.2337 | 24500 | 2.1710 | 12842964128 |
101
+ | 2.253 | 0.2385 | 25000 | 2.1673 | 13105069824 |
102
+ | 2.2388 | 0.2433 | 25500 | 2.1654 | 13367175456 |
103
+ | 2.2511 | 0.2480 | 26000 | 2.1629 | 13629260960 |
104
+ | 2.2453 | 0.2528 | 26500 | 2.1592 | 13891381408 |
105
+ | 2.2302 | 0.2576 | 27000 | 2.1568 | 14153506688 |
106
+ | 2.2305 | 0.2624 | 27500 | 2.1540 | 14415584288 |
107
+ | 2.2285 | 0.2671 | 28000 | 2.1511 | 14677696896 |
108
+ | 2.23 | 0.2719 | 28500 | 2.1498 | 14939780320 |
109
+ | 2.2136 | 0.2767 | 29000 | 2.1479 | 15201894176 |
110
+ | 2.2333 | 0.2814 | 29500 | 2.1446 | 15463988928 |
111
+ | 2.2241 | 0.2862 | 30000 | 2.1426 | 15726072896 |
112
+ | 2.2318 | 0.2910 | 30500 | 2.1404 | 15988180544 |
113
+ | 2.2156 | 0.2957 | 31000 | 2.1403 | 16250292672 |
114
+ | 2.2184 | 0.3005 | 31500 | 2.1374 | 16512390176 |
115
+ | 2.2261 | 0.3053 | 32000 | 2.1346 | 16774478112 |
116
+ | 2.2091 | 0.3101 | 32500 | 2.1335 | 17036582752 |
117
+ | 2.2222 | 0.3148 | 33000 | 2.1309 | 17298706464 |
118
+ | 2.2172 | 0.3196 | 33500 | 2.1288 | 17560813216 |
119
+ | 2.2003 | 0.3244 | 34000 | 2.1275 | 17822921152 |
120
+ | 2.2152 | 0.3291 | 34500 | 2.1281 | 18085018528 |
121
+ | 2.2076 | 0.3339 | 35000 | 2.1236 | 18347147168 |
122
+ | 2.2005 | 0.3387 | 35500 | 2.1224 | 18609222656 |
123
+ | 2.1975 | 0.3434 | 36000 | 2.1199 | 18871341760 |
124
+ | 2.201 | 0.3482 | 36500 | 2.1179 | 19133442016 |
125
+ | 2.1968 | 0.3530 | 37000 | 2.1169 | 19395541920 |
126
+ | 2.204 | 0.3578 | 37500 | 2.1144 | 19657633088 |
127
+ | 2.1979 | 0.3625 | 38000 | 2.1132 | 19919735840 |
128
+ | 2.1898 | 0.3673 | 38500 | 2.1110 | 20181833600 |
129
+ | 2.1859 | 0.3721 | 39000 | 2.1110 | 20443920960 |
130
+ | 2.188 | 0.3768 | 39500 | 2.1096 | 20706018592 |
131
+ | 2.1932 | 0.3816 | 40000 | 2.1082 | 20968112960 |
132
+ | 2.1933 | 0.3864 | 40500 | 2.1045 | 21230223584 |
133
+ | 2.1907 | 0.3911 | 41000 | 2.1057 | 21492310496 |
134
+ | 2.1806 | 0.3959 | 41500 | 2.1030 | 21754409120 |
135
+ | 2.1834 | 0.4007 | 42000 | 2.1014 | 22016505376 |
136
+ | 2.1914 | 0.4055 | 42500 | 2.1016 | 22278605888 |
137
+ | 2.1932 | 0.4102 | 43000 | 2.0990 | 22540715296 |
138
+ | 2.2209 | 0.4150 | 43500 | 2.1086 | 22802815776 |
139
+ | 2.1856 | 0.4198 | 44000 | 2.0981 | 23064909408 |
140
+ | 2.1823 | 0.4245 | 44500 | 2.0960 | 23327017760 |
141
+ | 2.1862 | 0.4293 | 45000 | 2.0936 | 23589115072 |
142
+ | 2.182 | 0.4341 | 45500 | 2.0927 | 23851210336 |
143
+ | 2.1729 | 0.4388 | 46000 | 2.0917 | 24113281184 |
144
+ | 2.177 | 0.4436 | 46500 | 2.0904 | 24375397792 |
145
+ | 2.1674 | 0.4484 | 47000 | 2.0890 | 24637513248 |
146
+ | 2.1608 | 0.4532 | 47500 | 2.0887 | 24899608000 |
147
+ | 2.1808 | 0.4579 | 48000 | 2.0938 | 25161718656 |
148
+ | 2.1811 | 0.4627 | 48500 | 2.0870 | 25423801984 |
149
+ | 2.1621 | 0.4675 | 49000 | 2.0852 | 25685912544 |
150
+ | 2.1722 | 0.4722 | 49500 | 2.0832 | 25948022560 |
151
+ | 2.1745 | 0.4770 | 50000 | 2.0824 | 26210133120 |
152
+ | 2.1529 | 0.4818 | 50500 | 2.0812 | 26472227840 |
153
+ | 2.169 | 0.4865 | 51000 | 2.0815 | 26734340064 |
154
+ | 2.1738 | 0.4913 | 51500 | 2.0796 | 26996432960 |
155
+ | 2.169 | 0.4961 | 52000 | 2.0802 | 27258524544 |
156
+ | 2.1557 | 0.5009 | 52500 | 2.0776 | 27520636736 |
157
+ | 2.1765 | 0.5056 | 53000 | 2.0828 | 27782732608 |
158
+ | 2.1616 | 0.5104 | 53500 | 2.0767 | 28044839456 |
159
+ | 2.1569 | 0.5152 | 54000 | 2.0758 | 28306946368 |
160
+ | 2.1561 | 0.5199 | 54500 | 2.0746 | 28569047936 |
161
+ | 2.1554 | 0.5247 | 55000 | 2.0725 | 28831152896 |
162
+ | 2.1505 | 0.5295 | 55500 | 2.0716 | 29093257888 |
163
+ | 2.1491 | 0.5342 | 56000 | 2.0714 | 29355372320 |
164
+ | 2.1471 | 0.5390 | 56500 | 2.0707 | 29617485024 |
165
+ | 2.1465 | 0.5438 | 57000 | 2.0692 | 29879599072 |
166
+ | 2.1511 | 0.5486 | 57500 | 2.0681 | 30141698752 |
167
+ | 2.1456 | 0.5533 | 58000 | 2.0688 | 30403788864 |
168
+ | 2.1591 | 0.5581 | 58500 | 2.0664 | 30665890560 |
169
+ | 2.1508 | 0.5629 | 59000 | 2.0671 | 30927998464 |
170
+ | 2.1466 | 0.5676 | 59500 | 2.0664 | 31190116608 |
171
+ | 2.1457 | 0.5724 | 60000 | 2.0640 | 31452217632 |
172
+ | 2.1496 | 0.5772 | 60500 | 2.0636 | 31714314848 |
173
+ | 2.1418 | 0.5819 | 61000 | 2.0649 | 31976431072 |
174
+ | 2.1477 | 0.5867 | 61500 | 2.0638 | 32238532768 |
175
+ | 2.137 | 0.5915 | 62000 | 2.0610 | 32500617568 |
176
+ | 2.1415 | 0.5963 | 62500 | 2.0606 | 32762704928 |
177
+ | 2.1459 | 0.6010 | 63000 | 2.0603 | 33024820736 |
178
+ | 2.1389 | 0.6058 | 63500 | 2.0586 | 33286935872 |
179
+ | 2.1367 | 0.6106 | 64000 | 2.0588 | 33549034848 |
180
+ | 2.147 | 0.6153 | 64500 | 2.0593 | 33811149696 |
181
+ | 2.1415 | 0.6201 | 65000 | 2.0580 | 34073267168 |
182
+ | 2.1426 | 0.6249 | 65500 | 2.0569 | 34335361632 |
183
+ | 2.1483 | 0.6296 | 66000 | 2.0545 | 34597457472 |
184
+ | 2.1409 | 0.6344 | 66500 | 2.0548 | 34859578368 |
185
+ | 2.1368 | 0.6392 | 67000 | 2.0555 | 35121682240 |
186
+ | 2.1366 | 0.6440 | 67500 | 2.0543 | 35383796224 |
187
+ | 2.137 | 0.6487 | 68000 | 2.0545 | 35645894016 |
188
+ | 2.1342 | 0.6535 | 68500 | 2.0521 | 35907993664 |
189
+ | 2.1388 | 0.6583 | 69000 | 2.0507 | 36170105088 |
190
+ | 2.1339 | 0.6630 | 69500 | 2.0502 | 36432197248 |
191
+ | 2.118 | 0.6678 | 70000 | 2.0507 | 36694303392 |
192
+ | 2.134 | 0.6726 | 70500 | 2.0488 | 36956418880 |
193
+ | 2.1461 | 0.6773 | 71000 | 2.0492 | 37218518048 |
194
+ | 2.1361 | 0.6821 | 71500 | 2.0479 | 37480608672 |
195
+ | 2.1369 | 0.6869 | 72000 | 2.0480 | 37742700480 |
196
+ | 2.1369 | 0.6917 | 72500 | 2.0477 | 38004789088 |
197
+ | 2.1379 | 0.6964 | 73000 | 2.0459 | 38266876896 |
198
+ | 2.1235 | 0.7012 | 73500 | 2.0458 | 38528990144 |
199
+ | 2.127 | 0.7060 | 74000 | 2.0454 | 38791103008 |
200
+ | 2.1362 | 0.7107 | 74500 | 2.0453 | 39053184384 |
201
+ | 2.1096 | 0.7155 | 75000 | 2.0448 | 39315259072 |
202
+ | 2.1273 | 0.7203 | 75500 | 2.0433 | 39577349376 |
203
+ | 2.1325 | 0.7250 | 76000 | 2.0433 | 39839459232 |
204
+ | 2.1261 | 0.7298 | 76500 | 2.0422 | 40101539296 |
205
+ | 2.1284 | 0.7346 | 77000 | 2.0418 | 40363651328 |
206
+ | 2.1152 | 0.7394 | 77500 | 2.0417 | 40625741536 |
207
+ | 2.1353 | 0.7441 | 78000 | 2.0413 | 40887844000 |
208
+ | 2.1214 | 0.7489 | 78500 | 2.0390 | 41149941248 |
209
+ | 2.1184 | 0.7537 | 79000 | 2.0395 | 41412054816 |
210
+ | 2.1199 | 0.7584 | 79500 | 2.0389 | 41674139648 |
211
+ | 2.129 | 0.7632 | 80000 | 2.0402 | 41936220160 |
212
+ | 2.1247 | 0.7680 | 80500 | 2.0385 | 42198339200 |
213
+ | 2.1147 | 0.7727 | 81000 | 2.0367 | 42460451136 |
214
+ | 2.1265 | 0.7775 | 81500 | 2.0365 | 42722562592 |
215
+ | 2.1254 | 0.7823 | 82000 | 2.0358 | 42984652928 |
216
+ | 2.1121 | 0.7871 | 82500 | 2.0354 | 43246727552 |
217
+ | 2.1235 | 0.7918 | 83000 | 2.0352 | 43508842912 |
218
+ | 2.1192 | 0.7966 | 83500 | 2.0352 | 43770947360 |
219
+ | 2.1303 | 0.8014 | 84000 | 2.0382 | 44033053408 |
220
+ | 2.1247 | 0.8061 | 84500 | 2.0337 | 44295182016 |
221
+ | 2.125 | 0.8109 | 85000 | 2.0327 | 44557289600 |
222
+ | 2.1218 | 0.8157 | 85500 | 2.0317 | 44819414752 |
223
+ | 2.1122 | 0.8204 | 86000 | 2.0312 | 45081505024 |
224
+ | 2.1293 | 0.8252 | 86500 | 2.0313 | 45343592896 |
225
+ | 2.1161 | 0.8300 | 87000 | 2.0335 | 45605708608 |
226
+ | 2.1237 | 0.8348 | 87500 | 2.0365 | 45867808448 |
227
+ | 2.1206 | 0.8395 | 88000 | 2.0313 | 46129884928 |
228
+ | 2.1191 | 0.8443 | 88500 | 2.0305 | 46391967104 |
229
+ | 2.1075 | 0.8491 | 89000 | 2.0287 | 46654069856 |
230
+ | 2.1109 | 0.8538 | 89500 | 2.0292 | 46916134592 |
231
+ | 2.1208 | 0.8586 | 90000 | 2.0290 | 47178248768 |
232
+ | 2.1056 | 0.8634 | 90500 | 2.0283 | 47440355904 |
233
+ | 2.1071 | 0.8681 | 91000 | 2.0275 | 47702460032 |
234
+ | 2.1069 | 0.8729 | 91500 | 2.0270 | 47964538176 |
235
+ | 2.1003 | 0.8777 | 92000 | 2.0305 | 48226649920 |
236
+ | 2.102 | 0.8825 | 92500 | 2.0255 | 48488770560 |
237
+ | 2.1141 | 0.8872 | 93000 | 2.0259 | 48750829152 |
238
+ | 2.1241 | 0.8920 | 93500 | 2.0255 | 49012940480 |
239
+ | 2.1154 | 0.8968 | 94000 | 2.0230 | 49275046560 |
240
+ | 2.0967 | 0.9015 | 94500 | 2.0232 | 49537142912 |
241
+ | 2.1101 | 0.9063 | 95000 | 2.0240 | 49799231680 |
242
+ | 2.1087 | 0.9111 | 95500 | 2.0230 | 50061331936 |
243
+ | 2.1011 | 0.9158 | 96000 | 2.0229 | 50323443136 |
244
+ | 2.1052 | 0.9206 | 96500 | 2.0226 | 50585537568 |
245
+ | 2.1024 | 0.9254 | 97000 | 2.0225 | 50847631776 |
246
+ | 2.1032 | 0.9302 | 97500 | 2.0224 | 51109732512 |
247
+ | 2.0984 | 0.9349 | 98000 | 2.0211 | 51371830432 |
248
+ | 2.1133 | 0.9397 | 98500 | 2.0206 | 51633931744 |
249
+ | 2.1038 | 0.9445 | 99000 | 2.0205 | 51896030848 |
250
+ | 2.1021 | 0.9492 | 99500 | 2.0222 | 52158147168 |
251
+ | 2.1116 | 0.9540 | 100000 | 2.0211 | 52420247968 |
252
+ | 2.1026 | 0.9588 | 100500 | 2.0197 | 52682343360 |
253
+ | 2.1031 | 0.9635 | 101000 | 2.0206 | 52944447232 |
254
+ | 2.1127 | 0.9683 | 101500 | 2.0185 | 53206553568 |
255
+ | 2.0992 | 0.9731 | 102000 | 2.0180 | 53468658336 |
256
+ | 2.0968 | 0.9779 | 102500 | 2.0211 | 53730772800 |
257
+ | 2.092 | 0.9826 | 103000 | 2.0180 | 53992863264 |
258
+ | 2.1016 | 0.9874 | 103500 | 2.0163 | 54254966464 |
259
+ | 2.1098 | 0.9922 | 104000 | 2.0174 | 54517083360 |
260
+ | 2.1103 | 0.9969 | 104500 | 2.0176 | 54779191328 |
261
+ | 2.0879 | 1.0017 | 105000 | 2.0179 | 55041423104 |
262
+ | 2.0964 | 1.0065 | 105500 | 2.0158 | 55303527552 |
263
+ | 2.0989 | 1.0112 | 106000 | 2.0157 | 55565623584 |
264
+ | 2.1079 | 1.0160 | 106500 | 2.0186 | 55827708736 |
265
+ | 2.1069 | 1.0208 | 107000 | 2.0148 | 56089794304 |
266
+ | 2.0987 | 1.0256 | 107500 | 2.0147 | 56351910752 |
267
+ | 2.095 | 1.0303 | 108000 | 2.0153 | 56614015392 |
268
+ | 2.097 | 1.0351 | 108500 | 2.0137 | 56876128320 |
269
+ | 2.099 | 1.0399 | 109000 | 2.0129 | 57138237248 |
270
+ | 2.0952 | 1.0446 | 109500 | 2.0124 | 57400333280 |
271
+ | 2.0895 | 1.0494 | 110000 | 2.0132 | 57662439520 |
272
+ | 2.0945 | 1.0542 | 110500 | 2.0113 | 57924525696 |
273
+ | 2.0978 | 1.0589 | 111000 | 2.0107 | 58186625280 |
274
+ | 2.0873 | 1.0637 | 111500 | 2.0106 | 58448739744 |
275
+ | 2.0768 | 1.0685 | 112000 | 2.0110 | 58710836096 |
276
+ | 2.0901 | 1.0733 | 112500 | 2.0121 | 58972918688 |
277
+ | 2.0889 | 1.0780 | 113000 | 2.0116 | 59235047232 |
278
+ | 2.0985 | 1.0828 | 113500 | 2.0100 | 59497148000 |
279
+ | 2.1012 | 1.0876 | 114000 | 2.0105 | 59759249088 |
280
+ | 2.0837 | 1.0923 | 114500 | 2.0083 | 60021363392 |
281
+ | 2.0992 | 1.0971 | 115000 | 2.0077 | 60283464768 |
282
+ | 2.0916 | 1.1019 | 115500 | 2.0105 | 60545534656 |
283
+ | 2.1125 | 1.1066 | 116000 | 2.0121 | 60807636160 |
284
+ | 2.0889 | 1.1114 | 116500 | 2.0075 | 61069752768 |
285
+ | 2.0879 | 1.1162 | 117000 | 2.0054 | 61331831488 |
286
+ | 2.0759 | 1.1210 | 117500 | 2.0039 | 61593927520 |
287
+ | 2.0801 | 1.1257 | 118000 | 2.0020 | 61856020192 |
288
+ | 2.0836 | 1.1305 | 118500 | 2.0045 | 62118128864 |
289
+ | 2.0858 | 1.1353 | 119000 | 2.0006 | 62380238112 |
290
+ | 2.0836 | 1.1400 | 119500 | 1.9990 | 62642339520 |
291
+ | 2.0956 | 1.1448 | 120000 | 2.0033 | 62904447680 |
292
+ | 2.0796 | 1.1496 | 120500 | 1.9961 | 63166554368 |
293
+ | 2.077 | 1.1543 | 121000 | 1.9940 | 63428647904 |
294
+ | 2.0938 | 1.1591 | 121500 | 1.9930 | 63690757024 |
295
+ | 2.072 | 1.1639 | 122000 | 1.9911 | 63952872768 |
296
+ | 2.0684 | 1.1687 | 122500 | 1.9892 | 64214942816 |
297
+ | 2.0564 | 1.1734 | 123000 | 1.9881 | 64477051392 |
298
+ | 2.066 | 1.1782 | 123500 | 1.9865 | 64739145440 |
299
+ | 2.0639 | 1.1830 | 124000 | 1.9843 | 65001257824 |
300
+ | 2.0675 | 1.1877 | 124500 | 1.9832 | 65263378112 |
301
+ | 2.0692 | 1.1925 | 125000 | 1.9822 | 65525493280 |
302
+ | 2.0645 | 1.1973 | 125500 | 1.9799 | 65787582144 |
303
+ | 2.0556 | 1.2020 | 126000 | 1.9784 | 66049692768 |
304
+ | 2.0581 | 1.2068 | 126500 | 1.9767 | 66311770112 |
305
+ | 2.0633 | 1.2116 | 127000 | 1.9741 | 66573856704 |
306
+ | 2.0625 | 1.2164 | 127500 | 1.9729 | 66835970592 |
307
+ | 2.0557 | 1.2211 | 128000 | 1.9715 | 67098059328 |
308
+ | 2.0475 | 1.2259 | 128500 | 1.9702 | 67360131616 |
309
+ | 2.0558 | 1.2307 | 129000 | 1.9687 | 67622231264 |
310
+ | 2.0405 | 1.2354 | 129500 | 1.9668 | 67884327168 |
311
+ | 2.038 | 1.2402 | 130000 | 1.9652 | 68146442176 |
312
+ | 2.0338 | 1.2450 | 130500 | 1.9644 | 68408534464 |
313
+ | 2.0506 | 1.2497 | 131000 | 1.9626 | 68670633664 |
314
+ | 2.0432 | 1.2545 | 131500 | 1.9614 | 68932735776 |
315
+ | 2.0361 | 1.2593 | 132000 | 1.9603 | 69194840608 |
316
+ | 2.0347 | 1.2641 | 132500 | 1.9590 | 69456943424 |
317
+ | 2.0354 | 1.2688 | 133000 | 1.9577 | 69719047840 |
318
+ | 2.0419 | 1.2736 | 133500 | 1.9569 | 69981165632 |
319
+ | 2.0345 | 1.2784 | 134000 | 1.9554 | 70243253472 |
320
+ | 2.0393 | 1.2831 | 134500 | 1.9546 | 70505353504 |
321
+ | 2.0401 | 1.2879 | 135000 | 1.9541 | 70767457344 |
322
+ | 2.037 | 1.2927 | 135500 | 1.9531 | 71029545920 |
323
+ | 2.0302 | 1.2974 | 136000 | 1.9527 | 71291638272 |
324
+ | 2.0173 | 1.3022 | 136500 | 1.9525 | 71553726688 |
325
+ | 2.0287 | 1.3070 | 137000 | 1.9519 | 71815816608 |
326
+ | 2.0254 | 1.3118 | 137500 | 1.9515 | 72077935168 |
327
+ | 2.0286 | 1.3165 | 138000 | 1.9512 | 72340003200 |
328
+ | 2.0285 | 1.3213 | 138500 | 1.9511 | 72602127392 |
329
+ | 2.0353 | 1.3261 | 139000 | 1.9511 | 72864248896 |
330
+ | 2.0366 | 1.3308 | 139500 | 1.9510 | 73126349984 |
331
+ | 2.0315 | 1.3356 | 140000 | 1.9510 | 73388446624 |
332
+
333
+
334
+ ### Framework versions
335
+
336
+ - Transformers 4.50.3
337
+ - Pytorch 2.6.0+cu126
338
+ - Datasets 3.5.0
339
+ - Tokenizers 0.21.1
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 0,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.50.3"
7
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|beginoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<|endoftext|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<|beginoftext|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<|reserved_token_1|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<|reserved_token_2|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<|reserved_token_3|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "<|reserved_token_4|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "<|reserved_token_5|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "7": {
60
+ "content": "<|reserved_token_6|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "8": {
68
+ "content": "<|reserved_token_7|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "9": {
76
+ "content": "<|reserved_token_8|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "10": {
84
+ "content": "<|reserved_token_9|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "11": {
92
+ "content": "<|reserved_token_10|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "12": {
100
+ "content": "<|reserved_token_11|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ }
107
+ },
108
+ "bos_token": "<|beginoftext|>",
109
+ "clean_up_tokenization_spaces": false,
110
+ "eos_token": "<|endoftext|>",
111
+ "extra_special_tokens": {},
112
+ "model_max_length": 1000000000000000019884624838656,
113
+ "pad_token": "<|endoftext|>",
114
+ "tokenizer_class": "PreTrainedTokenizer"
115
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff