CocoRoF commited on
Commit
1cc5ddc
·
verified ·
1 Parent(s): 6b003f2

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f95abb4f35b22a7e758afd27aca2a69f419698ad1276e97da477eb487b8e37f1
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ddca534d10503034b593de387e205daca04a072f5ccbe17faac957a202b96d5
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:048bbd1cabc0a45ad09ea3651c3b6e3d5d885845f3d358d903b243b17193ba3a
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cb1c32a2fe7e4ba0a80cc2f7de739a5497222f3987f01b2d711b570de3fbe5c
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c461c9d337dfc684e9352ec72bfa344e2f5d377f7cfc4475de9acae294dca89
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06fea830cf5ad73ec00d500ea6fb952740ac936f18e93fa2d32abde1ea3ead92
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fae392ec6232cbf9da21d6ed12bc8247d0d24e7f3a3606acd23be00f3e8bbfc5
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be561d1df19be227394d8ea607c54262a06c9bf880af0aa5e04a52596a2a6cb0
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbf3e7ca9991a58b0b16574a3c653483c551c270aa05aba06c162ea593f7b0f2
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03f3e24417a59435f5a8450a4aeb0f09cc92734b5c3b45a0701b2c043c415c05
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c695bebf6bcb75cbe26378bfe0ab7e2a33c49f713b9d6e4d10632b24322977e7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bea02744c29f30024590ab1629a0e7b7dabbf1e8476456c2e7c5ce46dc35c28
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5ebb13c71265c5464c9aa9bb9b66f07764d73befe6cd63a2aaf8e781bf0a374
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:041be966454b60c86af576fc1eb7f34189114689abff8f9622b947110f7334c8
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12cc6e245e189be568c8dfd43a4dd8f04bb3dbd9f17f41458107935d2c2a6a9d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85766f6596d15a810177d77dd259d9b50588cf100ec5f8ebff5fed881d57957
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36086646e9a8f76fea69f8a227112e83bb63524964ccdfb82f4cdad88b90e5e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8be75d04b1ebe614241b88fd010a5dda1b7bf703c00c6ebe310ca07975830fe7
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b44153bacf860d0ca6ce4c6b9380a199feab8a72ca613e6745bfb671b02c4e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4699833a7ab4cb692996ef7567f934c0bac79d6a067963a873f89a38e412bd48
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c784d1e14f175eecd1cb8a33bf7e3edbddb5399a3760000ee27c1d7309b565d
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06d017eeef42e0127e56fc73579acf75949a31de7dbc0f95bf4428c95dd75f92
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.34676318241910664,
5
  "eval_steps": 500,
6
- "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2131,6 +2131,714 @@
2131
  "eval_samples_per_second": 609.887,
2132
  "eval_steps_per_second": 38.118,
2133
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2134
  }
2135
  ],
2136
  "logging_steps": 5,
@@ -2150,7 +2858,7 @@
2150
  "attributes": {}
2151
  }
2152
  },
2153
- "total_flos": 6.498536989183181e+18,
2154
  "train_batch_size": 4,
2155
  "trial_name": null,
2156
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.4623509098921422,
5
  "eval_steps": 500,
6
+ "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2131
  "eval_samples_per_second": 609.887,
2132
  "eval_steps_per_second": 38.118,
2133
  "step": 1500
2134
+ },
2135
+ {
2136
+ "epoch": 0.347919059693837,
2137
+ "grad_norm": 167.875,
2138
+ "learning_rate": 7.245632065775952e-06,
2139
+ "loss": 71.7923,
2140
+ "step": 1505
2141
+ },
2142
+ {
2143
+ "epoch": 0.34907493696856734,
2144
+ "grad_norm": 180.625,
2145
+ "learning_rate": 7.2327852004111e-06,
2146
+ "loss": 72.9641,
2147
+ "step": 1510
2148
+ },
2149
+ {
2150
+ "epoch": 0.3502308142432977,
2151
+ "grad_norm": 161.875,
2152
+ "learning_rate": 7.21993833504625e-06,
2153
+ "loss": 71.7615,
2154
+ "step": 1515
2155
+ },
2156
+ {
2157
+ "epoch": 0.3513866915180281,
2158
+ "grad_norm": 183.5,
2159
+ "learning_rate": 7.2070914696813985e-06,
2160
+ "loss": 71.8146,
2161
+ "step": 1520
2162
+ },
2163
+ {
2164
+ "epoch": 0.3525425687927584,
2165
+ "grad_norm": 162.0,
2166
+ "learning_rate": 7.194244604316547e-06,
2167
+ "loss": 71.0128,
2168
+ "step": 1525
2169
+ },
2170
+ {
2171
+ "epoch": 0.3536984460674888,
2172
+ "grad_norm": 163.375,
2173
+ "learning_rate": 7.181397738951696e-06,
2174
+ "loss": 71.2744,
2175
+ "step": 1530
2176
+ },
2177
+ {
2178
+ "epoch": 0.3548543233422191,
2179
+ "grad_norm": 173.75,
2180
+ "learning_rate": 7.168550873586846e-06,
2181
+ "loss": 72.7448,
2182
+ "step": 1535
2183
+ },
2184
+ {
2185
+ "epoch": 0.3560102006169495,
2186
+ "grad_norm": 182.625,
2187
+ "learning_rate": 7.155704008221994e-06,
2188
+ "loss": 72.3184,
2189
+ "step": 1540
2190
+ },
2191
+ {
2192
+ "epoch": 0.35716607789167987,
2193
+ "grad_norm": 169.625,
2194
+ "learning_rate": 7.1428571428571436e-06,
2195
+ "loss": 73.2466,
2196
+ "step": 1545
2197
+ },
2198
+ {
2199
+ "epoch": 0.3583219551664102,
2200
+ "grad_norm": 169.375,
2201
+ "learning_rate": 7.1300102774922916e-06,
2202
+ "loss": 72.9034,
2203
+ "step": 1550
2204
+ },
2205
+ {
2206
+ "epoch": 0.35947783244114057,
2207
+ "grad_norm": 155.0,
2208
+ "learning_rate": 7.117163412127441e-06,
2209
+ "loss": 70.7591,
2210
+ "step": 1555
2211
+ },
2212
+ {
2213
+ "epoch": 0.36063370971587094,
2214
+ "grad_norm": 192.25,
2215
+ "learning_rate": 7.104316546762591e-06,
2216
+ "loss": 72.9981,
2217
+ "step": 1560
2218
+ },
2219
+ {
2220
+ "epoch": 0.36178958699060126,
2221
+ "grad_norm": 167.375,
2222
+ "learning_rate": 7.091469681397739e-06,
2223
+ "loss": 71.7857,
2224
+ "step": 1565
2225
+ },
2226
+ {
2227
+ "epoch": 0.36294546426533164,
2228
+ "grad_norm": 160.5,
2229
+ "learning_rate": 7.078622816032889e-06,
2230
+ "loss": 71.0528,
2231
+ "step": 1570
2232
+ },
2233
+ {
2234
+ "epoch": 0.36410134154006196,
2235
+ "grad_norm": 178.75,
2236
+ "learning_rate": 7.0657759506680375e-06,
2237
+ "loss": 72.1775,
2238
+ "step": 1575
2239
+ },
2240
+ {
2241
+ "epoch": 0.36525721881479234,
2242
+ "grad_norm": 161.5,
2243
+ "learning_rate": 7.052929085303186e-06,
2244
+ "loss": 71.9378,
2245
+ "step": 1580
2246
+ },
2247
+ {
2248
+ "epoch": 0.3664130960895227,
2249
+ "grad_norm": 158.25,
2250
+ "learning_rate": 7.040082219938336e-06,
2251
+ "loss": 71.7813,
2252
+ "step": 1585
2253
+ },
2254
+ {
2255
+ "epoch": 0.36756897336425304,
2256
+ "grad_norm": 160.0,
2257
+ "learning_rate": 7.027235354573485e-06,
2258
+ "loss": 71.8137,
2259
+ "step": 1590
2260
+ },
2261
+ {
2262
+ "epoch": 0.3687248506389834,
2263
+ "grad_norm": 165.0,
2264
+ "learning_rate": 7.014388489208634e-06,
2265
+ "loss": 73.0897,
2266
+ "step": 1595
2267
+ },
2268
+ {
2269
+ "epoch": 0.36988072791371374,
2270
+ "grad_norm": 166.125,
2271
+ "learning_rate": 7.0015416238437826e-06,
2272
+ "loss": 71.867,
2273
+ "step": 1600
2274
+ },
2275
+ {
2276
+ "epoch": 0.3710366051884441,
2277
+ "grad_norm": 152.375,
2278
+ "learning_rate": 6.988694758478932e-06,
2279
+ "loss": 72.0306,
2280
+ "step": 1605
2281
+ },
2282
+ {
2283
+ "epoch": 0.3721924824631745,
2284
+ "grad_norm": 153.625,
2285
+ "learning_rate": 6.97584789311408e-06,
2286
+ "loss": 71.2797,
2287
+ "step": 1610
2288
+ },
2289
+ {
2290
+ "epoch": 0.3733483597379048,
2291
+ "grad_norm": 157.75,
2292
+ "learning_rate": 6.96300102774923e-06,
2293
+ "loss": 70.6556,
2294
+ "step": 1615
2295
+ },
2296
+ {
2297
+ "epoch": 0.3745042370126352,
2298
+ "grad_norm": 206.25,
2299
+ "learning_rate": 6.950154162384378e-06,
2300
+ "loss": 70.427,
2301
+ "step": 1620
2302
+ },
2303
+ {
2304
+ "epoch": 0.37566011428736557,
2305
+ "grad_norm": 198.0,
2306
+ "learning_rate": 6.937307297019528e-06,
2307
+ "loss": 70.8553,
2308
+ "step": 1625
2309
+ },
2310
+ {
2311
+ "epoch": 0.3768159915620959,
2312
+ "grad_norm": 172.375,
2313
+ "learning_rate": 6.924460431654677e-06,
2314
+ "loss": 72.5536,
2315
+ "step": 1630
2316
+ },
2317
+ {
2318
+ "epoch": 0.37797186883682626,
2319
+ "grad_norm": 162.25,
2320
+ "learning_rate": 6.911613566289825e-06,
2321
+ "loss": 71.3941,
2322
+ "step": 1635
2323
+ },
2324
+ {
2325
+ "epoch": 0.3791277461115566,
2326
+ "grad_norm": 154.75,
2327
+ "learning_rate": 6.898766700924975e-06,
2328
+ "loss": 70.9165,
2329
+ "step": 1640
2330
+ },
2331
+ {
2332
+ "epoch": 0.38028362338628696,
2333
+ "grad_norm": 181.75,
2334
+ "learning_rate": 6.885919835560124e-06,
2335
+ "loss": 72.0093,
2336
+ "step": 1645
2337
+ },
2338
+ {
2339
+ "epoch": 0.38143950066101734,
2340
+ "grad_norm": 150.0,
2341
+ "learning_rate": 6.873072970195273e-06,
2342
+ "loss": 71.9246,
2343
+ "step": 1650
2344
+ },
2345
+ {
2346
+ "epoch": 0.38259537793574766,
2347
+ "grad_norm": 164.875,
2348
+ "learning_rate": 6.860226104830422e-06,
2349
+ "loss": 71.8933,
2350
+ "step": 1655
2351
+ },
2352
+ {
2353
+ "epoch": 0.38375125521047804,
2354
+ "grad_norm": 157.0,
2355
+ "learning_rate": 6.847379239465571e-06,
2356
+ "loss": 72.0258,
2357
+ "step": 1660
2358
+ },
2359
+ {
2360
+ "epoch": 0.38490713248520836,
2361
+ "grad_norm": 177.375,
2362
+ "learning_rate": 6.834532374100719e-06,
2363
+ "loss": 72.3605,
2364
+ "step": 1665
2365
+ },
2366
+ {
2367
+ "epoch": 0.38606300975993874,
2368
+ "grad_norm": 175.625,
2369
+ "learning_rate": 6.821685508735869e-06,
2370
+ "loss": 71.7816,
2371
+ "step": 1670
2372
+ },
2373
+ {
2374
+ "epoch": 0.3872188870346691,
2375
+ "grad_norm": 178.375,
2376
+ "learning_rate": 6.808838643371019e-06,
2377
+ "loss": 71.8166,
2378
+ "step": 1675
2379
+ },
2380
+ {
2381
+ "epoch": 0.38837476430939943,
2382
+ "grad_norm": 164.0,
2383
+ "learning_rate": 6.795991778006167e-06,
2384
+ "loss": 72.0027,
2385
+ "step": 1680
2386
+ },
2387
+ {
2388
+ "epoch": 0.3895306415841298,
2389
+ "grad_norm": 168.0,
2390
+ "learning_rate": 6.783144912641316e-06,
2391
+ "loss": 71.8431,
2392
+ "step": 1685
2393
+ },
2394
+ {
2395
+ "epoch": 0.3906865188588602,
2396
+ "grad_norm": 165.375,
2397
+ "learning_rate": 6.770298047276464e-06,
2398
+ "loss": 71.9873,
2399
+ "step": 1690
2400
+ },
2401
+ {
2402
+ "epoch": 0.3918423961335905,
2403
+ "grad_norm": 161.25,
2404
+ "learning_rate": 6.757451181911614e-06,
2405
+ "loss": 72.0379,
2406
+ "step": 1695
2407
+ },
2408
+ {
2409
+ "epoch": 0.3929982734083209,
2410
+ "grad_norm": 175.125,
2411
+ "learning_rate": 6.744604316546764e-06,
2412
+ "loss": 72.1231,
2413
+ "step": 1700
2414
+ },
2415
+ {
2416
+ "epoch": 0.3941541506830512,
2417
+ "grad_norm": 177.375,
2418
+ "learning_rate": 6.731757451181912e-06,
2419
+ "loss": 70.1524,
2420
+ "step": 1705
2421
+ },
2422
+ {
2423
+ "epoch": 0.3953100279577816,
2424
+ "grad_norm": 167.75,
2425
+ "learning_rate": 6.7189105858170614e-06,
2426
+ "loss": 71.1926,
2427
+ "step": 1710
2428
+ },
2429
+ {
2430
+ "epoch": 0.39646590523251196,
2431
+ "grad_norm": 148.125,
2432
+ "learning_rate": 6.70606372045221e-06,
2433
+ "loss": 72.7071,
2434
+ "step": 1715
2435
+ },
2436
+ {
2437
+ "epoch": 0.3976217825072423,
2438
+ "grad_norm": 162.875,
2439
+ "learning_rate": 6.693216855087359e-06,
2440
+ "loss": 71.857,
2441
+ "step": 1720
2442
+ },
2443
+ {
2444
+ "epoch": 0.39877765978197266,
2445
+ "grad_norm": 198.625,
2446
+ "learning_rate": 6.680369989722508e-06,
2447
+ "loss": 72.0683,
2448
+ "step": 1725
2449
+ },
2450
+ {
2451
+ "epoch": 0.399933537056703,
2452
+ "grad_norm": 155.625,
2453
+ "learning_rate": 6.667523124357658e-06,
2454
+ "loss": 70.8161,
2455
+ "step": 1730
2456
+ },
2457
+ {
2458
+ "epoch": 0.40108941433143336,
2459
+ "grad_norm": 164.0,
2460
+ "learning_rate": 6.654676258992806e-06,
2461
+ "loss": 70.9434,
2462
+ "step": 1735
2463
+ },
2464
+ {
2465
+ "epoch": 0.40224529160616374,
2466
+ "grad_norm": 159.375,
2467
+ "learning_rate": 6.641829393627955e-06,
2468
+ "loss": 71.407,
2469
+ "step": 1740
2470
+ },
2471
+ {
2472
+ "epoch": 0.40340116888089406,
2473
+ "grad_norm": 161.875,
2474
+ "learning_rate": 6.628982528263105e-06,
2475
+ "loss": 71.7674,
2476
+ "step": 1745
2477
+ },
2478
+ {
2479
+ "epoch": 0.40455704615562443,
2480
+ "grad_norm": 162.625,
2481
+ "learning_rate": 6.616135662898253e-06,
2482
+ "loss": 72.8434,
2483
+ "step": 1750
2484
+ },
2485
+ {
2486
+ "epoch": 0.40571292343035475,
2487
+ "grad_norm": 205.75,
2488
+ "learning_rate": 6.603288797533403e-06,
2489
+ "loss": 71.8435,
2490
+ "step": 1755
2491
+ },
2492
+ {
2493
+ "epoch": 0.40686880070508513,
2494
+ "grad_norm": 149.625,
2495
+ "learning_rate": 6.590441932168551e-06,
2496
+ "loss": 71.5685,
2497
+ "step": 1760
2498
+ },
2499
+ {
2500
+ "epoch": 0.4080246779798155,
2501
+ "grad_norm": 144.625,
2502
+ "learning_rate": 6.5775950668037005e-06,
2503
+ "loss": 71.5258,
2504
+ "step": 1765
2505
+ },
2506
+ {
2507
+ "epoch": 0.40918055525454583,
2508
+ "grad_norm": 171.5,
2509
+ "learning_rate": 6.564748201438849e-06,
2510
+ "loss": 70.664,
2511
+ "step": 1770
2512
+ },
2513
+ {
2514
+ "epoch": 0.4103364325292762,
2515
+ "grad_norm": 177.875,
2516
+ "learning_rate": 6.551901336073998e-06,
2517
+ "loss": 71.853,
2518
+ "step": 1775
2519
+ },
2520
+ {
2521
+ "epoch": 0.4114923098040066,
2522
+ "grad_norm": 168.5,
2523
+ "learning_rate": 6.539054470709148e-06,
2524
+ "loss": 71.6207,
2525
+ "step": 1780
2526
+ },
2527
+ {
2528
+ "epoch": 0.4126481870787369,
2529
+ "grad_norm": 166.375,
2530
+ "learning_rate": 6.526207605344297e-06,
2531
+ "loss": 71.5577,
2532
+ "step": 1785
2533
+ },
2534
+ {
2535
+ "epoch": 0.4138040643534673,
2536
+ "grad_norm": 187.375,
2537
+ "learning_rate": 6.5133607399794455e-06,
2538
+ "loss": 71.5951,
2539
+ "step": 1790
2540
+ },
2541
+ {
2542
+ "epoch": 0.4149599416281976,
2543
+ "grad_norm": 167.625,
2544
+ "learning_rate": 6.500513874614594e-06,
2545
+ "loss": 72.7333,
2546
+ "step": 1795
2547
+ },
2548
+ {
2549
+ "epoch": 0.416115818902928,
2550
+ "grad_norm": 177.375,
2551
+ "learning_rate": 6.487667009249744e-06,
2552
+ "loss": 71.4623,
2553
+ "step": 1800
2554
+ },
2555
+ {
2556
+ "epoch": 0.41727169617765836,
2557
+ "grad_norm": 176.25,
2558
+ "learning_rate": 6.474820143884892e-06,
2559
+ "loss": 70.88,
2560
+ "step": 1805
2561
+ },
2562
+ {
2563
+ "epoch": 0.4184275734523887,
2564
+ "grad_norm": 178.25,
2565
+ "learning_rate": 6.461973278520042e-06,
2566
+ "loss": 70.7191,
2567
+ "step": 1810
2568
+ },
2569
+ {
2570
+ "epoch": 0.41958345072711906,
2571
+ "grad_norm": 190.75,
2572
+ "learning_rate": 6.4491264131551915e-06,
2573
+ "loss": 71.4613,
2574
+ "step": 1815
2575
+ },
2576
+ {
2577
+ "epoch": 0.4207393280018494,
2578
+ "grad_norm": 175.75,
2579
+ "learning_rate": 6.4362795477903395e-06,
2580
+ "loss": 71.9579,
2581
+ "step": 1820
2582
+ },
2583
+ {
2584
+ "epoch": 0.42189520527657975,
2585
+ "grad_norm": 159.625,
2586
+ "learning_rate": 6.423432682425489e-06,
2587
+ "loss": 70.9512,
2588
+ "step": 1825
2589
+ },
2590
+ {
2591
+ "epoch": 0.42305108255131013,
2592
+ "grad_norm": 171.0,
2593
+ "learning_rate": 6.410585817060637e-06,
2594
+ "loss": 71.0018,
2595
+ "step": 1830
2596
+ },
2597
+ {
2598
+ "epoch": 0.42420695982604045,
2599
+ "grad_norm": 168.625,
2600
+ "learning_rate": 6.397738951695787e-06,
2601
+ "loss": 69.9553,
2602
+ "step": 1835
2603
+ },
2604
+ {
2605
+ "epoch": 0.42536283710077083,
2606
+ "grad_norm": 161.875,
2607
+ "learning_rate": 6.384892086330936e-06,
2608
+ "loss": 71.8582,
2609
+ "step": 1840
2610
+ },
2611
+ {
2612
+ "epoch": 0.4265187143755012,
2613
+ "grad_norm": 159.375,
2614
+ "learning_rate": 6.3720452209660846e-06,
2615
+ "loss": 71.7597,
2616
+ "step": 1845
2617
+ },
2618
+ {
2619
+ "epoch": 0.42767459165023153,
2620
+ "grad_norm": 168.75,
2621
+ "learning_rate": 6.359198355601233e-06,
2622
+ "loss": 72.3329,
2623
+ "step": 1850
2624
+ },
2625
+ {
2626
+ "epoch": 0.4288304689249619,
2627
+ "grad_norm": 227.625,
2628
+ "learning_rate": 6.346351490236383e-06,
2629
+ "loss": 70.8971,
2630
+ "step": 1855
2631
+ },
2632
+ {
2633
+ "epoch": 0.4299863461996922,
2634
+ "grad_norm": 162.125,
2635
+ "learning_rate": 6.333504624871531e-06,
2636
+ "loss": 72.014,
2637
+ "step": 1860
2638
+ },
2639
+ {
2640
+ "epoch": 0.4311422234744226,
2641
+ "grad_norm": 167.375,
2642
+ "learning_rate": 6.320657759506681e-06,
2643
+ "loss": 70.4521,
2644
+ "step": 1865
2645
+ },
2646
+ {
2647
+ "epoch": 0.432298100749153,
2648
+ "grad_norm": 176.625,
2649
+ "learning_rate": 6.3078108941418305e-06,
2650
+ "loss": 71.6845,
2651
+ "step": 1870
2652
+ },
2653
+ {
2654
+ "epoch": 0.4334539780238833,
2655
+ "grad_norm": 160.5,
2656
+ "learning_rate": 6.2949640287769785e-06,
2657
+ "loss": 70.4146,
2658
+ "step": 1875
2659
+ },
2660
+ {
2661
+ "epoch": 0.4346098552986137,
2662
+ "grad_norm": 169.0,
2663
+ "learning_rate": 6.282117163412128e-06,
2664
+ "loss": 70.7766,
2665
+ "step": 1880
2666
+ },
2667
+ {
2668
+ "epoch": 0.435765732573344,
2669
+ "grad_norm": 166.125,
2670
+ "learning_rate": 6.269270298047278e-06,
2671
+ "loss": 70.6441,
2672
+ "step": 1885
2673
+ },
2674
+ {
2675
+ "epoch": 0.4369216098480744,
2676
+ "grad_norm": 169.875,
2677
+ "learning_rate": 6.256423432682426e-06,
2678
+ "loss": 70.6906,
2679
+ "step": 1890
2680
+ },
2681
+ {
2682
+ "epoch": 0.43807748712280475,
2683
+ "grad_norm": 189.0,
2684
+ "learning_rate": 6.2435765673175756e-06,
2685
+ "loss": 69.9695,
2686
+ "step": 1895
2687
+ },
2688
+ {
2689
+ "epoch": 0.4392333643975351,
2690
+ "grad_norm": 163.5,
2691
+ "learning_rate": 6.2307297019527236e-06,
2692
+ "loss": 72.2346,
2693
+ "step": 1900
2694
+ },
2695
+ {
2696
+ "epoch": 0.44038924167226545,
2697
+ "grad_norm": 168.25,
2698
+ "learning_rate": 6.217882836587873e-06,
2699
+ "loss": 70.2958,
2700
+ "step": 1905
2701
+ },
2702
+ {
2703
+ "epoch": 0.44154511894699583,
2704
+ "grad_norm": 187.875,
2705
+ "learning_rate": 6.205035971223022e-06,
2706
+ "loss": 69.8792,
2707
+ "step": 1910
2708
+ },
2709
+ {
2710
+ "epoch": 0.44270099622172615,
2711
+ "grad_norm": 149.625,
2712
+ "learning_rate": 6.192189105858171e-06,
2713
+ "loss": 72.0271,
2714
+ "step": 1915
2715
+ },
2716
+ {
2717
+ "epoch": 0.4438568734964565,
2718
+ "grad_norm": 160.625,
2719
+ "learning_rate": 6.17934224049332e-06,
2720
+ "loss": 71.8988,
2721
+ "step": 1920
2722
+ },
2723
+ {
2724
+ "epoch": 0.44501275077118685,
2725
+ "grad_norm": 186.875,
2726
+ "learning_rate": 6.1664953751284695e-06,
2727
+ "loss": 71.6762,
2728
+ "step": 1925
2729
+ },
2730
+ {
2731
+ "epoch": 0.4461686280459172,
2732
+ "grad_norm": 166.125,
2733
+ "learning_rate": 6.1536485097636175e-06,
2734
+ "loss": 70.4196,
2735
+ "step": 1930
2736
+ },
2737
+ {
2738
+ "epoch": 0.4473245053206476,
2739
+ "grad_norm": 169.625,
2740
+ "learning_rate": 6.140801644398767e-06,
2741
+ "loss": 70.9007,
2742
+ "step": 1935
2743
+ },
2744
+ {
2745
+ "epoch": 0.4484803825953779,
2746
+ "grad_norm": 166.25,
2747
+ "learning_rate": 6.127954779033917e-06,
2748
+ "loss": 69.9268,
2749
+ "step": 1940
2750
+ },
2751
+ {
2752
+ "epoch": 0.4496362598701083,
2753
+ "grad_norm": 151.625,
2754
+ "learning_rate": 6.115107913669065e-06,
2755
+ "loss": 71.6101,
2756
+ "step": 1945
2757
+ },
2758
+ {
2759
+ "epoch": 0.4507921371448386,
2760
+ "grad_norm": 169.125,
2761
+ "learning_rate": 6.102261048304215e-06,
2762
+ "loss": 72.2735,
2763
+ "step": 1950
2764
+ },
2765
+ {
2766
+ "epoch": 0.451948014419569,
2767
+ "grad_norm": 161.375,
2768
+ "learning_rate": 6.0894141829393634e-06,
2769
+ "loss": 71.2896,
2770
+ "step": 1955
2771
+ },
2772
+ {
2773
+ "epoch": 0.4531038916942994,
2774
+ "grad_norm": 174.875,
2775
+ "learning_rate": 6.076567317574512e-06,
2776
+ "loss": 71.3177,
2777
+ "step": 1960
2778
+ },
2779
+ {
2780
+ "epoch": 0.4542597689690297,
2781
+ "grad_norm": 170.5,
2782
+ "learning_rate": 6.063720452209661e-06,
2783
+ "loss": 70.5251,
2784
+ "step": 1965
2785
+ },
2786
+ {
2787
+ "epoch": 0.4554156462437601,
2788
+ "grad_norm": 156.875,
2789
+ "learning_rate": 6.05087358684481e-06,
2790
+ "loss": 70.9012,
2791
+ "step": 1970
2792
+ },
2793
+ {
2794
+ "epoch": 0.45657152351849045,
2795
+ "grad_norm": 156.0,
2796
+ "learning_rate": 6.03802672147996e-06,
2797
+ "loss": 70.7252,
2798
+ "step": 1975
2799
+ },
2800
+ {
2801
+ "epoch": 0.4577274007932208,
2802
+ "grad_norm": 152.75,
2803
+ "learning_rate": 6.0251798561151085e-06,
2804
+ "loss": 70.9783,
2805
+ "step": 1980
2806
+ },
2807
+ {
2808
+ "epoch": 0.45888327806795115,
2809
+ "grad_norm": 159.5,
2810
+ "learning_rate": 6.012332990750257e-06,
2811
+ "loss": 70.2681,
2812
+ "step": 1985
2813
+ },
2814
+ {
2815
+ "epoch": 0.46003915534268147,
2816
+ "grad_norm": 178.0,
2817
+ "learning_rate": 5.999486125385406e-06,
2818
+ "loss": 70.291,
2819
+ "step": 1990
2820
+ },
2821
+ {
2822
+ "epoch": 0.46119503261741185,
2823
+ "grad_norm": 176.875,
2824
+ "learning_rate": 5.986639260020556e-06,
2825
+ "loss": 70.9205,
2826
+ "step": 1995
2827
+ },
2828
+ {
2829
+ "epoch": 0.4623509098921422,
2830
+ "grad_norm": 188.75,
2831
+ "learning_rate": 5.973792394655704e-06,
2832
+ "loss": 72.2977,
2833
+ "step": 2000
2834
+ },
2835
+ {
2836
+ "epoch": 0.4623509098921422,
2837
+ "eval_loss": NaN,
2838
+ "eval_runtime": 382.3957,
2839
+ "eval_samples_per_second": 609.667,
2840
+ "eval_steps_per_second": 38.105,
2841
+ "step": 2000
2842
  }
2843
  ],
2844
  "logging_steps": 5,
 
2858
  "attributes": {}
2859
  }
2860
  },
2861
+ "total_flos": 8.664715985577574e+18,
2862
  "train_batch_size": 4,
2863
  "trial_name": null,
2864
  "trial_params": null