ZurabDz commited on
Commit
686b6d2
·
verified ·
1 Parent(s): 53c2225

Training in progress, step 8000

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94cc0b49081e080919d2be8f5e4a517f58ccdc361d43f153ac49ba5755081b84
3
  size 44644496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adf638b849568db5d9d7a536bdb1edafa1e8d90c38459b1d7b3a6552d6da7ad0
3
  size 44644496
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6ad718095808c9c5f6a957cf2ca65f59b9ed75b8a432e2fa2fb65c556f2d360
3
  size 11230198
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abda9e7a12534ef2affd8d0c860673e26661a5152bce292672896e64d2a0cdaf
3
  size 11230198
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d138cfe3a4adf21f048848ee35837c9a757a0a3616ff7adbb45b69aac247435
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b3ee827a7a00012c0a116546df467feee35e70376d81a7a85b1a70eb90414d3
3
  size 14244
runs/Jun07_12-33-16_DESKTOP-69FPKCK/events.out.tfevents.1717788805.DESKTOP-69FPKCK CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eff3fe11f185c6d041edc58c2d8b069ad576bfecba786d61fc1bac322d07a7f6
3
- size 88649
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a3c8d5d8b582a0b0b9fd14feef2ad6f7e897a52df0c558e8aa72703bc07e62b
3
+ size 89915
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4aead1720be621f91a94a09ab21ddeec8c9e93c1a9e50cc6992710fbf1fedb49
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b35aaeefae1777c5b0cc2a6a699a6e86dbf10049e0c78d4a59c18dcf3571dfd
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.02714698736307738,
5
  "eval_steps": 2000,
6
- "global_step": 6000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2134,6 +2134,715 @@
2134
  "eval_samples_per_second": 2784.158,
2135
  "eval_steps_per_second": 10.878,
2136
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2137
  }
2138
  ],
2139
  "logging_steps": 20,
@@ -2141,7 +2850,7 @@
2141
  "num_input_tokens_seen": 0,
2142
  "num_train_epochs": 3,
2143
  "save_steps": 100,
2144
- "total_flos": 2157543161856000.0,
2145
  "train_batch_size": 256,
2146
  "trial_name": null,
2147
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.03619598315076984,
5
  "eval_steps": 2000,
6
+ "global_step": 8000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2134
  "eval_samples_per_second": 2784.158,
2135
  "eval_steps_per_second": 10.878,
2136
  "step": 6000
2137
+ },
2138
+ {
2139
+ "epoch": 0.027237477320954308,
2140
+ "grad_norm": 7.100822925567627,
2141
+ "learning_rate": 8.167134196000362e-05,
2142
+ "loss": 8.4131,
2143
+ "step": 6020
2144
+ },
2145
+ {
2146
+ "epoch": 0.02732796727883123,
2147
+ "grad_norm": 8.460954666137695,
2148
+ "learning_rate": 8.194281060537508e-05,
2149
+ "loss": 8.4087,
2150
+ "step": 6040
2151
+ },
2152
+ {
2153
+ "epoch": 0.027418457236708157,
2154
+ "grad_norm": 7.642125129699707,
2155
+ "learning_rate": 8.221427925074653e-05,
2156
+ "loss": 8.3806,
2157
+ "step": 6060
2158
+ },
2159
+ {
2160
+ "epoch": 0.02750894719458508,
2161
+ "grad_norm": 8.104974746704102,
2162
+ "learning_rate": 8.2485747896118e-05,
2163
+ "loss": 8.404,
2164
+ "step": 6080
2165
+ },
2166
+ {
2167
+ "epoch": 0.027599437152462006,
2168
+ "grad_norm": 8.082459449768066,
2169
+ "learning_rate": 8.275721654148946e-05,
2170
+ "loss": 8.3865,
2171
+ "step": 6100
2172
+ },
2173
+ {
2174
+ "epoch": 0.02768992711033893,
2175
+ "grad_norm": 8.786911010742188,
2176
+ "learning_rate": 8.302868518686092e-05,
2177
+ "loss": 8.3475,
2178
+ "step": 6120
2179
+ },
2180
+ {
2181
+ "epoch": 0.027780417068215855,
2182
+ "grad_norm": 7.780808925628662,
2183
+ "learning_rate": 8.330015383223237e-05,
2184
+ "loss": 8.3798,
2185
+ "step": 6140
2186
+ },
2187
+ {
2188
+ "epoch": 0.02787090702609278,
2189
+ "grad_norm": 10.508188247680664,
2190
+ "learning_rate": 8.357162247760384e-05,
2191
+ "loss": 8.3718,
2192
+ "step": 6160
2193
+ },
2194
+ {
2195
+ "epoch": 0.027961396983969704,
2196
+ "grad_norm": 9.833992004394531,
2197
+ "learning_rate": 8.38430911229753e-05,
2198
+ "loss": 8.3952,
2199
+ "step": 6180
2200
+ },
2201
+ {
2202
+ "epoch": 0.02805188694184663,
2203
+ "grad_norm": 9.917244911193848,
2204
+ "learning_rate": 8.411455976834675e-05,
2205
+ "loss": 8.3828,
2206
+ "step": 6200
2207
+ },
2208
+ {
2209
+ "epoch": 0.028142376899723553,
2210
+ "grad_norm": 8.893899917602539,
2211
+ "learning_rate": 8.438602841371821e-05,
2212
+ "loss": 8.3853,
2213
+ "step": 6220
2214
+ },
2215
+ {
2216
+ "epoch": 0.02823286685760048,
2217
+ "grad_norm": 8.206876754760742,
2218
+ "learning_rate": 8.465749705908967e-05,
2219
+ "loss": 8.3686,
2220
+ "step": 6240
2221
+ },
2222
+ {
2223
+ "epoch": 0.028323356815477402,
2224
+ "grad_norm": 6.771660327911377,
2225
+ "learning_rate": 8.492896570446114e-05,
2226
+ "loss": 8.3699,
2227
+ "step": 6260
2228
+ },
2229
+ {
2230
+ "epoch": 0.028413846773354328,
2231
+ "grad_norm": 8.602880477905273,
2232
+ "learning_rate": 8.52004343498326e-05,
2233
+ "loss": 8.3388,
2234
+ "step": 6280
2235
+ },
2236
+ {
2237
+ "epoch": 0.02850433673123125,
2238
+ "grad_norm": 12.602445602416992,
2239
+ "learning_rate": 8.547190299520405e-05,
2240
+ "loss": 8.3127,
2241
+ "step": 6300
2242
+ },
2243
+ {
2244
+ "epoch": 0.028594826689108177,
2245
+ "grad_norm": 6.581843852996826,
2246
+ "learning_rate": 8.57433716405755e-05,
2247
+ "loss": 8.3345,
2248
+ "step": 6320
2249
+ },
2250
+ {
2251
+ "epoch": 0.0286853166469851,
2252
+ "grad_norm": 11.11732292175293,
2253
+ "learning_rate": 8.601484028594696e-05,
2254
+ "loss": 8.3442,
2255
+ "step": 6340
2256
+ },
2257
+ {
2258
+ "epoch": 0.028775806604862026,
2259
+ "grad_norm": 7.795157432556152,
2260
+ "learning_rate": 8.628630893131843e-05,
2261
+ "loss": 8.3477,
2262
+ "step": 6360
2263
+ },
2264
+ {
2265
+ "epoch": 0.02886629656273895,
2266
+ "grad_norm": 7.013496398925781,
2267
+ "learning_rate": 8.655777757668989e-05,
2268
+ "loss": 8.3444,
2269
+ "step": 6380
2270
+ },
2271
+ {
2272
+ "epoch": 0.028956786520615875,
2273
+ "grad_norm": 7.039948463439941,
2274
+ "learning_rate": 8.682924622206135e-05,
2275
+ "loss": 8.3242,
2276
+ "step": 6400
2277
+ },
2278
+ {
2279
+ "epoch": 0.029047276478492798,
2280
+ "grad_norm": 9.261716842651367,
2281
+ "learning_rate": 8.710071486743282e-05,
2282
+ "loss": 8.3209,
2283
+ "step": 6420
2284
+ },
2285
+ {
2286
+ "epoch": 0.029137766436369724,
2287
+ "grad_norm": 7.255875587463379,
2288
+ "learning_rate": 8.737218351280428e-05,
2289
+ "loss": 8.304,
2290
+ "step": 6440
2291
+ },
2292
+ {
2293
+ "epoch": 0.029228256394246647,
2294
+ "grad_norm": 7.955538749694824,
2295
+ "learning_rate": 8.764365215817573e-05,
2296
+ "loss": 8.2953,
2297
+ "step": 6460
2298
+ },
2299
+ {
2300
+ "epoch": 0.029318746352123573,
2301
+ "grad_norm": 9.364811897277832,
2302
+ "learning_rate": 8.791512080354718e-05,
2303
+ "loss": 8.2936,
2304
+ "step": 6480
2305
+ },
2306
+ {
2307
+ "epoch": 0.0294092363100005,
2308
+ "grad_norm": 9.385396957397461,
2309
+ "learning_rate": 8.818658944891864e-05,
2310
+ "loss": 8.3276,
2311
+ "step": 6500
2312
+ },
2313
+ {
2314
+ "epoch": 0.029499726267877422,
2315
+ "grad_norm": 8.448295593261719,
2316
+ "learning_rate": 8.84580580942901e-05,
2317
+ "loss": 8.2975,
2318
+ "step": 6520
2319
+ },
2320
+ {
2321
+ "epoch": 0.02959021622575435,
2322
+ "grad_norm": 9.282604217529297,
2323
+ "learning_rate": 8.872952673966157e-05,
2324
+ "loss": 8.3217,
2325
+ "step": 6540
2326
+ },
2327
+ {
2328
+ "epoch": 0.02968070618363127,
2329
+ "grad_norm": 7.898446559906006,
2330
+ "learning_rate": 8.900099538503303e-05,
2331
+ "loss": 8.3006,
2332
+ "step": 6560
2333
+ },
2334
+ {
2335
+ "epoch": 0.029771196141508197,
2336
+ "grad_norm": 9.186493873596191,
2337
+ "learning_rate": 8.927246403040448e-05,
2338
+ "loss": 8.2981,
2339
+ "step": 6580
2340
+ },
2341
+ {
2342
+ "epoch": 0.02986168609938512,
2343
+ "grad_norm": 9.346575736999512,
2344
+ "learning_rate": 8.954393267577595e-05,
2345
+ "loss": 8.2883,
2346
+ "step": 6600
2347
+ },
2348
+ {
2349
+ "epoch": 0.029952176057262046,
2350
+ "grad_norm": 6.458785057067871,
2351
+ "learning_rate": 8.981540132114741e-05,
2352
+ "loss": 8.2966,
2353
+ "step": 6620
2354
+ },
2355
+ {
2356
+ "epoch": 0.03004266601513897,
2357
+ "grad_norm": 8.704976081848145,
2358
+ "learning_rate": 9.008686996651886e-05,
2359
+ "loss": 8.2986,
2360
+ "step": 6640
2361
+ },
2362
+ {
2363
+ "epoch": 0.030133155973015895,
2364
+ "grad_norm": 7.744259357452393,
2365
+ "learning_rate": 9.035833861189032e-05,
2366
+ "loss": 8.2868,
2367
+ "step": 6660
2368
+ },
2369
+ {
2370
+ "epoch": 0.030223645930892818,
2371
+ "grad_norm": 8.345844268798828,
2372
+ "learning_rate": 9.062980725726179e-05,
2373
+ "loss": 8.2931,
2374
+ "step": 6680
2375
+ },
2376
+ {
2377
+ "epoch": 0.030314135888769744,
2378
+ "grad_norm": 7.604759216308594,
2379
+ "learning_rate": 9.090127590263323e-05,
2380
+ "loss": 8.2847,
2381
+ "step": 6700
2382
+ },
2383
+ {
2384
+ "epoch": 0.030404625846646667,
2385
+ "grad_norm": 10.3920259475708,
2386
+ "learning_rate": 9.11727445480047e-05,
2387
+ "loss": 8.273,
2388
+ "step": 6720
2389
+ },
2390
+ {
2391
+ "epoch": 0.030495115804523593,
2392
+ "grad_norm": 7.095389366149902,
2393
+ "learning_rate": 9.144421319337616e-05,
2394
+ "loss": 8.2768,
2395
+ "step": 6740
2396
+ },
2397
+ {
2398
+ "epoch": 0.030585605762400516,
2399
+ "grad_norm": 7.211811542510986,
2400
+ "learning_rate": 9.171568183874762e-05,
2401
+ "loss": 8.2918,
2402
+ "step": 6760
2403
+ },
2404
+ {
2405
+ "epoch": 0.030676095720277442,
2406
+ "grad_norm": 8.639713287353516,
2407
+ "learning_rate": 9.198715048411909e-05,
2408
+ "loss": 8.2845,
2409
+ "step": 6780
2410
+ },
2411
+ {
2412
+ "epoch": 0.03076658567815437,
2413
+ "grad_norm": 7.687414169311523,
2414
+ "learning_rate": 9.225861912949055e-05,
2415
+ "loss": 8.2992,
2416
+ "step": 6800
2417
+ },
2418
+ {
2419
+ "epoch": 0.03085707563603129,
2420
+ "grad_norm": 8.479426383972168,
2421
+ "learning_rate": 9.2530087774862e-05,
2422
+ "loss": 8.2848,
2423
+ "step": 6820
2424
+ },
2425
+ {
2426
+ "epoch": 0.030947565593908218,
2427
+ "grad_norm": 8.185149192810059,
2428
+ "learning_rate": 9.280155642023345e-05,
2429
+ "loss": 8.3037,
2430
+ "step": 6840
2431
+ },
2432
+ {
2433
+ "epoch": 0.03103805555178514,
2434
+ "grad_norm": 8.295937538146973,
2435
+ "learning_rate": 9.307302506560491e-05,
2436
+ "loss": 8.3179,
2437
+ "step": 6860
2438
+ },
2439
+ {
2440
+ "epoch": 0.031128545509662067,
2441
+ "grad_norm": 10.772727012634277,
2442
+ "learning_rate": 9.334449371097638e-05,
2443
+ "loss": 8.264,
2444
+ "step": 6880
2445
+ },
2446
+ {
2447
+ "epoch": 0.03121903546753899,
2448
+ "grad_norm": 8.465076446533203,
2449
+ "learning_rate": 9.361596235634784e-05,
2450
+ "loss": 8.2303,
2451
+ "step": 6900
2452
+ },
2453
+ {
2454
+ "epoch": 0.031309525425415916,
2455
+ "grad_norm": 9.096773147583008,
2456
+ "learning_rate": 9.38874310017193e-05,
2457
+ "loss": 8.2473,
2458
+ "step": 6920
2459
+ },
2460
+ {
2461
+ "epoch": 0.03140001538329284,
2462
+ "grad_norm": 10.57555866241455,
2463
+ "learning_rate": 9.415889964709077e-05,
2464
+ "loss": 8.27,
2465
+ "step": 6940
2466
+ },
2467
+ {
2468
+ "epoch": 0.03149050534116976,
2469
+ "grad_norm": 7.5089850425720215,
2470
+ "learning_rate": 9.443036829246222e-05,
2471
+ "loss": 8.27,
2472
+ "step": 6960
2473
+ },
2474
+ {
2475
+ "epoch": 0.03158099529904669,
2476
+ "grad_norm": 10.865699768066406,
2477
+ "learning_rate": 9.470183693783368e-05,
2478
+ "loss": 8.2451,
2479
+ "step": 6980
2480
+ },
2481
+ {
2482
+ "epoch": 0.031671485256923614,
2483
+ "grad_norm": 12.514881134033203,
2484
+ "learning_rate": 9.497330558320513e-05,
2485
+ "loss": 8.259,
2486
+ "step": 7000
2487
+ },
2488
+ {
2489
+ "epoch": 0.031761975214800536,
2490
+ "grad_norm": 9.914373397827148,
2491
+ "learning_rate": 9.524477422857659e-05,
2492
+ "loss": 8.2727,
2493
+ "step": 7020
2494
+ },
2495
+ {
2496
+ "epoch": 0.03185246517267746,
2497
+ "grad_norm": 7.3313984870910645,
2498
+ "learning_rate": 9.551624287394806e-05,
2499
+ "loss": 8.2421,
2500
+ "step": 7040
2501
+ },
2502
+ {
2503
+ "epoch": 0.03194295513055439,
2504
+ "grad_norm": 5.989616394042969,
2505
+ "learning_rate": 9.578771151931952e-05,
2506
+ "loss": 8.2363,
2507
+ "step": 7060
2508
+ },
2509
+ {
2510
+ "epoch": 0.03203344508843131,
2511
+ "grad_norm": 7.4773430824279785,
2512
+ "learning_rate": 9.605918016469098e-05,
2513
+ "loss": 8.2718,
2514
+ "step": 7080
2515
+ },
2516
+ {
2517
+ "epoch": 0.032123935046308234,
2518
+ "grad_norm": 6.605820655822754,
2519
+ "learning_rate": 9.633064881006243e-05,
2520
+ "loss": 8.257,
2521
+ "step": 7100
2522
+ },
2523
+ {
2524
+ "epoch": 0.03221442500418516,
2525
+ "grad_norm": 8.294914245605469,
2526
+ "learning_rate": 9.658854402316532e-05,
2527
+ "loss": 8.2478,
2528
+ "step": 7120
2529
+ },
2530
+ {
2531
+ "epoch": 0.03230491496206209,
2532
+ "grad_norm": 10.011855125427246,
2533
+ "learning_rate": 9.686001266853678e-05,
2534
+ "loss": 8.2525,
2535
+ "step": 7140
2536
+ },
2537
+ {
2538
+ "epoch": 0.03239540491993901,
2539
+ "grad_norm": 7.529365062713623,
2540
+ "learning_rate": 9.713148131390823e-05,
2541
+ "loss": 8.2728,
2542
+ "step": 7160
2543
+ },
2544
+ {
2545
+ "epoch": 0.03248589487781593,
2546
+ "grad_norm": 8.781538009643555,
2547
+ "learning_rate": 9.74029499592797e-05,
2548
+ "loss": 8.2305,
2549
+ "step": 7180
2550
+ },
2551
+ {
2552
+ "epoch": 0.03257638483569286,
2553
+ "grad_norm": 12.758204460144043,
2554
+ "learning_rate": 9.767441860465116e-05,
2555
+ "loss": 8.2382,
2556
+ "step": 7200
2557
+ },
2558
+ {
2559
+ "epoch": 0.032666874793569785,
2560
+ "grad_norm": 10.523704528808594,
2561
+ "learning_rate": 9.794588725002262e-05,
2562
+ "loss": 8.2364,
2563
+ "step": 7220
2564
+ },
2565
+ {
2566
+ "epoch": 0.03275736475144671,
2567
+ "grad_norm": 6.50457239151001,
2568
+ "learning_rate": 9.821735589539409e-05,
2569
+ "loss": 8.2384,
2570
+ "step": 7240
2571
+ },
2572
+ {
2573
+ "epoch": 0.03284785470932363,
2574
+ "grad_norm": 9.191271781921387,
2575
+ "learning_rate": 9.848882454076555e-05,
2576
+ "loss": 8.2148,
2577
+ "step": 7260
2578
+ },
2579
+ {
2580
+ "epoch": 0.03293834466720056,
2581
+ "grad_norm": 8.93270206451416,
2582
+ "learning_rate": 9.8760293186137e-05,
2583
+ "loss": 8.2352,
2584
+ "step": 7280
2585
+ },
2586
+ {
2587
+ "epoch": 0.03302883462507748,
2588
+ "grad_norm": 9.895100593566895,
2589
+ "learning_rate": 9.903176183150845e-05,
2590
+ "loss": 8.2376,
2591
+ "step": 7300
2592
+ },
2593
+ {
2594
+ "epoch": 0.033119324582954406,
2595
+ "grad_norm": 10.420171737670898,
2596
+ "learning_rate": 9.930323047687991e-05,
2597
+ "loss": 8.2479,
2598
+ "step": 7320
2599
+ },
2600
+ {
2601
+ "epoch": 0.03320981454083133,
2602
+ "grad_norm": 9.649170875549316,
2603
+ "learning_rate": 9.957469912225138e-05,
2604
+ "loss": 8.2557,
2605
+ "step": 7340
2606
+ },
2607
+ {
2608
+ "epoch": 0.03330030449870826,
2609
+ "grad_norm": 7.854948043823242,
2610
+ "learning_rate": 9.984616776762284e-05,
2611
+ "loss": 8.2145,
2612
+ "step": 7360
2613
+ },
2614
+ {
2615
+ "epoch": 0.03339079445658518,
2616
+ "grad_norm": 8.486404418945312,
2617
+ "learning_rate": 0.0001001176364129943,
2618
+ "loss": 8.2132,
2619
+ "step": 7380
2620
+ },
2621
+ {
2622
+ "epoch": 0.033481284414462104,
2623
+ "grad_norm": 11.286945343017578,
2624
+ "learning_rate": 0.00010038910505836577,
2625
+ "loss": 8.2169,
2626
+ "step": 7400
2627
+ },
2628
+ {
2629
+ "epoch": 0.033571774372339026,
2630
+ "grad_norm": 6.662302494049072,
2631
+ "learning_rate": 0.00010066057370373721,
2632
+ "loss": 8.2318,
2633
+ "step": 7420
2634
+ },
2635
+ {
2636
+ "epoch": 0.033662264330215956,
2637
+ "grad_norm": 10.467026710510254,
2638
+ "learning_rate": 0.00010093204234910868,
2639
+ "loss": 8.2089,
2640
+ "step": 7440
2641
+ },
2642
+ {
2643
+ "epoch": 0.03375275428809288,
2644
+ "grad_norm": 12.113288879394531,
2645
+ "learning_rate": 0.00010120351099448013,
2646
+ "loss": 8.2194,
2647
+ "step": 7460
2648
+ },
2649
+ {
2650
+ "epoch": 0.0338432442459698,
2651
+ "grad_norm": 13.295260429382324,
2652
+ "learning_rate": 0.00010147497963985159,
2653
+ "loss": 8.2526,
2654
+ "step": 7480
2655
+ },
2656
+ {
2657
+ "epoch": 0.03393373420384673,
2658
+ "grad_norm": 9.79587173461914,
2659
+ "learning_rate": 0.00010174644828522305,
2660
+ "loss": 8.2253,
2661
+ "step": 7500
2662
+ },
2663
+ {
2664
+ "epoch": 0.034024224161723654,
2665
+ "grad_norm": 10.251439094543457,
2666
+ "learning_rate": 0.00010201791693059452,
2667
+ "loss": 8.2248,
2668
+ "step": 7520
2669
+ },
2670
+ {
2671
+ "epoch": 0.03411471411960058,
2672
+ "grad_norm": 10.583033561706543,
2673
+ "learning_rate": 0.00010228938557596597,
2674
+ "loss": 8.211,
2675
+ "step": 7540
2676
+ },
2677
+ {
2678
+ "epoch": 0.0342052040774775,
2679
+ "grad_norm": 10.661384582519531,
2680
+ "learning_rate": 0.00010256085422133743,
2681
+ "loss": 8.2053,
2682
+ "step": 7560
2683
+ },
2684
+ {
2685
+ "epoch": 0.03429569403535443,
2686
+ "grad_norm": 8.133881568908691,
2687
+ "learning_rate": 0.0001028323228667089,
2688
+ "loss": 8.1948,
2689
+ "step": 7580
2690
+ },
2691
+ {
2692
+ "epoch": 0.03438618399323135,
2693
+ "grad_norm": 9.278162002563477,
2694
+ "learning_rate": 0.00010310379151208036,
2695
+ "loss": 8.2235,
2696
+ "step": 7600
2697
+ },
2698
+ {
2699
+ "epoch": 0.034476673951108275,
2700
+ "grad_norm": 10.354171752929688,
2701
+ "learning_rate": 0.00010337526015745181,
2702
+ "loss": 8.1704,
2703
+ "step": 7620
2704
+ },
2705
+ {
2706
+ "epoch": 0.0345671639089852,
2707
+ "grad_norm": 9.4600830078125,
2708
+ "learning_rate": 0.00010364672880282327,
2709
+ "loss": 8.2008,
2710
+ "step": 7640
2711
+ },
2712
+ {
2713
+ "epoch": 0.03465765386686213,
2714
+ "grad_norm": 10.290422439575195,
2715
+ "learning_rate": 0.00010391819744819473,
2716
+ "loss": 8.2084,
2717
+ "step": 7660
2718
+ },
2719
+ {
2720
+ "epoch": 0.03474814382473905,
2721
+ "grad_norm": 9.98493480682373,
2722
+ "learning_rate": 0.00010418966609356618,
2723
+ "loss": 8.1878,
2724
+ "step": 7680
2725
+ },
2726
+ {
2727
+ "epoch": 0.03483863378261597,
2728
+ "grad_norm": 8.021723747253418,
2729
+ "learning_rate": 0.00010446113473893765,
2730
+ "loss": 8.1865,
2731
+ "step": 7700
2732
+ },
2733
+ {
2734
+ "epoch": 0.034929123740492896,
2735
+ "grad_norm": 6.915677070617676,
2736
+ "learning_rate": 0.00010473260338430911,
2737
+ "loss": 8.1795,
2738
+ "step": 7720
2739
+ },
2740
+ {
2741
+ "epoch": 0.035019613698369825,
2742
+ "grad_norm": 9.64877986907959,
2743
+ "learning_rate": 0.00010500407202968057,
2744
+ "loss": 8.1756,
2745
+ "step": 7740
2746
+ },
2747
+ {
2748
+ "epoch": 0.03511010365624675,
2749
+ "grad_norm": 9.673460960388184,
2750
+ "learning_rate": 0.00010527554067505204,
2751
+ "loss": 8.1877,
2752
+ "step": 7760
2753
+ },
2754
+ {
2755
+ "epoch": 0.03520059361412367,
2756
+ "grad_norm": 10.429800033569336,
2757
+ "learning_rate": 0.0001055470093204235,
2758
+ "loss": 8.1803,
2759
+ "step": 7780
2760
+ },
2761
+ {
2762
+ "epoch": 0.0352910835720006,
2763
+ "grad_norm": 9.610269546508789,
2764
+ "learning_rate": 0.00010581847796579494,
2765
+ "loss": 8.214,
2766
+ "step": 7800
2767
+ },
2768
+ {
2769
+ "epoch": 0.03538157352987752,
2770
+ "grad_norm": 9.696439743041992,
2771
+ "learning_rate": 0.0001060899466111664,
2772
+ "loss": 8.1585,
2773
+ "step": 7820
2774
+ },
2775
+ {
2776
+ "epoch": 0.035472063487754446,
2777
+ "grad_norm": 10.302108764648438,
2778
+ "learning_rate": 0.00010636141525653786,
2779
+ "loss": 8.1495,
2780
+ "step": 7840
2781
+ },
2782
+ {
2783
+ "epoch": 0.03556255344563137,
2784
+ "grad_norm": 10.439906120300293,
2785
+ "learning_rate": 0.00010663288390190933,
2786
+ "loss": 8.1636,
2787
+ "step": 7860
2788
+ },
2789
+ {
2790
+ "epoch": 0.0356530434035083,
2791
+ "grad_norm": 13.941293716430664,
2792
+ "learning_rate": 0.00010690435254728079,
2793
+ "loss": 8.1674,
2794
+ "step": 7880
2795
+ },
2796
+ {
2797
+ "epoch": 0.03574353336138522,
2798
+ "grad_norm": 11.378789901733398,
2799
+ "learning_rate": 0.00010717582119265225,
2800
+ "loss": 8.1704,
2801
+ "step": 7900
2802
+ },
2803
+ {
2804
+ "epoch": 0.035834023319262144,
2805
+ "grad_norm": 10.802684783935547,
2806
+ "learning_rate": 0.00010744728983802372,
2807
+ "loss": 8.1902,
2808
+ "step": 7920
2809
+ },
2810
+ {
2811
+ "epoch": 0.03592451327713907,
2812
+ "grad_norm": 13.995284080505371,
2813
+ "learning_rate": 0.00010771875848339517,
2814
+ "loss": 8.1502,
2815
+ "step": 7940
2816
+ },
2817
+ {
2818
+ "epoch": 0.036015003235016,
2819
+ "grad_norm": 11.473008155822754,
2820
+ "learning_rate": 0.00010799022712876663,
2821
+ "loss": 8.2082,
2822
+ "step": 7960
2823
+ },
2824
+ {
2825
+ "epoch": 0.03610549319289292,
2826
+ "grad_norm": 9.314510345458984,
2827
+ "learning_rate": 0.00010826169577413808,
2828
+ "loss": 8.19,
2829
+ "step": 7980
2830
+ },
2831
+ {
2832
+ "epoch": 0.03619598315076984,
2833
+ "grad_norm": 11.141118049621582,
2834
+ "learning_rate": 0.00010853316441950954,
2835
+ "loss": 8.2093,
2836
+ "step": 8000
2837
+ },
2838
+ {
2839
+ "epoch": 0.03619598315076984,
2840
+ "eval_accuracy": 0.11013720949528932,
2841
+ "eval_loss": 8.173333168029785,
2842
+ "eval_runtime": 219.4541,
2843
+ "eval_samples_per_second": 2769.782,
2844
+ "eval_steps_per_second": 10.822,
2845
+ "step": 8000
2846
  }
2847
  ],
2848
  "logging_steps": 20,
 
2850
  "num_input_tokens_seen": 0,
2851
  "num_train_epochs": 3,
2852
  "save_steps": 100,
2853
+ "total_flos": 2876724215808000.0,
2854
  "train_batch_size": 256,
2855
  "trial_name": null,
2856
  "trial_params": null