CocoRoF commited on
Commit
0c5330b
·
verified ·
1 Parent(s): d8ea414

Training in progress, step 3000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52bd40e312864d51c8095645c37fe8af576516dc9f393de626a8443d07a0adff
3
  size 737580392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0336b9d4a5405b35eb41810e914f8235995602c3b470eb98cb5172e5614a1617
3
  size 737580392
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d3cdcded26c7cf7845b27eeae770160912071edb99917b5e8879b5d146204b1
3
  size 1475248442
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb14199c8ed85d5530890aaca81a88b88623101addc71c4dba17e1262410aecb
3
  size 1475248442
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e59fd29507b0a3f94de60acf1485068cfbd28d4220459a98545dc01f241293d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9819055317e0aa1215ad120239bc4cecc175225c0dc18c98ca0bffe9f465133f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9aa7a9e3bf08de96609cd484c170a699c16678f8aed43f535a9ffcc2c3940322
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7947fe218b4344129921368e2448c6474704c87d577f328a448eabc5c93d4cc3
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.1715089034676662,
5
  "eval_steps": 100,
6
- "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2157,6 +2157,436 @@
2157
  "eval_spearman_manhattan": 0.8206493433328421,
2158
  "eval_steps_per_second": 18.5,
2159
  "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2160
  }
2161
  ],
2162
  "logging_steps": 10,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.4058106841611997,
5
  "eval_steps": 100,
6
+ "global_step": 3000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2157
  "eval_spearman_manhattan": 0.8206493433328421,
2158
  "eval_steps_per_second": 18.5,
2159
  "step": 2500
2160
+ },
2161
+ {
2162
+ "epoch": 1.176194939081537,
2163
+ "grad_norm": 1.3612301349639893,
2164
+ "learning_rate": 4.63243908153702e-05,
2165
+ "loss": 0.1851,
2166
+ "step": 2510
2167
+ },
2168
+ {
2169
+ "epoch": 1.1808809746954076,
2170
+ "grad_norm": 1.376557469367981,
2171
+ "learning_rate": 4.630974695407685e-05,
2172
+ "loss": 0.1848,
2173
+ "step": 2520
2174
+ },
2175
+ {
2176
+ "epoch": 1.1855670103092784,
2177
+ "grad_norm": 1.7185298204421997,
2178
+ "learning_rate": 4.6295103092783506e-05,
2179
+ "loss": 0.1797,
2180
+ "step": 2530
2181
+ },
2182
+ {
2183
+ "epoch": 1.190253045923149,
2184
+ "grad_norm": 1.4754388332366943,
2185
+ "learning_rate": 4.628045923149017e-05,
2186
+ "loss": 0.1684,
2187
+ "step": 2540
2188
+ },
2189
+ {
2190
+ "epoch": 1.1949390815370198,
2191
+ "grad_norm": 1.6029070615768433,
2192
+ "learning_rate": 4.6265815370196815e-05,
2193
+ "loss": 0.2004,
2194
+ "step": 2550
2195
+ },
2196
+ {
2197
+ "epoch": 1.1996251171508903,
2198
+ "grad_norm": 1.8501805067062378,
2199
+ "learning_rate": 4.625117150890347e-05,
2200
+ "loss": 0.1961,
2201
+ "step": 2560
2202
+ },
2203
+ {
2204
+ "epoch": 1.204311152764761,
2205
+ "grad_norm": 1.721009373664856,
2206
+ "learning_rate": 4.623652764761012e-05,
2207
+ "loss": 0.2011,
2208
+ "step": 2570
2209
+ },
2210
+ {
2211
+ "epoch": 1.2089971883786317,
2212
+ "grad_norm": 1.4581866264343262,
2213
+ "learning_rate": 4.622188378631678e-05,
2214
+ "loss": 0.1767,
2215
+ "step": 2580
2216
+ },
2217
+ {
2218
+ "epoch": 1.2136832239925024,
2219
+ "grad_norm": 1.7804685831069946,
2220
+ "learning_rate": 4.620723992502343e-05,
2221
+ "loss": 0.2001,
2222
+ "step": 2590
2223
+ },
2224
+ {
2225
+ "epoch": 1.218369259606373,
2226
+ "grad_norm": 1.4118379354476929,
2227
+ "learning_rate": 4.6192596063730086e-05,
2228
+ "loss": 0.1828,
2229
+ "step": 2600
2230
+ },
2231
+ {
2232
+ "epoch": 1.218369259606373,
2233
+ "eval_loss": 0.03557514026761055,
2234
+ "eval_pearson_cosine": 0.8245837692334135,
2235
+ "eval_pearson_dot": 0.7684124588445229,
2236
+ "eval_pearson_euclidean": 0.8153574244852146,
2237
+ "eval_pearson_manhattan": 0.8162493723270217,
2238
+ "eval_runtime": 5.8306,
2239
+ "eval_samples_per_second": 257.263,
2240
+ "eval_spearman_cosine": 0.8266121706817625,
2241
+ "eval_spearman_dot": 0.7673501753109182,
2242
+ "eval_spearman_euclidean": 0.8215235813615567,
2243
+ "eval_spearman_manhattan": 0.8216899493490951,
2244
+ "eval_steps_per_second": 16.122,
2245
+ "step": 2600
2246
+ },
2247
+ {
2248
+ "epoch": 1.2230552952202436,
2249
+ "grad_norm": 2.07022762298584,
2250
+ "learning_rate": 4.617795220243674e-05,
2251
+ "loss": 0.2239,
2252
+ "step": 2610
2253
+ },
2254
+ {
2255
+ "epoch": 1.2277413308341143,
2256
+ "grad_norm": 1.2374054193496704,
2257
+ "learning_rate": 4.6163308341143395e-05,
2258
+ "loss": 0.2037,
2259
+ "step": 2620
2260
+ },
2261
+ {
2262
+ "epoch": 1.232427366447985,
2263
+ "grad_norm": 1.5975677967071533,
2264
+ "learning_rate": 4.614866447985005e-05,
2265
+ "loss": 0.178,
2266
+ "step": 2630
2267
+ },
2268
+ {
2269
+ "epoch": 1.2371134020618557,
2270
+ "grad_norm": 1.7808302640914917,
2271
+ "learning_rate": 4.61340206185567e-05,
2272
+ "loss": 0.2349,
2273
+ "step": 2640
2274
+ },
2275
+ {
2276
+ "epoch": 1.2417994376757264,
2277
+ "grad_norm": 1.4256142377853394,
2278
+ "learning_rate": 4.611937675726336e-05,
2279
+ "loss": 0.1775,
2280
+ "step": 2650
2281
+ },
2282
+ {
2283
+ "epoch": 1.246485473289597,
2284
+ "grad_norm": 1.526659607887268,
2285
+ "learning_rate": 4.610473289597001e-05,
2286
+ "loss": 0.1929,
2287
+ "step": 2660
2288
+ },
2289
+ {
2290
+ "epoch": 1.2511715089034676,
2291
+ "grad_norm": 1.5862327814102173,
2292
+ "learning_rate": 4.6090089034676666e-05,
2293
+ "loss": 0.1936,
2294
+ "step": 2670
2295
+ },
2296
+ {
2297
+ "epoch": 1.2558575445173383,
2298
+ "grad_norm": 1.5287206172943115,
2299
+ "learning_rate": 4.607544517338332e-05,
2300
+ "loss": 0.2,
2301
+ "step": 2680
2302
+ },
2303
+ {
2304
+ "epoch": 1.260543580131209,
2305
+ "grad_norm": 1.8908789157867432,
2306
+ "learning_rate": 4.6060801312089974e-05,
2307
+ "loss": 0.181,
2308
+ "step": 2690
2309
+ },
2310
+ {
2311
+ "epoch": 1.2652296157450795,
2312
+ "grad_norm": 1.5627946853637695,
2313
+ "learning_rate": 4.604615745079663e-05,
2314
+ "loss": 0.2069,
2315
+ "step": 2700
2316
+ },
2317
+ {
2318
+ "epoch": 1.2652296157450795,
2319
+ "eval_loss": 0.036834895610809326,
2320
+ "eval_pearson_cosine": 0.81713630802534,
2321
+ "eval_pearson_dot": 0.7548895992723743,
2322
+ "eval_pearson_euclidean": 0.8115847405548919,
2323
+ "eval_pearson_manhattan": 0.8127917876351205,
2324
+ "eval_runtime": 4.9833,
2325
+ "eval_samples_per_second": 301.004,
2326
+ "eval_spearman_cosine": 0.8195765518778169,
2327
+ "eval_spearman_dot": 0.7543605112183749,
2328
+ "eval_spearman_euclidean": 0.8179210742039561,
2329
+ "eval_spearman_manhattan": 0.8186720969386952,
2330
+ "eval_steps_per_second": 18.863,
2331
+ "step": 2700
2332
+ },
2333
+ {
2334
+ "epoch": 1.2699156513589505,
2335
+ "grad_norm": 1.3397817611694336,
2336
+ "learning_rate": 4.603151358950328e-05,
2337
+ "loss": 0.1853,
2338
+ "step": 2710
2339
+ },
2340
+ {
2341
+ "epoch": 1.274601686972821,
2342
+ "grad_norm": 1.2182811498641968,
2343
+ "learning_rate": 4.601686972820994e-05,
2344
+ "loss": 0.1605,
2345
+ "step": 2720
2346
+ },
2347
+ {
2348
+ "epoch": 1.2792877225866917,
2349
+ "grad_norm": 1.3814051151275635,
2350
+ "learning_rate": 4.600222586691659e-05,
2351
+ "loss": 0.1891,
2352
+ "step": 2730
2353
+ },
2354
+ {
2355
+ "epoch": 1.2839737582005624,
2356
+ "grad_norm": 1.5402768850326538,
2357
+ "learning_rate": 4.5987582005623246e-05,
2358
+ "loss": 0.2003,
2359
+ "step": 2740
2360
+ },
2361
+ {
2362
+ "epoch": 1.2886597938144329,
2363
+ "grad_norm": 2.1603922843933105,
2364
+ "learning_rate": 4.59729381443299e-05,
2365
+ "loss": 0.2135,
2366
+ "step": 2750
2367
+ },
2368
+ {
2369
+ "epoch": 1.2933458294283038,
2370
+ "grad_norm": 1.5168280601501465,
2371
+ "learning_rate": 4.5958294283036554e-05,
2372
+ "loss": 0.2082,
2373
+ "step": 2760
2374
+ },
2375
+ {
2376
+ "epoch": 1.2980318650421743,
2377
+ "grad_norm": 1.4614285230636597,
2378
+ "learning_rate": 4.594365042174321e-05,
2379
+ "loss": 0.1666,
2380
+ "step": 2770
2381
+ },
2382
+ {
2383
+ "epoch": 1.302717900656045,
2384
+ "grad_norm": 1.7025336027145386,
2385
+ "learning_rate": 4.592900656044986e-05,
2386
+ "loss": 0.1968,
2387
+ "step": 2780
2388
+ },
2389
+ {
2390
+ "epoch": 1.3074039362699157,
2391
+ "grad_norm": 1.6786755323410034,
2392
+ "learning_rate": 4.591436269915652e-05,
2393
+ "loss": 0.1807,
2394
+ "step": 2790
2395
+ },
2396
+ {
2397
+ "epoch": 1.3120899718837864,
2398
+ "grad_norm": 1.3550318479537964,
2399
+ "learning_rate": 4.589971883786317e-05,
2400
+ "loss": 0.1957,
2401
+ "step": 2800
2402
+ },
2403
+ {
2404
+ "epoch": 1.3120899718837864,
2405
+ "eval_loss": 0.039760004729032516,
2406
+ "eval_pearson_cosine": 0.8185279801679428,
2407
+ "eval_pearson_dot": 0.747374497646847,
2408
+ "eval_pearson_euclidean": 0.816034100619575,
2409
+ "eval_pearson_manhattan": 0.8168064451822818,
2410
+ "eval_runtime": 6.0018,
2411
+ "eval_samples_per_second": 249.926,
2412
+ "eval_spearman_cosine": 0.8216325084204806,
2413
+ "eval_spearman_dot": 0.7458600705129738,
2414
+ "eval_spearman_euclidean": 0.8234026528716193,
2415
+ "eval_spearman_manhattan": 0.8240265815824191,
2416
+ "eval_steps_per_second": 15.662,
2417
+ "step": 2800
2418
+ },
2419
+ {
2420
+ "epoch": 1.316776007497657,
2421
+ "grad_norm": 1.5915249586105347,
2422
+ "learning_rate": 4.5885074976569826e-05,
2423
+ "loss": 0.1972,
2424
+ "step": 2810
2425
+ },
2426
+ {
2427
+ "epoch": 1.3214620431115276,
2428
+ "grad_norm": 1.377953290939331,
2429
+ "learning_rate": 4.587043111527648e-05,
2430
+ "loss": 0.178,
2431
+ "step": 2820
2432
+ },
2433
+ {
2434
+ "epoch": 1.3261480787253983,
2435
+ "grad_norm": 1.458254337310791,
2436
+ "learning_rate": 4.5855787253983134e-05,
2437
+ "loss": 0.1795,
2438
+ "step": 2830
2439
+ },
2440
+ {
2441
+ "epoch": 1.330834114339269,
2442
+ "grad_norm": 1.6020787954330444,
2443
+ "learning_rate": 4.584114339268979e-05,
2444
+ "loss": 0.1925,
2445
+ "step": 2840
2446
+ },
2447
+ {
2448
+ "epoch": 1.3355201499531397,
2449
+ "grad_norm": 1.5305688381195068,
2450
+ "learning_rate": 4.5826499531396436e-05,
2451
+ "loss": 0.1764,
2452
+ "step": 2850
2453
+ },
2454
+ {
2455
+ "epoch": 1.3402061855670104,
2456
+ "grad_norm": 1.6562731266021729,
2457
+ "learning_rate": 4.581185567010309e-05,
2458
+ "loss": 0.1838,
2459
+ "step": 2860
2460
+ },
2461
+ {
2462
+ "epoch": 1.344892221180881,
2463
+ "grad_norm": 1.2753900289535522,
2464
+ "learning_rate": 4.5797211808809745e-05,
2465
+ "loss": 0.1814,
2466
+ "step": 2870
2467
+ },
2468
+ {
2469
+ "epoch": 1.3495782567947516,
2470
+ "grad_norm": 1.702071189880371,
2471
+ "learning_rate": 4.5782567947516406e-05,
2472
+ "loss": 0.1946,
2473
+ "step": 2880
2474
+ },
2475
+ {
2476
+ "epoch": 1.3542642924086223,
2477
+ "grad_norm": 1.2827301025390625,
2478
+ "learning_rate": 4.576792408622306e-05,
2479
+ "loss": 0.1597,
2480
+ "step": 2890
2481
+ },
2482
+ {
2483
+ "epoch": 1.358950328022493,
2484
+ "grad_norm": 1.7702302932739258,
2485
+ "learning_rate": 4.5753280224929714e-05,
2486
+ "loss": 0.1917,
2487
+ "step": 2900
2488
+ },
2489
+ {
2490
+ "epoch": 1.358950328022493,
2491
+ "eval_loss": 0.03551251068711281,
2492
+ "eval_pearson_cosine": 0.8240420723863338,
2493
+ "eval_pearson_dot": 0.7591715678324817,
2494
+ "eval_pearson_euclidean": 0.8108149056355918,
2495
+ "eval_pearson_manhattan": 0.8124676242018225,
2496
+ "eval_runtime": 6.0763,
2497
+ "eval_samples_per_second": 246.859,
2498
+ "eval_spearman_cosine": 0.8255553040970393,
2499
+ "eval_spearman_dot": 0.7606600789247023,
2500
+ "eval_spearman_euclidean": 0.8186055770843518,
2501
+ "eval_spearman_manhattan": 0.8199282003068918,
2502
+ "eval_steps_per_second": 15.47,
2503
+ "step": 2900
2504
+ },
2505
+ {
2506
+ "epoch": 1.3636363636363638,
2507
+ "grad_norm": 1.4522427320480347,
2508
+ "learning_rate": 4.573863636363637e-05,
2509
+ "loss": 0.1994,
2510
+ "step": 2910
2511
+ },
2512
+ {
2513
+ "epoch": 1.3683223992502342,
2514
+ "grad_norm": 1.6747633218765259,
2515
+ "learning_rate": 4.572399250234302e-05,
2516
+ "loss": 0.1608,
2517
+ "step": 2920
2518
+ },
2519
+ {
2520
+ "epoch": 1.373008434864105,
2521
+ "grad_norm": 1.9356324672698975,
2522
+ "learning_rate": 4.570934864104968e-05,
2523
+ "loss": 0.2216,
2524
+ "step": 2930
2525
+ },
2526
+ {
2527
+ "epoch": 1.3776944704779757,
2528
+ "grad_norm": 1.3150924444198608,
2529
+ "learning_rate": 4.569470477975633e-05,
2530
+ "loss": 0.1755,
2531
+ "step": 2940
2532
+ },
2533
+ {
2534
+ "epoch": 1.3823805060918464,
2535
+ "grad_norm": 1.466535210609436,
2536
+ "learning_rate": 4.568006091846298e-05,
2537
+ "loss": 0.2097,
2538
+ "step": 2950
2539
+ },
2540
+ {
2541
+ "epoch": 1.387066541705717,
2542
+ "grad_norm": 1.5124356746673584,
2543
+ "learning_rate": 4.566541705716963e-05,
2544
+ "loss": 0.1874,
2545
+ "step": 2960
2546
+ },
2547
+ {
2548
+ "epoch": 1.3917525773195876,
2549
+ "grad_norm": 1.4657456874847412,
2550
+ "learning_rate": 4.565077319587629e-05,
2551
+ "loss": 0.1818,
2552
+ "step": 2970
2553
+ },
2554
+ {
2555
+ "epoch": 1.3964386129334583,
2556
+ "grad_norm": 1.0131442546844482,
2557
+ "learning_rate": 4.563612933458294e-05,
2558
+ "loss": 0.2018,
2559
+ "step": 2980
2560
+ },
2561
+ {
2562
+ "epoch": 1.401124648547329,
2563
+ "grad_norm": 1.6073145866394043,
2564
+ "learning_rate": 4.5621485473289596e-05,
2565
+ "loss": 0.1823,
2566
+ "step": 2990
2567
+ },
2568
+ {
2569
+ "epoch": 1.4058106841611997,
2570
+ "grad_norm": 1.0797163248062134,
2571
+ "learning_rate": 4.560684161199626e-05,
2572
+ "loss": 0.1944,
2573
+ "step": 3000
2574
+ },
2575
+ {
2576
+ "epoch": 1.4058106841611997,
2577
+ "eval_loss": 0.03550059348344803,
2578
+ "eval_pearson_cosine": 0.8271464556841153,
2579
+ "eval_pearson_dot": 0.7621466950928664,
2580
+ "eval_pearson_euclidean": 0.8148266069217982,
2581
+ "eval_pearson_manhattan": 0.8163309873805886,
2582
+ "eval_runtime": 6.1003,
2583
+ "eval_samples_per_second": 245.888,
2584
+ "eval_spearman_cosine": 0.8292279657552445,
2585
+ "eval_spearman_dot": 0.7643385930827495,
2586
+ "eval_spearman_euclidean": 0.822998286159295,
2587
+ "eval_spearman_manhattan": 0.8242741076199507,
2588
+ "eval_steps_per_second": 15.409,
2589
+ "step": 3000
2590
  }
2591
  ],
2592
  "logging_steps": 10,