CocoRoF commited on
Commit
bb61318
·
verified ·
1 Parent(s): 6071833

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:878397b694b0a8de341b4b8d86e9a615129650afe4a1ebc2f9f3ccfb75d0c0c8
3
  size 737580392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34566692b409c5ac91614f85431dcd34c66b34e43f4a92470ac88cb8a7f59789
3
  size 737580392
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0acf7430500cf883fa2666bfba6735859811567ffb5b4b4b2939205ee3547014
3
  size 1475248442
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dde92a9190daf4d6cde776e30b6451c543ea444edee3b9afc951fa9a8b0c5c7
3
  size 1475248442
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59efe62e4ca0647678855566a69eaafc20fb9e01c9af7b6b454bf0717d7bf5f7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f9aa43992237c34da047eae3e4635545e3cbee9026436669a8ec61ef48f58c1
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4f48bf96def541f86640977b0dc57c5078e1aaca13e1c80e28041dac90f6386
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a5a37a50084edc8f378e7d4e65f501b6827a819c6aec4a25edf84ae7f0723a0
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.4058106841611997,
5
  "eval_steps": 250,
6
- "global_step": 3000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2299,6 +2299,770 @@
2299
  "eval_spearman_manhattan": 0.8142678938453525,
2300
  "eval_steps_per_second": 27.696,
2301
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2302
  }
2303
  ],
2304
  "logging_steps": 10,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.8744142455482662,
5
  "eval_steps": 250,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2299
  "eval_spearman_manhattan": 0.8142678938453525,
2300
  "eval_steps_per_second": 27.696,
2301
  "step": 3000
2302
+ },
2303
+ {
2304
+ "epoch": 1.4104967197750704,
2305
+ "grad_norm": 2.0516934394836426,
2306
+ "learning_rate": 1.911843955014058e-05,
2307
+ "loss": 0.2469,
2308
+ "step": 3010
2309
+ },
2310
+ {
2311
+ "epoch": 1.415182755388941,
2312
+ "grad_norm": 1.8646856546401978,
2313
+ "learning_rate": 1.9115510777881914e-05,
2314
+ "loss": 0.2434,
2315
+ "step": 3020
2316
+ },
2317
+ {
2318
+ "epoch": 1.4198687910028116,
2319
+ "grad_norm": 1.746596097946167,
2320
+ "learning_rate": 1.9112582005623243e-05,
2321
+ "loss": 0.2392,
2322
+ "step": 3030
2323
+ },
2324
+ {
2325
+ "epoch": 1.4245548266166823,
2326
+ "grad_norm": 1.7546746730804443,
2327
+ "learning_rate": 1.9109653233364576e-05,
2328
+ "loss": 0.2141,
2329
+ "step": 3040
2330
+ },
2331
+ {
2332
+ "epoch": 1.429240862230553,
2333
+ "grad_norm": 1.7862520217895508,
2334
+ "learning_rate": 1.9106724461105905e-05,
2335
+ "loss": 0.2424,
2336
+ "step": 3050
2337
+ },
2338
+ {
2339
+ "epoch": 1.4339268978444237,
2340
+ "grad_norm": 2.089353322982788,
2341
+ "learning_rate": 1.910379568884724e-05,
2342
+ "loss": 0.2493,
2343
+ "step": 3060
2344
+ },
2345
+ {
2346
+ "epoch": 1.4386129334582942,
2347
+ "grad_norm": 1.7390618324279785,
2348
+ "learning_rate": 1.9100866916588568e-05,
2349
+ "loss": 0.2317,
2350
+ "step": 3070
2351
+ },
2352
+ {
2353
+ "epoch": 1.443298969072165,
2354
+ "grad_norm": 1.9297877550125122,
2355
+ "learning_rate": 1.90979381443299e-05,
2356
+ "loss": 0.2335,
2357
+ "step": 3080
2358
+ },
2359
+ {
2360
+ "epoch": 1.4479850046860356,
2361
+ "grad_norm": 1.872578740119934,
2362
+ "learning_rate": 1.909500937207123e-05,
2363
+ "loss": 0.222,
2364
+ "step": 3090
2365
+ },
2366
+ {
2367
+ "epoch": 1.4526710402999063,
2368
+ "grad_norm": 1.646843433380127,
2369
+ "learning_rate": 1.909208059981256e-05,
2370
+ "loss": 0.2154,
2371
+ "step": 3100
2372
+ },
2373
+ {
2374
+ "epoch": 1.457357075913777,
2375
+ "grad_norm": 1.8208719491958618,
2376
+ "learning_rate": 1.9089151827553893e-05,
2377
+ "loss": 0.2887,
2378
+ "step": 3110
2379
+ },
2380
+ {
2381
+ "epoch": 1.4620431115276475,
2382
+ "grad_norm": 1.6147174835205078,
2383
+ "learning_rate": 1.9086223055295222e-05,
2384
+ "loss": 0.2292,
2385
+ "step": 3120
2386
+ },
2387
+ {
2388
+ "epoch": 1.4667291471415183,
2389
+ "grad_norm": 2.147585391998291,
2390
+ "learning_rate": 1.908329428303655e-05,
2391
+ "loss": 0.2598,
2392
+ "step": 3130
2393
+ },
2394
+ {
2395
+ "epoch": 1.471415182755389,
2396
+ "grad_norm": 2.17818284034729,
2397
+ "learning_rate": 1.9080365510777884e-05,
2398
+ "loss": 0.2177,
2399
+ "step": 3140
2400
+ },
2401
+ {
2402
+ "epoch": 1.4761012183692597,
2403
+ "grad_norm": 1.7800393104553223,
2404
+ "learning_rate": 1.9077436738519214e-05,
2405
+ "loss": 0.2091,
2406
+ "step": 3150
2407
+ },
2408
+ {
2409
+ "epoch": 1.4807872539831304,
2410
+ "grad_norm": 1.8269144296646118,
2411
+ "learning_rate": 1.9074507966260543e-05,
2412
+ "loss": 0.241,
2413
+ "step": 3160
2414
+ },
2415
+ {
2416
+ "epoch": 1.4854732895970009,
2417
+ "grad_norm": 1.4544728994369507,
2418
+ "learning_rate": 1.9071579194001876e-05,
2419
+ "loss": 0.2481,
2420
+ "step": 3170
2421
+ },
2422
+ {
2423
+ "epoch": 1.4901593252108716,
2424
+ "grad_norm": 2.5941991806030273,
2425
+ "learning_rate": 1.9068650421743206e-05,
2426
+ "loss": 0.266,
2427
+ "step": 3180
2428
+ },
2429
+ {
2430
+ "epoch": 1.4948453608247423,
2431
+ "grad_norm": 1.9068180322647095,
2432
+ "learning_rate": 1.906572164948454e-05,
2433
+ "loss": 0.2546,
2434
+ "step": 3190
2435
+ },
2436
+ {
2437
+ "epoch": 1.499531396438613,
2438
+ "grad_norm": 1.590065598487854,
2439
+ "learning_rate": 1.9062792877225868e-05,
2440
+ "loss": 0.2292,
2441
+ "step": 3200
2442
+ },
2443
+ {
2444
+ "epoch": 1.5042174320524837,
2445
+ "grad_norm": 1.9283113479614258,
2446
+ "learning_rate": 1.9059864104967198e-05,
2447
+ "loss": 0.2485,
2448
+ "step": 3210
2449
+ },
2450
+ {
2451
+ "epoch": 1.5089034676663542,
2452
+ "grad_norm": 2.1952388286590576,
2453
+ "learning_rate": 1.905693533270853e-05,
2454
+ "loss": 0.2197,
2455
+ "step": 3220
2456
+ },
2457
+ {
2458
+ "epoch": 1.513589503280225,
2459
+ "grad_norm": 1.554611086845398,
2460
+ "learning_rate": 1.905400656044986e-05,
2461
+ "loss": 0.2136,
2462
+ "step": 3230
2463
+ },
2464
+ {
2465
+ "epoch": 1.5182755388940956,
2466
+ "grad_norm": 1.5786999464035034,
2467
+ "learning_rate": 1.9051077788191193e-05,
2468
+ "loss": 0.2333,
2469
+ "step": 3240
2470
+ },
2471
+ {
2472
+ "epoch": 1.522961574507966,
2473
+ "grad_norm": 2.7858917713165283,
2474
+ "learning_rate": 1.9048149015932523e-05,
2475
+ "loss": 0.2328,
2476
+ "step": 3250
2477
+ },
2478
+ {
2479
+ "epoch": 1.522961574507966,
2480
+ "eval_loss": 0.04040240868926048,
2481
+ "eval_pearson_cosine": 0.8187130517711054,
2482
+ "eval_pearson_dot": 0.7426107734380025,
2483
+ "eval_pearson_euclidean": 0.8069722435152613,
2484
+ "eval_pearson_manhattan": 0.8083599378238802,
2485
+ "eval_runtime": 3.6772,
2486
+ "eval_samples_per_second": 407.917,
2487
+ "eval_spearman_cosine": 0.8202782064729509,
2488
+ "eval_spearman_dot": 0.7414284976652127,
2489
+ "eval_spearman_euclidean": 0.8153801634076588,
2490
+ "eval_spearman_manhattan": 0.8165145388144227,
2491
+ "eval_steps_per_second": 25.563,
2492
+ "step": 3250
2493
+ },
2494
+ {
2495
+ "epoch": 1.527647610121837,
2496
+ "grad_norm": 1.9804577827453613,
2497
+ "learning_rate": 1.9045220243673855e-05,
2498
+ "loss": 0.252,
2499
+ "step": 3260
2500
+ },
2501
+ {
2502
+ "epoch": 1.5323336457357075,
2503
+ "grad_norm": 1.8716363906860352,
2504
+ "learning_rate": 1.9042291471415185e-05,
2505
+ "loss": 0.2266,
2506
+ "step": 3270
2507
+ },
2508
+ {
2509
+ "epoch": 1.5370196813495782,
2510
+ "grad_norm": 1.834250569343567,
2511
+ "learning_rate": 1.9039362699156518e-05,
2512
+ "loss": 0.2314,
2513
+ "step": 3280
2514
+ },
2515
+ {
2516
+ "epoch": 1.541705716963449,
2517
+ "grad_norm": 2.0380077362060547,
2518
+ "learning_rate": 1.9036433926897847e-05,
2519
+ "loss": 0.2545,
2520
+ "step": 3290
2521
+ },
2522
+ {
2523
+ "epoch": 1.5463917525773194,
2524
+ "grad_norm": 1.8950886726379395,
2525
+ "learning_rate": 1.9033505154639177e-05,
2526
+ "loss": 0.2902,
2527
+ "step": 3300
2528
+ },
2529
+ {
2530
+ "epoch": 1.5510777881911904,
2531
+ "grad_norm": 1.4216679334640503,
2532
+ "learning_rate": 1.9030576382380506e-05,
2533
+ "loss": 0.2344,
2534
+ "step": 3310
2535
+ },
2536
+ {
2537
+ "epoch": 1.5557638238050608,
2538
+ "grad_norm": 1.2056218385696411,
2539
+ "learning_rate": 1.902764761012184e-05,
2540
+ "loss": 0.252,
2541
+ "step": 3320
2542
+ },
2543
+ {
2544
+ "epoch": 1.5604498594189316,
2545
+ "grad_norm": 1.8112496137619019,
2546
+ "learning_rate": 1.902471883786317e-05,
2547
+ "loss": 0.2406,
2548
+ "step": 3330
2549
+ },
2550
+ {
2551
+ "epoch": 1.5651358950328023,
2552
+ "grad_norm": 1.9375700950622559,
2553
+ "learning_rate": 1.9021790065604498e-05,
2554
+ "loss": 0.2728,
2555
+ "step": 3340
2556
+ },
2557
+ {
2558
+ "epoch": 1.569821930646673,
2559
+ "grad_norm": 2.4203391075134277,
2560
+ "learning_rate": 1.901886129334583e-05,
2561
+ "loss": 0.2434,
2562
+ "step": 3350
2563
+ },
2564
+ {
2565
+ "epoch": 1.5745079662605437,
2566
+ "grad_norm": 2.114474058151245,
2567
+ "learning_rate": 1.901593252108716e-05,
2568
+ "loss": 0.2535,
2569
+ "step": 3360
2570
+ },
2571
+ {
2572
+ "epoch": 1.5791940018744142,
2573
+ "grad_norm": 1.5152201652526855,
2574
+ "learning_rate": 1.9013003748828493e-05,
2575
+ "loss": 0.2537,
2576
+ "step": 3370
2577
+ },
2578
+ {
2579
+ "epoch": 1.5838800374882849,
2580
+ "grad_norm": 1.8051055669784546,
2581
+ "learning_rate": 1.9010074976569823e-05,
2582
+ "loss": 0.2271,
2583
+ "step": 3380
2584
+ },
2585
+ {
2586
+ "epoch": 1.5885660731021556,
2587
+ "grad_norm": 2.005028247833252,
2588
+ "learning_rate": 1.9007146204311156e-05,
2589
+ "loss": 0.2508,
2590
+ "step": 3390
2591
+ },
2592
+ {
2593
+ "epoch": 1.5932521087160263,
2594
+ "grad_norm": 1.534379005432129,
2595
+ "learning_rate": 1.9004217432052485e-05,
2596
+ "loss": 0.2358,
2597
+ "step": 3400
2598
+ },
2599
+ {
2600
+ "epoch": 1.597938144329897,
2601
+ "grad_norm": 1.2152713537216187,
2602
+ "learning_rate": 1.9001288659793815e-05,
2603
+ "loss": 0.203,
2604
+ "step": 3410
2605
+ },
2606
+ {
2607
+ "epoch": 1.6026241799437675,
2608
+ "grad_norm": 1.584352970123291,
2609
+ "learning_rate": 1.8998359887535148e-05,
2610
+ "loss": 0.2369,
2611
+ "step": 3420
2612
+ },
2613
+ {
2614
+ "epoch": 1.6073102155576382,
2615
+ "grad_norm": 1.8603837490081787,
2616
+ "learning_rate": 1.8995431115276477e-05,
2617
+ "loss": 0.2576,
2618
+ "step": 3430
2619
+ },
2620
+ {
2621
+ "epoch": 1.611996251171509,
2622
+ "grad_norm": 1.300493597984314,
2623
+ "learning_rate": 1.899250234301781e-05,
2624
+ "loss": 0.2048,
2625
+ "step": 3440
2626
+ },
2627
+ {
2628
+ "epoch": 1.6166822867853796,
2629
+ "grad_norm": 1.6629600524902344,
2630
+ "learning_rate": 1.898957357075914e-05,
2631
+ "loss": 0.2305,
2632
+ "step": 3450
2633
+ },
2634
+ {
2635
+ "epoch": 1.6213683223992503,
2636
+ "grad_norm": 2.555297374725342,
2637
+ "learning_rate": 1.8986644798500473e-05,
2638
+ "loss": 0.2487,
2639
+ "step": 3460
2640
+ },
2641
+ {
2642
+ "epoch": 1.6260543580131208,
2643
+ "grad_norm": 2.425975799560547,
2644
+ "learning_rate": 1.8983716026241802e-05,
2645
+ "loss": 0.241,
2646
+ "step": 3470
2647
+ },
2648
+ {
2649
+ "epoch": 1.6307403936269915,
2650
+ "grad_norm": 1.912858247756958,
2651
+ "learning_rate": 1.898078725398313e-05,
2652
+ "loss": 0.2074,
2653
+ "step": 3480
2654
+ },
2655
+ {
2656
+ "epoch": 1.6354264292408622,
2657
+ "grad_norm": 2.066469430923462,
2658
+ "learning_rate": 1.897785848172446e-05,
2659
+ "loss": 0.2157,
2660
+ "step": 3490
2661
+ },
2662
+ {
2663
+ "epoch": 1.640112464854733,
2664
+ "grad_norm": 1.5405519008636475,
2665
+ "learning_rate": 1.8974929709465794e-05,
2666
+ "loss": 0.2052,
2667
+ "step": 3500
2668
+ },
2669
+ {
2670
+ "epoch": 1.640112464854733,
2671
+ "eval_loss": 0.03896905109286308,
2672
+ "eval_pearson_cosine": 0.8147239668269464,
2673
+ "eval_pearson_dot": 0.74259279719071,
2674
+ "eval_pearson_euclidean": 0.803459120860885,
2675
+ "eval_pearson_manhattan": 0.8045289290680273,
2676
+ "eval_runtime": 3.5517,
2677
+ "eval_samples_per_second": 422.338,
2678
+ "eval_spearman_cosine": 0.8164243711017356,
2679
+ "eval_spearman_dot": 0.742209739118843,
2680
+ "eval_spearman_euclidean": 0.8121850450179654,
2681
+ "eval_spearman_manhattan": 0.8128758362528613,
2682
+ "eval_steps_per_second": 26.467,
2683
+ "step": 3500
2684
+ },
2685
+ {
2686
+ "epoch": 1.6447985004686037,
2687
+ "grad_norm": 1.4605026245117188,
2688
+ "learning_rate": 1.8972000937207123e-05,
2689
+ "loss": 0.2329,
2690
+ "step": 3510
2691
+ },
2692
+ {
2693
+ "epoch": 1.6494845360824741,
2694
+ "grad_norm": 1.496071457862854,
2695
+ "learning_rate": 1.8969072164948453e-05,
2696
+ "loss": 0.2171,
2697
+ "step": 3520
2698
+ },
2699
+ {
2700
+ "epoch": 1.6541705716963448,
2701
+ "grad_norm": 1.5330686569213867,
2702
+ "learning_rate": 1.8966143392689786e-05,
2703
+ "loss": 0.235,
2704
+ "step": 3530
2705
+ },
2706
+ {
2707
+ "epoch": 1.6588566073102156,
2708
+ "grad_norm": 1.77309250831604,
2709
+ "learning_rate": 1.8963214620431115e-05,
2710
+ "loss": 0.2398,
2711
+ "step": 3540
2712
+ },
2713
+ {
2714
+ "epoch": 1.6635426429240863,
2715
+ "grad_norm": 2.6333131790161133,
2716
+ "learning_rate": 1.8960285848172448e-05,
2717
+ "loss": 0.2444,
2718
+ "step": 3550
2719
+ },
2720
+ {
2721
+ "epoch": 1.668228678537957,
2722
+ "grad_norm": 1.7444895505905151,
2723
+ "learning_rate": 1.8957357075913778e-05,
2724
+ "loss": 0.2308,
2725
+ "step": 3560
2726
+ },
2727
+ {
2728
+ "epoch": 1.6729147141518275,
2729
+ "grad_norm": 1.9555528163909912,
2730
+ "learning_rate": 1.895442830365511e-05,
2731
+ "loss": 0.2177,
2732
+ "step": 3570
2733
+ },
2734
+ {
2735
+ "epoch": 1.6776007497656982,
2736
+ "grad_norm": 1.8581876754760742,
2737
+ "learning_rate": 1.895149953139644e-05,
2738
+ "loss": 0.2287,
2739
+ "step": 3580
2740
+ },
2741
+ {
2742
+ "epoch": 1.6822867853795689,
2743
+ "grad_norm": 1.9251590967178345,
2744
+ "learning_rate": 1.894857075913777e-05,
2745
+ "loss": 0.2547,
2746
+ "step": 3590
2747
+ },
2748
+ {
2749
+ "epoch": 1.6869728209934396,
2750
+ "grad_norm": 1.6071839332580566,
2751
+ "learning_rate": 1.8945641986879102e-05,
2752
+ "loss": 0.2409,
2753
+ "step": 3600
2754
+ },
2755
+ {
2756
+ "epoch": 1.6916588566073103,
2757
+ "grad_norm": 1.735236406326294,
2758
+ "learning_rate": 1.8942713214620432e-05,
2759
+ "loss": 0.2315,
2760
+ "step": 3610
2761
+ },
2762
+ {
2763
+ "epoch": 1.6963448922211808,
2764
+ "grad_norm": 1.5945345163345337,
2765
+ "learning_rate": 1.8939784442361765e-05,
2766
+ "loss": 0.2283,
2767
+ "step": 3620
2768
+ },
2769
+ {
2770
+ "epoch": 1.7010309278350515,
2771
+ "grad_norm": 1.8697439432144165,
2772
+ "learning_rate": 1.8936855670103094e-05,
2773
+ "loss": 0.2382,
2774
+ "step": 3630
2775
+ },
2776
+ {
2777
+ "epoch": 1.7057169634489222,
2778
+ "grad_norm": 1.9775031805038452,
2779
+ "learning_rate": 1.8933926897844427e-05,
2780
+ "loss": 0.2167,
2781
+ "step": 3640
2782
+ },
2783
+ {
2784
+ "epoch": 1.710402999062793,
2785
+ "grad_norm": 1.7976644039154053,
2786
+ "learning_rate": 1.8930998125585757e-05,
2787
+ "loss": 0.2252,
2788
+ "step": 3650
2789
+ },
2790
+ {
2791
+ "epoch": 1.7150890346766636,
2792
+ "grad_norm": 1.9334297180175781,
2793
+ "learning_rate": 1.8928069353327086e-05,
2794
+ "loss": 0.207,
2795
+ "step": 3660
2796
+ },
2797
+ {
2798
+ "epoch": 1.7197750702905341,
2799
+ "grad_norm": 1.661363124847412,
2800
+ "learning_rate": 1.892514058106842e-05,
2801
+ "loss": 0.2531,
2802
+ "step": 3670
2803
+ },
2804
+ {
2805
+ "epoch": 1.7244611059044048,
2806
+ "grad_norm": 2.2192280292510986,
2807
+ "learning_rate": 1.892221180880975e-05,
2808
+ "loss": 0.2552,
2809
+ "step": 3680
2810
+ },
2811
+ {
2812
+ "epoch": 1.7291471415182755,
2813
+ "grad_norm": 1.9910961389541626,
2814
+ "learning_rate": 1.8919283036551078e-05,
2815
+ "loss": 0.2288,
2816
+ "step": 3690
2817
+ },
2818
+ {
2819
+ "epoch": 1.7338331771321462,
2820
+ "grad_norm": 1.6120171546936035,
2821
+ "learning_rate": 1.8916354264292408e-05,
2822
+ "loss": 0.2122,
2823
+ "step": 3700
2824
+ },
2825
+ {
2826
+ "epoch": 1.738519212746017,
2827
+ "grad_norm": 1.8387460708618164,
2828
+ "learning_rate": 1.891342549203374e-05,
2829
+ "loss": 0.2292,
2830
+ "step": 3710
2831
+ },
2832
+ {
2833
+ "epoch": 1.7432052483598874,
2834
+ "grad_norm": 1.9124442338943481,
2835
+ "learning_rate": 1.891049671977507e-05,
2836
+ "loss": 0.2469,
2837
+ "step": 3720
2838
+ },
2839
+ {
2840
+ "epoch": 1.7478912839737581,
2841
+ "grad_norm": 2.267001152038574,
2842
+ "learning_rate": 1.8907567947516403e-05,
2843
+ "loss": 0.2139,
2844
+ "step": 3730
2845
+ },
2846
+ {
2847
+ "epoch": 1.7525773195876289,
2848
+ "grad_norm": 1.9495887756347656,
2849
+ "learning_rate": 1.8904639175257732e-05,
2850
+ "loss": 0.2476,
2851
+ "step": 3740
2852
+ },
2853
+ {
2854
+ "epoch": 1.7572633552014996,
2855
+ "grad_norm": 1.8650192022323608,
2856
+ "learning_rate": 1.8901710402999065e-05,
2857
+ "loss": 0.262,
2858
+ "step": 3750
2859
+ },
2860
+ {
2861
+ "epoch": 1.7572633552014996,
2862
+ "eval_loss": 0.04190748557448387,
2863
+ "eval_pearson_cosine": 0.8188060652498521,
2864
+ "eval_pearson_dot": 0.7306266271006336,
2865
+ "eval_pearson_euclidean": 0.8067178845162717,
2866
+ "eval_pearson_manhattan": 0.8079651581429825,
2867
+ "eval_runtime": 3.6866,
2868
+ "eval_samples_per_second": 406.882,
2869
+ "eval_spearman_cosine": 0.8203796294288378,
2870
+ "eval_spearman_dot": 0.7294293478148326,
2871
+ "eval_spearman_euclidean": 0.8157749153170877,
2872
+ "eval_spearman_manhattan": 0.8170299724489123,
2873
+ "eval_steps_per_second": 25.498,
2874
+ "step": 3750
2875
+ },
2876
+ {
2877
+ "epoch": 1.7619493908153703,
2878
+ "grad_norm": 1.78036367893219,
2879
+ "learning_rate": 1.8898781630740395e-05,
2880
+ "loss": 0.2441,
2881
+ "step": 3760
2882
+ },
2883
+ {
2884
+ "epoch": 1.7666354264292408,
2885
+ "grad_norm": 2.0995302200317383,
2886
+ "learning_rate": 1.8895852858481728e-05,
2887
+ "loss": 0.2528,
2888
+ "step": 3770
2889
+ },
2890
+ {
2891
+ "epoch": 1.7713214620431117,
2892
+ "grad_norm": 1.7772722244262695,
2893
+ "learning_rate": 1.8892924086223057e-05,
2894
+ "loss": 0.2305,
2895
+ "step": 3780
2896
+ },
2897
+ {
2898
+ "epoch": 1.7760074976569822,
2899
+ "grad_norm": 2.158984661102295,
2900
+ "learning_rate": 1.8889995313964387e-05,
2901
+ "loss": 0.2374,
2902
+ "step": 3790
2903
+ },
2904
+ {
2905
+ "epoch": 1.780693533270853,
2906
+ "grad_norm": 1.888215184211731,
2907
+ "learning_rate": 1.888706654170572e-05,
2908
+ "loss": 0.2458,
2909
+ "step": 3800
2910
+ },
2911
+ {
2912
+ "epoch": 1.7853795688847236,
2913
+ "grad_norm": 2.109557867050171,
2914
+ "learning_rate": 1.888413776944705e-05,
2915
+ "loss": 0.2463,
2916
+ "step": 3810
2917
+ },
2918
+ {
2919
+ "epoch": 1.790065604498594,
2920
+ "grad_norm": 1.715156078338623,
2921
+ "learning_rate": 1.8881208997188382e-05,
2922
+ "loss": 0.2421,
2923
+ "step": 3820
2924
+ },
2925
+ {
2926
+ "epoch": 1.794751640112465,
2927
+ "grad_norm": 2.860567808151245,
2928
+ "learning_rate": 1.887828022492971e-05,
2929
+ "loss": 0.2728,
2930
+ "step": 3830
2931
+ },
2932
+ {
2933
+ "epoch": 1.7994376757263355,
2934
+ "grad_norm": 1.527709722518921,
2935
+ "learning_rate": 1.887535145267104e-05,
2936
+ "loss": 0.1982,
2937
+ "step": 3840
2938
+ },
2939
+ {
2940
+ "epoch": 1.8041237113402062,
2941
+ "grad_norm": 2.2306032180786133,
2942
+ "learning_rate": 1.8872422680412374e-05,
2943
+ "loss": 0.2269,
2944
+ "step": 3850
2945
+ },
2946
+ {
2947
+ "epoch": 1.808809746954077,
2948
+ "grad_norm": 1.285987377166748,
2949
+ "learning_rate": 1.8869493908153703e-05,
2950
+ "loss": 0.1879,
2951
+ "step": 3860
2952
+ },
2953
+ {
2954
+ "epoch": 1.8134957825679474,
2955
+ "grad_norm": 1.7815241813659668,
2956
+ "learning_rate": 1.8866565135895033e-05,
2957
+ "loss": 0.2337,
2958
+ "step": 3870
2959
+ },
2960
+ {
2961
+ "epoch": 1.8181818181818183,
2962
+ "grad_norm": 1.5507057905197144,
2963
+ "learning_rate": 1.8863636363636366e-05,
2964
+ "loss": 0.2542,
2965
+ "step": 3880
2966
+ },
2967
+ {
2968
+ "epoch": 1.8228678537956888,
2969
+ "grad_norm": 1.8540301322937012,
2970
+ "learning_rate": 1.8860707591377695e-05,
2971
+ "loss": 0.2668,
2972
+ "step": 3890
2973
+ },
2974
+ {
2975
+ "epoch": 1.8275538894095595,
2976
+ "grad_norm": 1.5156047344207764,
2977
+ "learning_rate": 1.8857778819119025e-05,
2978
+ "loss": 0.2438,
2979
+ "step": 3900
2980
+ },
2981
+ {
2982
+ "epoch": 1.8322399250234302,
2983
+ "grad_norm": 1.752375841140747,
2984
+ "learning_rate": 1.8854850046860358e-05,
2985
+ "loss": 0.2324,
2986
+ "step": 3910
2987
+ },
2988
+ {
2989
+ "epoch": 1.8369259606373007,
2990
+ "grad_norm": 1.9103461503982544,
2991
+ "learning_rate": 1.8851921274601687e-05,
2992
+ "loss": 0.2049,
2993
+ "step": 3920
2994
+ },
2995
+ {
2996
+ "epoch": 1.8416119962511717,
2997
+ "grad_norm": 2.043072462081909,
2998
+ "learning_rate": 1.884899250234302e-05,
2999
+ "loss": 0.2178,
3000
+ "step": 3930
3001
+ },
3002
+ {
3003
+ "epoch": 1.8462980318650422,
3004
+ "grad_norm": 1.7910971641540527,
3005
+ "learning_rate": 1.884606373008435e-05,
3006
+ "loss": 0.2527,
3007
+ "step": 3940
3008
+ },
3009
+ {
3010
+ "epoch": 1.8509840674789129,
3011
+ "grad_norm": 2.146057605743408,
3012
+ "learning_rate": 1.8843134957825682e-05,
3013
+ "loss": 0.2235,
3014
+ "step": 3950
3015
+ },
3016
+ {
3017
+ "epoch": 1.8556701030927836,
3018
+ "grad_norm": 2.3780410289764404,
3019
+ "learning_rate": 1.8840206185567012e-05,
3020
+ "loss": 0.2122,
3021
+ "step": 3960
3022
+ },
3023
+ {
3024
+ "epoch": 1.860356138706654,
3025
+ "grad_norm": 1.7693490982055664,
3026
+ "learning_rate": 1.8837277413308345e-05,
3027
+ "loss": 0.2663,
3028
+ "step": 3970
3029
+ },
3030
+ {
3031
+ "epoch": 1.865042174320525,
3032
+ "grad_norm": 1.9694123268127441,
3033
+ "learning_rate": 1.8834348641049674e-05,
3034
+ "loss": 0.2357,
3035
+ "step": 3980
3036
+ },
3037
+ {
3038
+ "epoch": 1.8697282099343955,
3039
+ "grad_norm": 1.9589118957519531,
3040
+ "learning_rate": 1.8831419868791004e-05,
3041
+ "loss": 0.2549,
3042
+ "step": 3990
3043
+ },
3044
+ {
3045
+ "epoch": 1.8744142455482662,
3046
+ "grad_norm": 1.7447434663772583,
3047
+ "learning_rate": 1.8828491096532337e-05,
3048
+ "loss": 0.2269,
3049
+ "step": 4000
3050
+ },
3051
+ {
3052
+ "epoch": 1.8744142455482662,
3053
+ "eval_loss": 0.03927910327911377,
3054
+ "eval_pearson_cosine": 0.8218360838151568,
3055
+ "eval_pearson_dot": 0.7383648469379267,
3056
+ "eval_pearson_euclidean": 0.79845453202784,
3057
+ "eval_pearson_manhattan": 0.8001969992210543,
3058
+ "eval_runtime": 3.608,
3059
+ "eval_samples_per_second": 415.741,
3060
+ "eval_spearman_cosine": 0.8235262235943273,
3061
+ "eval_spearman_dot": 0.7374591899505454,
3062
+ "eval_spearman_euclidean": 0.8094068736950194,
3063
+ "eval_spearman_manhattan": 0.8111787861762638,
3064
+ "eval_steps_per_second": 26.053,
3065
+ "step": 4000
3066
  }
3067
  ],
3068
  "logging_steps": 10,