| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.980443285528032, |
| "eval_steps": 500, |
| "global_step": 955, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005215123859191656, |
| "grad_norm": 8.640251838986332, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 1.3109, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.010430247718383311, |
| "grad_norm": 8.50165895125568, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 1.2827, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.01564537157757497, |
| "grad_norm": 8.535323811460232, |
| "learning_rate": 2.5e-06, |
| "loss": 1.2971, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.020860495436766623, |
| "grad_norm": 8.005773755772996, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 1.2835, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.02607561929595828, |
| "grad_norm": 6.542532290299446, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 1.2424, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.03129074315514994, |
| "grad_norm": 3.513863098870053, |
| "learning_rate": 5e-06, |
| "loss": 1.1855, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.03650586701434159, |
| "grad_norm": 2.7215053265085425, |
| "learning_rate": 5.833333333333334e-06, |
| "loss": 1.1771, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.041720990873533245, |
| "grad_norm": 6.45835816206034, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 1.1818, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.0469361147327249, |
| "grad_norm": 6.896643054762279, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 1.1927, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.05215123859191656, |
| "grad_norm": 7.106049193713323, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 1.192, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.05736636245110821, |
| "grad_norm": 5.8848429339749755, |
| "learning_rate": 9.166666666666666e-06, |
| "loss": 1.1358, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.06258148631029987, |
| "grad_norm": 5.16279034442262, |
| "learning_rate": 1e-05, |
| "loss": 1.1187, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.06779661016949153, |
| "grad_norm": 3.4057013372590914, |
| "learning_rate": 1.0833333333333334e-05, |
| "loss": 1.1044, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.07301173402868318, |
| "grad_norm": 2.2087972161452516, |
| "learning_rate": 1.1666666666666668e-05, |
| "loss": 1.0906, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.07822685788787484, |
| "grad_norm": 2.1528478641437006, |
| "learning_rate": 1.25e-05, |
| "loss": 1.0749, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.08344198174706649, |
| "grad_norm": 2.0837468590704984, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 1.041, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.08865710560625815, |
| "grad_norm": 2.3435856245272064, |
| "learning_rate": 1.416666666666667e-05, |
| "loss": 1.0308, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.0938722294654498, |
| "grad_norm": 1.7734881445436932, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 1.0104, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.09908735332464146, |
| "grad_norm": 1.2852782680220982, |
| "learning_rate": 1.5833333333333333e-05, |
| "loss": 1.0159, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.10430247718383312, |
| "grad_norm": 1.4664459559013807, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 0.9997, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10951760104302477, |
| "grad_norm": 1.2747674832880032, |
| "learning_rate": 1.7500000000000002e-05, |
| "loss": 0.9999, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.11473272490221642, |
| "grad_norm": 1.1325966855930794, |
| "learning_rate": 1.8333333333333333e-05, |
| "loss": 0.9845, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.11994784876140809, |
| "grad_norm": 1.178213874446251, |
| "learning_rate": 1.916666666666667e-05, |
| "loss": 0.9754, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.12516297262059975, |
| "grad_norm": 1.0070300787437625, |
| "learning_rate": 2e-05, |
| "loss": 0.9916, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.1303780964797914, |
| "grad_norm": 1.1301814564159125, |
| "learning_rate": 2.0833333333333336e-05, |
| "loss": 0.9535, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.13559322033898305, |
| "grad_norm": 1.1582102527170561, |
| "learning_rate": 2.1666666666666667e-05, |
| "loss": 0.9559, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.1408083441981747, |
| "grad_norm": 1.28096007382199, |
| "learning_rate": 2.25e-05, |
| "loss": 0.953, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.14602346805736635, |
| "grad_norm": 1.3963901299642703, |
| "learning_rate": 2.3333333333333336e-05, |
| "loss": 0.9467, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.15123859191655803, |
| "grad_norm": 1.2008437237669338, |
| "learning_rate": 2.4166666666666667e-05, |
| "loss": 0.9319, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.15645371577574968, |
| "grad_norm": 1.4579402450445522, |
| "learning_rate": 2.5e-05, |
| "loss": 0.9583, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.16166883963494133, |
| "grad_norm": 1.332654985022459, |
| "learning_rate": 2.5833333333333336e-05, |
| "loss": 0.9222, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.16688396349413298, |
| "grad_norm": 1.0118249538528512, |
| "learning_rate": 2.6666666666666667e-05, |
| "loss": 0.919, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.17209908735332463, |
| "grad_norm": 1.9244594897055562, |
| "learning_rate": 2.75e-05, |
| "loss": 0.9432, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.1773142112125163, |
| "grad_norm": 1.1988103983333642, |
| "learning_rate": 2.833333333333334e-05, |
| "loss": 0.9309, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.18252933507170796, |
| "grad_norm": 1.375838275777245, |
| "learning_rate": 2.9166666666666666e-05, |
| "loss": 0.9164, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.1877444589308996, |
| "grad_norm": 1.662174214128309, |
| "learning_rate": 3.0000000000000004e-05, |
| "loss": 0.9116, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.19295958279009126, |
| "grad_norm": 19.880826748141878, |
| "learning_rate": 3.0833333333333335e-05, |
| "loss": 0.9136, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.1981747066492829, |
| "grad_norm": 2.1089494461311897, |
| "learning_rate": 3.1666666666666666e-05, |
| "loss": 0.9288, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.2033898305084746, |
| "grad_norm": 1.0754107081344952, |
| "learning_rate": 3.2500000000000004e-05, |
| "loss": 0.9173, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.20860495436766624, |
| "grad_norm": 3.9611232218120964, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 0.9227, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2138200782268579, |
| "grad_norm": 3.0041123063486306, |
| "learning_rate": 3.4166666666666666e-05, |
| "loss": 0.9319, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.21903520208604954, |
| "grad_norm": 2.7847797531368066, |
| "learning_rate": 3.5000000000000004e-05, |
| "loss": 0.9233, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.2242503259452412, |
| "grad_norm": 2.2845773931740725, |
| "learning_rate": 3.5833333333333335e-05, |
| "loss": 0.9286, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.22946544980443284, |
| "grad_norm": 2.9550586599454363, |
| "learning_rate": 3.6666666666666666e-05, |
| "loss": 0.9194, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.23468057366362452, |
| "grad_norm": 1.713180532466099, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.8909, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.23989569752281617, |
| "grad_norm": 3.6245500815283984, |
| "learning_rate": 3.833333333333334e-05, |
| "loss": 0.9061, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.24511082138200782, |
| "grad_norm": 2.7780460507158034, |
| "learning_rate": 3.9166666666666665e-05, |
| "loss": 0.9165, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.2503259452411995, |
| "grad_norm": 3.3854708521313563, |
| "learning_rate": 4e-05, |
| "loss": 0.9076, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.25554106910039115, |
| "grad_norm": 2.788766938646062, |
| "learning_rate": 4.0833333333333334e-05, |
| "loss": 0.9086, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.2607561929595828, |
| "grad_norm": 3.769154968802035, |
| "learning_rate": 4.166666666666667e-05, |
| "loss": 0.8985, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.26597131681877445, |
| "grad_norm": 3.6676032273552783, |
| "learning_rate": 4.25e-05, |
| "loss": 0.9186, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.2711864406779661, |
| "grad_norm": 2.300855067051272, |
| "learning_rate": 4.3333333333333334e-05, |
| "loss": 0.8785, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.27640156453715775, |
| "grad_norm": 2.0855212942964156, |
| "learning_rate": 4.416666666666667e-05, |
| "loss": 0.9044, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.2816166883963494, |
| "grad_norm": 3.1541260770808144, |
| "learning_rate": 4.5e-05, |
| "loss": 0.9134, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.28683181225554105, |
| "grad_norm": 2.3439141139233635, |
| "learning_rate": 4.5833333333333334e-05, |
| "loss": 0.8849, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.2920469361147327, |
| "grad_norm": 3.622853136869387, |
| "learning_rate": 4.666666666666667e-05, |
| "loss": 0.8968, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.29726205997392435, |
| "grad_norm": 3.3444739650293793, |
| "learning_rate": 4.75e-05, |
| "loss": 0.9119, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.30247718383311606, |
| "grad_norm": 2.4056743976018007, |
| "learning_rate": 4.8333333333333334e-05, |
| "loss": 0.8978, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.3076923076923077, |
| "grad_norm": 2.0165924530855692, |
| "learning_rate": 4.916666666666667e-05, |
| "loss": 0.8842, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.31290743155149936, |
| "grad_norm": 3.199274379857031, |
| "learning_rate": 5e-05, |
| "loss": 0.8926, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.318122555410691, |
| "grad_norm": 2.145560466910737, |
| "learning_rate": 5.0833333333333333e-05, |
| "loss": 0.8875, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.32333767926988266, |
| "grad_norm": 3.3364603622456923, |
| "learning_rate": 5.166666666666667e-05, |
| "loss": 0.8891, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.3285528031290743, |
| "grad_norm": 3.088241155388676, |
| "learning_rate": 5.25e-05, |
| "loss": 0.8989, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.33376792698826596, |
| "grad_norm": 2.2703016974586836, |
| "learning_rate": 5.333333333333333e-05, |
| "loss": 0.8848, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.3389830508474576, |
| "grad_norm": 2.1056335353722875, |
| "learning_rate": 5.416666666666667e-05, |
| "loss": 0.8895, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.34419817470664926, |
| "grad_norm": 2.195474617684906, |
| "learning_rate": 5.5e-05, |
| "loss": 0.8855, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.3494132985658409, |
| "grad_norm": 1.8388111784516368, |
| "learning_rate": 5.583333333333333e-05, |
| "loss": 0.88, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.3546284224250326, |
| "grad_norm": 2.205537218893776, |
| "learning_rate": 5.666666666666668e-05, |
| "loss": 0.8923, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.35984354628422427, |
| "grad_norm": 1.43087259231813, |
| "learning_rate": 5.75e-05, |
| "loss": 0.8783, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.3650586701434159, |
| "grad_norm": 3.170207545618331, |
| "learning_rate": 5.833333333333333e-05, |
| "loss": 0.8979, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.37027379400260757, |
| "grad_norm": 2.4965183591219686, |
| "learning_rate": 5.916666666666668e-05, |
| "loss": 0.887, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.3754889178617992, |
| "grad_norm": 2.18025601493941, |
| "learning_rate": 6.000000000000001e-05, |
| "loss": 0.8885, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.38070404172099087, |
| "grad_norm": 2.320595439694094, |
| "learning_rate": 6.083333333333333e-05, |
| "loss": 0.8815, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.3859191655801825, |
| "grad_norm": 2.0185818109580467, |
| "learning_rate": 6.166666666666667e-05, |
| "loss": 0.8831, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.39113428943937417, |
| "grad_norm": 2.2105832623786745, |
| "learning_rate": 6.25e-05, |
| "loss": 0.8666, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.3963494132985658, |
| "grad_norm": 1.9340187658703214, |
| "learning_rate": 6.333333333333333e-05, |
| "loss": 0.888, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.4015645371577575, |
| "grad_norm": 1.811786813233452, |
| "learning_rate": 6.416666666666668e-05, |
| "loss": 0.889, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.4067796610169492, |
| "grad_norm": 2.0985830559238563, |
| "learning_rate": 6.500000000000001e-05, |
| "loss": 0.8945, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.41199478487614083, |
| "grad_norm": 2.7706210707741885, |
| "learning_rate": 6.583333333333334e-05, |
| "loss": 0.903, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.4172099087353325, |
| "grad_norm": 2.328367128642788, |
| "learning_rate": 6.666666666666667e-05, |
| "loss": 0.886, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.42242503259452413, |
| "grad_norm": 2.7310201710261857, |
| "learning_rate": 6.75e-05, |
| "loss": 0.8728, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.4276401564537158, |
| "grad_norm": 2.012060559904938, |
| "learning_rate": 6.833333333333333e-05, |
| "loss": 0.8807, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.43285528031290743, |
| "grad_norm": 1.8442023724268992, |
| "learning_rate": 6.916666666666668e-05, |
| "loss": 0.8832, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.4380704041720991, |
| "grad_norm": 3.3879560119725225, |
| "learning_rate": 7.000000000000001e-05, |
| "loss": 0.8848, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.44328552803129073, |
| "grad_norm": 2.0302673957424457, |
| "learning_rate": 7.083333333333334e-05, |
| "loss": 0.8716, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.4485006518904824, |
| "grad_norm": 3.238439147300149, |
| "learning_rate": 7.166666666666667e-05, |
| "loss": 0.8896, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.45371577574967403, |
| "grad_norm": 2.718682181237891, |
| "learning_rate": 7.25e-05, |
| "loss": 0.902, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.4589308996088657, |
| "grad_norm": 2.4102500849494035, |
| "learning_rate": 7.333333333333333e-05, |
| "loss": 0.8865, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.4641460234680574, |
| "grad_norm": 2.057159117257536, |
| "learning_rate": 7.416666666666668e-05, |
| "loss": 0.8714, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.46936114732724904, |
| "grad_norm": 2.303058894217337, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 0.875, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.4745762711864407, |
| "grad_norm": 1.853131768077206, |
| "learning_rate": 7.583333333333334e-05, |
| "loss": 0.8697, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.47979139504563234, |
| "grad_norm": 1.5089115884426068, |
| "learning_rate": 7.666666666666668e-05, |
| "loss": 0.8672, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.485006518904824, |
| "grad_norm": 3.288728672540363, |
| "learning_rate": 7.75e-05, |
| "loss": 0.8787, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.49022164276401564, |
| "grad_norm": 2.8218607718403197, |
| "learning_rate": 7.833333333333333e-05, |
| "loss": 0.8886, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.4954367666232073, |
| "grad_norm": 1.966995373674031, |
| "learning_rate": 7.916666666666668e-05, |
| "loss": 0.8738, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.500651890482399, |
| "grad_norm": 4.193763943813693, |
| "learning_rate": 8e-05, |
| "loss": 0.8813, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.5058670143415906, |
| "grad_norm": 2.2253144313432514, |
| "learning_rate": 7.99997324882088e-05, |
| "loss": 0.8778, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.5110821382007823, |
| "grad_norm": 4.059040973559077, |
| "learning_rate": 7.999892995641334e-05, |
| "loss": 0.8844, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.516297262059974, |
| "grad_norm": 2.814215374527903, |
| "learning_rate": 7.999759241534794e-05, |
| "loss": 0.8899, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.5215123859191656, |
| "grad_norm": 3.1446586623159383, |
| "learning_rate": 7.9995719882903e-05, |
| "loss": 0.8933, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5267275097783573, |
| "grad_norm": 2.5831777088882335, |
| "learning_rate": 7.999331238412474e-05, |
| "loss": 0.885, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.5319426336375489, |
| "grad_norm": 2.571515738465628, |
| "learning_rate": 7.99903699512149e-05, |
| "loss": 0.8773, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.5371577574967406, |
| "grad_norm": 2.028582558881839, |
| "learning_rate": 7.998689262353024e-05, |
| "loss": 0.875, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.5423728813559322, |
| "grad_norm": 1.9833279742128762, |
| "learning_rate": 7.998288044758206e-05, |
| "loss": 0.8864, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.5475880052151239, |
| "grad_norm": 1.595344120443328, |
| "learning_rate": 7.99783334770356e-05, |
| "loss": 0.8772, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.5528031290743155, |
| "grad_norm": 2.3698581665478025, |
| "learning_rate": 7.997325177270926e-05, |
| "loss": 0.8587, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.5580182529335072, |
| "grad_norm": 1.8069354168584124, |
| "learning_rate": 7.996763540257382e-05, |
| "loss": 0.8786, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.5632333767926988, |
| "grad_norm": 2.8802931859293968, |
| "learning_rate": 7.996148444175155e-05, |
| "loss": 0.8651, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.5684485006518905, |
| "grad_norm": 2.163396645739927, |
| "learning_rate": 7.99547989725152e-05, |
| "loss": 0.8779, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.5736636245110821, |
| "grad_norm": 1.7103752366008835, |
| "learning_rate": 7.994757908428683e-05, |
| "loss": 0.8627, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.5788787483702738, |
| "grad_norm": 2.6236911355349366, |
| "learning_rate": 7.99398248736367e-05, |
| "loss": 0.8775, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.5840938722294654, |
| "grad_norm": 1.7695200626233123, |
| "learning_rate": 7.993153644428198e-05, |
| "loss": 0.8675, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.5893089960886571, |
| "grad_norm": 3.1189819236023597, |
| "learning_rate": 7.992271390708529e-05, |
| "loss": 0.864, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.5945241199478487, |
| "grad_norm": 2.519671466442974, |
| "learning_rate": 7.991335738005325e-05, |
| "loss": 0.8753, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.5997392438070405, |
| "grad_norm": 2.445885184903824, |
| "learning_rate": 7.990346698833493e-05, |
| "loss": 0.8791, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.6049543676662321, |
| "grad_norm": 2.6107239711114634, |
| "learning_rate": 7.989304286422016e-05, |
| "loss": 0.8688, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.6101694915254238, |
| "grad_norm": 2.3280917526561926, |
| "learning_rate": 7.988208514713773e-05, |
| "loss": 0.8558, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.6153846153846154, |
| "grad_norm": 2.01951998260186, |
| "learning_rate": 7.987059398365358e-05, |
| "loss": 0.8611, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.6205997392438071, |
| "grad_norm": 2.67003693356188, |
| "learning_rate": 7.98585695274688e-05, |
| "loss": 0.8599, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.6258148631029987, |
| "grad_norm": 1.3989184556669023, |
| "learning_rate": 7.984601193941757e-05, |
| "loss": 0.8619, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.6310299869621904, |
| "grad_norm": 2.9657726548979007, |
| "learning_rate": 7.983292138746504e-05, |
| "loss": 0.8533, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.636245110821382, |
| "grad_norm": 2.2111378936066717, |
| "learning_rate": 7.981929804670505e-05, |
| "loss": 0.8647, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.6414602346805737, |
| "grad_norm": 1.7933021378995146, |
| "learning_rate": 7.980514209935783e-05, |
| "loss": 0.8572, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.6466753585397653, |
| "grad_norm": 2.6146019104563036, |
| "learning_rate": 7.97904537347675e-05, |
| "loss": 0.8629, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.651890482398957, |
| "grad_norm": 1.7226667254129786, |
| "learning_rate": 7.977523314939961e-05, |
| "loss": 0.8728, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.6571056062581486, |
| "grad_norm": 2.4681433772246804, |
| "learning_rate": 7.975948054683847e-05, |
| "loss": 0.8772, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.6623207301173403, |
| "grad_norm": 1.8531960262958524, |
| "learning_rate": 7.974319613778441e-05, |
| "loss": 0.8568, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.6675358539765319, |
| "grad_norm": 3.3273564870681067, |
| "learning_rate": 7.972638014005102e-05, |
| "loss": 0.8674, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.6727509778357236, |
| "grad_norm": 1.6565232049947463, |
| "learning_rate": 7.970903277856216e-05, |
| "loss": 0.8593, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.6779661016949152, |
| "grad_norm": 3.2772231491550157, |
| "learning_rate": 7.969115428534904e-05, |
| "loss": 0.8689, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.6831812255541069, |
| "grad_norm": 2.9366197326510175, |
| "learning_rate": 7.967274489954703e-05, |
| "loss": 0.8562, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.6883963494132985, |
| "grad_norm": 1.579970961599737, |
| "learning_rate": 7.965380486739253e-05, |
| "loss": 0.8422, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.6936114732724902, |
| "grad_norm": 3.1594495489774967, |
| "learning_rate": 7.963433444221964e-05, |
| "loss": 0.875, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.6988265971316818, |
| "grad_norm": 2.3403703396896423, |
| "learning_rate": 7.961433388445676e-05, |
| "loss": 0.8591, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.7040417209908736, |
| "grad_norm": 1.8764160357986022, |
| "learning_rate": 7.959380346162314e-05, |
| "loss": 0.8585, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.7092568448500652, |
| "grad_norm": 2.954668141615744, |
| "learning_rate": 7.957274344832533e-05, |
| "loss": 0.8558, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.7144719687092569, |
| "grad_norm": 2.1132431617150647, |
| "learning_rate": 7.955115412625337e-05, |
| "loss": 0.8597, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.7196870925684485, |
| "grad_norm": 1.9269531529548758, |
| "learning_rate": 7.952903578417719e-05, |
| "loss": 0.8553, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.7249022164276402, |
| "grad_norm": 1.9600506890917169, |
| "learning_rate": 7.950638871794268e-05, |
| "loss": 0.8498, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.7301173402868318, |
| "grad_norm": 1.295131341774357, |
| "learning_rate": 7.948321323046766e-05, |
| "loss": 0.862, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.7353324641460235, |
| "grad_norm": 2.598262766807541, |
| "learning_rate": 7.945950963173797e-05, |
| "loss": 0.8743, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.7405475880052151, |
| "grad_norm": 1.7173227811522496, |
| "learning_rate": 7.943527823880321e-05, |
| "loss": 0.857, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.7457627118644068, |
| "grad_norm": 2.778103098366881, |
| "learning_rate": 7.941051937577255e-05, |
| "loss": 0.8644, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.7509778357235984, |
| "grad_norm": 1.9721804979121416, |
| "learning_rate": 7.938523337381036e-05, |
| "loss": 0.8641, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.7561929595827901, |
| "grad_norm": 2.534467842230536, |
| "learning_rate": 7.935942057113185e-05, |
| "loss": 0.8607, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.7614080834419817, |
| "grad_norm": 2.420990126278799, |
| "learning_rate": 7.933308131299846e-05, |
| "loss": 0.8532, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.7666232073011734, |
| "grad_norm": 2.0250401094262775, |
| "learning_rate": 7.93062159517133e-05, |
| "loss": 0.8463, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.771838331160365, |
| "grad_norm": 1.949074222131184, |
| "learning_rate": 7.92788248466164e-05, |
| "loss": 0.8505, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.7770534550195567, |
| "grad_norm": 1.9136794648235018, |
| "learning_rate": 7.925090836407997e-05, |
| "loss": 0.8539, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.7822685788787483, |
| "grad_norm": 1.3862902439859852, |
| "learning_rate": 7.922246687750341e-05, |
| "loss": 0.8511, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.78748370273794, |
| "grad_norm": 2.238512517144782, |
| "learning_rate": 7.919350076730836e-05, |
| "loss": 0.846, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.7926988265971316, |
| "grad_norm": 2.0835619443377134, |
| "learning_rate": 7.916401042093361e-05, |
| "loss": 0.8461, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.7979139504563233, |
| "grad_norm": 1.8060011410999197, |
| "learning_rate": 7.913399623282997e-05, |
| "loss": 0.833, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.803129074315515, |
| "grad_norm": 2.0703223704187357, |
| "learning_rate": 7.910345860445487e-05, |
| "loss": 0.8447, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.8083441981747066, |
| "grad_norm": 2.299135311289539, |
| "learning_rate": 7.90723979442671e-05, |
| "loss": 0.8533, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.8135593220338984, |
| "grad_norm": 1.8730518820522357, |
| "learning_rate": 7.90408146677213e-05, |
| "loss": 0.8372, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.81877444589309, |
| "grad_norm": 1.5402936247797026, |
| "learning_rate": 7.900870919726244e-05, |
| "loss": 0.8408, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.8239895697522817, |
| "grad_norm": 2.378224013864058, |
| "learning_rate": 7.897608196232007e-05, |
| "loss": 0.8492, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.8292046936114733, |
| "grad_norm": 1.8241375846314414, |
| "learning_rate": 7.894293339930272e-05, |
| "loss": 0.8338, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.834419817470665, |
| "grad_norm": 2.2711975231391826, |
| "learning_rate": 7.890926395159197e-05, |
| "loss": 0.8385, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.8396349413298566, |
| "grad_norm": 1.6195285561704085, |
| "learning_rate": 7.887507406953651e-05, |
| "loss": 0.8489, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.8448500651890483, |
| "grad_norm": 2.2453846392163768, |
| "learning_rate": 7.884036421044618e-05, |
| "loss": 0.8487, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.8500651890482399, |
| "grad_norm": 1.940408392169657, |
| "learning_rate": 7.880513483858583e-05, |
| "loss": 0.8398, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.8552803129074316, |
| "grad_norm": 2.2159286823222675, |
| "learning_rate": 7.876938642516905e-05, |
| "loss": 0.8492, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.8604954367666232, |
| "grad_norm": 1.4842300453687591, |
| "learning_rate": 7.873311944835195e-05, |
| "loss": 0.8376, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.8657105606258149, |
| "grad_norm": 2.491755340230041, |
| "learning_rate": 7.869633439322674e-05, |
| "loss": 0.8386, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.8709256844850065, |
| "grad_norm": 1.8127014745278216, |
| "learning_rate": 7.865903175181521e-05, |
| "loss": 0.8318, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.8761408083441982, |
| "grad_norm": 1.5105514574695558, |
| "learning_rate": 7.862121202306217e-05, |
| "loss": 0.8317, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.8813559322033898, |
| "grad_norm": 2.485520131584514, |
| "learning_rate": 7.858287571282882e-05, |
| "loss": 0.8496, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.8865710560625815, |
| "grad_norm": 1.7516963763143856, |
| "learning_rate": 7.854402333388587e-05, |
| "loss": 0.8433, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.8917861799217731, |
| "grad_norm": 2.462296826145561, |
| "learning_rate": 7.850465540590684e-05, |
| "loss": 0.8537, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.8970013037809648, |
| "grad_norm": 1.1958019189200906, |
| "learning_rate": 7.846477245546094e-05, |
| "loss": 0.8323, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.9022164276401564, |
| "grad_norm": 2.6026749193200693, |
| "learning_rate": 7.842437501600616e-05, |
| "loss": 0.8516, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.9074315514993481, |
| "grad_norm": 1.7619131106705077, |
| "learning_rate": 7.838346362788206e-05, |
| "loss": 0.838, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.9126466753585397, |
| "grad_norm": 2.4386182828768312, |
| "learning_rate": 7.834203883830259e-05, |
| "loss": 0.8444, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.9178617992177314, |
| "grad_norm": 2.0969277412774274, |
| "learning_rate": 7.830010120134873e-05, |
| "loss": 0.8622, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.9230769230769231, |
| "grad_norm": 1.4692953467652599, |
| "learning_rate": 7.825765127796108e-05, |
| "loss": 0.8338, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.9282920469361148, |
| "grad_norm": 1.5923417027165594, |
| "learning_rate": 7.821468963593242e-05, |
| "loss": 0.8391, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.9335071707953064, |
| "grad_norm": 2.2009607061094942, |
| "learning_rate": 7.817121684990004e-05, |
| "loss": 0.8589, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.9387222946544981, |
| "grad_norm": 1.5116758680331852, |
| "learning_rate": 7.812723350133805e-05, |
| "loss": 0.8341, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.9439374185136897, |
| "grad_norm": 2.038874732993399, |
| "learning_rate": 7.80827401785497e-05, |
| "loss": 0.8547, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.9491525423728814, |
| "grad_norm": 2.2985530062759514, |
| "learning_rate": 7.80377374766594e-05, |
| "loss": 0.8391, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.954367666232073, |
| "grad_norm": 1.257017121878146, |
| "learning_rate": 7.799222599760481e-05, |
| "loss": 0.8325, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.9595827900912647, |
| "grad_norm": 2.873446007386103, |
| "learning_rate": 7.794620635012883e-05, |
| "loss": 0.853, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.9647979139504563, |
| "grad_norm": 2.0836694911738207, |
| "learning_rate": 7.789967914977134e-05, |
| "loss": 0.8554, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.970013037809648, |
| "grad_norm": 2.095712201456811, |
| "learning_rate": 7.785264501886108e-05, |
| "loss": 0.8439, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.9752281616688396, |
| "grad_norm": 1.668972201159559, |
| "learning_rate": 7.78051045865073e-05, |
| "loss": 0.8442, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.9804432855280313, |
| "grad_norm": 2.6548529934832787, |
| "learning_rate": 7.77570584885913e-05, |
| "loss": 0.8509, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.9856584093872229, |
| "grad_norm": 1.6440679506158635, |
| "learning_rate": 7.770850736775796e-05, |
| "loss": 0.8367, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.9908735332464146, |
| "grad_norm": 2.895095407046929, |
| "learning_rate": 7.765945187340715e-05, |
| "loss": 0.8476, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.9960886571056062, |
| "grad_norm": 2.1880375444766655, |
| "learning_rate": 7.760989266168503e-05, |
| "loss": 0.8466, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.001303780964798, |
| "grad_norm": 2.491373495658068, |
| "learning_rate": 7.755983039547528e-05, |
| "loss": 1.0483, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.0065189048239895, |
| "grad_norm": 2.1065510338703195, |
| "learning_rate": 7.750926574439019e-05, |
| "loss": 0.8317, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.0117340286831813, |
| "grad_norm": 1.6048527323347803, |
| "learning_rate": 7.745819938476184e-05, |
| "loss": 0.8309, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.0169491525423728, |
| "grad_norm": 1.3686572978259008, |
| "learning_rate": 7.740663199963284e-05, |
| "loss": 0.8207, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.0221642764015646, |
| "grad_norm": 1.6395534429827354, |
| "learning_rate": 7.73545642787474e-05, |
| "loss": 0.824, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.0273794002607561, |
| "grad_norm": 1.829519017394267, |
| "learning_rate": 7.730199691854198e-05, |
| "loss": 0.8346, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.032594524119948, |
| "grad_norm": 2.086222113236519, |
| "learning_rate": 7.724893062213602e-05, |
| "loss": 0.8204, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.0378096479791394, |
| "grad_norm": 1.6543925285010792, |
| "learning_rate": 7.71953660993225e-05, |
| "loss": 0.8238, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.0430247718383312, |
| "grad_norm": 1.6861891172176104, |
| "learning_rate": 7.71413040665585e-05, |
| "loss": 0.8215, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.0482398956975227, |
| "grad_norm": 2.313274305017442, |
| "learning_rate": 7.708674524695559e-05, |
| "loss": 0.8082, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.0534550195567145, |
| "grad_norm": 1.8334252249650502, |
| "learning_rate": 7.703169037027014e-05, |
| "loss": 0.824, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.058670143415906, |
| "grad_norm": 1.7126012022750563, |
| "learning_rate": 7.697614017289357e-05, |
| "loss": 0.8105, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.0638852672750978, |
| "grad_norm": 1.3173869406989056, |
| "learning_rate": 7.692009539784255e-05, |
| "loss": 0.8303, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.0691003911342893, |
| "grad_norm": 2.0262932335464012, |
| "learning_rate": 7.686355679474898e-05, |
| "loss": 0.8197, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.074315514993481, |
| "grad_norm": 2.2714312463405397, |
| "learning_rate": 7.680652511985e-05, |
| "loss": 0.8196, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.0795306388526726, |
| "grad_norm": 1.5002234528402432, |
| "learning_rate": 7.674900113597787e-05, |
| "loss": 0.8141, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.0847457627118644, |
| "grad_norm": 2.059909369219652, |
| "learning_rate": 7.669098561254983e-05, |
| "loss": 0.826, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.0899608865710562, |
| "grad_norm": 1.6415840594311673, |
| "learning_rate": 7.663247932555767e-05, |
| "loss": 0.8314, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.0951760104302477, |
| "grad_norm": 2.094580841761285, |
| "learning_rate": 7.65734830575575e-05, |
| "loss": 0.8405, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.1003911342894395, |
| "grad_norm": 1.4276266650819935, |
| "learning_rate": 7.651399759765915e-05, |
| "loss": 0.8172, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.105606258148631, |
| "grad_norm": 2.406763290618582, |
| "learning_rate": 7.645402374151575e-05, |
| "loss": 0.8166, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.1108213820078228, |
| "grad_norm": 1.4071963692309266, |
| "learning_rate": 7.639356229131298e-05, |
| "loss": 0.8252, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.1160365058670143, |
| "grad_norm": 1.434300709398784, |
| "learning_rate": 7.633261405575838e-05, |
| "loss": 0.8342, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.121251629726206, |
| "grad_norm": 2.6026163076276223, |
| "learning_rate": 7.627117985007052e-05, |
| "loss": 0.8277, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.1264667535853976, |
| "grad_norm": 1.545445917096449, |
| "learning_rate": 7.620926049596814e-05, |
| "loss": 0.8224, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.1316818774445894, |
| "grad_norm": 2.349483330688553, |
| "learning_rate": 7.61468568216591e-05, |
| "loss": 0.8317, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.136897001303781, |
| "grad_norm": 1.6493926282653744, |
| "learning_rate": 7.60839696618293e-05, |
| "loss": 0.8235, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.1421121251629727, |
| "grad_norm": 2.523447089439544, |
| "learning_rate": 7.602059985763165e-05, |
| "loss": 0.8259, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.1473272490221642, |
| "grad_norm": 1.910781874766756, |
| "learning_rate": 7.595674825667457e-05, |
| "loss": 0.826, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.152542372881356, |
| "grad_norm": 2.2270989973834032, |
| "learning_rate": 7.589241571301091e-05, |
| "loss": 0.8309, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.1577574967405475, |
| "grad_norm": 1.7934683365685906, |
| "learning_rate": 7.582760308712634e-05, |
| "loss": 0.8302, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.1629726205997393, |
| "grad_norm": 2.0662257630292062, |
| "learning_rate": 7.5762311245928e-05, |
| "loss": 0.8277, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.1681877444589308, |
| "grad_norm": 1.6790104250183842, |
| "learning_rate": 7.569654106273268e-05, |
| "loss": 0.8254, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.1734028683181226, |
| "grad_norm": 2.11893755909352, |
| "learning_rate": 7.563029341725541e-05, |
| "loss": 0.8183, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.1786179921773141, |
| "grad_norm": 1.5877658958807297, |
| "learning_rate": 7.55635691955975e-05, |
| "loss": 0.8232, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.1838331160365059, |
| "grad_norm": 2.147915736111546, |
| "learning_rate": 7.549636929023471e-05, |
| "loss": 0.8185, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.1890482398956976, |
| "grad_norm": 1.7400398545071791, |
| "learning_rate": 7.542869460000544e-05, |
| "loss": 0.8219, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.1942633637548892, |
| "grad_norm": 1.8017174411766266, |
| "learning_rate": 7.536054603009856e-05, |
| "loss": 0.8099, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.1994784876140807, |
| "grad_norm": 1.7295101992953383, |
| "learning_rate": 7.529192449204137e-05, |
| "loss": 0.8199, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.2046936114732725, |
| "grad_norm": 1.81480851347367, |
| "learning_rate": 7.522283090368739e-05, |
| "loss": 0.8163, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.2099087353324642, |
| "grad_norm": 1.4532183826025098, |
| "learning_rate": 7.515326618920409e-05, |
| "loss": 0.8243, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.2151238591916558, |
| "grad_norm": 2.025221417951849, |
| "learning_rate": 7.508323127906055e-05, |
| "loss": 0.8104, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.2203389830508475, |
| "grad_norm": 2.839580655118928, |
| "learning_rate": 7.5012727110015e-05, |
| "loss": 0.8156, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.225554106910039, |
| "grad_norm": 0.9612813608575648, |
| "learning_rate": 7.494175462510225e-05, |
| "loss": 0.8136, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.2307692307692308, |
| "grad_norm": 4.580627208479269, |
| "learning_rate": 7.487031477362112e-05, |
| "loss": 0.8363, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.2359843546284224, |
| "grad_norm": 3.3062212088133585, |
| "learning_rate": 7.479840851112175e-05, |
| "loss": 0.8457, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.2411994784876141, |
| "grad_norm": 3.9085448168352577, |
| "learning_rate": 7.47260367993928e-05, |
| "loss": 0.8369, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.2464146023468057, |
| "grad_norm": 3.2990078411550416, |
| "learning_rate": 7.465320060644857e-05, |
| "loss": 0.83, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.2516297262059974, |
| "grad_norm": 3.171914889017843, |
| "learning_rate": 7.45799009065161e-05, |
| "loss": 0.8245, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.256844850065189, |
| "grad_norm": 2.6816905524834436, |
| "learning_rate": 7.450613868002208e-05, |
| "loss": 0.8363, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.2620599739243807, |
| "grad_norm": 3.0890905030641256, |
| "learning_rate": 7.443191491357976e-05, |
| "loss": 0.823, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.2672750977835723, |
| "grad_norm": 2.3490265350478956, |
| "learning_rate": 7.435723059997581e-05, |
| "loss": 0.8276, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.272490221642764, |
| "grad_norm": 3.5496378180168957, |
| "learning_rate": 7.428208673815693e-05, |
| "loss": 0.8247, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.2777053455019556, |
| "grad_norm": 3.053170271456407, |
| "learning_rate": 7.420648433321659e-05, |
| "loss": 0.8306, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.2829204693611473, |
| "grad_norm": 2.387063335590222, |
| "learning_rate": 7.41304243963815e-05, |
| "loss": 0.831, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.288135593220339, |
| "grad_norm": 1.8497576227055157, |
| "learning_rate": 7.405390794499819e-05, |
| "loss": 0.8066, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.2933507170795306, |
| "grad_norm": 2.9897041114580727, |
| "learning_rate": 7.397693600251929e-05, |
| "loss": 0.8134, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.2985658409387222, |
| "grad_norm": 1.980452439525295, |
| "learning_rate": 7.389950959848992e-05, |
| "loss": 0.8252, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.303780964797914, |
| "grad_norm": 3.4714237886205748, |
| "learning_rate": 7.382162976853387e-05, |
| "loss": 0.8294, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.3089960886571057, |
| "grad_norm": 3.2963021601322535, |
| "learning_rate": 7.37432975543398e-05, |
| "loss": 0.8232, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.3142112125162972, |
| "grad_norm": 2.1372765339871806, |
| "learning_rate": 7.366451400364723e-05, |
| "loss": 0.8228, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.3194263363754888, |
| "grad_norm": 1.61329278641058, |
| "learning_rate": 7.358528017023262e-05, |
| "loss": 0.8149, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.3246414602346805, |
| "grad_norm": 2.149446564090082, |
| "learning_rate": 7.350559711389518e-05, |
| "loss": 0.803, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.3298565840938723, |
| "grad_norm": 1.2080706850732208, |
| "learning_rate": 7.342546590044279e-05, |
| "loss": 0.8309, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.3350717079530638, |
| "grad_norm": 2.060776426075165, |
| "learning_rate": 7.334488760167768e-05, |
| "loss": 0.8218, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.3402868318122556, |
| "grad_norm": 1.4270579486920896, |
| "learning_rate": 7.326386329538207e-05, |
| "loss": 0.8482, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.3455019556714471, |
| "grad_norm": 2.320654966360905, |
| "learning_rate": 7.318239406530386e-05, |
| "loss": 0.8284, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.350717079530639, |
| "grad_norm": 2.040787679253322, |
| "learning_rate": 7.3100481001142e-05, |
| "loss": 0.8436, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.3559322033898304, |
| "grad_norm": 1.450733455499567, |
| "learning_rate": 7.301812519853203e-05, |
| "loss": 0.8067, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.3611473272490222, |
| "grad_norm": 1.4473957425879223, |
| "learning_rate": 7.293532775903137e-05, |
| "loss": 0.8172, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.3663624511082137, |
| "grad_norm": 1.4567295321246068, |
| "learning_rate": 7.285208979010458e-05, |
| "loss": 0.833, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.3715775749674055, |
| "grad_norm": 2.2303402593920234, |
| "learning_rate": 7.276841240510858e-05, |
| "loss": 0.8241, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.376792698826597, |
| "grad_norm": 1.3757747352916987, |
| "learning_rate": 7.26842967232777e-05, |
| "loss": 0.8106, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.3820078226857888, |
| "grad_norm": 1.960651210566042, |
| "learning_rate": 7.25997438697088e-05, |
| "loss": 0.8258, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.3872229465449806, |
| "grad_norm": 1.6973151493940888, |
| "learning_rate": 7.251475497534615e-05, |
| "loss": 0.8421, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.3924380704041721, |
| "grad_norm": 1.4278961151807503, |
| "learning_rate": 7.242933117696628e-05, |
| "loss": 0.8107, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.3976531942633637, |
| "grad_norm": 1.6765492543318952, |
| "learning_rate": 7.234347361716291e-05, |
| "loss": 0.8126, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.4028683181225554, |
| "grad_norm": 1.7901534079828416, |
| "learning_rate": 7.225718344433149e-05, |
| "loss": 0.8209, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.4080834419817472, |
| "grad_norm": 1.1879344799911462, |
| "learning_rate": 7.217046181265394e-05, |
| "loss": 0.8241, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.4132985658409387, |
| "grad_norm": 2.316592389288362, |
| "learning_rate": 7.208330988208324e-05, |
| "loss": 0.8233, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.4185136897001303, |
| "grad_norm": 1.3009721963229537, |
| "learning_rate": 7.199572881832784e-05, |
| "loss": 0.8094, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.423728813559322, |
| "grad_norm": 2.068602624965321, |
| "learning_rate": 7.190771979283608e-05, |
| "loss": 0.8221, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.4289439374185138, |
| "grad_norm": 1.9940233989402643, |
| "learning_rate": 7.181928398278058e-05, |
| "loss": 0.8225, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.4341590612777053, |
| "grad_norm": 1.7328843255622977, |
| "learning_rate": 7.173042257104243e-05, |
| "loss": 0.8142, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.439374185136897, |
| "grad_norm": 1.5984687241828228, |
| "learning_rate": 7.164113674619542e-05, |
| "loss": 0.8062, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.4445893089960886, |
| "grad_norm": 1.9941450850887774, |
| "learning_rate": 7.155142770249008e-05, |
| "loss": 0.8156, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.4498044328552804, |
| "grad_norm": 1.2629214490257985, |
| "learning_rate": 7.146129663983775e-05, |
| "loss": 0.8029, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.455019556714472, |
| "grad_norm": 1.5341743253202347, |
| "learning_rate": 7.137074476379454e-05, |
| "loss": 0.8184, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.4602346805736637, |
| "grad_norm": 1.205052243506182, |
| "learning_rate": 7.127977328554518e-05, |
| "loss": 0.8297, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.4654498044328552, |
| "grad_norm": 1.7584530348059961, |
| "learning_rate": 7.118838342188683e-05, |
| "loss": 0.8183, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.470664928292047, |
| "grad_norm": 1.8223656535526023, |
| "learning_rate": 7.10965763952128e-05, |
| "loss": 0.8154, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.4758800521512385, |
| "grad_norm": 1.3665387213451412, |
| "learning_rate": 7.100435343349617e-05, |
| "loss": 0.8114, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.4810951760104303, |
| "grad_norm": 1.499553247270366, |
| "learning_rate": 7.091171577027344e-05, |
| "loss": 0.813, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.4863102998696218, |
| "grad_norm": 2.1007419745906866, |
| "learning_rate": 7.081866464462798e-05, |
| "loss": 0.8216, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.4915254237288136, |
| "grad_norm": 1.512999754469945, |
| "learning_rate": 7.072520130117344e-05, |
| "loss": 0.8182, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.4967405475880051, |
| "grad_norm": 1.6975647047303697, |
| "learning_rate": 7.063132699003716e-05, |
| "loss": 0.8244, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.5019556714471969, |
| "grad_norm": 1.7957380356125672, |
| "learning_rate": 7.053704296684337e-05, |
| "loss": 0.8162, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.5071707953063886, |
| "grad_norm": 1.2813149409377043, |
| "learning_rate": 7.044235049269649e-05, |
| "loss": 0.8095, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.5123859191655802, |
| "grad_norm": 1.56534754361993, |
| "learning_rate": 7.034725083416419e-05, |
| "loss": 0.8258, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.5176010430247717, |
| "grad_norm": 1.7975943131313281, |
| "learning_rate": 7.025174526326045e-05, |
| "loss": 0.8025, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.5228161668839635, |
| "grad_norm": 1.1853911402078072, |
| "learning_rate": 7.015583505742857e-05, |
| "loss": 0.8249, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.5280312907431552, |
| "grad_norm": 1.8494955443278849, |
| "learning_rate": 7.005952149952416e-05, |
| "loss": 0.8378, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.5332464146023468, |
| "grad_norm": 1.3622849536120247, |
| "learning_rate": 6.996280587779778e-05, |
| "loss": 0.8354, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.5384615384615383, |
| "grad_norm": 2.0112831346634708, |
| "learning_rate": 6.986568948587792e-05, |
| "loss": 0.83, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.54367666232073, |
| "grad_norm": 1.5420042484474124, |
| "learning_rate": 6.976817362275357e-05, |
| "loss": 0.8109, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.5488917861799218, |
| "grad_norm": 2.0929864944342804, |
| "learning_rate": 6.96702595927569e-05, |
| "loss": 0.8334, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.5541069100391134, |
| "grad_norm": 1.8922406476791735, |
| "learning_rate": 6.957194870554578e-05, |
| "loss": 0.806, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.559322033898305, |
| "grad_norm": 1.6880868554685944, |
| "learning_rate": 6.947324227608628e-05, |
| "loss": 0.8212, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.5645371577574967, |
| "grad_norm": 1.6346451668072235, |
| "learning_rate": 6.937414162463509e-05, |
| "loss": 0.8014, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.5697522816166884, |
| "grad_norm": 1.577939054921778, |
| "learning_rate": 6.927464807672186e-05, |
| "loss": 0.8187, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.57496740547588, |
| "grad_norm": 1.4316029714279948, |
| "learning_rate": 6.917476296313145e-05, |
| "loss": 0.8046, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.5801825293350718, |
| "grad_norm": 1.5381955264209675, |
| "learning_rate": 6.907448761988612e-05, |
| "loss": 0.8077, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.5853976531942635, |
| "grad_norm": 1.0983288794814963, |
| "learning_rate": 6.897382338822772e-05, |
| "loss": 0.804, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.590612777053455, |
| "grad_norm": 1.2952124244327796, |
| "learning_rate": 6.88727716145997e-05, |
| "loss": 0.8014, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.5958279009126466, |
| "grad_norm": 1.1248306945768556, |
| "learning_rate": 6.877133365062911e-05, |
| "loss": 0.8086, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.6010430247718384, |
| "grad_norm": 1.7722044591752057, |
| "learning_rate": 6.86695108531085e-05, |
| "loss": 0.8061, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.6062581486310301, |
| "grad_norm": 1.8332828396584284, |
| "learning_rate": 6.856730458397787e-05, |
| "loss": 0.8177, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.6114732724902217, |
| "grad_norm": 1.3222660990882034, |
| "learning_rate": 6.846471621030626e-05, |
| "loss": 0.8012, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.6166883963494132, |
| "grad_norm": 1.3548943859194524, |
| "learning_rate": 6.836174710427369e-05, |
| "loss": 0.8171, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.621903520208605, |
| "grad_norm": 1.2347347259073755, |
| "learning_rate": 6.825839864315264e-05, |
| "loss": 0.7839, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.6271186440677967, |
| "grad_norm": 1.4912678731047493, |
| "learning_rate": 6.815467220928972e-05, |
| "loss": 0.8004, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.6323337679269883, |
| "grad_norm": 1.0089833155389005, |
| "learning_rate": 6.805056919008714e-05, |
| "loss": 0.806, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.6375488917861798, |
| "grad_norm": 1.8923057463186084, |
| "learning_rate": 6.794609097798414e-05, |
| "loss": 0.8149, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.6427640156453716, |
| "grad_norm": 1.3911432602736478, |
| "learning_rate": 6.784123897043841e-05, |
| "loss": 0.8261, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.6479791395045633, |
| "grad_norm": 1.2299562475949994, |
| "learning_rate": 6.773601456990739e-05, |
| "loss": 0.8025, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.6531942633637549, |
| "grad_norm": 1.749738991866774, |
| "learning_rate": 6.763041918382945e-05, |
| "loss": 0.8087, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.6584093872229464, |
| "grad_norm": 1.2129819476495074, |
| "learning_rate": 6.752445422460513e-05, |
| "loss": 0.8058, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.6636245110821382, |
| "grad_norm": 1.9218816457335506, |
| "learning_rate": 6.741812110957823e-05, |
| "loss": 0.8199, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.66883963494133, |
| "grad_norm": 1.5263095695339857, |
| "learning_rate": 6.731142126101688e-05, |
| "loss": 0.8098, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.6740547588005215, |
| "grad_norm": 1.7388018103606915, |
| "learning_rate": 6.720435610609443e-05, |
| "loss": 0.7964, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.6792698826597132, |
| "grad_norm": 1.1143428753117783, |
| "learning_rate": 6.709692707687047e-05, |
| "loss": 0.8026, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.684485006518905, |
| "grad_norm": 1.4518152074151873, |
| "learning_rate": 6.69891356102716e-05, |
| "loss": 0.8199, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.6897001303780965, |
| "grad_norm": 1.2302866722544838, |
| "learning_rate": 6.688098314807221e-05, |
| "loss": 0.8116, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.694915254237288, |
| "grad_norm": 1.355915097493558, |
| "learning_rate": 6.677247113687527e-05, |
| "loss": 0.8184, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.7001303780964798, |
| "grad_norm": 1.0973851180923702, |
| "learning_rate": 6.666360102809289e-05, |
| "loss": 0.8066, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.7053455019556716, |
| "grad_norm": 2.034989183783714, |
| "learning_rate": 6.655437427792698e-05, |
| "loss": 0.8068, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.7105606258148631, |
| "grad_norm": 1.352963702330486, |
| "learning_rate": 6.644479234734971e-05, |
| "loss": 0.8337, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.7157757496740547, |
| "grad_norm": 1.486986240165971, |
| "learning_rate": 6.6334856702084e-05, |
| "loss": 0.8142, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.7209908735332464, |
| "grad_norm": 1.2755542839313279, |
| "learning_rate": 6.622456881258392e-05, |
| "loss": 0.8224, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.7262059973924382, |
| "grad_norm": 1.4619956893213686, |
| "learning_rate": 6.6113930154015e-05, |
| "loss": 0.8077, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.7314211212516297, |
| "grad_norm": 1.7136751212219052, |
| "learning_rate": 6.600294220623457e-05, |
| "loss": 0.8089, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.7366362451108213, |
| "grad_norm": 1.1691508723533794, |
| "learning_rate": 6.589160645377181e-05, |
| "loss": 0.8192, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.741851368970013, |
| "grad_norm": 2.2934106958814544, |
| "learning_rate": 6.57799243858081e-05, |
| "loss": 0.8123, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.7470664928292048, |
| "grad_norm": 1.3479467101781495, |
| "learning_rate": 6.566789749615691e-05, |
| "loss": 0.8016, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.7522816166883963, |
| "grad_norm": 1.4715014206564454, |
| "learning_rate": 6.555552728324394e-05, |
| "loss": 0.8135, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.7574967405475879, |
| "grad_norm": 2.4679525667464155, |
| "learning_rate": 6.544281525008703e-05, |
| "loss": 0.8047, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.7627118644067796, |
| "grad_norm": 1.2701148601075805, |
| "learning_rate": 6.532976290427611e-05, |
| "loss": 0.8155, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.7679269882659714, |
| "grad_norm": 2.7926000172133967, |
| "learning_rate": 6.521637175795292e-05, |
| "loss": 0.8114, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.773142112125163, |
| "grad_norm": 1.8634719780362607, |
| "learning_rate": 6.51026433277909e-05, |
| "loss": 0.8279, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.7783572359843545, |
| "grad_norm": 2.527061321023331, |
| "learning_rate": 6.498857913497485e-05, |
| "loss": 0.8308, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.7835723598435462, |
| "grad_norm": 1.7861432214936908, |
| "learning_rate": 6.487418070518063e-05, |
| "loss": 0.8237, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.788787483702738, |
| "grad_norm": 2.4741757997509812, |
| "learning_rate": 6.475944956855463e-05, |
| "loss": 0.8098, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.7940026075619295, |
| "grad_norm": 2.5470632776776583, |
| "learning_rate": 6.464438725969348e-05, |
| "loss": 0.8153, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.7992177314211213, |
| "grad_norm": 1.4497598146752289, |
| "learning_rate": 6.452899531762338e-05, |
| "loss": 0.809, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.804432855280313, |
| "grad_norm": 1.6671814059233867, |
| "learning_rate": 6.44132752857796e-05, |
| "loss": 0.8206, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.8096479791395046, |
| "grad_norm": 1.4862285893517284, |
| "learning_rate": 6.429722871198579e-05, |
| "loss": 0.8125, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.8148631029986961, |
| "grad_norm": 1.4717475694489375, |
| "learning_rate": 6.418085714843328e-05, |
| "loss": 0.8028, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.820078226857888, |
| "grad_norm": 1.2961734030931862, |
| "learning_rate": 6.406416215166035e-05, |
| "loss": 0.814, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.8252933507170797, |
| "grad_norm": 1.375416590864907, |
| "learning_rate": 6.394714528253137e-05, |
| "loss": 0.8084, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.8305084745762712, |
| "grad_norm": 1.1939033449159728, |
| "learning_rate": 6.382980810621595e-05, |
| "loss": 0.8054, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.8357235984354627, |
| "grad_norm": 1.905725265353936, |
| "learning_rate": 6.371215219216801e-05, |
| "loss": 0.799, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.8409387222946545, |
| "grad_norm": 1.5504247264478312, |
| "learning_rate": 6.359417911410477e-05, |
| "loss": 0.8184, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.8461538461538463, |
| "grad_norm": 1.034848478726046, |
| "learning_rate": 6.347589044998568e-05, |
| "loss": 0.8058, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.8513689700130378, |
| "grad_norm": 2.3826958255372217, |
| "learning_rate": 6.335728778199139e-05, |
| "loss": 0.8066, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.8565840938722293, |
| "grad_norm": 1.3959972285328377, |
| "learning_rate": 6.323837269650249e-05, |
| "loss": 0.8217, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.861799217731421, |
| "grad_norm": 2.506085020659937, |
| "learning_rate": 6.311914678407837e-05, |
| "loss": 0.8216, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.8670143415906129, |
| "grad_norm": 1.8279565139453056, |
| "learning_rate": 6.299961163943587e-05, |
| "loss": 0.8213, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.8722294654498044, |
| "grad_norm": 2.0962030329324204, |
| "learning_rate": 6.287976886142806e-05, |
| "loss": 0.8206, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.877444589308996, |
| "grad_norm": 1.70186031477118, |
| "learning_rate": 6.275962005302273e-05, |
| "loss": 0.8301, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.8826597131681877, |
| "grad_norm": 2.1788285828805365, |
| "learning_rate": 6.263916682128104e-05, |
| "loss": 0.817, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.8878748370273795, |
| "grad_norm": 1.805067718405396, |
| "learning_rate": 6.251841077733595e-05, |
| "loss": 0.8158, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.893089960886571, |
| "grad_norm": 2.1305468630587066, |
| "learning_rate": 6.239735353637076e-05, |
| "loss": 0.8086, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.8983050847457628, |
| "grad_norm": 1.8535876445861497, |
| "learning_rate": 6.227599671759745e-05, |
| "loss": 0.8088, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.9035202086049545, |
| "grad_norm": 2.0851098014276914, |
| "learning_rate": 6.215434194423499e-05, |
| "loss": 0.8053, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.908735332464146, |
| "grad_norm": 1.6617436320103747, |
| "learning_rate": 6.203239084348772e-05, |
| "loss": 0.8272, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.9139504563233376, |
| "grad_norm": 1.9519893634609065, |
| "learning_rate": 6.191014504652352e-05, |
| "loss": 0.8052, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.9191655801825294, |
| "grad_norm": 1.8496089676435563, |
| "learning_rate": 6.178760618845194e-05, |
| "loss": 0.8152, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.9243807040417211, |
| "grad_norm": 1.5391906529413488, |
| "learning_rate": 6.166477590830252e-05, |
| "loss": 0.8129, |
| "step": 369 |
| }, |
| { |
| "epoch": 1.9295958279009127, |
| "grad_norm": 1.5353016752039255, |
| "learning_rate": 6.154165584900263e-05, |
| "loss": 0.7994, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.9348109517601042, |
| "grad_norm": 1.4284130690464092, |
| "learning_rate": 6.141824765735567e-05, |
| "loss": 0.8169, |
| "step": 371 |
| }, |
| { |
| "epoch": 1.940026075619296, |
| "grad_norm": 1.3150434684077639, |
| "learning_rate": 6.129455298401894e-05, |
| "loss": 0.7936, |
| "step": 372 |
| }, |
| { |
| "epoch": 1.9452411994784877, |
| "grad_norm": 1.3747947345260623, |
| "learning_rate": 6.117057348348164e-05, |
| "loss": 0.8007, |
| "step": 373 |
| }, |
| { |
| "epoch": 1.9504563233376793, |
| "grad_norm": 1.0176646734905497, |
| "learning_rate": 6.104631081404269e-05, |
| "loss": 0.8085, |
| "step": 374 |
| }, |
| { |
| "epoch": 1.9556714471968708, |
| "grad_norm": 1.773293381624066, |
| "learning_rate": 6.092176663778851e-05, |
| "loss": 0.8075, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.9608865710560626, |
| "grad_norm": 1.4739056885963973, |
| "learning_rate": 6.079694262057094e-05, |
| "loss": 0.8125, |
| "step": 376 |
| }, |
| { |
| "epoch": 1.9661016949152543, |
| "grad_norm": 1.6264911494594487, |
| "learning_rate": 6.067184043198476e-05, |
| "loss": 0.8093, |
| "step": 377 |
| }, |
| { |
| "epoch": 1.9713168187744459, |
| "grad_norm": 1.5269696160976443, |
| "learning_rate": 6.054646174534552e-05, |
| "loss": 0.8073, |
| "step": 378 |
| }, |
| { |
| "epoch": 1.9765319426336374, |
| "grad_norm": 1.4672155013368013, |
| "learning_rate": 6.0420808237667055e-05, |
| "loss": 0.7873, |
| "step": 379 |
| }, |
| { |
| "epoch": 1.9817470664928292, |
| "grad_norm": 1.6071190706840761, |
| "learning_rate": 6.029488158963912e-05, |
| "loss": 0.8185, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.986962190352021, |
| "grad_norm": 1.1200471714942668, |
| "learning_rate": 6.016868348560488e-05, |
| "loss": 0.7945, |
| "step": 381 |
| }, |
| { |
| "epoch": 1.9921773142112125, |
| "grad_norm": 1.2021517615584545, |
| "learning_rate": 6.004221561353838e-05, |
| "loss": 0.8019, |
| "step": 382 |
| }, |
| { |
| "epoch": 1.997392438070404, |
| "grad_norm": 1.1953523539998745, |
| "learning_rate": 5.991547966502195e-05, |
| "loss": 0.8847, |
| "step": 383 |
| }, |
| { |
| "epoch": 2.002607561929596, |
| "grad_norm": 1.2139914038103075, |
| "learning_rate": 5.978847733522363e-05, |
| "loss": 0.9192, |
| "step": 384 |
| }, |
| { |
| "epoch": 2.0078226857887875, |
| "grad_norm": 1.2663014964519022, |
| "learning_rate": 5.9661210322874456e-05, |
| "loss": 0.7934, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.013037809647979, |
| "grad_norm": 1.7536553812874152, |
| "learning_rate": 5.953368033024576e-05, |
| "loss": 0.7716, |
| "step": 386 |
| }, |
| { |
| "epoch": 2.0182529335071706, |
| "grad_norm": 1.3505679779762105, |
| "learning_rate": 5.940588906312636e-05, |
| "loss": 0.79, |
| "step": 387 |
| }, |
| { |
| "epoch": 2.0234680573663626, |
| "grad_norm": 1.100812193500458, |
| "learning_rate": 5.9277838230799816e-05, |
| "loss": 0.7831, |
| "step": 388 |
| }, |
| { |
| "epoch": 2.028683181225554, |
| "grad_norm": 1.312836314335182, |
| "learning_rate": 5.9149529546021486e-05, |
| "loss": 0.7833, |
| "step": 389 |
| }, |
| { |
| "epoch": 2.0338983050847457, |
| "grad_norm": 1.9463142850045887, |
| "learning_rate": 5.902096472499569e-05, |
| "loss": 0.7829, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.039113428943937, |
| "grad_norm": 0.7033565907846064, |
| "learning_rate": 5.889214548735269e-05, |
| "loss": 0.7842, |
| "step": 391 |
| }, |
| { |
| "epoch": 2.044328552803129, |
| "grad_norm": 1.743510351487419, |
| "learning_rate": 5.876307355612575e-05, |
| "loss": 0.7679, |
| "step": 392 |
| }, |
| { |
| "epoch": 2.0495436766623207, |
| "grad_norm": 1.2465470038883548, |
| "learning_rate": 5.8633750657728033e-05, |
| "loss": 0.7712, |
| "step": 393 |
| }, |
| { |
| "epoch": 2.0547588005215123, |
| "grad_norm": 2.068655430465104, |
| "learning_rate": 5.850417852192956e-05, |
| "loss": 0.7853, |
| "step": 394 |
| }, |
| { |
| "epoch": 2.0599739243807043, |
| "grad_norm": 0.9560005505258112, |
| "learning_rate": 5.837435888183403e-05, |
| "loss": 0.785, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.065189048239896, |
| "grad_norm": 1.7062918904685223, |
| "learning_rate": 5.8244293473855664e-05, |
| "loss": 0.7849, |
| "step": 396 |
| }, |
| { |
| "epoch": 2.0704041720990873, |
| "grad_norm": 1.623721715677207, |
| "learning_rate": 5.8113984037695984e-05, |
| "loss": 0.7857, |
| "step": 397 |
| }, |
| { |
| "epoch": 2.075619295958279, |
| "grad_norm": 1.484064136635307, |
| "learning_rate": 5.798343231632053e-05, |
| "loss": 0.7779, |
| "step": 398 |
| }, |
| { |
| "epoch": 2.080834419817471, |
| "grad_norm": 1.34676834959028, |
| "learning_rate": 5.785264005593553e-05, |
| "loss": 0.7934, |
| "step": 399 |
| }, |
| { |
| "epoch": 2.0860495436766624, |
| "grad_norm": 1.6199775628522408, |
| "learning_rate": 5.772160900596456e-05, |
| "loss": 0.7735, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.091264667535854, |
| "grad_norm": 1.5660844096047937, |
| "learning_rate": 5.7590340919025204e-05, |
| "loss": 0.7833, |
| "step": 401 |
| }, |
| { |
| "epoch": 2.0964797913950455, |
| "grad_norm": 1.0421277103166564, |
| "learning_rate": 5.7458837550905486e-05, |
| "loss": 0.7837, |
| "step": 402 |
| }, |
| { |
| "epoch": 2.1016949152542375, |
| "grad_norm": 1.7458400592594279, |
| "learning_rate": 5.73271006605405e-05, |
| "loss": 0.7855, |
| "step": 403 |
| }, |
| { |
| "epoch": 2.106910039113429, |
| "grad_norm": 1.5186591541100996, |
| "learning_rate": 5.7195132009988814e-05, |
| "loss": 0.7853, |
| "step": 404 |
| }, |
| { |
| "epoch": 2.1121251629726205, |
| "grad_norm": 0.8719695266664192, |
| "learning_rate": 5.706293336440894e-05, |
| "loss": 0.7896, |
| "step": 405 |
| }, |
| { |
| "epoch": 2.117340286831812, |
| "grad_norm": 1.0314918237610309, |
| "learning_rate": 5.693050649203568e-05, |
| "loss": 0.7807, |
| "step": 406 |
| }, |
| { |
| "epoch": 2.122555410691004, |
| "grad_norm": 0.7603723547082539, |
| "learning_rate": 5.679785316415654e-05, |
| "loss": 0.7763, |
| "step": 407 |
| }, |
| { |
| "epoch": 2.1277705345501956, |
| "grad_norm": 0.9173514911262751, |
| "learning_rate": 5.6664975155088004e-05, |
| "loss": 0.7919, |
| "step": 408 |
| }, |
| { |
| "epoch": 2.132985658409387, |
| "grad_norm": 1.2215202346945515, |
| "learning_rate": 5.6531874242151746e-05, |
| "loss": 0.772, |
| "step": 409 |
| }, |
| { |
| "epoch": 2.1382007822685787, |
| "grad_norm": 2.368630198683683, |
| "learning_rate": 5.639855220565098e-05, |
| "loss": 0.801, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.1434159061277707, |
| "grad_norm": 1.2419888378957813, |
| "learning_rate": 5.626501082884654e-05, |
| "loss": 0.7948, |
| "step": 411 |
| }, |
| { |
| "epoch": 2.148631029986962, |
| "grad_norm": 3.0547619405266446, |
| "learning_rate": 5.613125189793305e-05, |
| "loss": 0.7761, |
| "step": 412 |
| }, |
| { |
| "epoch": 2.1538461538461537, |
| "grad_norm": 2.1394143129005987, |
| "learning_rate": 5.5997277202015085e-05, |
| "loss": 0.7815, |
| "step": 413 |
| }, |
| { |
| "epoch": 2.1590612777053453, |
| "grad_norm": 3.041921943681318, |
| "learning_rate": 5.586308853308319e-05, |
| "loss": 0.7889, |
| "step": 414 |
| }, |
| { |
| "epoch": 2.1642764015645373, |
| "grad_norm": 2.4712686274458853, |
| "learning_rate": 5.572868768598993e-05, |
| "loss": 0.7764, |
| "step": 415 |
| }, |
| { |
| "epoch": 2.169491525423729, |
| "grad_norm": 2.5501423533072605, |
| "learning_rate": 5.5594076458425864e-05, |
| "loss": 0.7995, |
| "step": 416 |
| }, |
| { |
| "epoch": 2.1747066492829203, |
| "grad_norm": 2.2877612282195416, |
| "learning_rate": 5.545925665089552e-05, |
| "loss": 0.7882, |
| "step": 417 |
| }, |
| { |
| "epoch": 2.1799217731421123, |
| "grad_norm": 2.1483606998937144, |
| "learning_rate": 5.532423006669332e-05, |
| "loss": 0.7873, |
| "step": 418 |
| }, |
| { |
| "epoch": 2.185136897001304, |
| "grad_norm": 1.7462471775177655, |
| "learning_rate": 5.518899851187942e-05, |
| "loss": 0.7964, |
| "step": 419 |
| }, |
| { |
| "epoch": 2.1903520208604954, |
| "grad_norm": 2.756935578457596, |
| "learning_rate": 5.505356379525559e-05, |
| "loss": 0.7891, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.195567144719687, |
| "grad_norm": 2.4106523309041314, |
| "learning_rate": 5.491792772834103e-05, |
| "loss": 0.7804, |
| "step": 421 |
| }, |
| { |
| "epoch": 2.200782268578879, |
| "grad_norm": 1.8981144967918866, |
| "learning_rate": 5.478209212534809e-05, |
| "loss": 0.7796, |
| "step": 422 |
| }, |
| { |
| "epoch": 2.2059973924380705, |
| "grad_norm": 1.487529019552184, |
| "learning_rate": 5.464605880315803e-05, |
| "loss": 0.7773, |
| "step": 423 |
| }, |
| { |
| "epoch": 2.211212516297262, |
| "grad_norm": 2.44109555313169, |
| "learning_rate": 5.4509829581296774e-05, |
| "loss": 0.777, |
| "step": 424 |
| }, |
| { |
| "epoch": 2.2164276401564535, |
| "grad_norm": 1.9157281312835157, |
| "learning_rate": 5.4373406281910434e-05, |
| "loss": 0.7835, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.2216427640156455, |
| "grad_norm": 2.6110440355014477, |
| "learning_rate": 5.423679072974109e-05, |
| "loss": 0.7792, |
| "step": 426 |
| }, |
| { |
| "epoch": 2.226857887874837, |
| "grad_norm": 2.3658010170065853, |
| "learning_rate": 5.4099984752102295e-05, |
| "loss": 0.7914, |
| "step": 427 |
| }, |
| { |
| "epoch": 2.2320730117340286, |
| "grad_norm": 1.768060696859615, |
| "learning_rate": 5.396299017885465e-05, |
| "loss": 0.7789, |
| "step": 428 |
| }, |
| { |
| "epoch": 2.23728813559322, |
| "grad_norm": 1.661248122612779, |
| "learning_rate": 5.3825808842381345e-05, |
| "loss": 0.7735, |
| "step": 429 |
| }, |
| { |
| "epoch": 2.242503259452412, |
| "grad_norm": 2.362109109009534, |
| "learning_rate": 5.3688442577563646e-05, |
| "loss": 0.7886, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.2477183833116037, |
| "grad_norm": 2.0017993267041905, |
| "learning_rate": 5.355089322175629e-05, |
| "loss": 0.7735, |
| "step": 431 |
| }, |
| { |
| "epoch": 2.252933507170795, |
| "grad_norm": 1.985452281861886, |
| "learning_rate": 5.3413162614763043e-05, |
| "loss": 0.7904, |
| "step": 432 |
| }, |
| { |
| "epoch": 2.2581486310299868, |
| "grad_norm": 1.8249568105371123, |
| "learning_rate": 5.327525259881196e-05, |
| "loss": 0.7792, |
| "step": 433 |
| }, |
| { |
| "epoch": 2.2633637548891787, |
| "grad_norm": 1.987099316108934, |
| "learning_rate": 5.3137165018530805e-05, |
| "loss": 0.7718, |
| "step": 434 |
| }, |
| { |
| "epoch": 2.2685788787483703, |
| "grad_norm": 1.7309972017145459, |
| "learning_rate": 5.299890172092238e-05, |
| "loss": 0.7737, |
| "step": 435 |
| }, |
| { |
| "epoch": 2.273794002607562, |
| "grad_norm": 2.083040780695706, |
| "learning_rate": 5.286046455533981e-05, |
| "loss": 0.7861, |
| "step": 436 |
| }, |
| { |
| "epoch": 2.279009126466754, |
| "grad_norm": 1.7833638732253791, |
| "learning_rate": 5.27218553734618e-05, |
| "loss": 0.7688, |
| "step": 437 |
| }, |
| { |
| "epoch": 2.2842242503259453, |
| "grad_norm": 2.2540602964270997, |
| "learning_rate": 5.2583076029267864e-05, |
| "loss": 0.7868, |
| "step": 438 |
| }, |
| { |
| "epoch": 2.289439374185137, |
| "grad_norm": 2.1042743671125046, |
| "learning_rate": 5.2444128379013564e-05, |
| "loss": 0.7806, |
| "step": 439 |
| }, |
| { |
| "epoch": 2.2946544980443284, |
| "grad_norm": 1.638491731440727, |
| "learning_rate": 5.2305014281205634e-05, |
| "loss": 0.7778, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.2998696219035204, |
| "grad_norm": 1.4490203056402047, |
| "learning_rate": 5.2165735596577146e-05, |
| "loss": 0.7776, |
| "step": 441 |
| }, |
| { |
| "epoch": 2.305084745762712, |
| "grad_norm": 2.285181885567823, |
| "learning_rate": 5.202629418806263e-05, |
| "loss": 0.7767, |
| "step": 442 |
| }, |
| { |
| "epoch": 2.3102998696219035, |
| "grad_norm": 1.9464604198183288, |
| "learning_rate": 5.1886691920773116e-05, |
| "loss": 0.7651, |
| "step": 443 |
| }, |
| { |
| "epoch": 2.315514993481095, |
| "grad_norm": 1.7631237894703213, |
| "learning_rate": 5.174693066197125e-05, |
| "loss": 0.7726, |
| "step": 444 |
| }, |
| { |
| "epoch": 2.320730117340287, |
| "grad_norm": 1.56838244742276, |
| "learning_rate": 5.160701228104626e-05, |
| "loss": 0.7819, |
| "step": 445 |
| }, |
| { |
| "epoch": 2.3259452411994785, |
| "grad_norm": 2.202901692990395, |
| "learning_rate": 5.146693864948898e-05, |
| "loss": 0.7856, |
| "step": 446 |
| }, |
| { |
| "epoch": 2.33116036505867, |
| "grad_norm": 1.92108099189494, |
| "learning_rate": 5.132671164086682e-05, |
| "loss": 0.775, |
| "step": 447 |
| }, |
| { |
| "epoch": 2.3363754889178616, |
| "grad_norm": 1.760952455824346, |
| "learning_rate": 5.118633313079869e-05, |
| "loss": 0.7807, |
| "step": 448 |
| }, |
| { |
| "epoch": 2.3415906127770536, |
| "grad_norm": 1.5869940859426193, |
| "learning_rate": 5.104580499692992e-05, |
| "loss": 0.7845, |
| "step": 449 |
| }, |
| { |
| "epoch": 2.346805736636245, |
| "grad_norm": 2.0260440621389364, |
| "learning_rate": 5.090512911890715e-05, |
| "loss": 0.7857, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.3520208604954367, |
| "grad_norm": 1.7167308373113537, |
| "learning_rate": 5.076430737835318e-05, |
| "loss": 0.7815, |
| "step": 451 |
| }, |
| { |
| "epoch": 2.3572359843546282, |
| "grad_norm": 1.881887746447444, |
| "learning_rate": 5.062334165884182e-05, |
| "loss": 0.7811, |
| "step": 452 |
| }, |
| { |
| "epoch": 2.36245110821382, |
| "grad_norm": 1.6824543517537593, |
| "learning_rate": 5.0482233845872674e-05, |
| "loss": 0.776, |
| "step": 453 |
| }, |
| { |
| "epoch": 2.3676662320730117, |
| "grad_norm": 1.8755854127797766, |
| "learning_rate": 5.034098582684595e-05, |
| "loss": 0.7803, |
| "step": 454 |
| }, |
| { |
| "epoch": 2.3728813559322033, |
| "grad_norm": 1.6777197590484048, |
| "learning_rate": 5.019959949103715e-05, |
| "loss": 0.7848, |
| "step": 455 |
| }, |
| { |
| "epoch": 2.3780964797913953, |
| "grad_norm": 1.81809003546367, |
| "learning_rate": 5.005807672957188e-05, |
| "loss": 0.7858, |
| "step": 456 |
| }, |
| { |
| "epoch": 2.383311603650587, |
| "grad_norm": 1.6133734346606823, |
| "learning_rate": 4.9916419435400516e-05, |
| "loss": 0.7808, |
| "step": 457 |
| }, |
| { |
| "epoch": 2.3885267275097783, |
| "grad_norm": 1.8363482997949565, |
| "learning_rate": 4.9774629503272874e-05, |
| "loss": 0.7796, |
| "step": 458 |
| }, |
| { |
| "epoch": 2.39374185136897, |
| "grad_norm": 1.5541908015103185, |
| "learning_rate": 4.96327088297129e-05, |
| "loss": 0.7771, |
| "step": 459 |
| }, |
| { |
| "epoch": 2.3989569752281614, |
| "grad_norm": 1.754540259013617, |
| "learning_rate": 4.949065931299328e-05, |
| "loss": 0.7858, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.4041720990873534, |
| "grad_norm": 1.539083403172259, |
| "learning_rate": 4.934848285311002e-05, |
| "loss": 0.778, |
| "step": 461 |
| }, |
| { |
| "epoch": 2.409387222946545, |
| "grad_norm": 1.8702666142803812, |
| "learning_rate": 4.920618135175712e-05, |
| "loss": 0.7815, |
| "step": 462 |
| }, |
| { |
| "epoch": 2.4146023468057365, |
| "grad_norm": 1.644430924102753, |
| "learning_rate": 4.9063756712301036e-05, |
| "loss": 0.773, |
| "step": 463 |
| }, |
| { |
| "epoch": 2.4198174706649285, |
| "grad_norm": 1.6756469842658643, |
| "learning_rate": 4.8921210839755304e-05, |
| "loss": 0.7737, |
| "step": 464 |
| }, |
| { |
| "epoch": 2.42503259452412, |
| "grad_norm": 1.4353920458126117, |
| "learning_rate": 4.877854564075499e-05, |
| "loss": 0.7861, |
| "step": 465 |
| }, |
| { |
| "epoch": 2.4302477183833116, |
| "grad_norm": 1.8834730757505287, |
| "learning_rate": 4.863576302353125e-05, |
| "loss": 0.7638, |
| "step": 466 |
| }, |
| { |
| "epoch": 2.435462842242503, |
| "grad_norm": 1.6285082262536497, |
| "learning_rate": 4.849286489788579e-05, |
| "loss": 0.7719, |
| "step": 467 |
| }, |
| { |
| "epoch": 2.440677966101695, |
| "grad_norm": 1.6841805396754381, |
| "learning_rate": 4.834985317516525e-05, |
| "loss": 0.7833, |
| "step": 468 |
| }, |
| { |
| "epoch": 2.4458930899608866, |
| "grad_norm": 1.5254872325101743, |
| "learning_rate": 4.8206729768235756e-05, |
| "loss": 0.7766, |
| "step": 469 |
| }, |
| { |
| "epoch": 2.451108213820078, |
| "grad_norm": 1.5946022371035584, |
| "learning_rate": 4.8063496591457256e-05, |
| "loss": 0.7688, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.4563233376792697, |
| "grad_norm": 1.3044832158013293, |
| "learning_rate": 4.792015556065793e-05, |
| "loss": 0.7745, |
| "step": 471 |
| }, |
| { |
| "epoch": 2.4615384615384617, |
| "grad_norm": 1.822367193607616, |
| "learning_rate": 4.777670859310857e-05, |
| "loss": 0.7839, |
| "step": 472 |
| }, |
| { |
| "epoch": 2.466753585397653, |
| "grad_norm": 1.5933250932152585, |
| "learning_rate": 4.763315760749695e-05, |
| "loss": 0.7886, |
| "step": 473 |
| }, |
| { |
| "epoch": 2.4719687092568448, |
| "grad_norm": 1.588436894654484, |
| "learning_rate": 4.748950452390212e-05, |
| "loss": 0.7742, |
| "step": 474 |
| }, |
| { |
| "epoch": 2.4771838331160367, |
| "grad_norm": 1.4522634410247943, |
| "learning_rate": 4.734575126376876e-05, |
| "loss": 0.7823, |
| "step": 475 |
| }, |
| { |
| "epoch": 2.4823989569752283, |
| "grad_norm": 1.586237253805528, |
| "learning_rate": 4.7201899749881504e-05, |
| "loss": 0.7926, |
| "step": 476 |
| }, |
| { |
| "epoch": 2.48761408083442, |
| "grad_norm": 1.3312635236777999, |
| "learning_rate": 4.705795190633915e-05, |
| "loss": 0.786, |
| "step": 477 |
| }, |
| { |
| "epoch": 2.4928292046936114, |
| "grad_norm": 1.7378087684763845, |
| "learning_rate": 4.691390965852893e-05, |
| "loss": 0.7693, |
| "step": 478 |
| }, |
| { |
| "epoch": 2.498044328552803, |
| "grad_norm": 1.554126764803909, |
| "learning_rate": 4.676977493310088e-05, |
| "loss": 0.7911, |
| "step": 479 |
| }, |
| { |
| "epoch": 2.503259452411995, |
| "grad_norm": 1.4897288575191927, |
| "learning_rate": 4.662554965794192e-05, |
| "loss": 0.7809, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.5084745762711864, |
| "grad_norm": 1.2742943025314304, |
| "learning_rate": 4.648123576215011e-05, |
| "loss": 0.7811, |
| "step": 481 |
| }, |
| { |
| "epoch": 2.513689700130378, |
| "grad_norm": 1.714780336353614, |
| "learning_rate": 4.633683517600891e-05, |
| "loss": 0.7707, |
| "step": 482 |
| }, |
| { |
| "epoch": 2.51890482398957, |
| "grad_norm": 1.514593536382248, |
| "learning_rate": 4.61923498309613e-05, |
| "loss": 0.7817, |
| "step": 483 |
| }, |
| { |
| "epoch": 2.5241199478487615, |
| "grad_norm": 1.4058404125363677, |
| "learning_rate": 4.604778165958392e-05, |
| "loss": 0.7691, |
| "step": 484 |
| }, |
| { |
| "epoch": 2.529335071707953, |
| "grad_norm": 1.1465871672288677, |
| "learning_rate": 4.590313259556132e-05, |
| "loss": 0.7635, |
| "step": 485 |
| }, |
| { |
| "epoch": 2.5345501955671446, |
| "grad_norm": 1.6402163570623103, |
| "learning_rate": 4.575840457366001e-05, |
| "loss": 0.7876, |
| "step": 486 |
| }, |
| { |
| "epoch": 2.539765319426336, |
| "grad_norm": 1.3178513882789409, |
| "learning_rate": 4.561359952970259e-05, |
| "loss": 0.7664, |
| "step": 487 |
| }, |
| { |
| "epoch": 2.544980443285528, |
| "grad_norm": 1.6022661486448093, |
| "learning_rate": 4.546871940054191e-05, |
| "loss": 0.7905, |
| "step": 488 |
| }, |
| { |
| "epoch": 2.5501955671447196, |
| "grad_norm": 1.4090335919146642, |
| "learning_rate": 4.5323766124035115e-05, |
| "loss": 0.7819, |
| "step": 489 |
| }, |
| { |
| "epoch": 2.555410691003911, |
| "grad_norm": 1.324862817997592, |
| "learning_rate": 4.5178741639017736e-05, |
| "loss": 0.7787, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.560625814863103, |
| "grad_norm": 1.1347550661741326, |
| "learning_rate": 4.503364788527774e-05, |
| "loss": 0.7831, |
| "step": 491 |
| }, |
| { |
| "epoch": 2.5658409387222947, |
| "grad_norm": 1.5760247018602986, |
| "learning_rate": 4.488848680352965e-05, |
| "loss": 0.7796, |
| "step": 492 |
| }, |
| { |
| "epoch": 2.5710560625814862, |
| "grad_norm": 1.3090981394784642, |
| "learning_rate": 4.4743260335388516e-05, |
| "loss": 0.7724, |
| "step": 493 |
| }, |
| { |
| "epoch": 2.576271186440678, |
| "grad_norm": 1.3548998817619167, |
| "learning_rate": 4.459797042334395e-05, |
| "loss": 0.781, |
| "step": 494 |
| }, |
| { |
| "epoch": 2.5814863102998697, |
| "grad_norm": 1.2234336308095453, |
| "learning_rate": 4.445261901073419e-05, |
| "loss": 0.7743, |
| "step": 495 |
| }, |
| { |
| "epoch": 2.5867014341590613, |
| "grad_norm": 1.2594794059995964, |
| "learning_rate": 4.430720804172008e-05, |
| "loss": 0.7878, |
| "step": 496 |
| }, |
| { |
| "epoch": 2.591916558018253, |
| "grad_norm": 1.0475475763925866, |
| "learning_rate": 4.416173946125906e-05, |
| "loss": 0.7851, |
| "step": 497 |
| }, |
| { |
| "epoch": 2.5971316818774444, |
| "grad_norm": 1.3034377416715606, |
| "learning_rate": 4.401621521507914e-05, |
| "loss": 0.7661, |
| "step": 498 |
| }, |
| { |
| "epoch": 2.6023468057366363, |
| "grad_norm": 1.0178177147229814, |
| "learning_rate": 4.387063724965292e-05, |
| "loss": 0.768, |
| "step": 499 |
| }, |
| { |
| "epoch": 2.607561929595828, |
| "grad_norm": 1.4358534072422087, |
| "learning_rate": 4.372500751217153e-05, |
| "loss": 0.7886, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.6127770534550194, |
| "grad_norm": 1.150853401699166, |
| "learning_rate": 4.357932795051852e-05, |
| "loss": 0.7934, |
| "step": 501 |
| }, |
| { |
| "epoch": 2.6179921773142114, |
| "grad_norm": 1.1810592463532203, |
| "learning_rate": 4.3433600513243965e-05, |
| "loss": 0.7563, |
| "step": 502 |
| }, |
| { |
| "epoch": 2.623207301173403, |
| "grad_norm": 1.0034045366911863, |
| "learning_rate": 4.328782714953823e-05, |
| "loss": 0.79, |
| "step": 503 |
| }, |
| { |
| "epoch": 2.6284224250325945, |
| "grad_norm": 1.0980643139376323, |
| "learning_rate": 4.3142009809205986e-05, |
| "loss": 0.7861, |
| "step": 504 |
| }, |
| { |
| "epoch": 2.633637548891786, |
| "grad_norm": 0.8881990742171157, |
| "learning_rate": 4.2996150442640163e-05, |
| "loss": 0.7758, |
| "step": 505 |
| }, |
| { |
| "epoch": 2.6388526727509776, |
| "grad_norm": 0.8675797674117819, |
| "learning_rate": 4.2850251000795735e-05, |
| "loss": 0.7801, |
| "step": 506 |
| }, |
| { |
| "epoch": 2.6440677966101696, |
| "grad_norm": 0.7883090264802906, |
| "learning_rate": 4.270431343516379e-05, |
| "loss": 0.7745, |
| "step": 507 |
| }, |
| { |
| "epoch": 2.649282920469361, |
| "grad_norm": 0.7583525803187019, |
| "learning_rate": 4.2558339697745297e-05, |
| "loss": 0.7827, |
| "step": 508 |
| }, |
| { |
| "epoch": 2.6544980443285526, |
| "grad_norm": 0.7762877342404452, |
| "learning_rate": 4.2412331741025045e-05, |
| "loss": 0.7967, |
| "step": 509 |
| }, |
| { |
| "epoch": 2.6597131681877446, |
| "grad_norm": 0.7014761796032362, |
| "learning_rate": 4.226629151794555e-05, |
| "loss": 0.7858, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.664928292046936, |
| "grad_norm": 0.7396414732409832, |
| "learning_rate": 4.2120220981880875e-05, |
| "loss": 0.7794, |
| "step": 511 |
| }, |
| { |
| "epoch": 2.6701434159061277, |
| "grad_norm": 0.6998054830397707, |
| "learning_rate": 4.197412208661058e-05, |
| "loss": 0.7777, |
| "step": 512 |
| }, |
| { |
| "epoch": 2.6753585397653197, |
| "grad_norm": 0.6386491344974894, |
| "learning_rate": 4.182799678629351e-05, |
| "loss": 0.7823, |
| "step": 513 |
| }, |
| { |
| "epoch": 2.680573663624511, |
| "grad_norm": 0.5877328765456284, |
| "learning_rate": 4.168184703544171e-05, |
| "loss": 0.7744, |
| "step": 514 |
| }, |
| { |
| "epoch": 2.6857887874837028, |
| "grad_norm": 0.5920983354896419, |
| "learning_rate": 4.153567478889426e-05, |
| "loss": 0.7769, |
| "step": 515 |
| }, |
| { |
| "epoch": 2.6910039113428943, |
| "grad_norm": 0.4867319202645289, |
| "learning_rate": 4.138948200179115e-05, |
| "loss": 0.7807, |
| "step": 516 |
| }, |
| { |
| "epoch": 2.696219035202086, |
| "grad_norm": 0.5746181168372155, |
| "learning_rate": 4.124327062954707e-05, |
| "loss": 0.7708, |
| "step": 517 |
| }, |
| { |
| "epoch": 2.701434159061278, |
| "grad_norm": 0.35394623638423245, |
| "learning_rate": 4.1097042627825325e-05, |
| "loss": 0.7823, |
| "step": 518 |
| }, |
| { |
| "epoch": 2.7066492829204694, |
| "grad_norm": 0.6245548216599335, |
| "learning_rate": 4.095079995251168e-05, |
| "loss": 0.784, |
| "step": 519 |
| }, |
| { |
| "epoch": 2.711864406779661, |
| "grad_norm": 0.3951096697035697, |
| "learning_rate": 4.080454455968812e-05, |
| "loss": 0.7711, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.717079530638853, |
| "grad_norm": 0.5759028178578062, |
| "learning_rate": 4.065827840560674e-05, |
| "loss": 0.7773, |
| "step": 521 |
| }, |
| { |
| "epoch": 2.7222946544980444, |
| "grad_norm": 0.39680693057913397, |
| "learning_rate": 4.0512003446663576e-05, |
| "loss": 0.7736, |
| "step": 522 |
| }, |
| { |
| "epoch": 2.727509778357236, |
| "grad_norm": 0.49858076462377626, |
| "learning_rate": 4.0365721639372476e-05, |
| "loss": 0.7884, |
| "step": 523 |
| }, |
| { |
| "epoch": 2.7327249022164275, |
| "grad_norm": 0.4117350165519155, |
| "learning_rate": 4.021943494033882e-05, |
| "loss": 0.7707, |
| "step": 524 |
| }, |
| { |
| "epoch": 2.737940026075619, |
| "grad_norm": 0.4515042047115184, |
| "learning_rate": 4.007314530623348e-05, |
| "loss": 0.7778, |
| "step": 525 |
| }, |
| { |
| "epoch": 2.743155149934811, |
| "grad_norm": 0.42547404001436967, |
| "learning_rate": 3.9926854693766536e-05, |
| "loss": 0.7688, |
| "step": 526 |
| }, |
| { |
| "epoch": 2.7483702737940026, |
| "grad_norm": 0.35069455268010413, |
| "learning_rate": 3.978056505966119e-05, |
| "loss": 0.7744, |
| "step": 527 |
| }, |
| { |
| "epoch": 2.753585397653194, |
| "grad_norm": 0.4444789741021684, |
| "learning_rate": 3.963427836062753e-05, |
| "loss": 0.775, |
| "step": 528 |
| }, |
| { |
| "epoch": 2.758800521512386, |
| "grad_norm": 0.43329608911430956, |
| "learning_rate": 3.948799655333644e-05, |
| "loss": 0.7748, |
| "step": 529 |
| }, |
| { |
| "epoch": 2.7640156453715776, |
| "grad_norm": 0.4209440665741905, |
| "learning_rate": 3.9341721594393276e-05, |
| "loss": 0.7762, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.769230769230769, |
| "grad_norm": 0.4322531684473173, |
| "learning_rate": 3.919545544031189e-05, |
| "loss": 0.7878, |
| "step": 531 |
| }, |
| { |
| "epoch": 2.774445893089961, |
| "grad_norm": 0.4720335982176997, |
| "learning_rate": 3.904920004748833e-05, |
| "loss": 0.777, |
| "step": 532 |
| }, |
| { |
| "epoch": 2.7796610169491527, |
| "grad_norm": 0.37675833736688985, |
| "learning_rate": 3.8902957372174675e-05, |
| "loss": 0.7728, |
| "step": 533 |
| }, |
| { |
| "epoch": 2.7848761408083442, |
| "grad_norm": 0.3961055004093581, |
| "learning_rate": 3.8756729370452936e-05, |
| "loss": 0.7745, |
| "step": 534 |
| }, |
| { |
| "epoch": 2.7900912646675358, |
| "grad_norm": 0.39787870628829797, |
| "learning_rate": 3.8610517998208866e-05, |
| "loss": 0.779, |
| "step": 535 |
| }, |
| { |
| "epoch": 2.7953063885267273, |
| "grad_norm": 0.366576904221954, |
| "learning_rate": 3.846432521110574e-05, |
| "loss": 0.7823, |
| "step": 536 |
| }, |
| { |
| "epoch": 2.8005215123859193, |
| "grad_norm": 0.39669396154921643, |
| "learning_rate": 3.831815296455829e-05, |
| "loss": 0.7776, |
| "step": 537 |
| }, |
| { |
| "epoch": 2.805736636245111, |
| "grad_norm": 0.5485537263803792, |
| "learning_rate": 3.817200321370649e-05, |
| "loss": 0.7821, |
| "step": 538 |
| }, |
| { |
| "epoch": 2.8109517601043024, |
| "grad_norm": 0.5046596554915849, |
| "learning_rate": 3.802587791338943e-05, |
| "loss": 0.757, |
| "step": 539 |
| }, |
| { |
| "epoch": 2.8161668839634943, |
| "grad_norm": 0.4297088800078496, |
| "learning_rate": 3.787977901811914e-05, |
| "loss": 0.7819, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.821382007822686, |
| "grad_norm": 0.2792196466948258, |
| "learning_rate": 3.7733708482054476e-05, |
| "loss": 0.7908, |
| "step": 541 |
| }, |
| { |
| "epoch": 2.8265971316818774, |
| "grad_norm": 0.3389535808547031, |
| "learning_rate": 3.758766825897497e-05, |
| "loss": 0.7789, |
| "step": 542 |
| }, |
| { |
| "epoch": 2.831812255541069, |
| "grad_norm": 0.33504887483040935, |
| "learning_rate": 3.7441660302254724e-05, |
| "loss": 0.7641, |
| "step": 543 |
| }, |
| { |
| "epoch": 2.8370273794002605, |
| "grad_norm": 0.35811016749224023, |
| "learning_rate": 3.729568656483623e-05, |
| "loss": 0.7711, |
| "step": 544 |
| }, |
| { |
| "epoch": 2.8422425032594525, |
| "grad_norm": 0.3304033008238814, |
| "learning_rate": 3.714974899920428e-05, |
| "loss": 0.781, |
| "step": 545 |
| }, |
| { |
| "epoch": 2.847457627118644, |
| "grad_norm": 0.42274779776394655, |
| "learning_rate": 3.7003849557359863e-05, |
| "loss": 0.7738, |
| "step": 546 |
| }, |
| { |
| "epoch": 2.8526727509778356, |
| "grad_norm": 0.33608657379975826, |
| "learning_rate": 3.685799019079402e-05, |
| "loss": 0.7858, |
| "step": 547 |
| }, |
| { |
| "epoch": 2.8578878748370276, |
| "grad_norm": 0.4402897321734762, |
| "learning_rate": 3.6712172850461785e-05, |
| "loss": 0.7679, |
| "step": 548 |
| }, |
| { |
| "epoch": 2.863102998696219, |
| "grad_norm": 0.29934112633294296, |
| "learning_rate": 3.6566399486756055e-05, |
| "loss": 0.7889, |
| "step": 549 |
| }, |
| { |
| "epoch": 2.8683181225554106, |
| "grad_norm": 0.38015963993433893, |
| "learning_rate": 3.642067204948149e-05, |
| "loss": 0.7695, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.8735332464146026, |
| "grad_norm": 0.38644710923105946, |
| "learning_rate": 3.627499248782849e-05, |
| "loss": 0.7745, |
| "step": 551 |
| }, |
| { |
| "epoch": 2.878748370273794, |
| "grad_norm": 0.44283035266374077, |
| "learning_rate": 3.612936275034709e-05, |
| "loss": 0.7829, |
| "step": 552 |
| }, |
| { |
| "epoch": 2.8839634941329857, |
| "grad_norm": 0.426691177545099, |
| "learning_rate": 3.598378478492087e-05, |
| "loss": 0.7775, |
| "step": 553 |
| }, |
| { |
| "epoch": 2.8891786179921772, |
| "grad_norm": 0.41202751066509746, |
| "learning_rate": 3.583826053874096e-05, |
| "loss": 0.7792, |
| "step": 554 |
| }, |
| { |
| "epoch": 2.8943937418513688, |
| "grad_norm": 0.40337608056255875, |
| "learning_rate": 3.5692791958279924e-05, |
| "loss": 0.7737, |
| "step": 555 |
| }, |
| { |
| "epoch": 2.8996088657105608, |
| "grad_norm": 0.42109772590116307, |
| "learning_rate": 3.554738098926582e-05, |
| "loss": 0.7757, |
| "step": 556 |
| }, |
| { |
| "epoch": 2.9048239895697523, |
| "grad_norm": 0.6458006850893471, |
| "learning_rate": 3.540202957665606e-05, |
| "loss": 0.7775, |
| "step": 557 |
| }, |
| { |
| "epoch": 2.910039113428944, |
| "grad_norm": 0.5129008593035667, |
| "learning_rate": 3.525673966461149e-05, |
| "loss": 0.7793, |
| "step": 558 |
| }, |
| { |
| "epoch": 2.915254237288136, |
| "grad_norm": 0.37986325064356524, |
| "learning_rate": 3.5111513196470356e-05, |
| "loss": 0.7706, |
| "step": 559 |
| }, |
| { |
| "epoch": 2.9204693611473274, |
| "grad_norm": 0.3127213742675476, |
| "learning_rate": 3.4966352114722264e-05, |
| "loss": 0.779, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.925684485006519, |
| "grad_norm": 0.44150964872249004, |
| "learning_rate": 3.482125836098227e-05, |
| "loss": 0.7726, |
| "step": 561 |
| }, |
| { |
| "epoch": 2.9308996088657104, |
| "grad_norm": 0.37918325106659184, |
| "learning_rate": 3.46762338759649e-05, |
| "loss": 0.7763, |
| "step": 562 |
| }, |
| { |
| "epoch": 2.936114732724902, |
| "grad_norm": 0.34801331615309594, |
| "learning_rate": 3.4531280599458096e-05, |
| "loss": 0.7792, |
| "step": 563 |
| }, |
| { |
| "epoch": 2.941329856584094, |
| "grad_norm": 0.3417071688239163, |
| "learning_rate": 3.438640047029741e-05, |
| "loss": 0.783, |
| "step": 564 |
| }, |
| { |
| "epoch": 2.9465449804432855, |
| "grad_norm": 0.4027724323607999, |
| "learning_rate": 3.424159542634001e-05, |
| "loss": 0.7799, |
| "step": 565 |
| }, |
| { |
| "epoch": 2.951760104302477, |
| "grad_norm": 0.359846745274617, |
| "learning_rate": 3.409686740443869e-05, |
| "loss": 0.7874, |
| "step": 566 |
| }, |
| { |
| "epoch": 2.956975228161669, |
| "grad_norm": 0.39877444581949706, |
| "learning_rate": 3.395221834041608e-05, |
| "loss": 0.7796, |
| "step": 567 |
| }, |
| { |
| "epoch": 2.9621903520208606, |
| "grad_norm": 0.31627637641393286, |
| "learning_rate": 3.3807650169038714e-05, |
| "loss": 0.7828, |
| "step": 568 |
| }, |
| { |
| "epoch": 2.967405475880052, |
| "grad_norm": 0.2915329688447508, |
| "learning_rate": 3.366316482399111e-05, |
| "loss": 0.7611, |
| "step": 569 |
| }, |
| { |
| "epoch": 2.9726205997392436, |
| "grad_norm": 0.3003806981610255, |
| "learning_rate": 3.351876423784991e-05, |
| "loss": 0.7753, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.9778357235984356, |
| "grad_norm": 0.32825206856441885, |
| "learning_rate": 3.33744503420581e-05, |
| "loss": 0.7805, |
| "step": 571 |
| }, |
| { |
| "epoch": 2.983050847457627, |
| "grad_norm": 0.28811623986201, |
| "learning_rate": 3.323022506689913e-05, |
| "loss": 0.7787, |
| "step": 572 |
| }, |
| { |
| "epoch": 2.9882659713168187, |
| "grad_norm": 0.3007386340259681, |
| "learning_rate": 3.308609034147109e-05, |
| "loss": 0.7693, |
| "step": 573 |
| }, |
| { |
| "epoch": 2.9934810951760102, |
| "grad_norm": 0.28601621207061495, |
| "learning_rate": 3.294204809366088e-05, |
| "loss": 0.7811, |
| "step": 574 |
| }, |
| { |
| "epoch": 2.9986962190352022, |
| "grad_norm": 0.38527843914278115, |
| "learning_rate": 3.27981002501185e-05, |
| "loss": 0.8962, |
| "step": 575 |
| }, |
| { |
| "epoch": 3.0039113428943938, |
| "grad_norm": 0.3891353205481252, |
| "learning_rate": 3.265424873623125e-05, |
| "loss": 0.8279, |
| "step": 576 |
| }, |
| { |
| "epoch": 3.0091264667535853, |
| "grad_norm": 0.34677276173333527, |
| "learning_rate": 3.251049547609789e-05, |
| "loss": 0.7617, |
| "step": 577 |
| }, |
| { |
| "epoch": 3.014341590612777, |
| "grad_norm": 0.3887463573130963, |
| "learning_rate": 3.2366842392503065e-05, |
| "loss": 0.7404, |
| "step": 578 |
| }, |
| { |
| "epoch": 3.019556714471969, |
| "grad_norm": 0.45850333580212727, |
| "learning_rate": 3.222329140689144e-05, |
| "loss": 0.7509, |
| "step": 579 |
| }, |
| { |
| "epoch": 3.0247718383311604, |
| "grad_norm": 0.42139454484056155, |
| "learning_rate": 3.207984443934208e-05, |
| "loss": 0.7474, |
| "step": 580 |
| }, |
| { |
| "epoch": 3.029986962190352, |
| "grad_norm": 0.32213156347235194, |
| "learning_rate": 3.193650340854275e-05, |
| "loss": 0.7449, |
| "step": 581 |
| }, |
| { |
| "epoch": 3.035202086049544, |
| "grad_norm": 0.2882489783740716, |
| "learning_rate": 3.179327023176426e-05, |
| "loss": 0.7565, |
| "step": 582 |
| }, |
| { |
| "epoch": 3.0404172099087354, |
| "grad_norm": 0.31398533972182546, |
| "learning_rate": 3.1650146824834765e-05, |
| "loss": 0.7663, |
| "step": 583 |
| }, |
| { |
| "epoch": 3.045632333767927, |
| "grad_norm": 0.42483377951795925, |
| "learning_rate": 3.1507135102114224e-05, |
| "loss": 0.7524, |
| "step": 584 |
| }, |
| { |
| "epoch": 3.0508474576271185, |
| "grad_norm": 0.4203137878607459, |
| "learning_rate": 3.1364236976468755e-05, |
| "loss": 0.7476, |
| "step": 585 |
| }, |
| { |
| "epoch": 3.0560625814863105, |
| "grad_norm": 0.3536224112892052, |
| "learning_rate": 3.122145435924502e-05, |
| "loss": 0.7479, |
| "step": 586 |
| }, |
| { |
| "epoch": 3.061277705345502, |
| "grad_norm": 0.3355710189906583, |
| "learning_rate": 3.107878916024471e-05, |
| "loss": 0.7475, |
| "step": 587 |
| }, |
| { |
| "epoch": 3.0664928292046936, |
| "grad_norm": 0.3578880245402906, |
| "learning_rate": 3.093624328769898e-05, |
| "loss": 0.7542, |
| "step": 588 |
| }, |
| { |
| "epoch": 3.071707953063885, |
| "grad_norm": 0.3807896298844658, |
| "learning_rate": 3.079381864824289e-05, |
| "loss": 0.7617, |
| "step": 589 |
| }, |
| { |
| "epoch": 3.076923076923077, |
| "grad_norm": 0.37153630367778145, |
| "learning_rate": 3.065151714688998e-05, |
| "loss": 0.7581, |
| "step": 590 |
| }, |
| { |
| "epoch": 3.0821382007822686, |
| "grad_norm": 0.41123425605078595, |
| "learning_rate": 3.0509340687006735e-05, |
| "loss": 0.7538, |
| "step": 591 |
| }, |
| { |
| "epoch": 3.08735332464146, |
| "grad_norm": 0.33981481049719536, |
| "learning_rate": 3.0367291170287102e-05, |
| "loss": 0.7524, |
| "step": 592 |
| }, |
| { |
| "epoch": 3.0925684485006517, |
| "grad_norm": 0.3651990332335353, |
| "learning_rate": 3.0225370496727125e-05, |
| "loss": 0.7584, |
| "step": 593 |
| }, |
| { |
| "epoch": 3.0977835723598437, |
| "grad_norm": 0.2918824973106232, |
| "learning_rate": 3.0083580564599484e-05, |
| "loss": 0.7563, |
| "step": 594 |
| }, |
| { |
| "epoch": 3.1029986962190352, |
| "grad_norm": 0.3423081259716029, |
| "learning_rate": 2.9941923270428125e-05, |
| "loss": 0.7471, |
| "step": 595 |
| }, |
| { |
| "epoch": 3.1082138200782268, |
| "grad_norm": 0.3604784090698713, |
| "learning_rate": 2.9800400508962854e-05, |
| "loss": 0.7458, |
| "step": 596 |
| }, |
| { |
| "epoch": 3.1134289439374183, |
| "grad_norm": 0.44107810877163156, |
| "learning_rate": 2.9659014173154053e-05, |
| "loss": 0.761, |
| "step": 597 |
| }, |
| { |
| "epoch": 3.1186440677966103, |
| "grad_norm": 0.36068201923449394, |
| "learning_rate": 2.9517766154127332e-05, |
| "loss": 0.7495, |
| "step": 598 |
| }, |
| { |
| "epoch": 3.123859191655802, |
| "grad_norm": 0.32775307660721575, |
| "learning_rate": 2.93766583411582e-05, |
| "loss": 0.7463, |
| "step": 599 |
| }, |
| { |
| "epoch": 3.1290743155149934, |
| "grad_norm": 0.4150847775843986, |
| "learning_rate": 2.923569262164684e-05, |
| "loss": 0.7459, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.134289439374185, |
| "grad_norm": 0.3453247625813097, |
| "learning_rate": 2.909487088109287e-05, |
| "loss": 0.756, |
| "step": 601 |
| }, |
| { |
| "epoch": 3.139504563233377, |
| "grad_norm": 0.3884807540864722, |
| "learning_rate": 2.8954195003070106e-05, |
| "loss": 0.7469, |
| "step": 602 |
| }, |
| { |
| "epoch": 3.1447196870925684, |
| "grad_norm": 0.32406875922905615, |
| "learning_rate": 2.8813666869201323e-05, |
| "loss": 0.7492, |
| "step": 603 |
| }, |
| { |
| "epoch": 3.14993481095176, |
| "grad_norm": 0.3124660936977556, |
| "learning_rate": 2.867328835913319e-05, |
| "loss": 0.7412, |
| "step": 604 |
| }, |
| { |
| "epoch": 3.155149934810952, |
| "grad_norm": 0.402564278458608, |
| "learning_rate": 2.853306135051103e-05, |
| "loss": 0.7628, |
| "step": 605 |
| }, |
| { |
| "epoch": 3.1603650586701435, |
| "grad_norm": 0.3914753370029631, |
| "learning_rate": 2.8392987718953748e-05, |
| "loss": 0.7467, |
| "step": 606 |
| }, |
| { |
| "epoch": 3.165580182529335, |
| "grad_norm": 0.36094618715771837, |
| "learning_rate": 2.8253069338028756e-05, |
| "loss": 0.7528, |
| "step": 607 |
| }, |
| { |
| "epoch": 3.1707953063885266, |
| "grad_norm": 0.4160569360480524, |
| "learning_rate": 2.8113308079226897e-05, |
| "loss": 0.753, |
| "step": 608 |
| }, |
| { |
| "epoch": 3.1760104302477186, |
| "grad_norm": 0.36846901981179736, |
| "learning_rate": 2.7973705811937387e-05, |
| "loss": 0.762, |
| "step": 609 |
| }, |
| { |
| "epoch": 3.18122555410691, |
| "grad_norm": 0.3181951199229672, |
| "learning_rate": 2.7834264403422864e-05, |
| "loss": 0.7581, |
| "step": 610 |
| }, |
| { |
| "epoch": 3.1864406779661016, |
| "grad_norm": 0.28604043043751265, |
| "learning_rate": 2.769498571879438e-05, |
| "loss": 0.7591, |
| "step": 611 |
| }, |
| { |
| "epoch": 3.191655801825293, |
| "grad_norm": 0.2634604899316907, |
| "learning_rate": 2.7555871620986453e-05, |
| "loss": 0.7554, |
| "step": 612 |
| }, |
| { |
| "epoch": 3.196870925684485, |
| "grad_norm": 0.3039718480846403, |
| "learning_rate": 2.7416923970732146e-05, |
| "loss": 0.7588, |
| "step": 613 |
| }, |
| { |
| "epoch": 3.2020860495436767, |
| "grad_norm": 0.288103576580876, |
| "learning_rate": 2.7278144626538212e-05, |
| "loss": 0.7487, |
| "step": 614 |
| }, |
| { |
| "epoch": 3.2073011734028682, |
| "grad_norm": 0.32239905156935256, |
| "learning_rate": 2.71395354446602e-05, |
| "loss": 0.7459, |
| "step": 615 |
| }, |
| { |
| "epoch": 3.21251629726206, |
| "grad_norm": 0.29806440969996567, |
| "learning_rate": 2.7001098279077623e-05, |
| "loss": 0.7548, |
| "step": 616 |
| }, |
| { |
| "epoch": 3.2177314211212518, |
| "grad_norm": 0.25998375637060805, |
| "learning_rate": 2.6862834981469198e-05, |
| "loss": 0.7486, |
| "step": 617 |
| }, |
| { |
| "epoch": 3.2229465449804433, |
| "grad_norm": 0.39237992094856566, |
| "learning_rate": 2.6724747401188055e-05, |
| "loss": 0.7532, |
| "step": 618 |
| }, |
| { |
| "epoch": 3.228161668839635, |
| "grad_norm": 0.2548892235121485, |
| "learning_rate": 2.6586837385236966e-05, |
| "loss": 0.7484, |
| "step": 619 |
| }, |
| { |
| "epoch": 3.2333767926988264, |
| "grad_norm": 0.3225804950825967, |
| "learning_rate": 2.6449106778243708e-05, |
| "loss": 0.7529, |
| "step": 620 |
| }, |
| { |
| "epoch": 3.2385919165580184, |
| "grad_norm": 0.45928173129349054, |
| "learning_rate": 2.631155742243637e-05, |
| "loss": 0.7549, |
| "step": 621 |
| }, |
| { |
| "epoch": 3.24380704041721, |
| "grad_norm": 0.3308854647358001, |
| "learning_rate": 2.6174191157618652e-05, |
| "loss": 0.7554, |
| "step": 622 |
| }, |
| { |
| "epoch": 3.2490221642764014, |
| "grad_norm": 0.2795489427507208, |
| "learning_rate": 2.6037009821145346e-05, |
| "loss": 0.7593, |
| "step": 623 |
| }, |
| { |
| "epoch": 3.2542372881355934, |
| "grad_norm": 0.4430769575631773, |
| "learning_rate": 2.5900015247897714e-05, |
| "loss": 0.7545, |
| "step": 624 |
| }, |
| { |
| "epoch": 3.259452411994785, |
| "grad_norm": 0.33341725027604363, |
| "learning_rate": 2.576320927025892e-05, |
| "loss": 0.753, |
| "step": 625 |
| }, |
| { |
| "epoch": 3.2646675358539765, |
| "grad_norm": 0.33122719602795536, |
| "learning_rate": 2.562659371808959e-05, |
| "loss": 0.7436, |
| "step": 626 |
| }, |
| { |
| "epoch": 3.269882659713168, |
| "grad_norm": 0.3963689954079041, |
| "learning_rate": 2.549017041870325e-05, |
| "loss": 0.7676, |
| "step": 627 |
| }, |
| { |
| "epoch": 3.27509778357236, |
| "grad_norm": 0.3538575496536462, |
| "learning_rate": 2.535394119684197e-05, |
| "loss": 0.7548, |
| "step": 628 |
| }, |
| { |
| "epoch": 3.2803129074315516, |
| "grad_norm": 0.24662943382751912, |
| "learning_rate": 2.521790787465193e-05, |
| "loss": 0.7517, |
| "step": 629 |
| }, |
| { |
| "epoch": 3.285528031290743, |
| "grad_norm": 0.39858795017503024, |
| "learning_rate": 2.5082072271658982e-05, |
| "loss": 0.7482, |
| "step": 630 |
| }, |
| { |
| "epoch": 3.2907431551499347, |
| "grad_norm": 0.24293852218374687, |
| "learning_rate": 2.494643620474442e-05, |
| "loss": 0.7445, |
| "step": 631 |
| }, |
| { |
| "epoch": 3.2959582790091266, |
| "grad_norm": 0.3499814647894016, |
| "learning_rate": 2.4811001488120598e-05, |
| "loss": 0.7555, |
| "step": 632 |
| }, |
| { |
| "epoch": 3.301173402868318, |
| "grad_norm": 0.27161500960796325, |
| "learning_rate": 2.4675769933306696e-05, |
| "loss": 0.7568, |
| "step": 633 |
| }, |
| { |
| "epoch": 3.3063885267275097, |
| "grad_norm": 0.2881787773315299, |
| "learning_rate": 2.4540743349104485e-05, |
| "loss": 0.7442, |
| "step": 634 |
| }, |
| { |
| "epoch": 3.3116036505867013, |
| "grad_norm": 0.2740026111414901, |
| "learning_rate": 2.440592354157415e-05, |
| "loss": 0.7579, |
| "step": 635 |
| }, |
| { |
| "epoch": 3.3168187744458932, |
| "grad_norm": 0.2605377452426752, |
| "learning_rate": 2.427131231401008e-05, |
| "loss": 0.7518, |
| "step": 636 |
| }, |
| { |
| "epoch": 3.3220338983050848, |
| "grad_norm": 0.2482525542273476, |
| "learning_rate": 2.413691146691681e-05, |
| "loss": 0.7615, |
| "step": 637 |
| }, |
| { |
| "epoch": 3.3272490221642763, |
| "grad_norm": 0.33206747294953354, |
| "learning_rate": 2.400272279798492e-05, |
| "loss": 0.7498, |
| "step": 638 |
| }, |
| { |
| "epoch": 3.332464146023468, |
| "grad_norm": 0.29080523675344855, |
| "learning_rate": 2.3868748102066964e-05, |
| "loss": 0.7457, |
| "step": 639 |
| }, |
| { |
| "epoch": 3.33767926988266, |
| "grad_norm": 0.31636040314912883, |
| "learning_rate": 2.3734989171153475e-05, |
| "loss": 0.7605, |
| "step": 640 |
| }, |
| { |
| "epoch": 3.3428943937418514, |
| "grad_norm": 0.3319918116811582, |
| "learning_rate": 2.3601447794349035e-05, |
| "loss": 0.7601, |
| "step": 641 |
| }, |
| { |
| "epoch": 3.348109517601043, |
| "grad_norm": 0.24931361371615676, |
| "learning_rate": 2.346812575784826e-05, |
| "loss": 0.752, |
| "step": 642 |
| }, |
| { |
| "epoch": 3.353324641460235, |
| "grad_norm": 0.30718347134905344, |
| "learning_rate": 2.333502484491202e-05, |
| "loss": 0.7616, |
| "step": 643 |
| }, |
| { |
| "epoch": 3.3585397653194264, |
| "grad_norm": 0.282987877244827, |
| "learning_rate": 2.3202146835843458e-05, |
| "loss": 0.7462, |
| "step": 644 |
| }, |
| { |
| "epoch": 3.363754889178618, |
| "grad_norm": 0.24823892221787283, |
| "learning_rate": 2.3069493507964328e-05, |
| "loss": 0.7508, |
| "step": 645 |
| }, |
| { |
| "epoch": 3.3689700130378095, |
| "grad_norm": 0.2747764846839421, |
| "learning_rate": 2.2937066635591082e-05, |
| "loss": 0.757, |
| "step": 646 |
| }, |
| { |
| "epoch": 3.374185136897001, |
| "grad_norm": 0.2496756674179984, |
| "learning_rate": 2.2804867990011186e-05, |
| "loss": 0.7578, |
| "step": 647 |
| }, |
| { |
| "epoch": 3.379400260756193, |
| "grad_norm": 0.2395621072609135, |
| "learning_rate": 2.26728993394595e-05, |
| "loss": 0.765, |
| "step": 648 |
| }, |
| { |
| "epoch": 3.3846153846153846, |
| "grad_norm": 0.25006905141016617, |
| "learning_rate": 2.2541162449094517e-05, |
| "loss": 0.7558, |
| "step": 649 |
| }, |
| { |
| "epoch": 3.389830508474576, |
| "grad_norm": 0.22593196210127933, |
| "learning_rate": 2.2409659080974792e-05, |
| "loss": 0.7441, |
| "step": 650 |
| }, |
| { |
| "epoch": 3.395045632333768, |
| "grad_norm": 0.25905615560125267, |
| "learning_rate": 2.2278390994035437e-05, |
| "loss": 0.759, |
| "step": 651 |
| }, |
| { |
| "epoch": 3.4002607561929596, |
| "grad_norm": 0.21460137257291464, |
| "learning_rate": 2.214735994406449e-05, |
| "loss": 0.7356, |
| "step": 652 |
| }, |
| { |
| "epoch": 3.405475880052151, |
| "grad_norm": 0.2844041034563339, |
| "learning_rate": 2.2016567683679474e-05, |
| "loss": 0.7611, |
| "step": 653 |
| }, |
| { |
| "epoch": 3.4106910039113427, |
| "grad_norm": 0.2193094602636308, |
| "learning_rate": 2.188601596230402e-05, |
| "loss": 0.7628, |
| "step": 654 |
| }, |
| { |
| "epoch": 3.4159061277705347, |
| "grad_norm": 0.2688886990907944, |
| "learning_rate": 2.1755706526144346e-05, |
| "loss": 0.7555, |
| "step": 655 |
| }, |
| { |
| "epoch": 3.4211212516297262, |
| "grad_norm": 0.22979387277456514, |
| "learning_rate": 2.1625641118165993e-05, |
| "loss": 0.7506, |
| "step": 656 |
| }, |
| { |
| "epoch": 3.426336375488918, |
| "grad_norm": 0.2575836540395883, |
| "learning_rate": 2.1495821478070463e-05, |
| "loss": 0.7454, |
| "step": 657 |
| }, |
| { |
| "epoch": 3.4315514993481093, |
| "grad_norm": 0.22113251166352366, |
| "learning_rate": 2.1366249342271983e-05, |
| "loss": 0.7607, |
| "step": 658 |
| }, |
| { |
| "epoch": 3.4367666232073013, |
| "grad_norm": 0.2541994114755118, |
| "learning_rate": 2.123692644387427e-05, |
| "loss": 0.7529, |
| "step": 659 |
| }, |
| { |
| "epoch": 3.441981747066493, |
| "grad_norm": 0.17984867947865277, |
| "learning_rate": 2.110785451264733e-05, |
| "loss": 0.7584, |
| "step": 660 |
| }, |
| { |
| "epoch": 3.4471968709256844, |
| "grad_norm": 0.2623173957981238, |
| "learning_rate": 2.0979035275004326e-05, |
| "loss": 0.7589, |
| "step": 661 |
| }, |
| { |
| "epoch": 3.4524119947848764, |
| "grad_norm": 0.17573507602846267, |
| "learning_rate": 2.0850470453978527e-05, |
| "loss": 0.7575, |
| "step": 662 |
| }, |
| { |
| "epoch": 3.457627118644068, |
| "grad_norm": 0.23281049598480352, |
| "learning_rate": 2.07221617692002e-05, |
| "loss": 0.7495, |
| "step": 663 |
| }, |
| { |
| "epoch": 3.4628422425032594, |
| "grad_norm": 0.19792669337262203, |
| "learning_rate": 2.0594110936873646e-05, |
| "loss": 0.7554, |
| "step": 664 |
| }, |
| { |
| "epoch": 3.468057366362451, |
| "grad_norm": 0.2348813479171187, |
| "learning_rate": 2.0466319669754257e-05, |
| "loss": 0.7459, |
| "step": 665 |
| }, |
| { |
| "epoch": 3.4732724902216425, |
| "grad_norm": 0.17805434883842863, |
| "learning_rate": 2.033878967712556e-05, |
| "loss": 0.7482, |
| "step": 666 |
| }, |
| { |
| "epoch": 3.4784876140808345, |
| "grad_norm": 0.2324152396008409, |
| "learning_rate": 2.0211522664776378e-05, |
| "loss": 0.7627, |
| "step": 667 |
| }, |
| { |
| "epoch": 3.483702737940026, |
| "grad_norm": 0.21569790923016002, |
| "learning_rate": 2.0084520334978066e-05, |
| "loss": 0.7475, |
| "step": 668 |
| }, |
| { |
| "epoch": 3.4889178617992176, |
| "grad_norm": 0.22787366870500253, |
| "learning_rate": 1.9957784386461642e-05, |
| "loss": 0.7539, |
| "step": 669 |
| }, |
| { |
| "epoch": 3.4941329856584096, |
| "grad_norm": 0.19346838428327817, |
| "learning_rate": 1.9831316514395125e-05, |
| "loss": 0.7482, |
| "step": 670 |
| }, |
| { |
| "epoch": 3.499348109517601, |
| "grad_norm": 0.22386565166114242, |
| "learning_rate": 1.9705118410360888e-05, |
| "loss": 0.7516, |
| "step": 671 |
| }, |
| { |
| "epoch": 3.5045632333767927, |
| "grad_norm": 0.19045550096072955, |
| "learning_rate": 1.9579191762332962e-05, |
| "loss": 0.7524, |
| "step": 672 |
| }, |
| { |
| "epoch": 3.509778357235984, |
| "grad_norm": 0.2568792722114023, |
| "learning_rate": 1.9453538254654492e-05, |
| "loss": 0.7608, |
| "step": 673 |
| }, |
| { |
| "epoch": 3.514993481095176, |
| "grad_norm": 0.2309953251505408, |
| "learning_rate": 1.9328159568015253e-05, |
| "loss": 0.761, |
| "step": 674 |
| }, |
| { |
| "epoch": 3.5202086049543677, |
| "grad_norm": 0.20090449798296708, |
| "learning_rate": 1.920305737942908e-05, |
| "loss": 0.7498, |
| "step": 675 |
| }, |
| { |
| "epoch": 3.5254237288135593, |
| "grad_norm": 0.1995433496289568, |
| "learning_rate": 1.9078233362211488e-05, |
| "loss": 0.7635, |
| "step": 676 |
| }, |
| { |
| "epoch": 3.530638852672751, |
| "grad_norm": 0.20023193260506214, |
| "learning_rate": 1.895368918595733e-05, |
| "loss": 0.7497, |
| "step": 677 |
| }, |
| { |
| "epoch": 3.5358539765319428, |
| "grad_norm": 0.19621170906491942, |
| "learning_rate": 1.882942651651835e-05, |
| "loss": 0.7434, |
| "step": 678 |
| }, |
| { |
| "epoch": 3.5410691003911343, |
| "grad_norm": 0.20574042364789344, |
| "learning_rate": 1.8705447015981056e-05, |
| "loss": 0.7436, |
| "step": 679 |
| }, |
| { |
| "epoch": 3.546284224250326, |
| "grad_norm": 0.18782121755639544, |
| "learning_rate": 1.858175234264434e-05, |
| "loss": 0.7556, |
| "step": 680 |
| }, |
| { |
| "epoch": 3.551499348109518, |
| "grad_norm": 0.1936150072481402, |
| "learning_rate": 1.845834415099736e-05, |
| "loss": 0.757, |
| "step": 681 |
| }, |
| { |
| "epoch": 3.5567144719687094, |
| "grad_norm": 0.1926949621185055, |
| "learning_rate": 1.833522409169748e-05, |
| "loss": 0.756, |
| "step": 682 |
| }, |
| { |
| "epoch": 3.561929595827901, |
| "grad_norm": 0.20476384841537432, |
| "learning_rate": 1.8212393811548074e-05, |
| "loss": 0.7341, |
| "step": 683 |
| }, |
| { |
| "epoch": 3.5671447196870925, |
| "grad_norm": 0.1696138534253153, |
| "learning_rate": 1.8089854953476503e-05, |
| "loss": 0.7521, |
| "step": 684 |
| }, |
| { |
| "epoch": 3.572359843546284, |
| "grad_norm": 0.22686575356319796, |
| "learning_rate": 1.796760915651229e-05, |
| "loss": 0.7621, |
| "step": 685 |
| }, |
| { |
| "epoch": 3.577574967405476, |
| "grad_norm": 0.18860960425351292, |
| "learning_rate": 1.784565805576503e-05, |
| "loss": 0.7537, |
| "step": 686 |
| }, |
| { |
| "epoch": 3.5827900912646675, |
| "grad_norm": 0.19640065748239693, |
| "learning_rate": 1.7724003282402567e-05, |
| "loss": 0.7592, |
| "step": 687 |
| }, |
| { |
| "epoch": 3.588005215123859, |
| "grad_norm": 0.18973579313916158, |
| "learning_rate": 1.760264646362926e-05, |
| "loss": 0.7477, |
| "step": 688 |
| }, |
| { |
| "epoch": 3.593220338983051, |
| "grad_norm": 0.20298759538425024, |
| "learning_rate": 1.7481589222664076e-05, |
| "loss": 0.7486, |
| "step": 689 |
| }, |
| { |
| "epoch": 3.5984354628422426, |
| "grad_norm": 0.17790150008707342, |
| "learning_rate": 1.736083317871898e-05, |
| "loss": 0.7458, |
| "step": 690 |
| }, |
| { |
| "epoch": 3.603650586701434, |
| "grad_norm": 0.20575599062083585, |
| "learning_rate": 1.7240379946977283e-05, |
| "loss": 0.7544, |
| "step": 691 |
| }, |
| { |
| "epoch": 3.6088657105606257, |
| "grad_norm": 0.17855398739046716, |
| "learning_rate": 1.712023113857195e-05, |
| "loss": 0.756, |
| "step": 692 |
| }, |
| { |
| "epoch": 3.614080834419817, |
| "grad_norm": 0.1947379717329557, |
| "learning_rate": 1.700038836056413e-05, |
| "loss": 0.7547, |
| "step": 693 |
| }, |
| { |
| "epoch": 3.619295958279009, |
| "grad_norm": 0.17740265903389008, |
| "learning_rate": 1.6880853215921642e-05, |
| "loss": 0.7532, |
| "step": 694 |
| }, |
| { |
| "epoch": 3.6245110821382007, |
| "grad_norm": 0.18082866727173452, |
| "learning_rate": 1.676162730349751e-05, |
| "loss": 0.7466, |
| "step": 695 |
| }, |
| { |
| "epoch": 3.6297262059973923, |
| "grad_norm": 0.18963190902387492, |
| "learning_rate": 1.6642712218008613e-05, |
| "loss": 0.742, |
| "step": 696 |
| }, |
| { |
| "epoch": 3.6349413298565842, |
| "grad_norm": 0.18716923070682592, |
| "learning_rate": 1.6524109550014323e-05, |
| "loss": 0.7515, |
| "step": 697 |
| }, |
| { |
| "epoch": 3.640156453715776, |
| "grad_norm": 0.20784228257641144, |
| "learning_rate": 1.6405820885895242e-05, |
| "loss": 0.7578, |
| "step": 698 |
| }, |
| { |
| "epoch": 3.6453715775749673, |
| "grad_norm": 0.1920361900700138, |
| "learning_rate": 1.6287847807832e-05, |
| "loss": 0.7538, |
| "step": 699 |
| }, |
| { |
| "epoch": 3.6505867014341593, |
| "grad_norm": 0.1652320731622616, |
| "learning_rate": 1.617019189378407e-05, |
| "loss": 0.7511, |
| "step": 700 |
| }, |
| { |
| "epoch": 3.655801825293351, |
| "grad_norm": 0.19431451990268614, |
| "learning_rate": 1.6052854717468644e-05, |
| "loss": 0.7515, |
| "step": 701 |
| }, |
| { |
| "epoch": 3.6610169491525424, |
| "grad_norm": 0.19087250562535932, |
| "learning_rate": 1.5935837848339667e-05, |
| "loss": 0.7568, |
| "step": 702 |
| }, |
| { |
| "epoch": 3.666232073011734, |
| "grad_norm": 0.20029152301458664, |
| "learning_rate": 1.581914285156673e-05, |
| "loss": 0.7465, |
| "step": 703 |
| }, |
| { |
| "epoch": 3.6714471968709255, |
| "grad_norm": 0.18770482135472025, |
| "learning_rate": 1.5702771288014206e-05, |
| "loss": 0.7516, |
| "step": 704 |
| }, |
| { |
| "epoch": 3.6766623207301175, |
| "grad_norm": 0.17463919163255975, |
| "learning_rate": 1.5586724714220397e-05, |
| "loss": 0.7498, |
| "step": 705 |
| }, |
| { |
| "epoch": 3.681877444589309, |
| "grad_norm": 0.1942555556221587, |
| "learning_rate": 1.5471004682376626e-05, |
| "loss": 0.743, |
| "step": 706 |
| }, |
| { |
| "epoch": 3.6870925684485005, |
| "grad_norm": 0.1946562516538543, |
| "learning_rate": 1.535561274030652e-05, |
| "loss": 0.7569, |
| "step": 707 |
| }, |
| { |
| "epoch": 3.6923076923076925, |
| "grad_norm": 0.17582331062620302, |
| "learning_rate": 1.5240550431445376e-05, |
| "loss": 0.7519, |
| "step": 708 |
| }, |
| { |
| "epoch": 3.697522816166884, |
| "grad_norm": 0.1861426528732765, |
| "learning_rate": 1.5125819294819391e-05, |
| "loss": 0.7482, |
| "step": 709 |
| }, |
| { |
| "epoch": 3.7027379400260756, |
| "grad_norm": 0.18269562874012577, |
| "learning_rate": 1.5011420865025143e-05, |
| "loss": 0.7523, |
| "step": 710 |
| }, |
| { |
| "epoch": 3.707953063885267, |
| "grad_norm": 0.18506228544951506, |
| "learning_rate": 1.4897356672209112e-05, |
| "loss": 0.7547, |
| "step": 711 |
| }, |
| { |
| "epoch": 3.7131681877444587, |
| "grad_norm": 0.20416348855073826, |
| "learning_rate": 1.4783628242047097e-05, |
| "loss": 0.7502, |
| "step": 712 |
| }, |
| { |
| "epoch": 3.7183833116036507, |
| "grad_norm": 0.16997764410348623, |
| "learning_rate": 1.467023709572391e-05, |
| "loss": 0.7512, |
| "step": 713 |
| }, |
| { |
| "epoch": 3.723598435462842, |
| "grad_norm": 0.18897244039596858, |
| "learning_rate": 1.4557184749912981e-05, |
| "loss": 0.7473, |
| "step": 714 |
| }, |
| { |
| "epoch": 3.7288135593220337, |
| "grad_norm": 0.19172065409291622, |
| "learning_rate": 1.4444472716756074e-05, |
| "loss": 0.7497, |
| "step": 715 |
| }, |
| { |
| "epoch": 3.7340286831812257, |
| "grad_norm": 0.171487618996453, |
| "learning_rate": 1.433210250384311e-05, |
| "loss": 0.7526, |
| "step": 716 |
| }, |
| { |
| "epoch": 3.7392438070404173, |
| "grad_norm": 0.17442752265085087, |
| "learning_rate": 1.4220075614191924e-05, |
| "loss": 0.7515, |
| "step": 717 |
| }, |
| { |
| "epoch": 3.744458930899609, |
| "grad_norm": 0.17508720890471086, |
| "learning_rate": 1.4108393546228194e-05, |
| "loss": 0.7578, |
| "step": 718 |
| }, |
| { |
| "epoch": 3.749674054758801, |
| "grad_norm": 0.19876602903764187, |
| "learning_rate": 1.3997057793765452e-05, |
| "loss": 0.7483, |
| "step": 719 |
| }, |
| { |
| "epoch": 3.7548891786179923, |
| "grad_norm": 0.17214365448338131, |
| "learning_rate": 1.3886069845985013e-05, |
| "loss": 0.7568, |
| "step": 720 |
| }, |
| { |
| "epoch": 3.760104302477184, |
| "grad_norm": 0.1920697005442559, |
| "learning_rate": 1.3775431187416097e-05, |
| "loss": 0.7504, |
| "step": 721 |
| }, |
| { |
| "epoch": 3.7653194263363754, |
| "grad_norm": 0.18503507230287503, |
| "learning_rate": 1.3665143297916018e-05, |
| "loss": 0.7636, |
| "step": 722 |
| }, |
| { |
| "epoch": 3.770534550195567, |
| "grad_norm": 0.19252969937704967, |
| "learning_rate": 1.3555207652650308e-05, |
| "loss": 0.7526, |
| "step": 723 |
| }, |
| { |
| "epoch": 3.775749674054759, |
| "grad_norm": 0.1554286223570067, |
| "learning_rate": 1.3445625722073023e-05, |
| "loss": 0.7512, |
| "step": 724 |
| }, |
| { |
| "epoch": 3.7809647979139505, |
| "grad_norm": 0.16978176727895614, |
| "learning_rate": 1.3336398971907114e-05, |
| "loss": 0.7556, |
| "step": 725 |
| }, |
| { |
| "epoch": 3.786179921773142, |
| "grad_norm": 0.15542348945118697, |
| "learning_rate": 1.3227528863124745e-05, |
| "loss": 0.7512, |
| "step": 726 |
| }, |
| { |
| "epoch": 3.791395045632334, |
| "grad_norm": 0.16399605746487972, |
| "learning_rate": 1.3119016851927798e-05, |
| "loss": 0.756, |
| "step": 727 |
| }, |
| { |
| "epoch": 3.7966101694915255, |
| "grad_norm": 0.16707283374852486, |
| "learning_rate": 1.301086438972842e-05, |
| "loss": 0.7475, |
| "step": 728 |
| }, |
| { |
| "epoch": 3.801825293350717, |
| "grad_norm": 0.15983694259778197, |
| "learning_rate": 1.2903072923129547e-05, |
| "loss": 0.7474, |
| "step": 729 |
| }, |
| { |
| "epoch": 3.8070404172099086, |
| "grad_norm": 0.14876964520461528, |
| "learning_rate": 1.2795643893905575e-05, |
| "loss": 0.7471, |
| "step": 730 |
| }, |
| { |
| "epoch": 3.8122555410691, |
| "grad_norm": 0.19924804420654435, |
| "learning_rate": 1.2688578738983131e-05, |
| "loss": 0.7464, |
| "step": 731 |
| }, |
| { |
| "epoch": 3.817470664928292, |
| "grad_norm": 0.19191423801767207, |
| "learning_rate": 1.2581878890421777e-05, |
| "loss": 0.7562, |
| "step": 732 |
| }, |
| { |
| "epoch": 3.8226857887874837, |
| "grad_norm": 0.20765129848679256, |
| "learning_rate": 1.2475545775394879e-05, |
| "loss": 0.7565, |
| "step": 733 |
| }, |
| { |
| "epoch": 3.827900912646675, |
| "grad_norm": 0.1697957267489159, |
| "learning_rate": 1.2369580816170563e-05, |
| "loss": 0.7586, |
| "step": 734 |
| }, |
| { |
| "epoch": 3.833116036505867, |
| "grad_norm": 0.1716215315213691, |
| "learning_rate": 1.2263985430092618e-05, |
| "loss": 0.7524, |
| "step": 735 |
| }, |
| { |
| "epoch": 3.8383311603650587, |
| "grad_norm": 0.18944814607450408, |
| "learning_rate": 1.2158761029561582e-05, |
| "loss": 0.7547, |
| "step": 736 |
| }, |
| { |
| "epoch": 3.8435462842242503, |
| "grad_norm": 0.14989091598529683, |
| "learning_rate": 1.2053909022015863e-05, |
| "loss": 0.7446, |
| "step": 737 |
| }, |
| { |
| "epoch": 3.8487614080834422, |
| "grad_norm": 0.1858141017872855, |
| "learning_rate": 1.194943080991287e-05, |
| "loss": 0.7503, |
| "step": 738 |
| }, |
| { |
| "epoch": 3.853976531942634, |
| "grad_norm": 0.14994883200639647, |
| "learning_rate": 1.1845327790710276e-05, |
| "loss": 0.747, |
| "step": 739 |
| }, |
| { |
| "epoch": 3.8591916558018253, |
| "grad_norm": 0.14941004189650142, |
| "learning_rate": 1.1741601356847365e-05, |
| "loss": 0.7541, |
| "step": 740 |
| }, |
| { |
| "epoch": 3.864406779661017, |
| "grad_norm": 0.16071992101640972, |
| "learning_rate": 1.1638252895726327e-05, |
| "loss": 0.7496, |
| "step": 741 |
| }, |
| { |
| "epoch": 3.8696219035202084, |
| "grad_norm": 0.1374205892222837, |
| "learning_rate": 1.1535283789693756e-05, |
| "loss": 0.753, |
| "step": 742 |
| }, |
| { |
| "epoch": 3.8748370273794004, |
| "grad_norm": 0.1394433360371365, |
| "learning_rate": 1.1432695416022158e-05, |
| "loss": 0.7535, |
| "step": 743 |
| }, |
| { |
| "epoch": 3.880052151238592, |
| "grad_norm": 0.16866985633081483, |
| "learning_rate": 1.1330489146891494e-05, |
| "loss": 0.7517, |
| "step": 744 |
| }, |
| { |
| "epoch": 3.8852672750977835, |
| "grad_norm": 0.15730043046261316, |
| "learning_rate": 1.1228666349370897e-05, |
| "loss": 0.7474, |
| "step": 745 |
| }, |
| { |
| "epoch": 3.8904823989569755, |
| "grad_norm": 0.1848086766193644, |
| "learning_rate": 1.112722838540031e-05, |
| "loss": 0.7615, |
| "step": 746 |
| }, |
| { |
| "epoch": 3.895697522816167, |
| "grad_norm": 0.16625357168861077, |
| "learning_rate": 1.1026176611772286e-05, |
| "loss": 0.7488, |
| "step": 747 |
| }, |
| { |
| "epoch": 3.9009126466753585, |
| "grad_norm": 0.1629255000824365, |
| "learning_rate": 1.0925512380113892e-05, |
| "loss": 0.7577, |
| "step": 748 |
| }, |
| { |
| "epoch": 3.90612777053455, |
| "grad_norm": 0.18675770122521296, |
| "learning_rate": 1.0825237036868575e-05, |
| "loss": 0.7531, |
| "step": 749 |
| }, |
| { |
| "epoch": 3.9113428943937416, |
| "grad_norm": 0.15755029689979122, |
| "learning_rate": 1.0725351923278144e-05, |
| "loss": 0.7548, |
| "step": 750 |
| }, |
| { |
| "epoch": 3.9165580182529336, |
| "grad_norm": 0.18913533326276255, |
| "learning_rate": 1.0625858375364917e-05, |
| "loss": 0.754, |
| "step": 751 |
| }, |
| { |
| "epoch": 3.921773142112125, |
| "grad_norm": 0.18170716935082054, |
| "learning_rate": 1.0526757723913735e-05, |
| "loss": 0.7441, |
| "step": 752 |
| }, |
| { |
| "epoch": 3.9269882659713167, |
| "grad_norm": 0.13608488064261914, |
| "learning_rate": 1.042805129445423e-05, |
| "loss": 0.7508, |
| "step": 753 |
| }, |
| { |
| "epoch": 3.9322033898305087, |
| "grad_norm": 0.17632467806971025, |
| "learning_rate": 1.0329740407243105e-05, |
| "loss": 0.7747, |
| "step": 754 |
| }, |
| { |
| "epoch": 3.9374185136897, |
| "grad_norm": 0.17072961754308208, |
| "learning_rate": 1.0231826377246437e-05, |
| "loss": 0.7462, |
| "step": 755 |
| }, |
| { |
| "epoch": 3.9426336375488917, |
| "grad_norm": 0.16478645139827036, |
| "learning_rate": 1.0134310514122082e-05, |
| "loss": 0.7563, |
| "step": 756 |
| }, |
| { |
| "epoch": 3.9478487614080837, |
| "grad_norm": 0.17542376854887573, |
| "learning_rate": 1.0037194122202227e-05, |
| "loss": 0.7567, |
| "step": 757 |
| }, |
| { |
| "epoch": 3.9530638852672753, |
| "grad_norm": 0.17159268136488617, |
| "learning_rate": 9.940478500475858e-06, |
| "loss": 0.7503, |
| "step": 758 |
| }, |
| { |
| "epoch": 3.958279009126467, |
| "grad_norm": 0.15300026372913983, |
| "learning_rate": 9.844164942571424e-06, |
| "loss": 0.7523, |
| "step": 759 |
| }, |
| { |
| "epoch": 3.9634941329856583, |
| "grad_norm": 0.159680621058249, |
| "learning_rate": 9.748254736739571e-06, |
| "loss": 0.757, |
| "step": 760 |
| }, |
| { |
| "epoch": 3.96870925684485, |
| "grad_norm": 0.15768593487660648, |
| "learning_rate": 9.652749165835828e-06, |
| "loss": 0.7458, |
| "step": 761 |
| }, |
| { |
| "epoch": 3.973924380704042, |
| "grad_norm": 0.16943127341342135, |
| "learning_rate": 9.557649507303508e-06, |
| "loss": 0.7456, |
| "step": 762 |
| }, |
| { |
| "epoch": 3.9791395045632334, |
| "grad_norm": 0.14842747623637131, |
| "learning_rate": 9.462957033156632e-06, |
| "loss": 0.7497, |
| "step": 763 |
| }, |
| { |
| "epoch": 3.984354628422425, |
| "grad_norm": 0.1501263870664962, |
| "learning_rate": 9.368673009962852e-06, |
| "loss": 0.7514, |
| "step": 764 |
| }, |
| { |
| "epoch": 3.989569752281617, |
| "grad_norm": 0.17717065342340108, |
| "learning_rate": 9.274798698826557e-06, |
| "loss": 0.7531, |
| "step": 765 |
| }, |
| { |
| "epoch": 3.9947848761408085, |
| "grad_norm": 0.15866607398462376, |
| "learning_rate": 9.181335355372028e-06, |
| "loss": 0.7512, |
| "step": 766 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.17952277081095838, |
| "learning_rate": 9.088284229726572e-06, |
| "loss": 0.9412, |
| "step": 767 |
| }, |
| { |
| "epoch": 4.005215123859192, |
| "grad_norm": 0.23274451897753248, |
| "learning_rate": 8.995646566503838e-06, |
| "loss": 0.7375, |
| "step": 768 |
| }, |
| { |
| "epoch": 4.010430247718383, |
| "grad_norm": 0.17288112228894242, |
| "learning_rate": 8.90342360478723e-06, |
| "loss": 0.7373, |
| "step": 769 |
| }, |
| { |
| "epoch": 4.015645371577575, |
| "grad_norm": 0.20174270563988336, |
| "learning_rate": 8.81161657811318e-06, |
| "loss": 0.7453, |
| "step": 770 |
| }, |
| { |
| "epoch": 4.020860495436767, |
| "grad_norm": 0.1726715461295719, |
| "learning_rate": 8.720226714454831e-06, |
| "loss": 0.7421, |
| "step": 771 |
| }, |
| { |
| "epoch": 4.026075619295958, |
| "grad_norm": 0.20664612564398913, |
| "learning_rate": 8.629255236205475e-06, |
| "loss": 0.7393, |
| "step": 772 |
| }, |
| { |
| "epoch": 4.03129074315515, |
| "grad_norm": 0.183032821290067, |
| "learning_rate": 8.538703360162267e-06, |
| "loss": 0.7262, |
| "step": 773 |
| }, |
| { |
| "epoch": 4.036505867014341, |
| "grad_norm": 0.19162204178734604, |
| "learning_rate": 8.44857229750994e-06, |
| "loss": 0.7229, |
| "step": 774 |
| }, |
| { |
| "epoch": 4.041720990873533, |
| "grad_norm": 0.19714648419474126, |
| "learning_rate": 8.3588632538046e-06, |
| "loss": 0.7406, |
| "step": 775 |
| }, |
| { |
| "epoch": 4.046936114732725, |
| "grad_norm": 0.18562646027448648, |
| "learning_rate": 8.269577428957571e-06, |
| "loss": 0.7375, |
| "step": 776 |
| }, |
| { |
| "epoch": 4.052151238591916, |
| "grad_norm": 0.19319081667416735, |
| "learning_rate": 8.180716017219433e-06, |
| "loss": 0.7315, |
| "step": 777 |
| }, |
| { |
| "epoch": 4.057366362451108, |
| "grad_norm": 0.17933639021135425, |
| "learning_rate": 8.092280207163941e-06, |
| "loss": 0.7439, |
| "step": 778 |
| }, |
| { |
| "epoch": 4.0625814863103, |
| "grad_norm": 0.16796265908006924, |
| "learning_rate": 8.004271181672178e-06, |
| "loss": 0.7219, |
| "step": 779 |
| }, |
| { |
| "epoch": 4.067796610169491, |
| "grad_norm": 0.16506690334266957, |
| "learning_rate": 7.916690117916772e-06, |
| "loss": 0.7409, |
| "step": 780 |
| }, |
| { |
| "epoch": 4.073011734028683, |
| "grad_norm": 0.16443977378096947, |
| "learning_rate": 7.829538187346077e-06, |
| "loss": 0.7377, |
| "step": 781 |
| }, |
| { |
| "epoch": 4.078226857887874, |
| "grad_norm": 0.1997012117535286, |
| "learning_rate": 7.742816555668532e-06, |
| "loss": 0.7399, |
| "step": 782 |
| }, |
| { |
| "epoch": 4.083441981747066, |
| "grad_norm": 0.14695927923208477, |
| "learning_rate": 7.656526382837106e-06, |
| "loss": 0.7327, |
| "step": 783 |
| }, |
| { |
| "epoch": 4.088657105606258, |
| "grad_norm": 0.1799749838620325, |
| "learning_rate": 7.570668823033727e-06, |
| "loss": 0.7309, |
| "step": 784 |
| }, |
| { |
| "epoch": 4.0938722294654495, |
| "grad_norm": 0.17989297489241943, |
| "learning_rate": 7.4852450246538685e-06, |
| "loss": 0.7343, |
| "step": 785 |
| }, |
| { |
| "epoch": 4.0990873533246415, |
| "grad_norm": 0.15506025585209593, |
| "learning_rate": 7.40025613029121e-06, |
| "loss": 0.736, |
| "step": 786 |
| }, |
| { |
| "epoch": 4.1043024771838335, |
| "grad_norm": 0.18105126244534, |
| "learning_rate": 7.315703276722317e-06, |
| "loss": 0.7342, |
| "step": 787 |
| }, |
| { |
| "epoch": 4.1095176010430245, |
| "grad_norm": 0.19050717050961422, |
| "learning_rate": 7.231587594891438e-06, |
| "loss": 0.7458, |
| "step": 788 |
| }, |
| { |
| "epoch": 4.1147327249022165, |
| "grad_norm": 0.14216335950493533, |
| "learning_rate": 7.147910209895435e-06, |
| "loss": 0.7318, |
| "step": 789 |
| }, |
| { |
| "epoch": 4.1199478487614085, |
| "grad_norm": 0.1655844728014387, |
| "learning_rate": 7.064672240968638e-06, |
| "loss": 0.7442, |
| "step": 790 |
| }, |
| { |
| "epoch": 4.1251629726206, |
| "grad_norm": 0.14883381171151777, |
| "learning_rate": 6.9818748014679785e-06, |
| "loss": 0.7403, |
| "step": 791 |
| }, |
| { |
| "epoch": 4.130378096479792, |
| "grad_norm": 0.1602685057342838, |
| "learning_rate": 6.899518998858017e-06, |
| "loss": 0.7405, |
| "step": 792 |
| }, |
| { |
| "epoch": 4.135593220338983, |
| "grad_norm": 0.1649659601993552, |
| "learning_rate": 6.81760593469615e-06, |
| "loss": 0.7346, |
| "step": 793 |
| }, |
| { |
| "epoch": 4.140808344198175, |
| "grad_norm": 0.12921893028933829, |
| "learning_rate": 6.7361367046179325e-06, |
| "loss": 0.7303, |
| "step": 794 |
| }, |
| { |
| "epoch": 4.146023468057367, |
| "grad_norm": 0.12489877860585255, |
| "learning_rate": 6.655112398322332e-06, |
| "loss": 0.731, |
| "step": 795 |
| }, |
| { |
| "epoch": 4.151238591916558, |
| "grad_norm": 0.17428621687246298, |
| "learning_rate": 6.574534099557204e-06, |
| "loss": 0.7307, |
| "step": 796 |
| }, |
| { |
| "epoch": 4.15645371577575, |
| "grad_norm": 0.1251987161714777, |
| "learning_rate": 6.49440288610482e-06, |
| "loss": 0.7275, |
| "step": 797 |
| }, |
| { |
| "epoch": 4.161668839634942, |
| "grad_norm": 0.15533418379259104, |
| "learning_rate": 6.414719829767406e-06, |
| "loss": 0.7339, |
| "step": 798 |
| }, |
| { |
| "epoch": 4.166883963494133, |
| "grad_norm": 0.16236363314856642, |
| "learning_rate": 6.335485996352782e-06, |
| "loss": 0.7418, |
| "step": 799 |
| }, |
| { |
| "epoch": 4.172099087353325, |
| "grad_norm": 0.13801981121095086, |
| "learning_rate": 6.256702445660221e-06, |
| "loss": 0.7304, |
| "step": 800 |
| }, |
| { |
| "epoch": 4.177314211212516, |
| "grad_norm": 0.14142425026968236, |
| "learning_rate": 6.178370231466142e-06, |
| "loss": 0.743, |
| "step": 801 |
| }, |
| { |
| "epoch": 4.182529335071708, |
| "grad_norm": 0.16216801700272557, |
| "learning_rate": 6.100490401510089e-06, |
| "loss": 0.7437, |
| "step": 802 |
| }, |
| { |
| "epoch": 4.1877444589309, |
| "grad_norm": 0.14030637701048476, |
| "learning_rate": 6.0230639974807206e-06, |
| "loss": 0.7383, |
| "step": 803 |
| }, |
| { |
| "epoch": 4.192959582790091, |
| "grad_norm": 0.13754640300037912, |
| "learning_rate": 5.946092055001824e-06, |
| "loss": 0.7329, |
| "step": 804 |
| }, |
| { |
| "epoch": 4.198174706649283, |
| "grad_norm": 0.16139415971113086, |
| "learning_rate": 5.869575603618507e-06, |
| "loss": 0.7417, |
| "step": 805 |
| }, |
| { |
| "epoch": 4.203389830508475, |
| "grad_norm": 0.15626356140455866, |
| "learning_rate": 5.793515666783429e-06, |
| "loss": 0.7337, |
| "step": 806 |
| }, |
| { |
| "epoch": 4.208604954367666, |
| "grad_norm": 0.12177414306535114, |
| "learning_rate": 5.717913261843078e-06, |
| "loss": 0.733, |
| "step": 807 |
| }, |
| { |
| "epoch": 4.213820078226858, |
| "grad_norm": 0.12926025156980447, |
| "learning_rate": 5.642769400024199e-06, |
| "loss": 0.7422, |
| "step": 808 |
| }, |
| { |
| "epoch": 4.219035202086049, |
| "grad_norm": 0.13704261747895924, |
| "learning_rate": 5.568085086420247e-06, |
| "loss": 0.7404, |
| "step": 809 |
| }, |
| { |
| "epoch": 4.224250325945241, |
| "grad_norm": 0.13842188890961235, |
| "learning_rate": 5.4938613199779334e-06, |
| "loss": 0.7334, |
| "step": 810 |
| }, |
| { |
| "epoch": 4.229465449804433, |
| "grad_norm": 0.13979962035636995, |
| "learning_rate": 5.420099093483911e-06, |
| "loss": 0.7222, |
| "step": 811 |
| }, |
| { |
| "epoch": 4.234680573663624, |
| "grad_norm": 0.12565163857223183, |
| "learning_rate": 5.346799393551436e-06, |
| "loss": 0.7307, |
| "step": 812 |
| }, |
| { |
| "epoch": 4.239895697522816, |
| "grad_norm": 0.12679243834545414, |
| "learning_rate": 5.273963200607206e-06, |
| "loss": 0.7387, |
| "step": 813 |
| }, |
| { |
| "epoch": 4.245110821382008, |
| "grad_norm": 0.13813903814909753, |
| "learning_rate": 5.201591488878257e-06, |
| "loss": 0.7251, |
| "step": 814 |
| }, |
| { |
| "epoch": 4.250325945241199, |
| "grad_norm": 0.1285263356419264, |
| "learning_rate": 5.129685226378893e-06, |
| "loss": 0.7234, |
| "step": 815 |
| }, |
| { |
| "epoch": 4.255541069100391, |
| "grad_norm": 0.12820199717697753, |
| "learning_rate": 5.058245374897763e-06, |
| "loss": 0.7271, |
| "step": 816 |
| }, |
| { |
| "epoch": 4.260756192959583, |
| "grad_norm": 0.12495602397125814, |
| "learning_rate": 4.987272889985009e-06, |
| "loss": 0.7393, |
| "step": 817 |
| }, |
| { |
| "epoch": 4.265971316818774, |
| "grad_norm": 0.15151715317554978, |
| "learning_rate": 4.916768720939451e-06, |
| "loss": 0.7332, |
| "step": 818 |
| }, |
| { |
| "epoch": 4.271186440677966, |
| "grad_norm": 0.14747133461014475, |
| "learning_rate": 4.846733810795914e-06, |
| "loss": 0.7297, |
| "step": 819 |
| }, |
| { |
| "epoch": 4.276401564537157, |
| "grad_norm": 0.14100514701144612, |
| "learning_rate": 4.777169096312624e-06, |
| "loss": 0.7276, |
| "step": 820 |
| }, |
| { |
| "epoch": 4.281616688396349, |
| "grad_norm": 0.1646113420856527, |
| "learning_rate": 4.708075507958642e-06, |
| "loss": 0.7281, |
| "step": 821 |
| }, |
| { |
| "epoch": 4.286831812255541, |
| "grad_norm": 0.13983624281016735, |
| "learning_rate": 4.639453969901442e-06, |
| "loss": 0.7276, |
| "step": 822 |
| }, |
| { |
| "epoch": 4.292046936114732, |
| "grad_norm": 0.13237794200876413, |
| "learning_rate": 4.571305399994561e-06, |
| "loss": 0.7409, |
| "step": 823 |
| }, |
| { |
| "epoch": 4.297262059973924, |
| "grad_norm": 0.180777223722747, |
| "learning_rate": 4.503630709765294e-06, |
| "loss": 0.741, |
| "step": 824 |
| }, |
| { |
| "epoch": 4.302477183833116, |
| "grad_norm": 0.1367694718240071, |
| "learning_rate": 4.436430804402521e-06, |
| "loss": 0.7331, |
| "step": 825 |
| }, |
| { |
| "epoch": 4.3076923076923075, |
| "grad_norm": 0.15547588436169807, |
| "learning_rate": 4.3697065827446e-06, |
| "loss": 0.7376, |
| "step": 826 |
| }, |
| { |
| "epoch": 4.3129074315514995, |
| "grad_norm": 0.17342743882348932, |
| "learning_rate": 4.303458937267326e-06, |
| "loss": 0.7267, |
| "step": 827 |
| }, |
| { |
| "epoch": 4.318122555410691, |
| "grad_norm": 0.16682655867364918, |
| "learning_rate": 4.237688754072022e-06, |
| "loss": 0.7309, |
| "step": 828 |
| }, |
| { |
| "epoch": 4.3233376792698825, |
| "grad_norm": 0.16450623559503522, |
| "learning_rate": 4.172396912873659e-06, |
| "loss": 0.7342, |
| "step": 829 |
| }, |
| { |
| "epoch": 4.3285528031290745, |
| "grad_norm": 0.19380979384775532, |
| "learning_rate": 4.107584286989097e-06, |
| "loss": 0.7392, |
| "step": 830 |
| }, |
| { |
| "epoch": 4.333767926988266, |
| "grad_norm": 0.18721173933947008, |
| "learning_rate": 4.043251743325436e-06, |
| "loss": 0.7303, |
| "step": 831 |
| }, |
| { |
| "epoch": 4.338983050847458, |
| "grad_norm": 0.15835333882238062, |
| "learning_rate": 3.979400142368368e-06, |
| "loss": 0.7301, |
| "step": 832 |
| }, |
| { |
| "epoch": 4.34419817470665, |
| "grad_norm": 0.18385561342339599, |
| "learning_rate": 3.916030338170696e-06, |
| "loss": 0.7374, |
| "step": 833 |
| }, |
| { |
| "epoch": 4.349413298565841, |
| "grad_norm": 0.2022205872843893, |
| "learning_rate": 3.8531431783409165e-06, |
| "loss": 0.7387, |
| "step": 834 |
| }, |
| { |
| "epoch": 4.354628422425033, |
| "grad_norm": 0.15316197738525242, |
| "learning_rate": 3.7907395040318685e-06, |
| "loss": 0.7385, |
| "step": 835 |
| }, |
| { |
| "epoch": 4.359843546284225, |
| "grad_norm": 0.1500484412205804, |
| "learning_rate": 3.7288201499294797e-06, |
| "loss": 0.7268, |
| "step": 836 |
| }, |
| { |
| "epoch": 4.365058670143416, |
| "grad_norm": 0.14189535258541974, |
| "learning_rate": 3.667385944241626e-06, |
| "loss": 0.7453, |
| "step": 837 |
| }, |
| { |
| "epoch": 4.370273794002608, |
| "grad_norm": 0.15984630251802126, |
| "learning_rate": 3.606437708687023e-06, |
| "loss": 0.7395, |
| "step": 838 |
| }, |
| { |
| "epoch": 4.375488917861799, |
| "grad_norm": 0.1642822923840156, |
| "learning_rate": 3.545976258484247e-06, |
| "loss": 0.7354, |
| "step": 839 |
| }, |
| { |
| "epoch": 4.380704041720991, |
| "grad_norm": 0.13221533894356902, |
| "learning_rate": 3.4860024023408577e-06, |
| "loss": 0.7459, |
| "step": 840 |
| }, |
| { |
| "epoch": 4.385919165580183, |
| "grad_norm": 0.18318603173559903, |
| "learning_rate": 3.426516942442524e-06, |
| "loss": 0.7509, |
| "step": 841 |
| }, |
| { |
| "epoch": 4.391134289439374, |
| "grad_norm": 0.17040203525878148, |
| "learning_rate": 3.3675206744423394e-06, |
| "loss": 0.7375, |
| "step": 842 |
| }, |
| { |
| "epoch": 4.396349413298566, |
| "grad_norm": 0.157205271311182, |
| "learning_rate": 3.309014387450189e-06, |
| "loss": 0.7302, |
| "step": 843 |
| }, |
| { |
| "epoch": 4.401564537157758, |
| "grad_norm": 0.15260420463119376, |
| "learning_rate": 3.2509988640221414e-06, |
| "loss": 0.7407, |
| "step": 844 |
| }, |
| { |
| "epoch": 4.406779661016949, |
| "grad_norm": 0.16154440296062286, |
| "learning_rate": 3.1934748801500184e-06, |
| "loss": 0.7255, |
| "step": 845 |
| }, |
| { |
| "epoch": 4.411994784876141, |
| "grad_norm": 0.14388929124722757, |
| "learning_rate": 3.136443205251034e-06, |
| "loss": 0.7369, |
| "step": 846 |
| }, |
| { |
| "epoch": 4.417209908735332, |
| "grad_norm": 0.14793139417234433, |
| "learning_rate": 3.0799046021574574e-06, |
| "loss": 0.7337, |
| "step": 847 |
| }, |
| { |
| "epoch": 4.422425032594524, |
| "grad_norm": 0.14906886650426293, |
| "learning_rate": 3.0238598271064323e-06, |
| "loss": 0.7319, |
| "step": 848 |
| }, |
| { |
| "epoch": 4.427640156453716, |
| "grad_norm": 0.18885017044608088, |
| "learning_rate": 2.9683096297298753e-06, |
| "loss": 0.7226, |
| "step": 849 |
| }, |
| { |
| "epoch": 4.432855280312907, |
| "grad_norm": 0.16698890562230234, |
| "learning_rate": 2.9132547530444254e-06, |
| "loss": 0.74, |
| "step": 850 |
| }, |
| { |
| "epoch": 4.438070404172099, |
| "grad_norm": 0.12976275259498832, |
| "learning_rate": 2.8586959334415064e-06, |
| "loss": 0.7325, |
| "step": 851 |
| }, |
| { |
| "epoch": 4.443285528031291, |
| "grad_norm": 0.14807620276113978, |
| "learning_rate": 2.8046339006775115e-06, |
| "loss": 0.7197, |
| "step": 852 |
| }, |
| { |
| "epoch": 4.448500651890482, |
| "grad_norm": 0.15235231363332793, |
| "learning_rate": 2.751069377863993e-06, |
| "loss": 0.7302, |
| "step": 853 |
| }, |
| { |
| "epoch": 4.453715775749674, |
| "grad_norm": 0.15202060711926385, |
| "learning_rate": 2.698003081458018e-06, |
| "loss": 0.7289, |
| "step": 854 |
| }, |
| { |
| "epoch": 4.458930899608866, |
| "grad_norm": 0.12630616368200856, |
| "learning_rate": 2.6454357212526026e-06, |
| "loss": 0.7279, |
| "step": 855 |
| }, |
| { |
| "epoch": 4.464146023468057, |
| "grad_norm": 0.11739628579126643, |
| "learning_rate": 2.5933680003671625e-06, |
| "loss": 0.7335, |
| "step": 856 |
| }, |
| { |
| "epoch": 4.469361147327249, |
| "grad_norm": 0.1434952804888338, |
| "learning_rate": 2.541800615238175e-06, |
| "loss": 0.7266, |
| "step": 857 |
| }, |
| { |
| "epoch": 4.47457627118644, |
| "grad_norm": 0.176277045965935, |
| "learning_rate": 2.4907342556098125e-06, |
| "loss": 0.7269, |
| "step": 858 |
| }, |
| { |
| "epoch": 4.479791395045632, |
| "grad_norm": 0.18491464302255675, |
| "learning_rate": 2.440169604524738e-06, |
| "loss": 0.7453, |
| "step": 859 |
| }, |
| { |
| "epoch": 4.485006518904824, |
| "grad_norm": 0.12415001369259007, |
| "learning_rate": 2.390107338314982e-06, |
| "loss": 0.7319, |
| "step": 860 |
| }, |
| { |
| "epoch": 4.490221642764015, |
| "grad_norm": 0.10478588253479926, |
| "learning_rate": 2.340548126592861e-06, |
| "loss": 0.737, |
| "step": 861 |
| }, |
| { |
| "epoch": 4.495436766623207, |
| "grad_norm": 0.14165809120057415, |
| "learning_rate": 2.2914926322420515e-06, |
| "loss": 0.7343, |
| "step": 862 |
| }, |
| { |
| "epoch": 4.500651890482399, |
| "grad_norm": 0.11590106390711405, |
| "learning_rate": 2.242941511408714e-06, |
| "loss": 0.737, |
| "step": 863 |
| }, |
| { |
| "epoch": 4.50586701434159, |
| "grad_norm": 0.12634034846215603, |
| "learning_rate": 2.1948954134927103e-06, |
| "loss": 0.7377, |
| "step": 864 |
| }, |
| { |
| "epoch": 4.511082138200782, |
| "grad_norm": 0.11268712032715957, |
| "learning_rate": 2.147354981138925e-06, |
| "loss": 0.7257, |
| "step": 865 |
| }, |
| { |
| "epoch": 4.5162972620599735, |
| "grad_norm": 0.11916219994594991, |
| "learning_rate": 2.1003208502286744e-06, |
| "loss": 0.729, |
| "step": 866 |
| }, |
| { |
| "epoch": 4.5215123859191655, |
| "grad_norm": 0.13119722935564196, |
| "learning_rate": 2.0537936498711898e-06, |
| "loss": 0.7316, |
| "step": 867 |
| }, |
| { |
| "epoch": 4.5267275097783575, |
| "grad_norm": 0.12883622307786546, |
| "learning_rate": 2.007774002395193e-06, |
| "loss": 0.7337, |
| "step": 868 |
| }, |
| { |
| "epoch": 4.531942633637549, |
| "grad_norm": 0.10574304590110693, |
| "learning_rate": 1.9622625233406143e-06, |
| "loss": 0.7268, |
| "step": 869 |
| }, |
| { |
| "epoch": 4.5371577574967406, |
| "grad_norm": 0.11189460238334994, |
| "learning_rate": 1.9172598214503146e-06, |
| "loss": 0.736, |
| "step": 870 |
| }, |
| { |
| "epoch": 4.5423728813559325, |
| "grad_norm": 0.12169459450452168, |
| "learning_rate": 1.8727664986619576e-06, |
| "loss": 0.7359, |
| "step": 871 |
| }, |
| { |
| "epoch": 4.547588005215124, |
| "grad_norm": 0.11602787776789947, |
| "learning_rate": 1.8287831500999775e-06, |
| "loss": 0.733, |
| "step": 872 |
| }, |
| { |
| "epoch": 4.552803129074316, |
| "grad_norm": 0.11542784974890338, |
| "learning_rate": 1.7853103640675852e-06, |
| "loss": 0.7274, |
| "step": 873 |
| }, |
| { |
| "epoch": 4.558018252933508, |
| "grad_norm": 0.10042034550407807, |
| "learning_rate": 1.74234872203892e-06, |
| "loss": 0.7346, |
| "step": 874 |
| }, |
| { |
| "epoch": 4.563233376792699, |
| "grad_norm": 0.11267524138272358, |
| "learning_rate": 1.699898798651285e-06, |
| "loss": 0.7355, |
| "step": 875 |
| }, |
| { |
| "epoch": 4.568448500651891, |
| "grad_norm": 0.14020374405591798, |
| "learning_rate": 1.6579611616974212e-06, |
| "loss": 0.7323, |
| "step": 876 |
| }, |
| { |
| "epoch": 4.573663624511082, |
| "grad_norm": 0.1329010216092144, |
| "learning_rate": 1.6165363721179471e-06, |
| "loss": 0.7333, |
| "step": 877 |
| }, |
| { |
| "epoch": 4.578878748370274, |
| "grad_norm": 0.1111745646565014, |
| "learning_rate": 1.575624983993853e-06, |
| "loss": 0.7198, |
| "step": 878 |
| }, |
| { |
| "epoch": 4.584093872229466, |
| "grad_norm": 0.12252637821649354, |
| "learning_rate": 1.5352275445390752e-06, |
| "loss": 0.7265, |
| "step": 879 |
| }, |
| { |
| "epoch": 4.589308996088657, |
| "grad_norm": 0.10588650578235918, |
| "learning_rate": 1.4953445940931688e-06, |
| "loss": 0.7328, |
| "step": 880 |
| }, |
| { |
| "epoch": 4.594524119947849, |
| "grad_norm": 0.10242904609486453, |
| "learning_rate": 1.4559766661141317e-06, |
| "loss": 0.7305, |
| "step": 881 |
| }, |
| { |
| "epoch": 4.599739243807041, |
| "grad_norm": 0.11484087780107081, |
| "learning_rate": 1.4171242871711922e-06, |
| "loss": 0.7432, |
| "step": 882 |
| }, |
| { |
| "epoch": 4.604954367666232, |
| "grad_norm": 0.10614866491620119, |
| "learning_rate": 1.3787879769378277e-06, |
| "loss": 0.7317, |
| "step": 883 |
| }, |
| { |
| "epoch": 4.610169491525424, |
| "grad_norm": 0.11216621248162341, |
| "learning_rate": 1.3409682481848063e-06, |
| "loss": 0.7379, |
| "step": 884 |
| }, |
| { |
| "epoch": 4.615384615384615, |
| "grad_norm": 0.09993770692863328, |
| "learning_rate": 1.3036656067732679e-06, |
| "loss": 0.739, |
| "step": 885 |
| }, |
| { |
| "epoch": 4.620599739243807, |
| "grad_norm": 0.09930632699157292, |
| "learning_rate": 1.2668805516480577e-06, |
| "loss": 0.743, |
| "step": 886 |
| }, |
| { |
| "epoch": 4.625814863102999, |
| "grad_norm": 0.1008961163196706, |
| "learning_rate": 1.2306135748309633e-06, |
| "loss": 0.746, |
| "step": 887 |
| }, |
| { |
| "epoch": 4.63102998696219, |
| "grad_norm": 0.1255524490616452, |
| "learning_rate": 1.1948651614141783e-06, |
| "loss": 0.7317, |
| "step": 888 |
| }, |
| { |
| "epoch": 4.636245110821382, |
| "grad_norm": 0.10155914604030952, |
| "learning_rate": 1.1596357895538213e-06, |
| "loss": 0.7213, |
| "step": 889 |
| }, |
| { |
| "epoch": 4.641460234680574, |
| "grad_norm": 0.10501816031109035, |
| "learning_rate": 1.1249259304634986e-06, |
| "loss": 0.7398, |
| "step": 890 |
| }, |
| { |
| "epoch": 4.646675358539765, |
| "grad_norm": 0.13819236460373796, |
| "learning_rate": 1.0907360484080409e-06, |
| "loss": 0.7368, |
| "step": 891 |
| }, |
| { |
| "epoch": 4.651890482398957, |
| "grad_norm": 0.10830515113482954, |
| "learning_rate": 1.0570666006972875e-06, |
| "loss": 0.7543, |
| "step": 892 |
| }, |
| { |
| "epoch": 4.657105606258149, |
| "grad_norm": 0.10654993027327486, |
| "learning_rate": 1.023918037679943e-06, |
| "loss": 0.7243, |
| "step": 893 |
| }, |
| { |
| "epoch": 4.66232073011734, |
| "grad_norm": 0.10012139135715376, |
| "learning_rate": 9.912908027375745e-07, |
| "loss": 0.7373, |
| "step": 894 |
| }, |
| { |
| "epoch": 4.667535853976532, |
| "grad_norm": 0.0950421289561811, |
| "learning_rate": 9.591853322787003e-07, |
| "loss": 0.7412, |
| "step": 895 |
| }, |
| { |
| "epoch": 4.672750977835723, |
| "grad_norm": 0.09904664343265647, |
| "learning_rate": 9.276020557329101e-07, |
| "loss": 0.729, |
| "step": 896 |
| }, |
| { |
| "epoch": 4.677966101694915, |
| "grad_norm": 0.1051253462879674, |
| "learning_rate": 8.965413955451363e-07, |
| "loss": 0.7329, |
| "step": 897 |
| }, |
| { |
| "epoch": 4.683181225554107, |
| "grad_norm": 0.10272032718312339, |
| "learning_rate": 8.660037671700405e-07, |
| "loss": 0.7387, |
| "step": 898 |
| }, |
| { |
| "epoch": 4.688396349413298, |
| "grad_norm": 0.09813662944001211, |
| "learning_rate": 8.359895790663963e-07, |
| "loss": 0.7377, |
| "step": 899 |
| }, |
| { |
| "epoch": 4.69361147327249, |
| "grad_norm": 0.10536763446890161, |
| "learning_rate": 8.064992326916577e-07, |
| "loss": 0.7353, |
| "step": 900 |
| }, |
| { |
| "epoch": 4.698826597131681, |
| "grad_norm": 0.10236023154880665, |
| "learning_rate": 7.775331224966076e-07, |
| "loss": 0.7272, |
| "step": 901 |
| }, |
| { |
| "epoch": 4.704041720990873, |
| "grad_norm": 0.09414675460502064, |
| "learning_rate": 7.49091635920034e-07, |
| "loss": 0.7292, |
| "step": 902 |
| }, |
| { |
| "epoch": 4.709256844850065, |
| "grad_norm": 0.0943501127919075, |
| "learning_rate": 7.211751533835998e-07, |
| "loss": 0.7416, |
| "step": 903 |
| }, |
| { |
| "epoch": 4.7144719687092564, |
| "grad_norm": 0.1021023935601688, |
| "learning_rate": 6.937840482867142e-07, |
| "loss": 0.7337, |
| "step": 904 |
| }, |
| { |
| "epoch": 4.719687092568448, |
| "grad_norm": 0.09881423844437276, |
| "learning_rate": 6.669186870015454e-07, |
| "loss": 0.7324, |
| "step": 905 |
| }, |
| { |
| "epoch": 4.72490221642764, |
| "grad_norm": 0.09987028627197758, |
| "learning_rate": 6.405794288681577e-07, |
| "loss": 0.7374, |
| "step": 906 |
| }, |
| { |
| "epoch": 4.7301173402868315, |
| "grad_norm": 0.09574731953052935, |
| "learning_rate": 6.147666261896445e-07, |
| "loss": 0.7326, |
| "step": 907 |
| }, |
| { |
| "epoch": 4.7353324641460235, |
| "grad_norm": 0.09726177695264816, |
| "learning_rate": 5.894806242274565e-07, |
| "loss": 0.7375, |
| "step": 908 |
| }, |
| { |
| "epoch": 4.7405475880052155, |
| "grad_norm": 0.09663506260906218, |
| "learning_rate": 5.647217611967914e-07, |
| "loss": 0.7366, |
| "step": 909 |
| }, |
| { |
| "epoch": 4.745762711864407, |
| "grad_norm": 0.09898203274690626, |
| "learning_rate": 5.404903682620299e-07, |
| "loss": 0.7282, |
| "step": 910 |
| }, |
| { |
| "epoch": 4.7509778357235986, |
| "grad_norm": 0.09888749932985194, |
| "learning_rate": 5.167867695323426e-07, |
| "loss": 0.7234, |
| "step": 911 |
| }, |
| { |
| "epoch": 4.7561929595827905, |
| "grad_norm": 0.09677300838881348, |
| "learning_rate": 4.936112820573335e-07, |
| "loss": 0.734, |
| "step": 912 |
| }, |
| { |
| "epoch": 4.761408083441982, |
| "grad_norm": 0.09338358111347181, |
| "learning_rate": 4.709642158228134e-07, |
| "loss": 0.7319, |
| "step": 913 |
| }, |
| { |
| "epoch": 4.766623207301174, |
| "grad_norm": 0.09475677222529419, |
| "learning_rate": 4.4884587374663727e-07, |
| "loss": 0.7345, |
| "step": 914 |
| }, |
| { |
| "epoch": 4.771838331160365, |
| "grad_norm": 0.09660842094296682, |
| "learning_rate": 4.2725655167468626e-07, |
| "loss": 0.7328, |
| "step": 915 |
| }, |
| { |
| "epoch": 4.777053455019557, |
| "grad_norm": 0.09522844691697419, |
| "learning_rate": 4.061965383768529e-07, |
| "loss": 0.7395, |
| "step": 916 |
| }, |
| { |
| "epoch": 4.782268578878749, |
| "grad_norm": 0.09658310921334537, |
| "learning_rate": 3.856661155432484e-07, |
| "loss": 0.7282, |
| "step": 917 |
| }, |
| { |
| "epoch": 4.78748370273794, |
| "grad_norm": 0.09209868347782292, |
| "learning_rate": 3.656655577803703e-07, |
| "loss": 0.7321, |
| "step": 918 |
| }, |
| { |
| "epoch": 4.792698826597132, |
| "grad_norm": 0.09809162403374437, |
| "learning_rate": 3.4619513260746973e-07, |
| "loss": 0.721, |
| "step": 919 |
| }, |
| { |
| "epoch": 4.797913950456323, |
| "grad_norm": 0.09728296962426303, |
| "learning_rate": 3.2725510045297225e-07, |
| "loss": 0.7383, |
| "step": 920 |
| }, |
| { |
| "epoch": 4.803129074315515, |
| "grad_norm": 0.0982190410036559, |
| "learning_rate": 3.0884571465096936e-07, |
| "loss": 0.7356, |
| "step": 921 |
| }, |
| { |
| "epoch": 4.808344198174707, |
| "grad_norm": 0.09138351849890962, |
| "learning_rate": 2.909672214378434e-07, |
| "loss": 0.7375, |
| "step": 922 |
| }, |
| { |
| "epoch": 4.813559322033898, |
| "grad_norm": 0.08931700971308501, |
| "learning_rate": 2.736198599489903e-07, |
| "loss": 0.7399, |
| "step": 923 |
| }, |
| { |
| "epoch": 4.81877444589309, |
| "grad_norm": 0.09559406138911163, |
| "learning_rate": 2.568038622155955e-07, |
| "loss": 0.7509, |
| "step": 924 |
| }, |
| { |
| "epoch": 4.823989569752282, |
| "grad_norm": 0.09150057561208538, |
| "learning_rate": 2.4051945316153846e-07, |
| "loss": 0.7284, |
| "step": 925 |
| }, |
| { |
| "epoch": 4.829204693611473, |
| "grad_norm": 0.09349065890924081, |
| "learning_rate": 2.2476685060039527e-07, |
| "loss": 0.7299, |
| "step": 926 |
| }, |
| { |
| "epoch": 4.834419817470665, |
| "grad_norm": 0.09556536713909304, |
| "learning_rate": 2.0954626523251197e-07, |
| "loss": 0.7276, |
| "step": 927 |
| }, |
| { |
| "epoch": 4.839634941329857, |
| "grad_norm": 0.09471713249250732, |
| "learning_rate": 1.948579006421847e-07, |
| "loss": 0.7428, |
| "step": 928 |
| }, |
| { |
| "epoch": 4.844850065189048, |
| "grad_norm": 0.09109503005212934, |
| "learning_rate": 1.8070195329495944e-07, |
| "loss": 0.7401, |
| "step": 929 |
| }, |
| { |
| "epoch": 4.85006518904824, |
| "grad_norm": 0.09359860047319951, |
| "learning_rate": 1.6707861253497214e-07, |
| "loss": 0.7409, |
| "step": 930 |
| }, |
| { |
| "epoch": 4.855280312907432, |
| "grad_norm": 0.09252775154456866, |
| "learning_rate": 1.539880605824351e-07, |
| "loss": 0.7362, |
| "step": 931 |
| }, |
| { |
| "epoch": 4.860495436766623, |
| "grad_norm": 0.09910390005741515, |
| "learning_rate": 1.4143047253120322e-07, |
| "loss": 0.7332, |
| "step": 932 |
| }, |
| { |
| "epoch": 4.865710560625815, |
| "grad_norm": 0.0900655265782037, |
| "learning_rate": 1.2940601634642059e-07, |
| "loss": 0.7233, |
| "step": 933 |
| }, |
| { |
| "epoch": 4.870925684485006, |
| "grad_norm": 0.09409378931281938, |
| "learning_rate": 1.1791485286227311e-07, |
| "loss": 0.7395, |
| "step": 934 |
| }, |
| { |
| "epoch": 4.876140808344198, |
| "grad_norm": 0.10015850940263663, |
| "learning_rate": 1.0695713577984824e-07, |
| "loss": 0.7231, |
| "step": 935 |
| }, |
| { |
| "epoch": 4.88135593220339, |
| "grad_norm": 0.0903358412925811, |
| "learning_rate": 9.653301166507422e-08, |
| "loss": 0.7351, |
| "step": 936 |
| }, |
| { |
| "epoch": 4.886571056062581, |
| "grad_norm": 0.0902382557022594, |
| "learning_rate": 8.664261994675738e-08, |
| "loss": 0.7333, |
| "step": 937 |
| }, |
| { |
| "epoch": 4.891786179921773, |
| "grad_norm": 0.09428682569360029, |
| "learning_rate": 7.728609291471678e-08, |
| "loss": 0.7353, |
| "step": 938 |
| }, |
| { |
| "epoch": 4.897001303780964, |
| "grad_norm": 0.09406554806150552, |
| "learning_rate": 6.846355571801688e-08, |
| "loss": 0.7387, |
| "step": 939 |
| }, |
| { |
| "epoch": 4.902216427640156, |
| "grad_norm": 0.10257884244588195, |
| "learning_rate": 6.017512636329325e-08, |
| "loss": 0.7327, |
| "step": 940 |
| }, |
| { |
| "epoch": 4.907431551499348, |
| "grad_norm": 0.08993907897520984, |
| "learning_rate": 5.242091571318053e-08, |
| "loss": 0.7323, |
| "step": 941 |
| }, |
| { |
| "epoch": 4.912646675358539, |
| "grad_norm": 0.09752580757548222, |
| "learning_rate": 4.5201027484811365e-08, |
| "loss": 0.7369, |
| "step": 942 |
| }, |
| { |
| "epoch": 4.917861799217731, |
| "grad_norm": 0.09295306846482325, |
| "learning_rate": 3.851555824844866e-08, |
| "loss": 0.7269, |
| "step": 943 |
| }, |
| { |
| "epoch": 4.923076923076923, |
| "grad_norm": 0.09090791000944588, |
| "learning_rate": 3.23645974261888e-08, |
| "loss": 0.72, |
| "step": 944 |
| }, |
| { |
| "epoch": 4.9282920469361144, |
| "grad_norm": 0.0941176803792802, |
| "learning_rate": 2.674822729075377e-08, |
| "loss": 0.7302, |
| "step": 945 |
| }, |
| { |
| "epoch": 4.933507170795306, |
| "grad_norm": 0.09476591232745427, |
| "learning_rate": 2.166652296440752e-08, |
| "loss": 0.7389, |
| "step": 946 |
| }, |
| { |
| "epoch": 4.938722294654498, |
| "grad_norm": 0.09487039421752298, |
| "learning_rate": 1.7119552417943496e-08, |
| "loss": 0.7384, |
| "step": 947 |
| }, |
| { |
| "epoch": 4.9439374185136895, |
| "grad_norm": 0.09350946727423737, |
| "learning_rate": 1.3107376469769783e-08, |
| "loss": 0.7352, |
| "step": 948 |
| }, |
| { |
| "epoch": 4.9491525423728815, |
| "grad_norm": 0.09008974287657726, |
| "learning_rate": 9.630048785105318e-09, |
| "loss": 0.7202, |
| "step": 949 |
| }, |
| { |
| "epoch": 4.9543676662320735, |
| "grad_norm": 0.09424907250830968, |
| "learning_rate": 6.687615875264897e-09, |
| "loss": 0.7352, |
| "step": 950 |
| }, |
| { |
| "epoch": 4.959582790091265, |
| "grad_norm": 0.08996524126321254, |
| "learning_rate": 4.280117097015257e-09, |
| "loss": 0.7342, |
| "step": 951 |
| }, |
| { |
| "epoch": 4.9647979139504566, |
| "grad_norm": 0.09830845644976817, |
| "learning_rate": 2.407584652073247e-09, |
| "loss": 0.7357, |
| "step": 952 |
| }, |
| { |
| "epoch": 4.970013037809648, |
| "grad_norm": 0.0969409044778751, |
| "learning_rate": 1.0700435866706216e-09, |
| "loss": 0.7212, |
| "step": 953 |
| }, |
| { |
| "epoch": 4.97522816166884, |
| "grad_norm": 0.09285007352436725, |
| "learning_rate": 2.6751179119877124e-10, |
| "loss": 0.7353, |
| "step": 954 |
| }, |
| { |
| "epoch": 4.980443285528032, |
| "grad_norm": 0.09324700428805709, |
| "learning_rate": 0.0, |
| "loss": 0.7386, |
| "step": 955 |
| }, |
| { |
| "epoch": 4.980443285528032, |
| "step": 955, |
| "total_flos": 1.9663195796108476e+19, |
| "train_loss": 0.7990808349005215, |
| "train_runtime": 49935.3477, |
| "train_samples_per_second": 9.826, |
| "train_steps_per_second": 0.019 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 955, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.9663195796108476e+19, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|