diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,90846 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 12972, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.70891150169596e-05, + "grad_norm": 8.23936653137207, + "learning_rate": 2.5641025641025643e-08, + "loss": 1.5914, + "step": 1 + }, + { + "epoch": 0.0001541782300339192, + "grad_norm": 8.8346586227417, + "learning_rate": 5.1282051282051286e-08, + "loss": 1.5383, + "step": 2 + }, + { + "epoch": 0.00023126734505087883, + "grad_norm": 10.278841972351074, + "learning_rate": 7.692307692307694e-08, + "loss": 1.6608, + "step": 3 + }, + { + "epoch": 0.0003083564600678384, + "grad_norm": 11.908944129943848, + "learning_rate": 1.0256410256410257e-07, + "loss": 1.6491, + "step": 4 + }, + { + "epoch": 0.000385445575084798, + "grad_norm": 8.718999862670898, + "learning_rate": 1.282051282051282e-07, + "loss": 1.5346, + "step": 5 + }, + { + "epoch": 0.00046253469010175765, + "grad_norm": 8.059788703918457, + "learning_rate": 1.5384615384615387e-07, + "loss": 1.53, + "step": 6 + }, + { + "epoch": 0.0005396238051187173, + "grad_norm": 9.216470718383789, + "learning_rate": 1.7948717948717948e-07, + "loss": 1.5931, + "step": 7 + }, + { + "epoch": 0.0006167129201356768, + "grad_norm": 9.295415878295898, + "learning_rate": 2.0512820512820514e-07, + "loss": 1.5061, + "step": 8 + }, + { + "epoch": 0.0006938020351526364, + "grad_norm": 7.587814807891846, + "learning_rate": 2.307692307692308e-07, + "loss": 1.5034, + "step": 9 + }, + { + "epoch": 0.000770891150169596, + "grad_norm": 10.28200912475586, + "learning_rate": 2.564102564102564e-07, + "loss": 1.5683, + "step": 10 + }, + { + "epoch": 0.0008479802651865557, + "grad_norm": 10.186943054199219, + "learning_rate": 2.820512820512821e-07, + "loss": 1.612, + "step": 11 + }, + { + "epoch": 0.0009250693802035153, + "grad_norm": 8.96688175201416, + "learning_rate": 3.0769230769230774e-07, + "loss": 1.5478, + "step": 12 + }, + { + "epoch": 0.0010021584952204748, + "grad_norm": 8.598114013671875, + "learning_rate": 3.3333333333333335e-07, + "loss": 1.77, + "step": 13 + }, + { + "epoch": 0.0010792476102374346, + "grad_norm": 8.468049049377441, + "learning_rate": 3.5897435897435896e-07, + "loss": 1.5732, + "step": 14 + }, + { + "epoch": 0.001156336725254394, + "grad_norm": 8.353447914123535, + "learning_rate": 3.846153846153847e-07, + "loss": 1.6234, + "step": 15 + }, + { + "epoch": 0.0012334258402713536, + "grad_norm": 8.753851890563965, + "learning_rate": 4.102564102564103e-07, + "loss": 1.6226, + "step": 16 + }, + { + "epoch": 0.0013105149552883133, + "grad_norm": 7.985566139221191, + "learning_rate": 4.358974358974359e-07, + "loss": 1.5986, + "step": 17 + }, + { + "epoch": 0.0013876040703052729, + "grad_norm": 9.336605072021484, + "learning_rate": 4.615384615384616e-07, + "loss": 1.4641, + "step": 18 + }, + { + "epoch": 0.0014646931853222326, + "grad_norm": 8.00399398803711, + "learning_rate": 4.871794871794872e-07, + "loss": 1.5511, + "step": 19 + }, + { + "epoch": 0.001541782300339192, + "grad_norm": 8.389213562011719, + "learning_rate": 5.128205128205128e-07, + "loss": 1.5059, + "step": 20 + }, + { + "epoch": 0.0016188714153561516, + "grad_norm": 14.020890235900879, + "learning_rate": 5.384615384615386e-07, + "loss": 1.7145, + "step": 21 + }, + { + "epoch": 0.0016959605303731114, + "grad_norm": 6.704775810241699, + "learning_rate": 5.641025641025642e-07, + "loss": 1.5278, + "step": 22 + }, + { + "epoch": 0.0017730496453900709, + "grad_norm": 6.921572208404541, + "learning_rate": 5.897435897435898e-07, + "loss": 1.502, + "step": 23 + }, + { + "epoch": 0.0018501387604070306, + "grad_norm": 11.839808464050293, + "learning_rate": 6.153846153846155e-07, + "loss": 1.4523, + "step": 24 + }, + { + "epoch": 0.0019272278754239901, + "grad_norm": 7.50053071975708, + "learning_rate": 6.41025641025641e-07, + "loss": 1.4585, + "step": 25 + }, + { + "epoch": 0.0020043169904409497, + "grad_norm": 8.217403411865234, + "learning_rate": 6.666666666666667e-07, + "loss": 1.5492, + "step": 26 + }, + { + "epoch": 0.0020814061054579094, + "grad_norm": 7.217784881591797, + "learning_rate": 6.923076923076924e-07, + "loss": 1.424, + "step": 27 + }, + { + "epoch": 0.002158495220474869, + "grad_norm": 7.2611799240112305, + "learning_rate": 7.179487179487179e-07, + "loss": 1.4658, + "step": 28 + }, + { + "epoch": 0.0022355843354918284, + "grad_norm": 6.959589958190918, + "learning_rate": 7.435897435897436e-07, + "loss": 1.4955, + "step": 29 + }, + { + "epoch": 0.002312673450508788, + "grad_norm": 8.893731117248535, + "learning_rate": 7.692307692307694e-07, + "loss": 1.3793, + "step": 30 + }, + { + "epoch": 0.002389762565525748, + "grad_norm": 6.077394485473633, + "learning_rate": 7.948717948717949e-07, + "loss": 1.4154, + "step": 31 + }, + { + "epoch": 0.002466851680542707, + "grad_norm": 6.705117225646973, + "learning_rate": 8.205128205128206e-07, + "loss": 1.4288, + "step": 32 + }, + { + "epoch": 0.002543940795559667, + "grad_norm": 6.546511173248291, + "learning_rate": 8.461538461538463e-07, + "loss": 1.3884, + "step": 33 + }, + { + "epoch": 0.0026210299105766267, + "grad_norm": 6.519981384277344, + "learning_rate": 8.717948717948718e-07, + "loss": 1.3881, + "step": 34 + }, + { + "epoch": 0.002698119025593586, + "grad_norm": 7.000728130340576, + "learning_rate": 8.974358974358975e-07, + "loss": 1.4235, + "step": 35 + }, + { + "epoch": 0.0027752081406105457, + "grad_norm": 6.453318119049072, + "learning_rate": 9.230769230769232e-07, + "loss": 1.4284, + "step": 36 + }, + { + "epoch": 0.0028522972556275054, + "grad_norm": 5.742129802703857, + "learning_rate": 9.487179487179487e-07, + "loss": 1.4163, + "step": 37 + }, + { + "epoch": 0.002929386370644465, + "grad_norm": 5.8094916343688965, + "learning_rate": 9.743589743589745e-07, + "loss": 1.293, + "step": 38 + }, + { + "epoch": 0.0030064754856614245, + "grad_norm": 6.008228302001953, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.4182, + "step": 39 + }, + { + "epoch": 0.003083564600678384, + "grad_norm": 5.845043659210205, + "learning_rate": 1.0256410256410257e-06, + "loss": 1.444, + "step": 40 + }, + { + "epoch": 0.003160653715695344, + "grad_norm": 5.4758806228637695, + "learning_rate": 1.0512820512820514e-06, + "loss": 1.396, + "step": 41 + }, + { + "epoch": 0.0032377428307123032, + "grad_norm": 5.759223937988281, + "learning_rate": 1.076923076923077e-06, + "loss": 1.3574, + "step": 42 + }, + { + "epoch": 0.003314831945729263, + "grad_norm": 6.016756534576416, + "learning_rate": 1.1025641025641026e-06, + "loss": 1.3853, + "step": 43 + }, + { + "epoch": 0.0033919210607462227, + "grad_norm": 6.545979976654053, + "learning_rate": 1.1282051282051283e-06, + "loss": 1.492, + "step": 44 + }, + { + "epoch": 0.003469010175763182, + "grad_norm": 5.281697750091553, + "learning_rate": 1.153846153846154e-06, + "loss": 1.3644, + "step": 45 + }, + { + "epoch": 0.0035460992907801418, + "grad_norm": 5.430404186248779, + "learning_rate": 1.1794871794871795e-06, + "loss": 1.3613, + "step": 46 + }, + { + "epoch": 0.0036231884057971015, + "grad_norm": 5.730790138244629, + "learning_rate": 1.2051282051282053e-06, + "loss": 1.349, + "step": 47 + }, + { + "epoch": 0.0037002775208140612, + "grad_norm": 7.0034098625183105, + "learning_rate": 1.230769230769231e-06, + "loss": 1.4167, + "step": 48 + }, + { + "epoch": 0.0037773666358310205, + "grad_norm": 5.198720932006836, + "learning_rate": 1.2564102564102565e-06, + "loss": 1.2908, + "step": 49 + }, + { + "epoch": 0.0038544557508479803, + "grad_norm": 5.8838605880737305, + "learning_rate": 1.282051282051282e-06, + "loss": 1.4535, + "step": 50 + }, + { + "epoch": 0.00393154486586494, + "grad_norm": 5.737644672393799, + "learning_rate": 1.307692307692308e-06, + "loss": 1.2674, + "step": 51 + }, + { + "epoch": 0.004008633980881899, + "grad_norm": 5.439259052276611, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.327, + "step": 52 + }, + { + "epoch": 0.0040857230958988595, + "grad_norm": 5.283633708953857, + "learning_rate": 1.358974358974359e-06, + "loss": 1.2557, + "step": 53 + }, + { + "epoch": 0.004162812210915819, + "grad_norm": 5.148982524871826, + "learning_rate": 1.3846153846153848e-06, + "loss": 1.3224, + "step": 54 + }, + { + "epoch": 0.004239901325932778, + "grad_norm": 5.781235694885254, + "learning_rate": 1.4102564102564104e-06, + "loss": 1.3519, + "step": 55 + }, + { + "epoch": 0.004316990440949738, + "grad_norm": 5.436124801635742, + "learning_rate": 1.4358974358974359e-06, + "loss": 1.2495, + "step": 56 + }, + { + "epoch": 0.0043940795559666975, + "grad_norm": 5.363502025604248, + "learning_rate": 1.4615384615384618e-06, + "loss": 1.304, + "step": 57 + }, + { + "epoch": 0.004471168670983657, + "grad_norm": 5.214340686798096, + "learning_rate": 1.4871794871794873e-06, + "loss": 1.1908, + "step": 58 + }, + { + "epoch": 0.004548257786000617, + "grad_norm": 5.095510959625244, + "learning_rate": 1.5128205128205128e-06, + "loss": 1.3102, + "step": 59 + }, + { + "epoch": 0.004625346901017576, + "grad_norm": 4.8631157875061035, + "learning_rate": 1.5384615384615387e-06, + "loss": 1.3529, + "step": 60 + }, + { + "epoch": 0.004702436016034536, + "grad_norm": 4.890917778015137, + "learning_rate": 1.5641025641025642e-06, + "loss": 1.1276, + "step": 61 + }, + { + "epoch": 0.004779525131051496, + "grad_norm": 5.280998706817627, + "learning_rate": 1.5897435897435897e-06, + "loss": 1.2194, + "step": 62 + }, + { + "epoch": 0.004856614246068455, + "grad_norm": 5.293563365936279, + "learning_rate": 1.6153846153846157e-06, + "loss": 1.319, + "step": 63 + }, + { + "epoch": 0.004933703361085414, + "grad_norm": 5.016965866088867, + "learning_rate": 1.6410256410256412e-06, + "loss": 1.3868, + "step": 64 + }, + { + "epoch": 0.0050107924761023746, + "grad_norm": 4.839664936065674, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.2705, + "step": 65 + }, + { + "epoch": 0.005087881591119334, + "grad_norm": 4.863608360290527, + "learning_rate": 1.6923076923076926e-06, + "loss": 1.2731, + "step": 66 + }, + { + "epoch": 0.005164970706136293, + "grad_norm": 5.9279656410217285, + "learning_rate": 1.717948717948718e-06, + "loss": 1.324, + "step": 67 + }, + { + "epoch": 0.005242059821153253, + "grad_norm": 4.750378608703613, + "learning_rate": 1.7435897435897436e-06, + "loss": 1.2538, + "step": 68 + }, + { + "epoch": 0.005319148936170213, + "grad_norm": 5.045099258422852, + "learning_rate": 1.7692307692307695e-06, + "loss": 1.2533, + "step": 69 + }, + { + "epoch": 0.005396238051187172, + "grad_norm": 4.8463006019592285, + "learning_rate": 1.794871794871795e-06, + "loss": 1.3231, + "step": 70 + }, + { + "epoch": 0.005473327166204132, + "grad_norm": 4.522879600524902, + "learning_rate": 1.8205128205128205e-06, + "loss": 1.3319, + "step": 71 + }, + { + "epoch": 0.005550416281221091, + "grad_norm": 5.500392913818359, + "learning_rate": 1.8461538461538465e-06, + "loss": 1.2909, + "step": 72 + }, + { + "epoch": 0.005627505396238052, + "grad_norm": 4.754296779632568, + "learning_rate": 1.871794871794872e-06, + "loss": 1.2207, + "step": 73 + }, + { + "epoch": 0.005704594511255011, + "grad_norm": 4.711799144744873, + "learning_rate": 1.8974358974358975e-06, + "loss": 1.2839, + "step": 74 + }, + { + "epoch": 0.00578168362627197, + "grad_norm": 5.301671504974365, + "learning_rate": 1.9230769230769234e-06, + "loss": 1.3236, + "step": 75 + }, + { + "epoch": 0.00585877274128893, + "grad_norm": 4.835858345031738, + "learning_rate": 1.948717948717949e-06, + "loss": 1.2828, + "step": 76 + }, + { + "epoch": 0.00593586185630589, + "grad_norm": 5.167266368865967, + "learning_rate": 1.9743589743589744e-06, + "loss": 1.3363, + "step": 77 + }, + { + "epoch": 0.006012950971322849, + "grad_norm": 5.193398952484131, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.2662, + "step": 78 + }, + { + "epoch": 0.006090040086339809, + "grad_norm": 4.928783416748047, + "learning_rate": 2.025641025641026e-06, + "loss": 1.2551, + "step": 79 + }, + { + "epoch": 0.006167129201356768, + "grad_norm": 5.207440376281738, + "learning_rate": 2.0512820512820513e-06, + "loss": 1.4568, + "step": 80 + }, + { + "epoch": 0.006244218316373728, + "grad_norm": 5.189006805419922, + "learning_rate": 2.0769230769230773e-06, + "loss": 1.1864, + "step": 81 + }, + { + "epoch": 0.006321307431390688, + "grad_norm": 5.419140338897705, + "learning_rate": 2.1025641025641028e-06, + "loss": 1.3153, + "step": 82 + }, + { + "epoch": 0.006398396546407647, + "grad_norm": 4.961038589477539, + "learning_rate": 2.1282051282051283e-06, + "loss": 1.2585, + "step": 83 + }, + { + "epoch": 0.0064754856614246065, + "grad_norm": 4.714632034301758, + "learning_rate": 2.153846153846154e-06, + "loss": 1.2694, + "step": 84 + }, + { + "epoch": 0.006552574776441567, + "grad_norm": 5.331617832183838, + "learning_rate": 2.1794871794871797e-06, + "loss": 1.3765, + "step": 85 + }, + { + "epoch": 0.006629663891458526, + "grad_norm": 4.9930644035339355, + "learning_rate": 2.2051282051282052e-06, + "loss": 1.3456, + "step": 86 + }, + { + "epoch": 0.006706753006475485, + "grad_norm": 4.928962230682373, + "learning_rate": 2.230769230769231e-06, + "loss": 1.2938, + "step": 87 + }, + { + "epoch": 0.0067838421214924454, + "grad_norm": 5.059289932250977, + "learning_rate": 2.2564102564102566e-06, + "loss": 1.2987, + "step": 88 + }, + { + "epoch": 0.006860931236509405, + "grad_norm": 4.943025588989258, + "learning_rate": 2.282051282051282e-06, + "loss": 1.2336, + "step": 89 + }, + { + "epoch": 0.006938020351526364, + "grad_norm": 5.614814758300781, + "learning_rate": 2.307692307692308e-06, + "loss": 1.1704, + "step": 90 + }, + { + "epoch": 0.007015109466543324, + "grad_norm": 4.728719234466553, + "learning_rate": 2.3333333333333336e-06, + "loss": 1.301, + "step": 91 + }, + { + "epoch": 0.0070921985815602835, + "grad_norm": 5.109025478363037, + "learning_rate": 2.358974358974359e-06, + "loss": 1.3068, + "step": 92 + }, + { + "epoch": 0.007169287696577244, + "grad_norm": 5.220505237579346, + "learning_rate": 2.384615384615385e-06, + "loss": 1.3854, + "step": 93 + }, + { + "epoch": 0.007246376811594203, + "grad_norm": 4.759350776672363, + "learning_rate": 2.4102564102564105e-06, + "loss": 1.2805, + "step": 94 + }, + { + "epoch": 0.007323465926611162, + "grad_norm": 5.338301181793213, + "learning_rate": 2.435897435897436e-06, + "loss": 1.2815, + "step": 95 + }, + { + "epoch": 0.0074005550416281225, + "grad_norm": 5.214847087860107, + "learning_rate": 2.461538461538462e-06, + "loss": 1.1316, + "step": 96 + }, + { + "epoch": 0.007477644156645082, + "grad_norm": 4.639960289001465, + "learning_rate": 2.4871794871794875e-06, + "loss": 1.2396, + "step": 97 + }, + { + "epoch": 0.007554733271662041, + "grad_norm": 4.69920015335083, + "learning_rate": 2.512820512820513e-06, + "loss": 1.2721, + "step": 98 + }, + { + "epoch": 0.007631822386679001, + "grad_norm": 4.799991607666016, + "learning_rate": 2.5384615384615385e-06, + "loss": 1.3245, + "step": 99 + }, + { + "epoch": 0.0077089115016959605, + "grad_norm": 4.986448764801025, + "learning_rate": 2.564102564102564e-06, + "loss": 1.2549, + "step": 100 + }, + { + "epoch": 0.00778600061671292, + "grad_norm": 4.512473106384277, + "learning_rate": 2.5897435897435903e-06, + "loss": 1.2412, + "step": 101 + }, + { + "epoch": 0.00786308973172988, + "grad_norm": 5.038204669952393, + "learning_rate": 2.615384615384616e-06, + "loss": 1.3146, + "step": 102 + }, + { + "epoch": 0.00794017884674684, + "grad_norm": 5.079225540161133, + "learning_rate": 2.6410256410256413e-06, + "loss": 1.3502, + "step": 103 + }, + { + "epoch": 0.008017267961763799, + "grad_norm": 4.596055507659912, + "learning_rate": 2.666666666666667e-06, + "loss": 1.2562, + "step": 104 + }, + { + "epoch": 0.008094357076780759, + "grad_norm": 4.764355659484863, + "learning_rate": 2.6923076923076923e-06, + "loss": 1.2504, + "step": 105 + }, + { + "epoch": 0.008171446191797719, + "grad_norm": 4.672957897186279, + "learning_rate": 2.717948717948718e-06, + "loss": 1.3247, + "step": 106 + }, + { + "epoch": 0.008248535306814677, + "grad_norm": 5.352134704589844, + "learning_rate": 2.743589743589744e-06, + "loss": 1.2011, + "step": 107 + }, + { + "epoch": 0.008325624421831638, + "grad_norm": 5.508421897888184, + "learning_rate": 2.7692307692307697e-06, + "loss": 1.275, + "step": 108 + }, + { + "epoch": 0.008402713536848598, + "grad_norm": 4.892576694488525, + "learning_rate": 2.794871794871795e-06, + "loss": 1.2882, + "step": 109 + }, + { + "epoch": 0.008479802651865556, + "grad_norm": 5.162463665008545, + "learning_rate": 2.8205128205128207e-06, + "loss": 1.2715, + "step": 110 + }, + { + "epoch": 0.008556891766882516, + "grad_norm": 4.8386549949646, + "learning_rate": 2.846153846153846e-06, + "loss": 1.2729, + "step": 111 + }, + { + "epoch": 0.008633980881899476, + "grad_norm": 4.82133150100708, + "learning_rate": 2.8717948717948717e-06, + "loss": 1.2241, + "step": 112 + }, + { + "epoch": 0.008711069996916435, + "grad_norm": 4.835055828094482, + "learning_rate": 2.897435897435898e-06, + "loss": 1.1711, + "step": 113 + }, + { + "epoch": 0.008788159111933395, + "grad_norm": 5.417538642883301, + "learning_rate": 2.9230769230769236e-06, + "loss": 1.1772, + "step": 114 + }, + { + "epoch": 0.008865248226950355, + "grad_norm": 5.100035190582275, + "learning_rate": 2.948717948717949e-06, + "loss": 1.2341, + "step": 115 + }, + { + "epoch": 0.008942337341967314, + "grad_norm": 4.800838470458984, + "learning_rate": 2.9743589743589746e-06, + "loss": 1.2045, + "step": 116 + }, + { + "epoch": 0.009019426456984274, + "grad_norm": 4.5708184242248535, + "learning_rate": 3e-06, + "loss": 1.1864, + "step": 117 + }, + { + "epoch": 0.009096515572001234, + "grad_norm": 5.127934455871582, + "learning_rate": 3.0256410256410256e-06, + "loss": 1.293, + "step": 118 + }, + { + "epoch": 0.009173604687018192, + "grad_norm": 4.894773960113525, + "learning_rate": 3.051282051282052e-06, + "loss": 1.2704, + "step": 119 + }, + { + "epoch": 0.009250693802035153, + "grad_norm": 5.011133193969727, + "learning_rate": 3.0769230769230774e-06, + "loss": 1.3439, + "step": 120 + }, + { + "epoch": 0.009327782917052113, + "grad_norm": 4.952587127685547, + "learning_rate": 3.102564102564103e-06, + "loss": 1.2353, + "step": 121 + }, + { + "epoch": 0.009404872032069071, + "grad_norm": 4.612563610076904, + "learning_rate": 3.1282051282051284e-06, + "loss": 1.2477, + "step": 122 + }, + { + "epoch": 0.009481961147086031, + "grad_norm": 4.67457389831543, + "learning_rate": 3.153846153846154e-06, + "loss": 1.3014, + "step": 123 + }, + { + "epoch": 0.009559050262102992, + "grad_norm": 4.838489532470703, + "learning_rate": 3.1794871794871795e-06, + "loss": 1.2135, + "step": 124 + }, + { + "epoch": 0.00963613937711995, + "grad_norm": 5.4575042724609375, + "learning_rate": 3.205128205128206e-06, + "loss": 1.3339, + "step": 125 + }, + { + "epoch": 0.00971322849213691, + "grad_norm": 4.730915069580078, + "learning_rate": 3.2307692307692313e-06, + "loss": 1.2298, + "step": 126 + }, + { + "epoch": 0.00979031760715387, + "grad_norm": 5.235102653503418, + "learning_rate": 3.256410256410257e-06, + "loss": 1.195, + "step": 127 + }, + { + "epoch": 0.009867406722170829, + "grad_norm": 4.515792369842529, + "learning_rate": 3.2820512820512823e-06, + "loss": 1.2009, + "step": 128 + }, + { + "epoch": 0.009944495837187789, + "grad_norm": 4.882987022399902, + "learning_rate": 3.307692307692308e-06, + "loss": 1.3124, + "step": 129 + }, + { + "epoch": 0.010021584952204749, + "grad_norm": 4.703249931335449, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.2973, + "step": 130 + }, + { + "epoch": 0.010098674067221708, + "grad_norm": 5.116337776184082, + "learning_rate": 3.358974358974359e-06, + "loss": 1.3368, + "step": 131 + }, + { + "epoch": 0.010175763182238668, + "grad_norm": 4.505654335021973, + "learning_rate": 3.384615384615385e-06, + "loss": 1.12, + "step": 132 + }, + { + "epoch": 0.010252852297255628, + "grad_norm": 4.808957576751709, + "learning_rate": 3.4102564102564107e-06, + "loss": 1.3359, + "step": 133 + }, + { + "epoch": 0.010329941412272586, + "grad_norm": 5.045513153076172, + "learning_rate": 3.435897435897436e-06, + "loss": 1.2173, + "step": 134 + }, + { + "epoch": 0.010407030527289547, + "grad_norm": 4.800675868988037, + "learning_rate": 3.4615384615384617e-06, + "loss": 1.3637, + "step": 135 + }, + { + "epoch": 0.010484119642306507, + "grad_norm": 4.922788619995117, + "learning_rate": 3.487179487179487e-06, + "loss": 1.2737, + "step": 136 + }, + { + "epoch": 0.010561208757323465, + "grad_norm": 5.645283222198486, + "learning_rate": 3.5128205128205127e-06, + "loss": 1.3652, + "step": 137 + }, + { + "epoch": 0.010638297872340425, + "grad_norm": 5.270390510559082, + "learning_rate": 3.538461538461539e-06, + "loss": 1.2676, + "step": 138 + }, + { + "epoch": 0.010715386987357385, + "grad_norm": 5.082615375518799, + "learning_rate": 3.5641025641025646e-06, + "loss": 1.2817, + "step": 139 + }, + { + "epoch": 0.010792476102374344, + "grad_norm": 4.738155841827393, + "learning_rate": 3.58974358974359e-06, + "loss": 1.2964, + "step": 140 + }, + { + "epoch": 0.010869565217391304, + "grad_norm": 4.902276992797852, + "learning_rate": 3.6153846153846156e-06, + "loss": 1.143, + "step": 141 + }, + { + "epoch": 0.010946654332408264, + "grad_norm": 4.572447776794434, + "learning_rate": 3.641025641025641e-06, + "loss": 1.2367, + "step": 142 + }, + { + "epoch": 0.011023743447425224, + "grad_norm": 5.859162330627441, + "learning_rate": 3.6666666666666666e-06, + "loss": 1.2027, + "step": 143 + }, + { + "epoch": 0.011100832562442183, + "grad_norm": 5.429625034332275, + "learning_rate": 3.692307692307693e-06, + "loss": 1.3681, + "step": 144 + }, + { + "epoch": 0.011177921677459143, + "grad_norm": 4.913431167602539, + "learning_rate": 3.7179487179487184e-06, + "loss": 1.1966, + "step": 145 + }, + { + "epoch": 0.011255010792476103, + "grad_norm": 5.548055648803711, + "learning_rate": 3.743589743589744e-06, + "loss": 1.1941, + "step": 146 + }, + { + "epoch": 0.011332099907493062, + "grad_norm": 4.827651500701904, + "learning_rate": 3.7692307692307694e-06, + "loss": 1.27, + "step": 147 + }, + { + "epoch": 0.011409189022510022, + "grad_norm": 4.984038829803467, + "learning_rate": 3.794871794871795e-06, + "loss": 1.2527, + "step": 148 + }, + { + "epoch": 0.011486278137526982, + "grad_norm": 4.7654032707214355, + "learning_rate": 3.8205128205128204e-06, + "loss": 1.1242, + "step": 149 + }, + { + "epoch": 0.01156336725254394, + "grad_norm": 4.45728874206543, + "learning_rate": 3.846153846153847e-06, + "loss": 1.2219, + "step": 150 + }, + { + "epoch": 0.0116404563675609, + "grad_norm": 4.937605381011963, + "learning_rate": 3.871794871794872e-06, + "loss": 1.2036, + "step": 151 + }, + { + "epoch": 0.01171754548257786, + "grad_norm": 5.271725654602051, + "learning_rate": 3.897435897435898e-06, + "loss": 1.3552, + "step": 152 + }, + { + "epoch": 0.01179463459759482, + "grad_norm": 4.775473117828369, + "learning_rate": 3.923076923076923e-06, + "loss": 1.1872, + "step": 153 + }, + { + "epoch": 0.01187172371261178, + "grad_norm": 5.3986358642578125, + "learning_rate": 3.948717948717949e-06, + "loss": 1.3389, + "step": 154 + }, + { + "epoch": 0.01194881282762874, + "grad_norm": 4.677184581756592, + "learning_rate": 3.974358974358974e-06, + "loss": 1.2231, + "step": 155 + }, + { + "epoch": 0.012025901942645698, + "grad_norm": 5.348373889923096, + "learning_rate": 4.000000000000001e-06, + "loss": 1.2328, + "step": 156 + }, + { + "epoch": 0.012102991057662658, + "grad_norm": 4.47370719909668, + "learning_rate": 4.025641025641026e-06, + "loss": 1.215, + "step": 157 + }, + { + "epoch": 0.012180080172679618, + "grad_norm": 5.143750190734863, + "learning_rate": 4.051282051282052e-06, + "loss": 1.2161, + "step": 158 + }, + { + "epoch": 0.012257169287696577, + "grad_norm": 5.946848392486572, + "learning_rate": 4.076923076923077e-06, + "loss": 1.1238, + "step": 159 + }, + { + "epoch": 0.012334258402713537, + "grad_norm": 4.650530815124512, + "learning_rate": 4.102564102564103e-06, + "loss": 1.2754, + "step": 160 + }, + { + "epoch": 0.012411347517730497, + "grad_norm": 4.534331798553467, + "learning_rate": 4.128205128205128e-06, + "loss": 1.2325, + "step": 161 + }, + { + "epoch": 0.012488436632747455, + "grad_norm": 5.198877334594727, + "learning_rate": 4.1538461538461545e-06, + "loss": 1.2318, + "step": 162 + }, + { + "epoch": 0.012565525747764416, + "grad_norm": 4.763823986053467, + "learning_rate": 4.17948717948718e-06, + "loss": 1.2049, + "step": 163 + }, + { + "epoch": 0.012642614862781376, + "grad_norm": 5.088624000549316, + "learning_rate": 4.2051282051282055e-06, + "loss": 1.2649, + "step": 164 + }, + { + "epoch": 0.012719703977798334, + "grad_norm": 5.234319686889648, + "learning_rate": 4.230769230769231e-06, + "loss": 1.308, + "step": 165 + }, + { + "epoch": 0.012796793092815294, + "grad_norm": 4.7018537521362305, + "learning_rate": 4.2564102564102566e-06, + "loss": 1.2356, + "step": 166 + }, + { + "epoch": 0.012873882207832255, + "grad_norm": 5.618417263031006, + "learning_rate": 4.282051282051282e-06, + "loss": 1.1963, + "step": 167 + }, + { + "epoch": 0.012950971322849213, + "grad_norm": 5.07379674911499, + "learning_rate": 4.307692307692308e-06, + "loss": 1.2074, + "step": 168 + }, + { + "epoch": 0.013028060437866173, + "grad_norm": 4.746245861053467, + "learning_rate": 4.333333333333334e-06, + "loss": 1.302, + "step": 169 + }, + { + "epoch": 0.013105149552883133, + "grad_norm": 4.660124778747559, + "learning_rate": 4.358974358974359e-06, + "loss": 1.2835, + "step": 170 + }, + { + "epoch": 0.013182238667900092, + "grad_norm": 4.880448818206787, + "learning_rate": 4.384615384615385e-06, + "loss": 1.3333, + "step": 171 + }, + { + "epoch": 0.013259327782917052, + "grad_norm": 5.0500359535217285, + "learning_rate": 4.4102564102564104e-06, + "loss": 1.2318, + "step": 172 + }, + { + "epoch": 0.013336416897934012, + "grad_norm": 4.640452861785889, + "learning_rate": 4.435897435897436e-06, + "loss": 1.1425, + "step": 173 + }, + { + "epoch": 0.01341350601295097, + "grad_norm": 4.400023460388184, + "learning_rate": 4.461538461538462e-06, + "loss": 1.319, + "step": 174 + }, + { + "epoch": 0.01349059512796793, + "grad_norm": 4.9543046951293945, + "learning_rate": 4.487179487179488e-06, + "loss": 1.1423, + "step": 175 + }, + { + "epoch": 0.013567684242984891, + "grad_norm": 5.030699729919434, + "learning_rate": 4.512820512820513e-06, + "loss": 1.2126, + "step": 176 + }, + { + "epoch": 0.01364477335800185, + "grad_norm": 5.696242809295654, + "learning_rate": 4.538461538461539e-06, + "loss": 1.4041, + "step": 177 + }, + { + "epoch": 0.01372186247301881, + "grad_norm": 5.10263729095459, + "learning_rate": 4.564102564102564e-06, + "loss": 1.2331, + "step": 178 + }, + { + "epoch": 0.01379895158803577, + "grad_norm": 4.404088497161865, + "learning_rate": 4.58974358974359e-06, + "loss": 1.2334, + "step": 179 + }, + { + "epoch": 0.013876040703052728, + "grad_norm": 4.599002361297607, + "learning_rate": 4.615384615384616e-06, + "loss": 1.0995, + "step": 180 + }, + { + "epoch": 0.013953129818069688, + "grad_norm": 5.312249660491943, + "learning_rate": 4.641025641025642e-06, + "loss": 1.3024, + "step": 181 + }, + { + "epoch": 0.014030218933086648, + "grad_norm": 4.873595714569092, + "learning_rate": 4.666666666666667e-06, + "loss": 1.3503, + "step": 182 + }, + { + "epoch": 0.014107308048103609, + "grad_norm": 4.9252400398254395, + "learning_rate": 4.692307692307693e-06, + "loss": 1.2383, + "step": 183 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 5.004896640777588, + "learning_rate": 4.717948717948718e-06, + "loss": 1.1963, + "step": 184 + }, + { + "epoch": 0.014261486278137527, + "grad_norm": 5.2026801109313965, + "learning_rate": 4.743589743589744e-06, + "loss": 1.4058, + "step": 185 + }, + { + "epoch": 0.014338575393154487, + "grad_norm": 4.740382194519043, + "learning_rate": 4.76923076923077e-06, + "loss": 1.2259, + "step": 186 + }, + { + "epoch": 0.014415664508171446, + "grad_norm": 4.338659763336182, + "learning_rate": 4.7948717948717955e-06, + "loss": 1.0815, + "step": 187 + }, + { + "epoch": 0.014492753623188406, + "grad_norm": 5.065215110778809, + "learning_rate": 4.820512820512821e-06, + "loss": 1.2062, + "step": 188 + }, + { + "epoch": 0.014569842738205366, + "grad_norm": 4.5627264976501465, + "learning_rate": 4.8461538461538465e-06, + "loss": 1.2032, + "step": 189 + }, + { + "epoch": 0.014646931853222325, + "grad_norm": 5.128222465515137, + "learning_rate": 4.871794871794872e-06, + "loss": 1.3514, + "step": 190 + }, + { + "epoch": 0.014724020968239285, + "grad_norm": 4.774002552032471, + "learning_rate": 4.8974358974358975e-06, + "loss": 1.2028, + "step": 191 + }, + { + "epoch": 0.014801110083256245, + "grad_norm": 5.117179870605469, + "learning_rate": 4.923076923076924e-06, + "loss": 1.19, + "step": 192 + }, + { + "epoch": 0.014878199198273203, + "grad_norm": 4.972648620605469, + "learning_rate": 4.948717948717949e-06, + "loss": 1.2141, + "step": 193 + }, + { + "epoch": 0.014955288313290164, + "grad_norm": 4.6805219650268555, + "learning_rate": 4.974358974358975e-06, + "loss": 1.1642, + "step": 194 + }, + { + "epoch": 0.015032377428307124, + "grad_norm": 4.577054023742676, + "learning_rate": 5e-06, + "loss": 1.28, + "step": 195 + }, + { + "epoch": 0.015109466543324082, + "grad_norm": 4.717634677886963, + "learning_rate": 5.025641025641026e-06, + "loss": 1.2222, + "step": 196 + }, + { + "epoch": 0.015186555658341042, + "grad_norm": 5.113860607147217, + "learning_rate": 5.051282051282051e-06, + "loss": 1.3246, + "step": 197 + }, + { + "epoch": 0.015263644773358002, + "grad_norm": 5.178426742553711, + "learning_rate": 5.076923076923077e-06, + "loss": 1.2102, + "step": 198 + }, + { + "epoch": 0.015340733888374961, + "grad_norm": 5.071588039398193, + "learning_rate": 5.1025641025641024e-06, + "loss": 1.2794, + "step": 199 + }, + { + "epoch": 0.015417823003391921, + "grad_norm": 5.163846015930176, + "learning_rate": 5.128205128205128e-06, + "loss": 1.2664, + "step": 200 + }, + { + "epoch": 0.015494912118408881, + "grad_norm": 4.933408260345459, + "learning_rate": 5.1538461538461534e-06, + "loss": 1.3126, + "step": 201 + }, + { + "epoch": 0.01557200123342584, + "grad_norm": 5.551726341247559, + "learning_rate": 5.179487179487181e-06, + "loss": 1.4489, + "step": 202 + }, + { + "epoch": 0.0156490903484428, + "grad_norm": 4.847557067871094, + "learning_rate": 5.205128205128206e-06, + "loss": 1.2887, + "step": 203 + }, + { + "epoch": 0.01572617946345976, + "grad_norm": 5.323663234710693, + "learning_rate": 5.230769230769232e-06, + "loss": 1.271, + "step": 204 + }, + { + "epoch": 0.01580326857847672, + "grad_norm": 4.9948201179504395, + "learning_rate": 5.256410256410257e-06, + "loss": 1.1802, + "step": 205 + }, + { + "epoch": 0.01588035769349368, + "grad_norm": 5.172011375427246, + "learning_rate": 5.282051282051283e-06, + "loss": 1.1817, + "step": 206 + }, + { + "epoch": 0.015957446808510637, + "grad_norm": 5.043161869049072, + "learning_rate": 5.307692307692308e-06, + "loss": 1.2308, + "step": 207 + }, + { + "epoch": 0.016034535923527597, + "grad_norm": 4.964840888977051, + "learning_rate": 5.333333333333334e-06, + "loss": 1.314, + "step": 208 + }, + { + "epoch": 0.016111625038544557, + "grad_norm": 5.201618194580078, + "learning_rate": 5.358974358974359e-06, + "loss": 1.2482, + "step": 209 + }, + { + "epoch": 0.016188714153561518, + "grad_norm": 5.381608963012695, + "learning_rate": 5.384615384615385e-06, + "loss": 1.2502, + "step": 210 + }, + { + "epoch": 0.016265803268578478, + "grad_norm": 5.2073187828063965, + "learning_rate": 5.41025641025641e-06, + "loss": 1.3514, + "step": 211 + }, + { + "epoch": 0.016342892383595438, + "grad_norm": 5.278835773468018, + "learning_rate": 5.435897435897436e-06, + "loss": 1.313, + "step": 212 + }, + { + "epoch": 0.016419981498612395, + "grad_norm": 5.034635543823242, + "learning_rate": 5.461538461538461e-06, + "loss": 1.1963, + "step": 213 + }, + { + "epoch": 0.016497070613629355, + "grad_norm": 4.991930961608887, + "learning_rate": 5.487179487179488e-06, + "loss": 1.3247, + "step": 214 + }, + { + "epoch": 0.016574159728646315, + "grad_norm": 5.150703430175781, + "learning_rate": 5.512820512820514e-06, + "loss": 1.2084, + "step": 215 + }, + { + "epoch": 0.016651248843663275, + "grad_norm": 4.838485240936279, + "learning_rate": 5.538461538461539e-06, + "loss": 1.248, + "step": 216 + }, + { + "epoch": 0.016728337958680235, + "grad_norm": 4.620452404022217, + "learning_rate": 5.564102564102565e-06, + "loss": 1.1368, + "step": 217 + }, + { + "epoch": 0.016805427073697195, + "grad_norm": 4.712934494018555, + "learning_rate": 5.58974358974359e-06, + "loss": 1.2305, + "step": 218 + }, + { + "epoch": 0.016882516188714152, + "grad_norm": 4.66661262512207, + "learning_rate": 5.615384615384616e-06, + "loss": 1.1054, + "step": 219 + }, + { + "epoch": 0.016959605303731112, + "grad_norm": 5.062489986419678, + "learning_rate": 5.641025641025641e-06, + "loss": 1.2248, + "step": 220 + }, + { + "epoch": 0.017036694418748072, + "grad_norm": 5.344682216644287, + "learning_rate": 5.666666666666667e-06, + "loss": 1.3622, + "step": 221 + }, + { + "epoch": 0.017113783533765033, + "grad_norm": 5.120663166046143, + "learning_rate": 5.692307692307692e-06, + "loss": 1.1208, + "step": 222 + }, + { + "epoch": 0.017190872648781993, + "grad_norm": 4.629307746887207, + "learning_rate": 5.717948717948718e-06, + "loss": 1.204, + "step": 223 + }, + { + "epoch": 0.017267961763798953, + "grad_norm": 5.405148029327393, + "learning_rate": 5.743589743589743e-06, + "loss": 1.2831, + "step": 224 + }, + { + "epoch": 0.01734505087881591, + "grad_norm": 4.422632217407227, + "learning_rate": 5.769230769230769e-06, + "loss": 1.2251, + "step": 225 + }, + { + "epoch": 0.01742213999383287, + "grad_norm": 4.856597900390625, + "learning_rate": 5.794871794871796e-06, + "loss": 1.3051, + "step": 226 + }, + { + "epoch": 0.01749922910884983, + "grad_norm": 4.129710674285889, + "learning_rate": 5.820512820512822e-06, + "loss": 1.0233, + "step": 227 + }, + { + "epoch": 0.01757631822386679, + "grad_norm": 4.654583930969238, + "learning_rate": 5.846153846153847e-06, + "loss": 1.2445, + "step": 228 + }, + { + "epoch": 0.01765340733888375, + "grad_norm": 4.510683536529541, + "learning_rate": 5.871794871794873e-06, + "loss": 1.1158, + "step": 229 + }, + { + "epoch": 0.01773049645390071, + "grad_norm": 4.56011438369751, + "learning_rate": 5.897435897435898e-06, + "loss": 1.2008, + "step": 230 + }, + { + "epoch": 0.017807585568917667, + "grad_norm": 4.707639694213867, + "learning_rate": 5.923076923076924e-06, + "loss": 1.2206, + "step": 231 + }, + { + "epoch": 0.017884674683934627, + "grad_norm": 4.402471542358398, + "learning_rate": 5.948717948717949e-06, + "loss": 1.193, + "step": 232 + }, + { + "epoch": 0.017961763798951588, + "grad_norm": 4.763029098510742, + "learning_rate": 5.974358974358975e-06, + "loss": 1.2553, + "step": 233 + }, + { + "epoch": 0.018038852913968548, + "grad_norm": 4.92691707611084, + "learning_rate": 6e-06, + "loss": 1.2258, + "step": 234 + }, + { + "epoch": 0.018115942028985508, + "grad_norm": 4.550500392913818, + "learning_rate": 6.025641025641026e-06, + "loss": 1.2513, + "step": 235 + }, + { + "epoch": 0.018193031144002468, + "grad_norm": 4.7478251457214355, + "learning_rate": 6.051282051282051e-06, + "loss": 1.239, + "step": 236 + }, + { + "epoch": 0.018270120259019425, + "grad_norm": 4.82733678817749, + "learning_rate": 6.076923076923077e-06, + "loss": 1.2731, + "step": 237 + }, + { + "epoch": 0.018347209374036385, + "grad_norm": 4.546608924865723, + "learning_rate": 6.102564102564104e-06, + "loss": 1.1815, + "step": 238 + }, + { + "epoch": 0.018424298489053345, + "grad_norm": 5.261005878448486, + "learning_rate": 6.128205128205129e-06, + "loss": 1.2074, + "step": 239 + }, + { + "epoch": 0.018501387604070305, + "grad_norm": 4.9964213371276855, + "learning_rate": 6.153846153846155e-06, + "loss": 1.3077, + "step": 240 + }, + { + "epoch": 0.018578476719087265, + "grad_norm": 4.793504238128662, + "learning_rate": 6.17948717948718e-06, + "loss": 1.1757, + "step": 241 + }, + { + "epoch": 0.018655565834104226, + "grad_norm": 5.3314642906188965, + "learning_rate": 6.205128205128206e-06, + "loss": 1.3881, + "step": 242 + }, + { + "epoch": 0.018732654949121186, + "grad_norm": 4.648246765136719, + "learning_rate": 6.230769230769231e-06, + "loss": 1.2445, + "step": 243 + }, + { + "epoch": 0.018809744064138142, + "grad_norm": 4.99802827835083, + "learning_rate": 6.256410256410257e-06, + "loss": 1.27, + "step": 244 + }, + { + "epoch": 0.018886833179155103, + "grad_norm": 4.553841590881348, + "learning_rate": 6.282051282051282e-06, + "loss": 1.1916, + "step": 245 + }, + { + "epoch": 0.018963922294172063, + "grad_norm": 4.673896312713623, + "learning_rate": 6.307692307692308e-06, + "loss": 1.2046, + "step": 246 + }, + { + "epoch": 0.019041011409189023, + "grad_norm": 5.352630615234375, + "learning_rate": 6.333333333333333e-06, + "loss": 1.3785, + "step": 247 + }, + { + "epoch": 0.019118100524205983, + "grad_norm": 4.59555721282959, + "learning_rate": 6.358974358974359e-06, + "loss": 1.0682, + "step": 248 + }, + { + "epoch": 0.019195189639222943, + "grad_norm": 5.211584568023682, + "learning_rate": 6.384615384615384e-06, + "loss": 1.3154, + "step": 249 + }, + { + "epoch": 0.0192722787542399, + "grad_norm": 4.7507243156433105, + "learning_rate": 6.410256410256412e-06, + "loss": 1.1679, + "step": 250 + }, + { + "epoch": 0.01934936786925686, + "grad_norm": 4.9127516746521, + "learning_rate": 6.435897435897437e-06, + "loss": 1.1803, + "step": 251 + }, + { + "epoch": 0.01942645698427382, + "grad_norm": 5.001187801361084, + "learning_rate": 6.461538461538463e-06, + "loss": 1.2204, + "step": 252 + }, + { + "epoch": 0.01950354609929078, + "grad_norm": 5.021228313446045, + "learning_rate": 6.487179487179488e-06, + "loss": 1.2126, + "step": 253 + }, + { + "epoch": 0.01958063521430774, + "grad_norm": 4.73190975189209, + "learning_rate": 6.512820512820514e-06, + "loss": 1.1656, + "step": 254 + }, + { + "epoch": 0.0196577243293247, + "grad_norm": 4.766290187835693, + "learning_rate": 6.538461538461539e-06, + "loss": 1.2573, + "step": 255 + }, + { + "epoch": 0.019734813444341658, + "grad_norm": 4.61043643951416, + "learning_rate": 6.564102564102565e-06, + "loss": 1.2092, + "step": 256 + }, + { + "epoch": 0.019811902559358618, + "grad_norm": 4.742149353027344, + "learning_rate": 6.58974358974359e-06, + "loss": 1.2197, + "step": 257 + }, + { + "epoch": 0.019888991674375578, + "grad_norm": 4.5568389892578125, + "learning_rate": 6.615384615384616e-06, + "loss": 1.2677, + "step": 258 + }, + { + "epoch": 0.019966080789392538, + "grad_norm": 4.9256367683410645, + "learning_rate": 6.641025641025641e-06, + "loss": 1.1879, + "step": 259 + }, + { + "epoch": 0.020043169904409498, + "grad_norm": 5.019280433654785, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1391, + "step": 260 + }, + { + "epoch": 0.02012025901942646, + "grad_norm": 5.337982654571533, + "learning_rate": 6.692307692307692e-06, + "loss": 1.1885, + "step": 261 + }, + { + "epoch": 0.020197348134443415, + "grad_norm": 5.246166229248047, + "learning_rate": 6.717948717948718e-06, + "loss": 1.2816, + "step": 262 + }, + { + "epoch": 0.020274437249460375, + "grad_norm": 5.1329731941223145, + "learning_rate": 6.743589743589745e-06, + "loss": 1.0985, + "step": 263 + }, + { + "epoch": 0.020351526364477335, + "grad_norm": 5.237858772277832, + "learning_rate": 6.76923076923077e-06, + "loss": 1.3882, + "step": 264 + }, + { + "epoch": 0.020428615479494296, + "grad_norm": 5.131351470947266, + "learning_rate": 6.794871794871796e-06, + "loss": 1.1944, + "step": 265 + }, + { + "epoch": 0.020505704594511256, + "grad_norm": 5.077263832092285, + "learning_rate": 6.820512820512821e-06, + "loss": 1.2041, + "step": 266 + }, + { + "epoch": 0.020582793709528216, + "grad_norm": 5.028573036193848, + "learning_rate": 6.846153846153847e-06, + "loss": 1.248, + "step": 267 + }, + { + "epoch": 0.020659882824545173, + "grad_norm": 5.382236480712891, + "learning_rate": 6.871794871794872e-06, + "loss": 1.3058, + "step": 268 + }, + { + "epoch": 0.020736971939562133, + "grad_norm": 4.82020902633667, + "learning_rate": 6.897435897435898e-06, + "loss": 1.2581, + "step": 269 + }, + { + "epoch": 0.020814061054579093, + "grad_norm": 4.485026836395264, + "learning_rate": 6.923076923076923e-06, + "loss": 1.1932, + "step": 270 + }, + { + "epoch": 0.020891150169596053, + "grad_norm": 4.891682147979736, + "learning_rate": 6.948717948717949e-06, + "loss": 1.2035, + "step": 271 + }, + { + "epoch": 0.020968239284613013, + "grad_norm": 4.9292683601379395, + "learning_rate": 6.974358974358974e-06, + "loss": 1.2209, + "step": 272 + }, + { + "epoch": 0.021045328399629974, + "grad_norm": 4.799893379211426, + "learning_rate": 7e-06, + "loss": 1.3824, + "step": 273 + }, + { + "epoch": 0.02112241751464693, + "grad_norm": 4.358574390411377, + "learning_rate": 7.025641025641025e-06, + "loss": 1.1656, + "step": 274 + }, + { + "epoch": 0.02119950662966389, + "grad_norm": 4.752989768981934, + "learning_rate": 7.051282051282053e-06, + "loss": 1.1773, + "step": 275 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 4.714949607849121, + "learning_rate": 7.076923076923078e-06, + "loss": 1.1622, + "step": 276 + }, + { + "epoch": 0.02135368485969781, + "grad_norm": 4.488297939300537, + "learning_rate": 7.102564102564104e-06, + "loss": 1.1412, + "step": 277 + }, + { + "epoch": 0.02143077397471477, + "grad_norm": 4.944022178649902, + "learning_rate": 7.128205128205129e-06, + "loss": 1.1468, + "step": 278 + }, + { + "epoch": 0.02150786308973173, + "grad_norm": 4.799287796020508, + "learning_rate": 7.153846153846155e-06, + "loss": 1.1958, + "step": 279 + }, + { + "epoch": 0.021584952204748688, + "grad_norm": 4.447544574737549, + "learning_rate": 7.17948717948718e-06, + "loss": 1.3105, + "step": 280 + }, + { + "epoch": 0.021662041319765648, + "grad_norm": 4.781645774841309, + "learning_rate": 7.205128205128206e-06, + "loss": 1.3335, + "step": 281 + }, + { + "epoch": 0.021739130434782608, + "grad_norm": 5.0389018058776855, + "learning_rate": 7.230769230769231e-06, + "loss": 1.2225, + "step": 282 + }, + { + "epoch": 0.021816219549799568, + "grad_norm": 5.132277011871338, + "learning_rate": 7.256410256410257e-06, + "loss": 1.2914, + "step": 283 + }, + { + "epoch": 0.02189330866481653, + "grad_norm": 5.015186309814453, + "learning_rate": 7.282051282051282e-06, + "loss": 1.1996, + "step": 284 + }, + { + "epoch": 0.02197039777983349, + "grad_norm": 5.795614242553711, + "learning_rate": 7.307692307692308e-06, + "loss": 1.251, + "step": 285 + }, + { + "epoch": 0.02204748689485045, + "grad_norm": 4.816180229187012, + "learning_rate": 7.333333333333333e-06, + "loss": 1.2001, + "step": 286 + }, + { + "epoch": 0.022124576009867405, + "grad_norm": 4.811607837677002, + "learning_rate": 7.35897435897436e-06, + "loss": 1.2148, + "step": 287 + }, + { + "epoch": 0.022201665124884366, + "grad_norm": 4.706379413604736, + "learning_rate": 7.384615384615386e-06, + "loss": 1.249, + "step": 288 + }, + { + "epoch": 0.022278754239901326, + "grad_norm": 5.7372894287109375, + "learning_rate": 7.410256410256411e-06, + "loss": 1.3184, + "step": 289 + }, + { + "epoch": 0.022355843354918286, + "grad_norm": 4.672527313232422, + "learning_rate": 7.435897435897437e-06, + "loss": 1.1606, + "step": 290 + }, + { + "epoch": 0.022432932469935246, + "grad_norm": 4.6642560958862305, + "learning_rate": 7.461538461538462e-06, + "loss": 1.2362, + "step": 291 + }, + { + "epoch": 0.022510021584952206, + "grad_norm": 4.640800952911377, + "learning_rate": 7.487179487179488e-06, + "loss": 1.237, + "step": 292 + }, + { + "epoch": 0.022587110699969163, + "grad_norm": 5.178220272064209, + "learning_rate": 7.512820512820513e-06, + "loss": 1.1799, + "step": 293 + }, + { + "epoch": 0.022664199814986123, + "grad_norm": 4.580568790435791, + "learning_rate": 7.538461538461539e-06, + "loss": 1.2442, + "step": 294 + }, + { + "epoch": 0.022741288930003083, + "grad_norm": 4.631171226501465, + "learning_rate": 7.564102564102564e-06, + "loss": 1.2399, + "step": 295 + }, + { + "epoch": 0.022818378045020044, + "grad_norm": 4.3523101806640625, + "learning_rate": 7.58974358974359e-06, + "loss": 1.1657, + "step": 296 + }, + { + "epoch": 0.022895467160037004, + "grad_norm": 5.684840202331543, + "learning_rate": 7.615384615384615e-06, + "loss": 1.2356, + "step": 297 + }, + { + "epoch": 0.022972556275053964, + "grad_norm": 4.739688873291016, + "learning_rate": 7.641025641025641e-06, + "loss": 1.2422, + "step": 298 + }, + { + "epoch": 0.02304964539007092, + "grad_norm": 4.905622482299805, + "learning_rate": 7.666666666666667e-06, + "loss": 1.3017, + "step": 299 + }, + { + "epoch": 0.02312673450508788, + "grad_norm": 4.725374221801758, + "learning_rate": 7.692307692307694e-06, + "loss": 1.3432, + "step": 300 + }, + { + "epoch": 0.02320382362010484, + "grad_norm": 4.97802209854126, + "learning_rate": 7.717948717948718e-06, + "loss": 1.2064, + "step": 301 + }, + { + "epoch": 0.0232809127351218, + "grad_norm": 4.46768856048584, + "learning_rate": 7.743589743589745e-06, + "loss": 1.1462, + "step": 302 + }, + { + "epoch": 0.02335800185013876, + "grad_norm": 4.416783809661865, + "learning_rate": 7.76923076923077e-06, + "loss": 1.2325, + "step": 303 + }, + { + "epoch": 0.02343509096515572, + "grad_norm": 5.031586647033691, + "learning_rate": 7.794871794871796e-06, + "loss": 1.2013, + "step": 304 + }, + { + "epoch": 0.023512180080172678, + "grad_norm": 5.045072555541992, + "learning_rate": 7.820512820512822e-06, + "loss": 1.1312, + "step": 305 + }, + { + "epoch": 0.02358926919518964, + "grad_norm": 4.986116886138916, + "learning_rate": 7.846153846153847e-06, + "loss": 1.1599, + "step": 306 + }, + { + "epoch": 0.0236663583102066, + "grad_norm": 5.043438911437988, + "learning_rate": 7.871794871794873e-06, + "loss": 1.2043, + "step": 307 + }, + { + "epoch": 0.02374344742522356, + "grad_norm": 4.92976188659668, + "learning_rate": 7.897435897435898e-06, + "loss": 1.2303, + "step": 308 + }, + { + "epoch": 0.02382053654024052, + "grad_norm": 5.960122585296631, + "learning_rate": 7.923076923076924e-06, + "loss": 1.203, + "step": 309 + }, + { + "epoch": 0.02389762565525748, + "grad_norm": 5.050799369812012, + "learning_rate": 7.948717948717949e-06, + "loss": 1.3918, + "step": 310 + }, + { + "epoch": 0.023974714770274436, + "grad_norm": 5.031795978546143, + "learning_rate": 7.974358974358975e-06, + "loss": 1.2533, + "step": 311 + }, + { + "epoch": 0.024051803885291396, + "grad_norm": 4.596309185028076, + "learning_rate": 8.000000000000001e-06, + "loss": 1.1893, + "step": 312 + }, + { + "epoch": 0.024128893000308356, + "grad_norm": 4.877758979797363, + "learning_rate": 8.025641025641026e-06, + "loss": 1.2398, + "step": 313 + }, + { + "epoch": 0.024205982115325316, + "grad_norm": 4.588964462280273, + "learning_rate": 8.051282051282052e-06, + "loss": 1.1656, + "step": 314 + }, + { + "epoch": 0.024283071230342276, + "grad_norm": 4.996614456176758, + "learning_rate": 8.076923076923077e-06, + "loss": 1.2567, + "step": 315 + }, + { + "epoch": 0.024360160345359236, + "grad_norm": 4.830722332000732, + "learning_rate": 8.102564102564103e-06, + "loss": 1.2854, + "step": 316 + }, + { + "epoch": 0.024437249460376193, + "grad_norm": 5.052509784698486, + "learning_rate": 8.12820512820513e-06, + "loss": 1.1399, + "step": 317 + }, + { + "epoch": 0.024514338575393153, + "grad_norm": 5.15505838394165, + "learning_rate": 8.153846153846154e-06, + "loss": 1.1843, + "step": 318 + }, + { + "epoch": 0.024591427690410114, + "grad_norm": 4.3718671798706055, + "learning_rate": 8.17948717948718e-06, + "loss": 1.2297, + "step": 319 + }, + { + "epoch": 0.024668516805427074, + "grad_norm": 5.402135372161865, + "learning_rate": 8.205128205128205e-06, + "loss": 1.1474, + "step": 320 + }, + { + "epoch": 0.024745605920444034, + "grad_norm": 4.940793991088867, + "learning_rate": 8.230769230769232e-06, + "loss": 1.2823, + "step": 321 + }, + { + "epoch": 0.024822695035460994, + "grad_norm": 4.902515888214111, + "learning_rate": 8.256410256410256e-06, + "loss": 1.3237, + "step": 322 + }, + { + "epoch": 0.024899784150477954, + "grad_norm": 4.852400779724121, + "learning_rate": 8.282051282051283e-06, + "loss": 1.1324, + "step": 323 + }, + { + "epoch": 0.02497687326549491, + "grad_norm": 4.758856296539307, + "learning_rate": 8.307692307692309e-06, + "loss": 1.2108, + "step": 324 + }, + { + "epoch": 0.02505396238051187, + "grad_norm": 4.814808368682861, + "learning_rate": 8.333333333333334e-06, + "loss": 1.2657, + "step": 325 + }, + { + "epoch": 0.02513105149552883, + "grad_norm": 4.526566028594971, + "learning_rate": 8.35897435897436e-06, + "loss": 1.2241, + "step": 326 + }, + { + "epoch": 0.02520814061054579, + "grad_norm": 5.531708717346191, + "learning_rate": 8.384615384615385e-06, + "loss": 1.3275, + "step": 327 + }, + { + "epoch": 0.02528522972556275, + "grad_norm": 4.945657253265381, + "learning_rate": 8.410256410256411e-06, + "loss": 1.274, + "step": 328 + }, + { + "epoch": 0.025362318840579712, + "grad_norm": 4.540581703186035, + "learning_rate": 8.435897435897436e-06, + "loss": 1.266, + "step": 329 + }, + { + "epoch": 0.02543940795559667, + "grad_norm": 4.201421737670898, + "learning_rate": 8.461538461538462e-06, + "loss": 1.2355, + "step": 330 + }, + { + "epoch": 0.02551649707061363, + "grad_norm": 4.603540420532227, + "learning_rate": 8.487179487179488e-06, + "loss": 1.2529, + "step": 331 + }, + { + "epoch": 0.02559358618563059, + "grad_norm": 4.829327583312988, + "learning_rate": 8.512820512820513e-06, + "loss": 1.3069, + "step": 332 + }, + { + "epoch": 0.02567067530064755, + "grad_norm": 4.978155612945557, + "learning_rate": 8.53846153846154e-06, + "loss": 1.1573, + "step": 333 + }, + { + "epoch": 0.02574776441566451, + "grad_norm": 4.506424427032471, + "learning_rate": 8.564102564102564e-06, + "loss": 1.1211, + "step": 334 + }, + { + "epoch": 0.02582485353068147, + "grad_norm": 5.001166343688965, + "learning_rate": 8.58974358974359e-06, + "loss": 1.1474, + "step": 335 + }, + { + "epoch": 0.025901942645698426, + "grad_norm": 4.721814155578613, + "learning_rate": 8.615384615384617e-06, + "loss": 1.3049, + "step": 336 + }, + { + "epoch": 0.025979031760715386, + "grad_norm": 4.9357008934021, + "learning_rate": 8.641025641025641e-06, + "loss": 1.2877, + "step": 337 + }, + { + "epoch": 0.026056120875732346, + "grad_norm": 4.828952789306641, + "learning_rate": 8.666666666666668e-06, + "loss": 1.1093, + "step": 338 + }, + { + "epoch": 0.026133209990749307, + "grad_norm": 4.813793659210205, + "learning_rate": 8.692307692307692e-06, + "loss": 1.2452, + "step": 339 + }, + { + "epoch": 0.026210299105766267, + "grad_norm": 4.303406715393066, + "learning_rate": 8.717948717948719e-06, + "loss": 1.0951, + "step": 340 + }, + { + "epoch": 0.026287388220783227, + "grad_norm": 4.791975498199463, + "learning_rate": 8.743589743589743e-06, + "loss": 1.2548, + "step": 341 + }, + { + "epoch": 0.026364477335800184, + "grad_norm": 4.471169948577881, + "learning_rate": 8.76923076923077e-06, + "loss": 1.112, + "step": 342 + }, + { + "epoch": 0.026441566450817144, + "grad_norm": 4.681336879730225, + "learning_rate": 8.794871794871796e-06, + "loss": 1.2183, + "step": 343 + }, + { + "epoch": 0.026518655565834104, + "grad_norm": 5.055361270904541, + "learning_rate": 8.820512820512821e-06, + "loss": 1.1957, + "step": 344 + }, + { + "epoch": 0.026595744680851064, + "grad_norm": 4.525293827056885, + "learning_rate": 8.846153846153847e-06, + "loss": 1.1316, + "step": 345 + }, + { + "epoch": 0.026672833795868024, + "grad_norm": 4.73473596572876, + "learning_rate": 8.871794871794872e-06, + "loss": 1.2229, + "step": 346 + }, + { + "epoch": 0.026749922910884984, + "grad_norm": 4.338306427001953, + "learning_rate": 8.897435897435898e-06, + "loss": 1.2271, + "step": 347 + }, + { + "epoch": 0.02682701202590194, + "grad_norm": 4.502996444702148, + "learning_rate": 8.923076923076925e-06, + "loss": 1.2868, + "step": 348 + }, + { + "epoch": 0.0269041011409189, + "grad_norm": 4.5346198081970215, + "learning_rate": 8.94871794871795e-06, + "loss": 1.0935, + "step": 349 + }, + { + "epoch": 0.02698119025593586, + "grad_norm": 4.440368175506592, + "learning_rate": 8.974358974358976e-06, + "loss": 1.1267, + "step": 350 + }, + { + "epoch": 0.02705827937095282, + "grad_norm": 4.624384880065918, + "learning_rate": 9e-06, + "loss": 1.215, + "step": 351 + }, + { + "epoch": 0.027135368485969782, + "grad_norm": 4.52625846862793, + "learning_rate": 9.025641025641027e-06, + "loss": 1.2398, + "step": 352 + }, + { + "epoch": 0.027212457600986742, + "grad_norm": 4.4209818840026855, + "learning_rate": 9.051282051282051e-06, + "loss": 1.1526, + "step": 353 + }, + { + "epoch": 0.0272895467160037, + "grad_norm": 4.647578239440918, + "learning_rate": 9.076923076923078e-06, + "loss": 1.2131, + "step": 354 + }, + { + "epoch": 0.02736663583102066, + "grad_norm": 3.973903179168701, + "learning_rate": 9.102564102564104e-06, + "loss": 1.0613, + "step": 355 + }, + { + "epoch": 0.02744372494603762, + "grad_norm": 5.1517252922058105, + "learning_rate": 9.128205128205129e-06, + "loss": 1.1746, + "step": 356 + }, + { + "epoch": 0.02752081406105458, + "grad_norm": 4.37183952331543, + "learning_rate": 9.153846153846155e-06, + "loss": 1.2774, + "step": 357 + }, + { + "epoch": 0.02759790317607154, + "grad_norm": 4.5695295333862305, + "learning_rate": 9.17948717948718e-06, + "loss": 1.1766, + "step": 358 + }, + { + "epoch": 0.0276749922910885, + "grad_norm": 4.399728775024414, + "learning_rate": 9.205128205128206e-06, + "loss": 1.2096, + "step": 359 + }, + { + "epoch": 0.027752081406105456, + "grad_norm": 4.5362043380737305, + "learning_rate": 9.230769230769232e-06, + "loss": 1.1005, + "step": 360 + }, + { + "epoch": 0.027829170521122416, + "grad_norm": 4.581974983215332, + "learning_rate": 9.256410256410257e-06, + "loss": 1.1755, + "step": 361 + }, + { + "epoch": 0.027906259636139377, + "grad_norm": 5.120265960693359, + "learning_rate": 9.282051282051283e-06, + "loss": 1.1285, + "step": 362 + }, + { + "epoch": 0.027983348751156337, + "grad_norm": 4.422695636749268, + "learning_rate": 9.307692307692308e-06, + "loss": 1.2343, + "step": 363 + }, + { + "epoch": 0.028060437866173297, + "grad_norm": 4.639062881469727, + "learning_rate": 9.333333333333334e-06, + "loss": 1.2733, + "step": 364 + }, + { + "epoch": 0.028137526981190257, + "grad_norm": 5.054222106933594, + "learning_rate": 9.358974358974359e-06, + "loss": 1.1177, + "step": 365 + }, + { + "epoch": 0.028214616096207217, + "grad_norm": 5.064762592315674, + "learning_rate": 9.384615384615385e-06, + "loss": 1.1147, + "step": 366 + }, + { + "epoch": 0.028291705211224174, + "grad_norm": 4.730871200561523, + "learning_rate": 9.410256410256412e-06, + "loss": 1.3437, + "step": 367 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 4.744129180908203, + "learning_rate": 9.435897435897436e-06, + "loss": 1.3698, + "step": 368 + }, + { + "epoch": 0.028445883441258094, + "grad_norm": 4.724253177642822, + "learning_rate": 9.461538461538463e-06, + "loss": 1.1685, + "step": 369 + }, + { + "epoch": 0.028522972556275054, + "grad_norm": 4.356321334838867, + "learning_rate": 9.487179487179487e-06, + "loss": 1.154, + "step": 370 + }, + { + "epoch": 0.028600061671292015, + "grad_norm": 4.506900310516357, + "learning_rate": 9.512820512820514e-06, + "loss": 1.233, + "step": 371 + }, + { + "epoch": 0.028677150786308975, + "grad_norm": 4.505119800567627, + "learning_rate": 9.53846153846154e-06, + "loss": 1.3049, + "step": 372 + }, + { + "epoch": 0.02875423990132593, + "grad_norm": 4.503683090209961, + "learning_rate": 9.564102564102565e-06, + "loss": 1.1666, + "step": 373 + }, + { + "epoch": 0.02883132901634289, + "grad_norm": 4.283431529998779, + "learning_rate": 9.589743589743591e-06, + "loss": 1.1497, + "step": 374 + }, + { + "epoch": 0.028908418131359852, + "grad_norm": 5.00119686126709, + "learning_rate": 9.615384615384616e-06, + "loss": 1.2709, + "step": 375 + }, + { + "epoch": 0.028985507246376812, + "grad_norm": 4.850398540496826, + "learning_rate": 9.641025641025642e-06, + "loss": 1.2135, + "step": 376 + }, + { + "epoch": 0.029062596361393772, + "grad_norm": 5.15523624420166, + "learning_rate": 9.666666666666667e-06, + "loss": 1.1788, + "step": 377 + }, + { + "epoch": 0.029139685476410732, + "grad_norm": 4.818391799926758, + "learning_rate": 9.692307692307693e-06, + "loss": 1.1626, + "step": 378 + }, + { + "epoch": 0.02921677459142769, + "grad_norm": 5.061548233032227, + "learning_rate": 9.71794871794872e-06, + "loss": 1.2423, + "step": 379 + }, + { + "epoch": 0.02929386370644465, + "grad_norm": 4.615644931793213, + "learning_rate": 9.743589743589744e-06, + "loss": 1.1704, + "step": 380 + }, + { + "epoch": 0.02937095282146161, + "grad_norm": 4.75278902053833, + "learning_rate": 9.76923076923077e-06, + "loss": 1.1854, + "step": 381 + }, + { + "epoch": 0.02944804193647857, + "grad_norm": 4.710444450378418, + "learning_rate": 9.794871794871795e-06, + "loss": 1.2158, + "step": 382 + }, + { + "epoch": 0.02952513105149553, + "grad_norm": 4.5607709884643555, + "learning_rate": 9.820512820512821e-06, + "loss": 1.1627, + "step": 383 + }, + { + "epoch": 0.02960222016651249, + "grad_norm": 4.754578113555908, + "learning_rate": 9.846153846153848e-06, + "loss": 1.3299, + "step": 384 + }, + { + "epoch": 0.029679309281529447, + "grad_norm": 4.9476542472839355, + "learning_rate": 9.871794871794872e-06, + "loss": 1.1626, + "step": 385 + }, + { + "epoch": 0.029756398396546407, + "grad_norm": 4.63112211227417, + "learning_rate": 9.897435897435899e-06, + "loss": 1.2714, + "step": 386 + }, + { + "epoch": 0.029833487511563367, + "grad_norm": 4.136993885040283, + "learning_rate": 9.923076923076923e-06, + "loss": 1.1761, + "step": 387 + }, + { + "epoch": 0.029910576626580327, + "grad_norm": 4.531764984130859, + "learning_rate": 9.94871794871795e-06, + "loss": 1.1452, + "step": 388 + }, + { + "epoch": 0.029987665741597287, + "grad_norm": 5.1062774658203125, + "learning_rate": 9.974358974358974e-06, + "loss": 1.2642, + "step": 389 + }, + { + "epoch": 0.030064754856614247, + "grad_norm": 4.920534610748291, + "learning_rate": 1e-05, + "loss": 1.2648, + "step": 390 + }, + { + "epoch": 0.030141843971631204, + "grad_norm": 5.01254415512085, + "learning_rate": 9.99999984413795e-06, + "loss": 1.1768, + "step": 391 + }, + { + "epoch": 0.030218933086648164, + "grad_norm": 4.516940116882324, + "learning_rate": 9.999999376551802e-06, + "loss": 1.303, + "step": 392 + }, + { + "epoch": 0.030296022201665124, + "grad_norm": 4.425420761108398, + "learning_rate": 9.99999859724159e-06, + "loss": 1.3071, + "step": 393 + }, + { + "epoch": 0.030373111316682085, + "grad_norm": 5.187442302703857, + "learning_rate": 9.999997506207361e-06, + "loss": 1.3249, + "step": 394 + }, + { + "epoch": 0.030450200431699045, + "grad_norm": 4.589512348175049, + "learning_rate": 9.999996103449184e-06, + "loss": 1.2048, + "step": 395 + }, + { + "epoch": 0.030527289546716005, + "grad_norm": 4.254446983337402, + "learning_rate": 9.999994388967143e-06, + "loss": 1.2693, + "step": 396 + }, + { + "epoch": 0.03060437866173296, + "grad_norm": 4.697052478790283, + "learning_rate": 9.999992362761349e-06, + "loss": 1.2305, + "step": 397 + }, + { + "epoch": 0.030681467776749922, + "grad_norm": 4.343505382537842, + "learning_rate": 9.999990024831926e-06, + "loss": 1.2191, + "step": 398 + }, + { + "epoch": 0.030758556891766882, + "grad_norm": 4.3586015701293945, + "learning_rate": 9.999987375179023e-06, + "loss": 1.197, + "step": 399 + }, + { + "epoch": 0.030835646006783842, + "grad_norm": 4.417754173278809, + "learning_rate": 9.999984413802802e-06, + "loss": 1.2121, + "step": 400 + }, + { + "epoch": 0.030912735121800802, + "grad_norm": 4.340965747833252, + "learning_rate": 9.999981140703447e-06, + "loss": 1.1745, + "step": 401 + }, + { + "epoch": 0.030989824236817762, + "grad_norm": 4.833497047424316, + "learning_rate": 9.999977555881163e-06, + "loss": 1.2646, + "step": 402 + }, + { + "epoch": 0.031066913351834723, + "grad_norm": 4.495340824127197, + "learning_rate": 9.999973659336176e-06, + "loss": 1.1996, + "step": 403 + }, + { + "epoch": 0.03114400246685168, + "grad_norm": 4.7980475425720215, + "learning_rate": 9.999969451068725e-06, + "loss": 1.2813, + "step": 404 + }, + { + "epoch": 0.03122109158186864, + "grad_norm": 4.268666744232178, + "learning_rate": 9.999964931079076e-06, + "loss": 1.2938, + "step": 405 + }, + { + "epoch": 0.0312981806968856, + "grad_norm": 4.3926777839660645, + "learning_rate": 9.999960099367507e-06, + "loss": 1.236, + "step": 406 + }, + { + "epoch": 0.031375269811902556, + "grad_norm": 4.350759029388428, + "learning_rate": 9.999954955934323e-06, + "loss": 1.1252, + "step": 407 + }, + { + "epoch": 0.03145235892691952, + "grad_norm": 4.4295220375061035, + "learning_rate": 9.999949500779842e-06, + "loss": 1.2229, + "step": 408 + }, + { + "epoch": 0.03152944804193648, + "grad_norm": 4.337591171264648, + "learning_rate": 9.999943733904404e-06, + "loss": 1.1971, + "step": 409 + }, + { + "epoch": 0.03160653715695344, + "grad_norm": 4.42830753326416, + "learning_rate": 9.999937655308373e-06, + "loss": 1.1197, + "step": 410 + }, + { + "epoch": 0.0316836262719704, + "grad_norm": 5.042124271392822, + "learning_rate": 9.999931264992122e-06, + "loss": 1.1953, + "step": 411 + }, + { + "epoch": 0.03176071538698736, + "grad_norm": 4.555182933807373, + "learning_rate": 9.999924562956052e-06, + "loss": 1.2661, + "step": 412 + }, + { + "epoch": 0.03183780450200432, + "grad_norm": 4.939544677734375, + "learning_rate": 9.999917549200581e-06, + "loss": 1.2496, + "step": 413 + }, + { + "epoch": 0.031914893617021274, + "grad_norm": 4.4207539558410645, + "learning_rate": 9.999910223726146e-06, + "loss": 1.2442, + "step": 414 + }, + { + "epoch": 0.03199198273203824, + "grad_norm": 4.292929649353027, + "learning_rate": 9.999902586533205e-06, + "loss": 1.1871, + "step": 415 + }, + { + "epoch": 0.032069071847055194, + "grad_norm": 4.947376251220703, + "learning_rate": 9.999894637622231e-06, + "loss": 1.1849, + "step": 416 + }, + { + "epoch": 0.03214616096207216, + "grad_norm": 4.807240009307861, + "learning_rate": 9.999886376993723e-06, + "loss": 1.3515, + "step": 417 + }, + { + "epoch": 0.032223250077089115, + "grad_norm": 4.25323486328125, + "learning_rate": 9.999877804648194e-06, + "loss": 1.1788, + "step": 418 + }, + { + "epoch": 0.03230033919210607, + "grad_norm": 4.476768970489502, + "learning_rate": 9.999868920586178e-06, + "loss": 1.1857, + "step": 419 + }, + { + "epoch": 0.032377428307123035, + "grad_norm": 4.515340805053711, + "learning_rate": 9.999859724808231e-06, + "loss": 1.1074, + "step": 420 + }, + { + "epoch": 0.03245451742213999, + "grad_norm": 4.648842811584473, + "learning_rate": 9.999850217314924e-06, + "loss": 1.2364, + "step": 421 + }, + { + "epoch": 0.032531606537156955, + "grad_norm": 4.573676109313965, + "learning_rate": 9.999840398106852e-06, + "loss": 1.3302, + "step": 422 + }, + { + "epoch": 0.03260869565217391, + "grad_norm": 4.733384132385254, + "learning_rate": 9.999830267184625e-06, + "loss": 1.2797, + "step": 423 + }, + { + "epoch": 0.032685784767190876, + "grad_norm": 4.6709303855896, + "learning_rate": 9.999819824548875e-06, + "loss": 1.2243, + "step": 424 + }, + { + "epoch": 0.03276287388220783, + "grad_norm": 4.113539695739746, + "learning_rate": 9.999809070200256e-06, + "loss": 1.0434, + "step": 425 + }, + { + "epoch": 0.03283996299722479, + "grad_norm": 4.282526969909668, + "learning_rate": 9.999798004139435e-06, + "loss": 1.2439, + "step": 426 + }, + { + "epoch": 0.03291705211224175, + "grad_norm": 4.742609977722168, + "learning_rate": 9.999786626367102e-06, + "loss": 1.3161, + "step": 427 + }, + { + "epoch": 0.03299414122725871, + "grad_norm": 4.848706245422363, + "learning_rate": 9.999774936883969e-06, + "loss": 1.2556, + "step": 428 + }, + { + "epoch": 0.03307123034227567, + "grad_norm": 5.125500202178955, + "learning_rate": 9.999762935690762e-06, + "loss": 1.2509, + "step": 429 + }, + { + "epoch": 0.03314831945729263, + "grad_norm": 4.103264808654785, + "learning_rate": 9.999750622788232e-06, + "loss": 1.3005, + "step": 430 + }, + { + "epoch": 0.03322540857230959, + "grad_norm": 4.143542289733887, + "learning_rate": 9.999737998177144e-06, + "loss": 1.1695, + "step": 431 + }, + { + "epoch": 0.03330249768732655, + "grad_norm": 4.100920677185059, + "learning_rate": 9.999725061858286e-06, + "loss": 1.23, + "step": 432 + }, + { + "epoch": 0.03337958680234351, + "grad_norm": 4.6039299964904785, + "learning_rate": 9.999711813832465e-06, + "loss": 1.1966, + "step": 433 + }, + { + "epoch": 0.03345667591736047, + "grad_norm": 5.09745979309082, + "learning_rate": 9.999698254100506e-06, + "loss": 1.197, + "step": 434 + }, + { + "epoch": 0.03353376503237743, + "grad_norm": 4.1532769203186035, + "learning_rate": 9.999684382663254e-06, + "loss": 1.0883, + "step": 435 + }, + { + "epoch": 0.03361085414739439, + "grad_norm": 4.5222697257995605, + "learning_rate": 9.999670199521577e-06, + "loss": 1.2341, + "step": 436 + }, + { + "epoch": 0.03368794326241135, + "grad_norm": 4.2074294090271, + "learning_rate": 9.999655704676357e-06, + "loss": 1.3028, + "step": 437 + }, + { + "epoch": 0.033765032377428304, + "grad_norm": 4.485882759094238, + "learning_rate": 9.999640898128495e-06, + "loss": 1.2003, + "step": 438 + }, + { + "epoch": 0.03384212149244527, + "grad_norm": 4.506419658660889, + "learning_rate": 9.999625779878918e-06, + "loss": 1.2515, + "step": 439 + }, + { + "epoch": 0.033919210607462225, + "grad_norm": 4.3678436279296875, + "learning_rate": 9.99961034992857e-06, + "loss": 1.2829, + "step": 440 + }, + { + "epoch": 0.03399629972247919, + "grad_norm": 4.121408462524414, + "learning_rate": 9.999594608278407e-06, + "loss": 1.1691, + "step": 441 + }, + { + "epoch": 0.034073388837496145, + "grad_norm": 4.504795074462891, + "learning_rate": 9.999578554929415e-06, + "loss": 1.1358, + "step": 442 + }, + { + "epoch": 0.03415047795251311, + "grad_norm": 4.258967876434326, + "learning_rate": 9.999562189882594e-06, + "loss": 1.1833, + "step": 443 + }, + { + "epoch": 0.034227567067530065, + "grad_norm": 4.424483776092529, + "learning_rate": 9.999545513138964e-06, + "loss": 1.2847, + "step": 444 + }, + { + "epoch": 0.03430465618254702, + "grad_norm": 4.2883687019348145, + "learning_rate": 9.999528524699563e-06, + "loss": 1.1569, + "step": 445 + }, + { + "epoch": 0.034381745297563986, + "grad_norm": 4.397137641906738, + "learning_rate": 9.999511224565453e-06, + "loss": 1.1686, + "step": 446 + }, + { + "epoch": 0.03445883441258094, + "grad_norm": 4.637729644775391, + "learning_rate": 9.999493612737712e-06, + "loss": 1.2365, + "step": 447 + }, + { + "epoch": 0.034535923527597906, + "grad_norm": 4.562082290649414, + "learning_rate": 9.999475689217434e-06, + "loss": 1.1553, + "step": 448 + }, + { + "epoch": 0.03461301264261486, + "grad_norm": 4.052862644195557, + "learning_rate": 9.999457454005743e-06, + "loss": 1.1823, + "step": 449 + }, + { + "epoch": 0.03469010175763182, + "grad_norm": 4.691180229187012, + "learning_rate": 9.999438907103772e-06, + "loss": 1.2576, + "step": 450 + }, + { + "epoch": 0.03476719087264878, + "grad_norm": 4.515274524688721, + "learning_rate": 9.999420048512678e-06, + "loss": 1.1499, + "step": 451 + }, + { + "epoch": 0.03484427998766574, + "grad_norm": 4.499693393707275, + "learning_rate": 9.999400878233636e-06, + "loss": 1.1628, + "step": 452 + }, + { + "epoch": 0.0349213691026827, + "grad_norm": 4.56060266494751, + "learning_rate": 9.999381396267841e-06, + "loss": 1.2407, + "step": 453 + }, + { + "epoch": 0.03499845821769966, + "grad_norm": 4.329947471618652, + "learning_rate": 9.99936160261651e-06, + "loss": 1.1197, + "step": 454 + }, + { + "epoch": 0.035075547332716624, + "grad_norm": 4.710033416748047, + "learning_rate": 9.999341497280875e-06, + "loss": 1.2235, + "step": 455 + }, + { + "epoch": 0.03515263644773358, + "grad_norm": 4.5141167640686035, + "learning_rate": 9.99932108026219e-06, + "loss": 1.2179, + "step": 456 + }, + { + "epoch": 0.03522972556275054, + "grad_norm": 4.853623390197754, + "learning_rate": 9.999300351561727e-06, + "loss": 1.2093, + "step": 457 + }, + { + "epoch": 0.0353068146777675, + "grad_norm": 4.331170558929443, + "learning_rate": 9.999279311180779e-06, + "loss": 1.3841, + "step": 458 + }, + { + "epoch": 0.03538390379278446, + "grad_norm": 4.377639293670654, + "learning_rate": 9.999257959120658e-06, + "loss": 1.2103, + "step": 459 + }, + { + "epoch": 0.03546099290780142, + "grad_norm": 4.253114700317383, + "learning_rate": 9.999236295382695e-06, + "loss": 1.1429, + "step": 460 + }, + { + "epoch": 0.03553808202281838, + "grad_norm": 4.613873481750488, + "learning_rate": 9.999214319968242e-06, + "loss": 1.2951, + "step": 461 + }, + { + "epoch": 0.035615171137835334, + "grad_norm": 4.3931498527526855, + "learning_rate": 9.999192032878667e-06, + "loss": 1.223, + "step": 462 + }, + { + "epoch": 0.0356922602528523, + "grad_norm": 4.192042827606201, + "learning_rate": 9.99916943411536e-06, + "loss": 1.1564, + "step": 463 + }, + { + "epoch": 0.035769349367869255, + "grad_norm": 4.241518974304199, + "learning_rate": 9.99914652367973e-06, + "loss": 1.1852, + "step": 464 + }, + { + "epoch": 0.03584643848288622, + "grad_norm": 4.183101177215576, + "learning_rate": 9.999123301573208e-06, + "loss": 1.2052, + "step": 465 + }, + { + "epoch": 0.035923527597903175, + "grad_norm": 4.752673149108887, + "learning_rate": 9.999099767797236e-06, + "loss": 1.1693, + "step": 466 + }, + { + "epoch": 0.03600061671292014, + "grad_norm": 4.796976089477539, + "learning_rate": 9.999075922353286e-06, + "loss": 1.3416, + "step": 467 + }, + { + "epoch": 0.036077705827937095, + "grad_norm": 4.705587863922119, + "learning_rate": 9.999051765242843e-06, + "loss": 1.2008, + "step": 468 + }, + { + "epoch": 0.03615479494295405, + "grad_norm": 4.491495609283447, + "learning_rate": 9.999027296467412e-06, + "loss": 1.1668, + "step": 469 + }, + { + "epoch": 0.036231884057971016, + "grad_norm": 4.496662139892578, + "learning_rate": 9.99900251602852e-06, + "loss": 1.2627, + "step": 470 + }, + { + "epoch": 0.03630897317298797, + "grad_norm": 4.168951511383057, + "learning_rate": 9.998977423927714e-06, + "loss": 1.1625, + "step": 471 + }, + { + "epoch": 0.036386062288004936, + "grad_norm": 4.014923095703125, + "learning_rate": 9.998952020166554e-06, + "loss": 1.1659, + "step": 472 + }, + { + "epoch": 0.03646315140302189, + "grad_norm": 4.847691059112549, + "learning_rate": 9.998926304746626e-06, + "loss": 1.1743, + "step": 473 + }, + { + "epoch": 0.03654024051803885, + "grad_norm": 4.13588809967041, + "learning_rate": 9.998900277669531e-06, + "loss": 1.1655, + "step": 474 + }, + { + "epoch": 0.03661732963305581, + "grad_norm": 4.349898815155029, + "learning_rate": 9.998873938936897e-06, + "loss": 1.2163, + "step": 475 + }, + { + "epoch": 0.03669441874807277, + "grad_norm": 4.191585540771484, + "learning_rate": 9.998847288550363e-06, + "loss": 1.1745, + "step": 476 + }, + { + "epoch": 0.036771507863089734, + "grad_norm": 4.346571922302246, + "learning_rate": 9.998820326511587e-06, + "loss": 1.1258, + "step": 477 + }, + { + "epoch": 0.03684859697810669, + "grad_norm": 4.2652459144592285, + "learning_rate": 9.998793052822256e-06, + "loss": 1.2362, + "step": 478 + }, + { + "epoch": 0.036925686093123654, + "grad_norm": 4.184598445892334, + "learning_rate": 9.998765467484066e-06, + "loss": 1.1744, + "step": 479 + }, + { + "epoch": 0.03700277520814061, + "grad_norm": 4.4486260414123535, + "learning_rate": 9.998737570498737e-06, + "loss": 1.1812, + "step": 480 + }, + { + "epoch": 0.03707986432315757, + "grad_norm": 4.7114410400390625, + "learning_rate": 9.998709361868012e-06, + "loss": 1.2171, + "step": 481 + }, + { + "epoch": 0.03715695343817453, + "grad_norm": 4.759378910064697, + "learning_rate": 9.998680841593647e-06, + "loss": 1.2212, + "step": 482 + }, + { + "epoch": 0.03723404255319149, + "grad_norm": 4.3762922286987305, + "learning_rate": 9.998652009677421e-06, + "loss": 1.1983, + "step": 483 + }, + { + "epoch": 0.03731113166820845, + "grad_norm": 4.322701454162598, + "learning_rate": 9.998622866121129e-06, + "loss": 1.2557, + "step": 484 + }, + { + "epoch": 0.03738822078322541, + "grad_norm": 4.573087692260742, + "learning_rate": 9.998593410926591e-06, + "loss": 1.2969, + "step": 485 + }, + { + "epoch": 0.03746530989824237, + "grad_norm": 4.229235649108887, + "learning_rate": 9.998563644095642e-06, + "loss": 1.2178, + "step": 486 + }, + { + "epoch": 0.03754239901325933, + "grad_norm": 4.301479339599609, + "learning_rate": 9.998533565630138e-06, + "loss": 1.2294, + "step": 487 + }, + { + "epoch": 0.037619488128276285, + "grad_norm": 4.079572677612305, + "learning_rate": 9.998503175531955e-06, + "loss": 1.2311, + "step": 488 + }, + { + "epoch": 0.03769657724329325, + "grad_norm": 5.241011619567871, + "learning_rate": 9.998472473802984e-06, + "loss": 1.2433, + "step": 489 + }, + { + "epoch": 0.037773666358310205, + "grad_norm": 4.18917989730835, + "learning_rate": 9.998441460445145e-06, + "loss": 1.3185, + "step": 490 + }, + { + "epoch": 0.03785075547332717, + "grad_norm": 4.833195209503174, + "learning_rate": 9.998410135460368e-06, + "loss": 1.2041, + "step": 491 + }, + { + "epoch": 0.037927844588344126, + "grad_norm": 4.499256610870361, + "learning_rate": 9.998378498850605e-06, + "loss": 1.2148, + "step": 492 + }, + { + "epoch": 0.03800493370336108, + "grad_norm": 4.2141947746276855, + "learning_rate": 9.998346550617833e-06, + "loss": 1.1423, + "step": 493 + }, + { + "epoch": 0.038082022818378046, + "grad_norm": 4.486852169036865, + "learning_rate": 9.998314290764038e-06, + "loss": 1.2306, + "step": 494 + }, + { + "epoch": 0.038159111933395, + "grad_norm": 4.096106052398682, + "learning_rate": 9.998281719291234e-06, + "loss": 1.113, + "step": 495 + }, + { + "epoch": 0.038236201048411966, + "grad_norm": 4.3826985359191895, + "learning_rate": 9.998248836201452e-06, + "loss": 1.1976, + "step": 496 + }, + { + "epoch": 0.03831329016342892, + "grad_norm": 4.423654079437256, + "learning_rate": 9.998215641496743e-06, + "loss": 1.1227, + "step": 497 + }, + { + "epoch": 0.03839037927844589, + "grad_norm": 3.936204671859741, + "learning_rate": 9.998182135179173e-06, + "loss": 1.1393, + "step": 498 + }, + { + "epoch": 0.03846746839346284, + "grad_norm": 4.782196521759033, + "learning_rate": 9.998148317250835e-06, + "loss": 1.1804, + "step": 499 + }, + { + "epoch": 0.0385445575084798, + "grad_norm": 4.143787384033203, + "learning_rate": 9.998114187713834e-06, + "loss": 1.1194, + "step": 500 + }, + { + "epoch": 0.038621646623496764, + "grad_norm": 4.448570728302002, + "learning_rate": 9.998079746570299e-06, + "loss": 1.2344, + "step": 501 + }, + { + "epoch": 0.03869873573851372, + "grad_norm": 4.438679218292236, + "learning_rate": 9.99804499382238e-06, + "loss": 1.3456, + "step": 502 + }, + { + "epoch": 0.038775824853530684, + "grad_norm": 4.5678606033325195, + "learning_rate": 9.99800992947224e-06, + "loss": 1.196, + "step": 503 + }, + { + "epoch": 0.03885291396854764, + "grad_norm": 4.333302021026611, + "learning_rate": 9.997974553522066e-06, + "loss": 1.2677, + "step": 504 + }, + { + "epoch": 0.0389300030835646, + "grad_norm": 4.986594200134277, + "learning_rate": 9.997938865974063e-06, + "loss": 1.2973, + "step": 505 + }, + { + "epoch": 0.03900709219858156, + "grad_norm": 4.153099536895752, + "learning_rate": 9.997902866830459e-06, + "loss": 1.1358, + "step": 506 + }, + { + "epoch": 0.03908418131359852, + "grad_norm": 4.446223735809326, + "learning_rate": 9.997866556093491e-06, + "loss": 1.2034, + "step": 507 + }, + { + "epoch": 0.03916127042861548, + "grad_norm": 4.689149379730225, + "learning_rate": 9.997829933765433e-06, + "loss": 1.242, + "step": 508 + }, + { + "epoch": 0.03923835954363244, + "grad_norm": 4.689703941345215, + "learning_rate": 9.997792999848561e-06, + "loss": 1.2383, + "step": 509 + }, + { + "epoch": 0.0393154486586494, + "grad_norm": 4.7472429275512695, + "learning_rate": 9.997755754345179e-06, + "loss": 1.3211, + "step": 510 + }, + { + "epoch": 0.03939253777366636, + "grad_norm": 4.735124111175537, + "learning_rate": 9.997718197257612e-06, + "loss": 1.2133, + "step": 511 + }, + { + "epoch": 0.039469626888683315, + "grad_norm": 4.425952911376953, + "learning_rate": 9.997680328588198e-06, + "loss": 1.1441, + "step": 512 + }, + { + "epoch": 0.03954671600370028, + "grad_norm": 4.210488319396973, + "learning_rate": 9.9976421483393e-06, + "loss": 1.2533, + "step": 513 + }, + { + "epoch": 0.039623805118717235, + "grad_norm": 4.078352451324463, + "learning_rate": 9.997603656513295e-06, + "loss": 1.179, + "step": 514 + }, + { + "epoch": 0.0397008942337342, + "grad_norm": 4.232064247131348, + "learning_rate": 9.997564853112586e-06, + "loss": 1.2968, + "step": 515 + }, + { + "epoch": 0.039777983348751156, + "grad_norm": 4.38081169128418, + "learning_rate": 9.997525738139595e-06, + "loss": 1.1834, + "step": 516 + }, + { + "epoch": 0.03985507246376811, + "grad_norm": 4.6666388511657715, + "learning_rate": 9.997486311596754e-06, + "loss": 1.16, + "step": 517 + }, + { + "epoch": 0.039932161578785076, + "grad_norm": 4.461724758148193, + "learning_rate": 9.997446573486524e-06, + "loss": 1.2314, + "step": 518 + }, + { + "epoch": 0.04000925069380203, + "grad_norm": 4.663398265838623, + "learning_rate": 9.997406523811383e-06, + "loss": 1.1742, + "step": 519 + }, + { + "epoch": 0.040086339808818996, + "grad_norm": 4.144864559173584, + "learning_rate": 9.997366162573828e-06, + "loss": 1.1362, + "step": 520 + }, + { + "epoch": 0.04016342892383595, + "grad_norm": 4.398372650146484, + "learning_rate": 9.997325489776375e-06, + "loss": 1.1925, + "step": 521 + }, + { + "epoch": 0.04024051803885292, + "grad_norm": 4.672181606292725, + "learning_rate": 9.997284505421559e-06, + "loss": 1.2667, + "step": 522 + }, + { + "epoch": 0.040317607153869874, + "grad_norm": 4.4285569190979, + "learning_rate": 9.997243209511935e-06, + "loss": 1.3158, + "step": 523 + }, + { + "epoch": 0.04039469626888683, + "grad_norm": 4.436081886291504, + "learning_rate": 9.99720160205008e-06, + "loss": 1.1041, + "step": 524 + }, + { + "epoch": 0.040471785383903794, + "grad_norm": 4.347367286682129, + "learning_rate": 9.997159683038584e-06, + "loss": 1.1595, + "step": 525 + }, + { + "epoch": 0.04054887449892075, + "grad_norm": 4.27985143661499, + "learning_rate": 9.997117452480063e-06, + "loss": 1.283, + "step": 526 + }, + { + "epoch": 0.040625963613937714, + "grad_norm": 4.468918800354004, + "learning_rate": 9.997074910377151e-06, + "loss": 1.1987, + "step": 527 + }, + { + "epoch": 0.04070305272895467, + "grad_norm": 4.384061813354492, + "learning_rate": 9.997032056732498e-06, + "loss": 1.1361, + "step": 528 + }, + { + "epoch": 0.040780141843971635, + "grad_norm": 4.80853271484375, + "learning_rate": 9.996988891548777e-06, + "loss": 1.2749, + "step": 529 + }, + { + "epoch": 0.04085723095898859, + "grad_norm": 4.914130210876465, + "learning_rate": 9.99694541482868e-06, + "loss": 1.4303, + "step": 530 + }, + { + "epoch": 0.04093432007400555, + "grad_norm": 4.093295097351074, + "learning_rate": 9.996901626574915e-06, + "loss": 1.155, + "step": 531 + }, + { + "epoch": 0.04101140918902251, + "grad_norm": 4.31951379776001, + "learning_rate": 9.996857526790212e-06, + "loss": 1.206, + "step": 532 + }, + { + "epoch": 0.04108849830403947, + "grad_norm": 6.6202521324157715, + "learning_rate": 9.996813115477323e-06, + "loss": 1.1679, + "step": 533 + }, + { + "epoch": 0.04116558741905643, + "grad_norm": 4.6812334060668945, + "learning_rate": 9.996768392639015e-06, + "loss": 1.205, + "step": 534 + }, + { + "epoch": 0.04124267653407339, + "grad_norm": 4.417919158935547, + "learning_rate": 9.996723358278077e-06, + "loss": 1.2173, + "step": 535 + }, + { + "epoch": 0.041319765649090345, + "grad_norm": 4.809331893920898, + "learning_rate": 9.996678012397317e-06, + "loss": 1.2465, + "step": 536 + }, + { + "epoch": 0.04139685476410731, + "grad_norm": 4.317751884460449, + "learning_rate": 9.99663235499956e-06, + "loss": 1.1406, + "step": 537 + }, + { + "epoch": 0.041473943879124266, + "grad_norm": 4.429257392883301, + "learning_rate": 9.996586386087653e-06, + "loss": 1.2509, + "step": 538 + }, + { + "epoch": 0.04155103299414123, + "grad_norm": 4.199450969696045, + "learning_rate": 9.996540105664464e-06, + "loss": 1.1988, + "step": 539 + }, + { + "epoch": 0.041628122109158186, + "grad_norm": 4.068055152893066, + "learning_rate": 9.996493513732878e-06, + "loss": 1.212, + "step": 540 + }, + { + "epoch": 0.04170521122417515, + "grad_norm": 4.471511363983154, + "learning_rate": 9.996446610295797e-06, + "loss": 1.2122, + "step": 541 + }, + { + "epoch": 0.041782300339192106, + "grad_norm": 4.128568172454834, + "learning_rate": 9.996399395356149e-06, + "loss": 1.2008, + "step": 542 + }, + { + "epoch": 0.04185938945420906, + "grad_norm": 4.495331764221191, + "learning_rate": 9.996351868916874e-06, + "loss": 1.2308, + "step": 543 + }, + { + "epoch": 0.04193647856922603, + "grad_norm": 4.763047218322754, + "learning_rate": 9.996304030980939e-06, + "loss": 1.2247, + "step": 544 + }, + { + "epoch": 0.04201356768424298, + "grad_norm": 4.249602317810059, + "learning_rate": 9.996255881551322e-06, + "loss": 1.2335, + "step": 545 + }, + { + "epoch": 0.04209065679925995, + "grad_norm": 4.013154983520508, + "learning_rate": 9.996207420631029e-06, + "loss": 1.0859, + "step": 546 + }, + { + "epoch": 0.042167745914276904, + "grad_norm": 4.579375743865967, + "learning_rate": 9.996158648223077e-06, + "loss": 1.1609, + "step": 547 + }, + { + "epoch": 0.04224483502929386, + "grad_norm": 4.4072747230529785, + "learning_rate": 9.996109564330513e-06, + "loss": 1.2142, + "step": 548 + }, + { + "epoch": 0.042321924144310824, + "grad_norm": 5.151735782623291, + "learning_rate": 9.99606016895639e-06, + "loss": 1.2514, + "step": 549 + }, + { + "epoch": 0.04239901325932778, + "grad_norm": 4.274787902832031, + "learning_rate": 9.996010462103791e-06, + "loss": 1.087, + "step": 550 + }, + { + "epoch": 0.042476102374344744, + "grad_norm": 4.291683197021484, + "learning_rate": 9.995960443775814e-06, + "loss": 1.0762, + "step": 551 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 4.732009410858154, + "learning_rate": 9.99591011397558e-06, + "loss": 1.2677, + "step": 552 + }, + { + "epoch": 0.042630280604378665, + "grad_norm": 4.424540996551514, + "learning_rate": 9.995859472706223e-06, + "loss": 1.3005, + "step": 553 + }, + { + "epoch": 0.04270736971939562, + "grad_norm": 4.752233505249023, + "learning_rate": 9.995808519970902e-06, + "loss": 1.278, + "step": 554 + }, + { + "epoch": 0.04278445883441258, + "grad_norm": 3.9832558631896973, + "learning_rate": 9.995757255772795e-06, + "loss": 1.1136, + "step": 555 + }, + { + "epoch": 0.04286154794942954, + "grad_norm": 4.465639114379883, + "learning_rate": 9.995705680115098e-06, + "loss": 1.1282, + "step": 556 + }, + { + "epoch": 0.0429386370644465, + "grad_norm": 4.6259446144104, + "learning_rate": 9.995653793001023e-06, + "loss": 1.2864, + "step": 557 + }, + { + "epoch": 0.04301572617946346, + "grad_norm": 3.95756459236145, + "learning_rate": 9.995601594433808e-06, + "loss": 1.1661, + "step": 558 + }, + { + "epoch": 0.04309281529448042, + "grad_norm": 4.663955211639404, + "learning_rate": 9.995549084416706e-06, + "loss": 1.1438, + "step": 559 + }, + { + "epoch": 0.043169904409497376, + "grad_norm": 4.4860615730285645, + "learning_rate": 9.995496262952993e-06, + "loss": 1.3304, + "step": 560 + }, + { + "epoch": 0.04324699352451434, + "grad_norm": 4.189501762390137, + "learning_rate": 9.995443130045958e-06, + "loss": 1.1138, + "step": 561 + }, + { + "epoch": 0.043324082639531296, + "grad_norm": 4.266587734222412, + "learning_rate": 9.995389685698918e-06, + "loss": 1.1725, + "step": 562 + }, + { + "epoch": 0.04340117175454826, + "grad_norm": 4.076508045196533, + "learning_rate": 9.995335929915202e-06, + "loss": 1.2068, + "step": 563 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 4.271369934082031, + "learning_rate": 9.995281862698161e-06, + "loss": 1.2824, + "step": 564 + }, + { + "epoch": 0.04355534998458218, + "grad_norm": 4.289413928985596, + "learning_rate": 9.995227484051168e-06, + "loss": 1.1504, + "step": 565 + }, + { + "epoch": 0.043632439099599137, + "grad_norm": 4.292287826538086, + "learning_rate": 9.995172793977613e-06, + "loss": 1.1755, + "step": 566 + }, + { + "epoch": 0.04370952821461609, + "grad_norm": 4.293979167938232, + "learning_rate": 9.995117792480903e-06, + "loss": 1.1852, + "step": 567 + }, + { + "epoch": 0.04378661732963306, + "grad_norm": 4.438719272613525, + "learning_rate": 9.995062479564472e-06, + "loss": 1.2362, + "step": 568 + }, + { + "epoch": 0.043863706444650014, + "grad_norm": 4.045646667480469, + "learning_rate": 9.995006855231763e-06, + "loss": 1.1894, + "step": 569 + }, + { + "epoch": 0.04394079555966698, + "grad_norm": 4.521982669830322, + "learning_rate": 9.994950919486248e-06, + "loss": 1.2075, + "step": 570 + }, + { + "epoch": 0.044017884674683934, + "grad_norm": 4.346794605255127, + "learning_rate": 9.994894672331413e-06, + "loss": 1.0808, + "step": 571 + }, + { + "epoch": 0.0440949737897009, + "grad_norm": 3.817455291748047, + "learning_rate": 9.994838113770762e-06, + "loss": 1.1479, + "step": 572 + }, + { + "epoch": 0.044172062904717854, + "grad_norm": 4.4810566902160645, + "learning_rate": 9.994781243807825e-06, + "loss": 1.1589, + "step": 573 + }, + { + "epoch": 0.04424915201973481, + "grad_norm": 4.639770030975342, + "learning_rate": 9.994724062446145e-06, + "loss": 1.2405, + "step": 574 + }, + { + "epoch": 0.044326241134751775, + "grad_norm": 3.9520983695983887, + "learning_rate": 9.994666569689291e-06, + "loss": 1.106, + "step": 575 + }, + { + "epoch": 0.04440333024976873, + "grad_norm": 4.4173407554626465, + "learning_rate": 9.994608765540842e-06, + "loss": 1.1598, + "step": 576 + }, + { + "epoch": 0.044480419364785695, + "grad_norm": 3.9496123790740967, + "learning_rate": 9.994550650004404e-06, + "loss": 1.2553, + "step": 577 + }, + { + "epoch": 0.04455750847980265, + "grad_norm": 4.36904239654541, + "learning_rate": 9.994492223083603e-06, + "loss": 1.2274, + "step": 578 + }, + { + "epoch": 0.04463459759481961, + "grad_norm": 4.925714015960693, + "learning_rate": 9.994433484782076e-06, + "loss": 1.3003, + "step": 579 + }, + { + "epoch": 0.04471168670983657, + "grad_norm": 4.471517086029053, + "learning_rate": 9.994374435103489e-06, + "loss": 1.3164, + "step": 580 + }, + { + "epoch": 0.04478877582485353, + "grad_norm": 4.279041290283203, + "learning_rate": 9.994315074051525e-06, + "loss": 1.1954, + "step": 581 + }, + { + "epoch": 0.04486586493987049, + "grad_norm": 4.745578289031982, + "learning_rate": 9.994255401629878e-06, + "loss": 1.2623, + "step": 582 + }, + { + "epoch": 0.04494295405488745, + "grad_norm": 4.342554569244385, + "learning_rate": 9.994195417842276e-06, + "loss": 1.3354, + "step": 583 + }, + { + "epoch": 0.04502004316990441, + "grad_norm": 4.38990592956543, + "learning_rate": 9.994135122692454e-06, + "loss": 1.2993, + "step": 584 + }, + { + "epoch": 0.04509713228492137, + "grad_norm": 4.334774971008301, + "learning_rate": 9.99407451618417e-06, + "loss": 1.2366, + "step": 585 + }, + { + "epoch": 0.045174221399938326, + "grad_norm": 4.226545333862305, + "learning_rate": 9.994013598321208e-06, + "loss": 1.2286, + "step": 586 + }, + { + "epoch": 0.04525131051495529, + "grad_norm": 4.3245391845703125, + "learning_rate": 9.993952369107363e-06, + "loss": 1.2417, + "step": 587 + }, + { + "epoch": 0.045328399629972246, + "grad_norm": 4.576898097991943, + "learning_rate": 9.99389082854645e-06, + "loss": 1.3929, + "step": 588 + }, + { + "epoch": 0.04540548874498921, + "grad_norm": 4.47149658203125, + "learning_rate": 9.993828976642311e-06, + "loss": 1.2802, + "step": 589 + }, + { + "epoch": 0.04548257786000617, + "grad_norm": 4.014610290527344, + "learning_rate": 9.993766813398796e-06, + "loss": 1.1964, + "step": 590 + }, + { + "epoch": 0.04555966697502312, + "grad_norm": 4.17547607421875, + "learning_rate": 9.993704338819786e-06, + "loss": 1.2604, + "step": 591 + }, + { + "epoch": 0.04563675609004009, + "grad_norm": 4.3998823165893555, + "learning_rate": 9.993641552909172e-06, + "loss": 1.2443, + "step": 592 + }, + { + "epoch": 0.045713845205057044, + "grad_norm": 4.851663589477539, + "learning_rate": 9.99357845567087e-06, + "loss": 1.3904, + "step": 593 + }, + { + "epoch": 0.04579093432007401, + "grad_norm": 4.441982746124268, + "learning_rate": 9.993515047108813e-06, + "loss": 1.2615, + "step": 594 + }, + { + "epoch": 0.045868023435090964, + "grad_norm": 4.005769729614258, + "learning_rate": 9.993451327226955e-06, + "loss": 1.1988, + "step": 595 + }, + { + "epoch": 0.04594511255010793, + "grad_norm": 4.6932454109191895, + "learning_rate": 9.99338729602927e-06, + "loss": 1.1599, + "step": 596 + }, + { + "epoch": 0.046022201665124884, + "grad_norm": 4.53994083404541, + "learning_rate": 9.993322953519749e-06, + "loss": 1.1581, + "step": 597 + }, + { + "epoch": 0.04609929078014184, + "grad_norm": 4.505131721496582, + "learning_rate": 9.9932582997024e-06, + "loss": 1.2383, + "step": 598 + }, + { + "epoch": 0.046176379895158805, + "grad_norm": 4.228984355926514, + "learning_rate": 9.993193334581259e-06, + "loss": 1.3035, + "step": 599 + }, + { + "epoch": 0.04625346901017576, + "grad_norm": 4.068345546722412, + "learning_rate": 9.993128058160373e-06, + "loss": 1.1668, + "step": 600 + }, + { + "epoch": 0.046330558125192725, + "grad_norm": 4.136154651641846, + "learning_rate": 9.993062470443814e-06, + "loss": 1.2611, + "step": 601 + }, + { + "epoch": 0.04640764724020968, + "grad_norm": 3.893854856491089, + "learning_rate": 9.992996571435668e-06, + "loss": 1.0874, + "step": 602 + }, + { + "epoch": 0.046484736355226645, + "grad_norm": 3.9865612983703613, + "learning_rate": 9.992930361140045e-06, + "loss": 1.1812, + "step": 603 + }, + { + "epoch": 0.0465618254702436, + "grad_norm": 4.678125381469727, + "learning_rate": 9.992863839561076e-06, + "loss": 1.2949, + "step": 604 + }, + { + "epoch": 0.04663891458526056, + "grad_norm": 4.224588871002197, + "learning_rate": 9.992797006702902e-06, + "loss": 1.1542, + "step": 605 + }, + { + "epoch": 0.04671600370027752, + "grad_norm": 4.713046073913574, + "learning_rate": 9.992729862569694e-06, + "loss": 1.2428, + "step": 606 + }, + { + "epoch": 0.04679309281529448, + "grad_norm": 4.193148612976074, + "learning_rate": 9.992662407165637e-06, + "loss": 1.2829, + "step": 607 + }, + { + "epoch": 0.04687018193031144, + "grad_norm": 4.588696479797363, + "learning_rate": 9.992594640494937e-06, + "loss": 1.3168, + "step": 608 + }, + { + "epoch": 0.0469472710453284, + "grad_norm": 3.988802671432495, + "learning_rate": 9.99252656256182e-06, + "loss": 1.1058, + "step": 609 + }, + { + "epoch": 0.047024360160345356, + "grad_norm": 4.400096893310547, + "learning_rate": 9.992458173370525e-06, + "loss": 1.3421, + "step": 610 + }, + { + "epoch": 0.04710144927536232, + "grad_norm": 4.6701483726501465, + "learning_rate": 9.992389472925323e-06, + "loss": 1.2688, + "step": 611 + }, + { + "epoch": 0.04717853839037928, + "grad_norm": 4.002415180206299, + "learning_rate": 9.99232046123049e-06, + "loss": 1.163, + "step": 612 + }, + { + "epoch": 0.04725562750539624, + "grad_norm": 4.165858268737793, + "learning_rate": 9.992251138290336e-06, + "loss": 1.157, + "step": 613 + }, + { + "epoch": 0.0473327166204132, + "grad_norm": 4.390151023864746, + "learning_rate": 9.992181504109177e-06, + "loss": 1.147, + "step": 614 + }, + { + "epoch": 0.04740980573543016, + "grad_norm": 4.475891590118408, + "learning_rate": 9.992111558691357e-06, + "loss": 1.1861, + "step": 615 + }, + { + "epoch": 0.04748689485044712, + "grad_norm": 4.247600555419922, + "learning_rate": 9.992041302041238e-06, + "loss": 1.0589, + "step": 616 + }, + { + "epoch": 0.047563983965464074, + "grad_norm": 4.422875881195068, + "learning_rate": 9.991970734163195e-06, + "loss": 1.1923, + "step": 617 + }, + { + "epoch": 0.04764107308048104, + "grad_norm": 4.901252746582031, + "learning_rate": 9.991899855061633e-06, + "loss": 1.1754, + "step": 618 + }, + { + "epoch": 0.047718162195497994, + "grad_norm": 4.342195510864258, + "learning_rate": 9.991828664740969e-06, + "loss": 1.1996, + "step": 619 + }, + { + "epoch": 0.04779525131051496, + "grad_norm": 4.116771697998047, + "learning_rate": 9.991757163205638e-06, + "loss": 1.1596, + "step": 620 + }, + { + "epoch": 0.047872340425531915, + "grad_norm": 4.246543884277344, + "learning_rate": 9.991685350460102e-06, + "loss": 1.2953, + "step": 621 + }, + { + "epoch": 0.04794942954054887, + "grad_norm": 4.528024673461914, + "learning_rate": 9.991613226508838e-06, + "loss": 1.1836, + "step": 622 + }, + { + "epoch": 0.048026518655565835, + "grad_norm": 4.674347877502441, + "learning_rate": 9.991540791356342e-06, + "loss": 1.2399, + "step": 623 + }, + { + "epoch": 0.04810360777058279, + "grad_norm": 3.99076247215271, + "learning_rate": 9.99146804500713e-06, + "loss": 1.1949, + "step": 624 + }, + { + "epoch": 0.048180696885599755, + "grad_norm": 4.242091178894043, + "learning_rate": 9.991394987465734e-06, + "loss": 1.2284, + "step": 625 + }, + { + "epoch": 0.04825778600061671, + "grad_norm": 4.19851016998291, + "learning_rate": 9.991321618736713e-06, + "loss": 1.2451, + "step": 626 + }, + { + "epoch": 0.048334875115633676, + "grad_norm": 4.685157299041748, + "learning_rate": 9.991247938824641e-06, + "loss": 1.1317, + "step": 627 + }, + { + "epoch": 0.04841196423065063, + "grad_norm": 3.7727530002593994, + "learning_rate": 9.991173947734109e-06, + "loss": 1.1234, + "step": 628 + }, + { + "epoch": 0.04848905334566759, + "grad_norm": 4.488282203674316, + "learning_rate": 9.991099645469733e-06, + "loss": 1.1491, + "step": 629 + }, + { + "epoch": 0.04856614246068455, + "grad_norm": 4.348680019378662, + "learning_rate": 9.991025032036141e-06, + "loss": 1.273, + "step": 630 + }, + { + "epoch": 0.04864323157570151, + "grad_norm": 4.021731853485107, + "learning_rate": 9.990950107437989e-06, + "loss": 1.092, + "step": 631 + }, + { + "epoch": 0.04872032069071847, + "grad_norm": 3.9224579334259033, + "learning_rate": 9.990874871679948e-06, + "loss": 1.1298, + "step": 632 + }, + { + "epoch": 0.04879740980573543, + "grad_norm": 4.388615608215332, + "learning_rate": 9.990799324766704e-06, + "loss": 1.3351, + "step": 633 + }, + { + "epoch": 0.048874498920752386, + "grad_norm": 4.323206901550293, + "learning_rate": 9.990723466702972e-06, + "loss": 1.3192, + "step": 634 + }, + { + "epoch": 0.04895158803576935, + "grad_norm": 4.259064674377441, + "learning_rate": 9.99064729749348e-06, + "loss": 1.2824, + "step": 635 + }, + { + "epoch": 0.04902867715078631, + "grad_norm": 4.554323673248291, + "learning_rate": 9.990570817142974e-06, + "loss": 1.302, + "step": 636 + }, + { + "epoch": 0.04910576626580327, + "grad_norm": 4.136668682098389, + "learning_rate": 9.990494025656227e-06, + "loss": 1.2416, + "step": 637 + }, + { + "epoch": 0.04918285538082023, + "grad_norm": 4.030932426452637, + "learning_rate": 9.990416923038022e-06, + "loss": 1.2393, + "step": 638 + }, + { + "epoch": 0.04925994449583719, + "grad_norm": 4.34704065322876, + "learning_rate": 9.990339509293169e-06, + "loss": 1.175, + "step": 639 + }, + { + "epoch": 0.04933703361085415, + "grad_norm": 4.10532808303833, + "learning_rate": 9.990261784426494e-06, + "loss": 1.0991, + "step": 640 + }, + { + "epoch": 0.049414122725871104, + "grad_norm": 4.257048606872559, + "learning_rate": 9.99018374844284e-06, + "loss": 1.2731, + "step": 641 + }, + { + "epoch": 0.04949121184088807, + "grad_norm": 4.365378379821777, + "learning_rate": 9.990105401347075e-06, + "loss": 1.1691, + "step": 642 + }, + { + "epoch": 0.049568300955905024, + "grad_norm": 5.146248817443848, + "learning_rate": 9.990026743144084e-06, + "loss": 1.1753, + "step": 643 + }, + { + "epoch": 0.04964539007092199, + "grad_norm": 4.770695686340332, + "learning_rate": 9.989947773838766e-06, + "loss": 1.3062, + "step": 644 + }, + { + "epoch": 0.049722479185938945, + "grad_norm": 4.065369606018066, + "learning_rate": 9.989868493436052e-06, + "loss": 1.1505, + "step": 645 + }, + { + "epoch": 0.04979956830095591, + "grad_norm": 3.940561294555664, + "learning_rate": 9.989788901940878e-06, + "loss": 1.1703, + "step": 646 + }, + { + "epoch": 0.049876657415972865, + "grad_norm": 4.689897537231445, + "learning_rate": 9.98970899935821e-06, + "loss": 1.2645, + "step": 647 + }, + { + "epoch": 0.04995374653098982, + "grad_norm": 3.9842276573181152, + "learning_rate": 9.989628785693027e-06, + "loss": 1.2856, + "step": 648 + }, + { + "epoch": 0.050030835646006785, + "grad_norm": 4.0935773849487305, + "learning_rate": 9.989548260950332e-06, + "loss": 1.2049, + "step": 649 + }, + { + "epoch": 0.05010792476102374, + "grad_norm": 4.526483058929443, + "learning_rate": 9.989467425135145e-06, + "loss": 1.2788, + "step": 650 + }, + { + "epoch": 0.050185013876040706, + "grad_norm": 4.23220682144165, + "learning_rate": 9.989386278252505e-06, + "loss": 1.2215, + "step": 651 + }, + { + "epoch": 0.05026210299105766, + "grad_norm": 4.511578559875488, + "learning_rate": 9.989304820307469e-06, + "loss": 1.2454, + "step": 652 + }, + { + "epoch": 0.05033919210607462, + "grad_norm": 3.9522716999053955, + "learning_rate": 9.98922305130512e-06, + "loss": 1.1437, + "step": 653 + }, + { + "epoch": 0.05041628122109158, + "grad_norm": 4.068902492523193, + "learning_rate": 9.989140971250553e-06, + "loss": 1.1963, + "step": 654 + }, + { + "epoch": 0.05049337033610854, + "grad_norm": 4.930382251739502, + "learning_rate": 9.989058580148885e-06, + "loss": 1.2457, + "step": 655 + }, + { + "epoch": 0.0505704594511255, + "grad_norm": 3.9273664951324463, + "learning_rate": 9.988975878005256e-06, + "loss": 1.1408, + "step": 656 + }, + { + "epoch": 0.05064754856614246, + "grad_norm": 4.123290061950684, + "learning_rate": 9.988892864824817e-06, + "loss": 1.2368, + "step": 657 + }, + { + "epoch": 0.050724637681159424, + "grad_norm": 3.8407251834869385, + "learning_rate": 9.988809540612747e-06, + "loss": 1.2847, + "step": 658 + }, + { + "epoch": 0.05080172679617638, + "grad_norm": 4.222978115081787, + "learning_rate": 9.988725905374242e-06, + "loss": 1.1422, + "step": 659 + }, + { + "epoch": 0.05087881591119334, + "grad_norm": 4.184953689575195, + "learning_rate": 9.988641959114512e-06, + "loss": 1.1177, + "step": 660 + }, + { + "epoch": 0.0509559050262103, + "grad_norm": 4.532317161560059, + "learning_rate": 9.988557701838791e-06, + "loss": 1.2804, + "step": 661 + }, + { + "epoch": 0.05103299414122726, + "grad_norm": 4.091014385223389, + "learning_rate": 9.988473133552336e-06, + "loss": 1.1743, + "step": 662 + }, + { + "epoch": 0.05111008325624422, + "grad_norm": 4.535579204559326, + "learning_rate": 9.988388254260417e-06, + "loss": 1.161, + "step": 663 + }, + { + "epoch": 0.05118717237126118, + "grad_norm": 4.38757848739624, + "learning_rate": 9.988303063968325e-06, + "loss": 1.2166, + "step": 664 + }, + { + "epoch": 0.051264261486278134, + "grad_norm": 4.727634906768799, + "learning_rate": 9.988217562681373e-06, + "loss": 1.1831, + "step": 665 + }, + { + "epoch": 0.0513413506012951, + "grad_norm": 4.283156871795654, + "learning_rate": 9.988131750404889e-06, + "loss": 1.2051, + "step": 666 + }, + { + "epoch": 0.051418439716312055, + "grad_norm": 4.735593318939209, + "learning_rate": 9.988045627144226e-06, + "loss": 1.2458, + "step": 667 + }, + { + "epoch": 0.05149552883132902, + "grad_norm": 3.9615490436553955, + "learning_rate": 9.98795919290475e-06, + "loss": 1.3039, + "step": 668 + }, + { + "epoch": 0.051572617946345975, + "grad_norm": 4.121702671051025, + "learning_rate": 9.987872447691853e-06, + "loss": 1.1451, + "step": 669 + }, + { + "epoch": 0.05164970706136294, + "grad_norm": 4.138181209564209, + "learning_rate": 9.987785391510943e-06, + "loss": 1.2745, + "step": 670 + }, + { + "epoch": 0.051726796176379895, + "grad_norm": 4.431112289428711, + "learning_rate": 9.987698024367444e-06, + "loss": 1.2789, + "step": 671 + }, + { + "epoch": 0.05180388529139685, + "grad_norm": 4.268251895904541, + "learning_rate": 9.987610346266808e-06, + "loss": 1.128, + "step": 672 + }, + { + "epoch": 0.051880974406413816, + "grad_norm": 4.024346351623535, + "learning_rate": 9.987522357214496e-06, + "loss": 1.1851, + "step": 673 + }, + { + "epoch": 0.05195806352143077, + "grad_norm": 4.691232681274414, + "learning_rate": 9.987434057215996e-06, + "loss": 1.1774, + "step": 674 + }, + { + "epoch": 0.052035152636447736, + "grad_norm": 4.2596635818481445, + "learning_rate": 9.987345446276815e-06, + "loss": 1.1782, + "step": 675 + }, + { + "epoch": 0.05211224175146469, + "grad_norm": 4.4914231300354, + "learning_rate": 9.987256524402473e-06, + "loss": 1.3086, + "step": 676 + }, + { + "epoch": 0.05218933086648165, + "grad_norm": 4.257656097412109, + "learning_rate": 9.987167291598518e-06, + "loss": 1.2465, + "step": 677 + }, + { + "epoch": 0.05226641998149861, + "grad_norm": 4.148374080657959, + "learning_rate": 9.987077747870512e-06, + "loss": 1.2106, + "step": 678 + }, + { + "epoch": 0.05234350909651557, + "grad_norm": 4.15828275680542, + "learning_rate": 9.986987893224036e-06, + "loss": 1.2535, + "step": 679 + }, + { + "epoch": 0.05242059821153253, + "grad_norm": 4.436988353729248, + "learning_rate": 9.986897727664693e-06, + "loss": 1.2644, + "step": 680 + }, + { + "epoch": 0.05249768732654949, + "grad_norm": 4.664417266845703, + "learning_rate": 9.986807251198105e-06, + "loss": 1.3725, + "step": 681 + }, + { + "epoch": 0.052574776441566454, + "grad_norm": 4.027350425720215, + "learning_rate": 9.986716463829913e-06, + "loss": 1.2051, + "step": 682 + }, + { + "epoch": 0.05265186555658341, + "grad_norm": 4.5459818840026855, + "learning_rate": 9.986625365565776e-06, + "loss": 1.3047, + "step": 683 + }, + { + "epoch": 0.05272895467160037, + "grad_norm": 4.167656898498535, + "learning_rate": 9.986533956411373e-06, + "loss": 1.1959, + "step": 684 + }, + { + "epoch": 0.05280604378661733, + "grad_norm": 4.064361095428467, + "learning_rate": 9.986442236372404e-06, + "loss": 1.1627, + "step": 685 + }, + { + "epoch": 0.05288313290163429, + "grad_norm": 4.358853816986084, + "learning_rate": 9.986350205454587e-06, + "loss": 1.1343, + "step": 686 + }, + { + "epoch": 0.05296022201665125, + "grad_norm": 3.9868597984313965, + "learning_rate": 9.98625786366366e-06, + "loss": 1.1268, + "step": 687 + }, + { + "epoch": 0.05303731113166821, + "grad_norm": 3.7771265506744385, + "learning_rate": 9.98616521100538e-06, + "loss": 1.1284, + "step": 688 + }, + { + "epoch": 0.05311440024668517, + "grad_norm": 4.3471574783325195, + "learning_rate": 9.986072247485523e-06, + "loss": 1.2269, + "step": 689 + }, + { + "epoch": 0.05319148936170213, + "grad_norm": 3.956973075866699, + "learning_rate": 9.985978973109885e-06, + "loss": 1.2157, + "step": 690 + }, + { + "epoch": 0.053268578476719085, + "grad_norm": 4.183494567871094, + "learning_rate": 9.98588538788428e-06, + "loss": 1.2887, + "step": 691 + }, + { + "epoch": 0.05334566759173605, + "grad_norm": 4.123586654663086, + "learning_rate": 9.985791491814545e-06, + "loss": 1.1641, + "step": 692 + }, + { + "epoch": 0.053422756706753005, + "grad_norm": 4.9690680503845215, + "learning_rate": 9.985697284906532e-06, + "loss": 1.221, + "step": 693 + }, + { + "epoch": 0.05349984582176997, + "grad_norm": 3.8592402935028076, + "learning_rate": 9.985602767166114e-06, + "loss": 1.2379, + "step": 694 + }, + { + "epoch": 0.053576934936786925, + "grad_norm": 4.504653453826904, + "learning_rate": 9.985507938599186e-06, + "loss": 1.2235, + "step": 695 + }, + { + "epoch": 0.05365402405180388, + "grad_norm": 4.732753753662109, + "learning_rate": 9.985412799211658e-06, + "loss": 1.2707, + "step": 696 + }, + { + "epoch": 0.053731113166820846, + "grad_norm": 3.8671162128448486, + "learning_rate": 9.98531734900946e-06, + "loss": 1.282, + "step": 697 + }, + { + "epoch": 0.0538082022818378, + "grad_norm": 4.004355430603027, + "learning_rate": 9.985221587998549e-06, + "loss": 1.1439, + "step": 698 + }, + { + "epoch": 0.053885291396854766, + "grad_norm": 4.045041084289551, + "learning_rate": 9.985125516184889e-06, + "loss": 1.1859, + "step": 699 + }, + { + "epoch": 0.05396238051187172, + "grad_norm": 4.078817367553711, + "learning_rate": 9.985029133574471e-06, + "loss": 1.231, + "step": 700 + }, + { + "epoch": 0.054039469626888686, + "grad_norm": 4.425305366516113, + "learning_rate": 9.984932440173304e-06, + "loss": 1.2619, + "step": 701 + }, + { + "epoch": 0.05411655874190564, + "grad_norm": 4.202983856201172, + "learning_rate": 9.98483543598742e-06, + "loss": 1.1445, + "step": 702 + }, + { + "epoch": 0.0541936478569226, + "grad_norm": 4.361485958099365, + "learning_rate": 9.984738121022861e-06, + "loss": 1.2118, + "step": 703 + }, + { + "epoch": 0.054270736971939564, + "grad_norm": 4.436924934387207, + "learning_rate": 9.9846404952857e-06, + "loss": 1.3065, + "step": 704 + }, + { + "epoch": 0.05434782608695652, + "grad_norm": 4.535494804382324, + "learning_rate": 9.984542558782018e-06, + "loss": 1.2161, + "step": 705 + }, + { + "epoch": 0.054424915201973484, + "grad_norm": 4.041503429412842, + "learning_rate": 9.984444311517923e-06, + "loss": 1.2332, + "step": 706 + }, + { + "epoch": 0.05450200431699044, + "grad_norm": 4.140787601470947, + "learning_rate": 9.98434575349954e-06, + "loss": 1.2384, + "step": 707 + }, + { + "epoch": 0.0545790934320074, + "grad_norm": 4.458608627319336, + "learning_rate": 9.984246884733014e-06, + "loss": 1.1531, + "step": 708 + }, + { + "epoch": 0.05465618254702436, + "grad_norm": 4.457377910614014, + "learning_rate": 9.984147705224509e-06, + "loss": 1.1315, + "step": 709 + }, + { + "epoch": 0.05473327166204132, + "grad_norm": 4.368088722229004, + "learning_rate": 9.984048214980209e-06, + "loss": 1.2896, + "step": 710 + }, + { + "epoch": 0.05481036077705828, + "grad_norm": 4.405154228210449, + "learning_rate": 9.983948414006315e-06, + "loss": 1.2627, + "step": 711 + }, + { + "epoch": 0.05488744989207524, + "grad_norm": 4.391892433166504, + "learning_rate": 9.98384830230905e-06, + "loss": 1.2523, + "step": 712 + }, + { + "epoch": 0.0549645390070922, + "grad_norm": 4.066841125488281, + "learning_rate": 9.983747879894655e-06, + "loss": 1.3027, + "step": 713 + }, + { + "epoch": 0.05504162812210916, + "grad_norm": 3.8345208168029785, + "learning_rate": 9.98364714676939e-06, + "loss": 1.2225, + "step": 714 + }, + { + "epoch": 0.055118717237126115, + "grad_norm": 3.9913508892059326, + "learning_rate": 9.983546102939539e-06, + "loss": 1.2512, + "step": 715 + }, + { + "epoch": 0.05519580635214308, + "grad_norm": 4.175078868865967, + "learning_rate": 9.983444748411399e-06, + "loss": 1.092, + "step": 716 + }, + { + "epoch": 0.055272895467160035, + "grad_norm": 4.498879909515381, + "learning_rate": 9.983343083191287e-06, + "loss": 1.2141, + "step": 717 + }, + { + "epoch": 0.055349984582177, + "grad_norm": 4.295947551727295, + "learning_rate": 9.983241107285544e-06, + "loss": 1.2172, + "step": 718 + }, + { + "epoch": 0.055427073697193956, + "grad_norm": 4.796656131744385, + "learning_rate": 9.983138820700526e-06, + "loss": 1.0474, + "step": 719 + }, + { + "epoch": 0.05550416281221091, + "grad_norm": 4.575197219848633, + "learning_rate": 9.983036223442612e-06, + "loss": 1.2008, + "step": 720 + }, + { + "epoch": 0.055581251927227876, + "grad_norm": 4.701834201812744, + "learning_rate": 9.982933315518197e-06, + "loss": 1.3672, + "step": 721 + }, + { + "epoch": 0.05565834104224483, + "grad_norm": 4.1152753829956055, + "learning_rate": 9.982830096933697e-06, + "loss": 1.1963, + "step": 722 + }, + { + "epoch": 0.055735430157261796, + "grad_norm": 3.986194372177124, + "learning_rate": 9.982726567695547e-06, + "loss": 1.2208, + "step": 723 + }, + { + "epoch": 0.05581251927227875, + "grad_norm": 4.519706726074219, + "learning_rate": 9.982622727810202e-06, + "loss": 1.2464, + "step": 724 + }, + { + "epoch": 0.05588960838729572, + "grad_norm": 4.6598992347717285, + "learning_rate": 9.982518577284135e-06, + "loss": 1.2623, + "step": 725 + }, + { + "epoch": 0.05596669750231267, + "grad_norm": 4.343133926391602, + "learning_rate": 9.98241411612384e-06, + "loss": 1.302, + "step": 726 + }, + { + "epoch": 0.05604378661732963, + "grad_norm": 4.320357322692871, + "learning_rate": 9.98230934433583e-06, + "loss": 1.2826, + "step": 727 + }, + { + "epoch": 0.056120875732346594, + "grad_norm": 3.914271116256714, + "learning_rate": 9.982204261926636e-06, + "loss": 1.0489, + "step": 728 + }, + { + "epoch": 0.05619796484736355, + "grad_norm": 4.293032169342041, + "learning_rate": 9.98209886890281e-06, + "loss": 1.0239, + "step": 729 + }, + { + "epoch": 0.056275053962380514, + "grad_norm": 4.971100807189941, + "learning_rate": 9.981993165270922e-06, + "loss": 1.3301, + "step": 730 + }, + { + "epoch": 0.05635214307739747, + "grad_norm": 4.393875598907471, + "learning_rate": 9.981887151037563e-06, + "loss": 1.1637, + "step": 731 + }, + { + "epoch": 0.056429232192414434, + "grad_norm": 4.458146095275879, + "learning_rate": 9.981780826209342e-06, + "loss": 1.2018, + "step": 732 + }, + { + "epoch": 0.05650632130743139, + "grad_norm": 4.671881198883057, + "learning_rate": 9.981674190792887e-06, + "loss": 1.2186, + "step": 733 + }, + { + "epoch": 0.05658341042244835, + "grad_norm": 4.1135029792785645, + "learning_rate": 9.98156724479485e-06, + "loss": 1.2273, + "step": 734 + }, + { + "epoch": 0.05666049953746531, + "grad_norm": 4.136716842651367, + "learning_rate": 9.981459988221891e-06, + "loss": 1.2256, + "step": 735 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 4.991126537322998, + "learning_rate": 9.981352421080704e-06, + "loss": 1.1702, + "step": 736 + }, + { + "epoch": 0.05681467776749923, + "grad_norm": 3.9331271648406982, + "learning_rate": 9.981244543377992e-06, + "loss": 1.0802, + "step": 737 + }, + { + "epoch": 0.05689176688251619, + "grad_norm": 4.251241207122803, + "learning_rate": 9.981136355120483e-06, + "loss": 1.2407, + "step": 738 + }, + { + "epoch": 0.056968855997533145, + "grad_norm": 4.525993347167969, + "learning_rate": 9.981027856314918e-06, + "loss": 1.1908, + "step": 739 + }, + { + "epoch": 0.05704594511255011, + "grad_norm": 4.329502105712891, + "learning_rate": 9.980919046968063e-06, + "loss": 1.3154, + "step": 740 + }, + { + "epoch": 0.057123034227567066, + "grad_norm": 4.49136209487915, + "learning_rate": 9.980809927086704e-06, + "loss": 1.3037, + "step": 741 + }, + { + "epoch": 0.05720012334258403, + "grad_norm": 4.0258941650390625, + "learning_rate": 9.980700496677643e-06, + "loss": 1.1127, + "step": 742 + }, + { + "epoch": 0.057277212457600986, + "grad_norm": 4.217193126678467, + "learning_rate": 9.980590755747698e-06, + "loss": 1.1301, + "step": 743 + }, + { + "epoch": 0.05735430157261795, + "grad_norm": 4.262319087982178, + "learning_rate": 9.980480704303719e-06, + "loss": 1.2313, + "step": 744 + }, + { + "epoch": 0.057431390687634906, + "grad_norm": 4.561182975769043, + "learning_rate": 9.980370342352562e-06, + "loss": 1.2237, + "step": 745 + }, + { + "epoch": 0.05750847980265186, + "grad_norm": 4.379204750061035, + "learning_rate": 9.980259669901105e-06, + "loss": 1.2885, + "step": 746 + }, + { + "epoch": 0.057585568917668827, + "grad_norm": 4.246951103210449, + "learning_rate": 9.980148686956252e-06, + "loss": 1.2487, + "step": 747 + }, + { + "epoch": 0.05766265803268578, + "grad_norm": 4.429943084716797, + "learning_rate": 9.980037393524925e-06, + "loss": 1.2902, + "step": 748 + }, + { + "epoch": 0.05773974714770275, + "grad_norm": 4.189937114715576, + "learning_rate": 9.979925789614053e-06, + "loss": 1.1855, + "step": 749 + }, + { + "epoch": 0.057816836262719704, + "grad_norm": 4.239128589630127, + "learning_rate": 9.979813875230604e-06, + "loss": 1.3016, + "step": 750 + }, + { + "epoch": 0.05789392537773666, + "grad_norm": 4.345081329345703, + "learning_rate": 9.97970165038155e-06, + "loss": 1.3184, + "step": 751 + }, + { + "epoch": 0.057971014492753624, + "grad_norm": 4.280864238739014, + "learning_rate": 9.979589115073888e-06, + "loss": 1.3088, + "step": 752 + }, + { + "epoch": 0.05804810360777058, + "grad_norm": 4.612633228302002, + "learning_rate": 9.979476269314635e-06, + "loss": 1.125, + "step": 753 + }, + { + "epoch": 0.058125192722787544, + "grad_norm": 4.349769115447998, + "learning_rate": 9.979363113110825e-06, + "loss": 1.2171, + "step": 754 + }, + { + "epoch": 0.0582022818378045, + "grad_norm": 4.421850681304932, + "learning_rate": 9.979249646469516e-06, + "loss": 1.239, + "step": 755 + }, + { + "epoch": 0.058279370952821465, + "grad_norm": 3.841928482055664, + "learning_rate": 9.979135869397776e-06, + "loss": 1.1576, + "step": 756 + }, + { + "epoch": 0.05835646006783842, + "grad_norm": 4.535758018493652, + "learning_rate": 9.979021781902705e-06, + "loss": 1.2456, + "step": 757 + }, + { + "epoch": 0.05843354918285538, + "grad_norm": 4.633460998535156, + "learning_rate": 9.978907383991412e-06, + "loss": 1.1629, + "step": 758 + }, + { + "epoch": 0.05851063829787234, + "grad_norm": 4.196053504943848, + "learning_rate": 9.978792675671031e-06, + "loss": 1.1678, + "step": 759 + }, + { + "epoch": 0.0585877274128893, + "grad_norm": 4.235719203948975, + "learning_rate": 9.978677656948712e-06, + "loss": 1.135, + "step": 760 + }, + { + "epoch": 0.05866481652790626, + "grad_norm": 3.9202029705047607, + "learning_rate": 9.978562327831626e-06, + "loss": 1.1177, + "step": 761 + }, + { + "epoch": 0.05874190564292322, + "grad_norm": 4.615420341491699, + "learning_rate": 9.978446688326964e-06, + "loss": 1.2012, + "step": 762 + }, + { + "epoch": 0.05881899475794018, + "grad_norm": 4.768507480621338, + "learning_rate": 9.978330738441935e-06, + "loss": 1.3307, + "step": 763 + }, + { + "epoch": 0.05889608387295714, + "grad_norm": 4.350339412689209, + "learning_rate": 9.978214478183767e-06, + "loss": 1.127, + "step": 764 + }, + { + "epoch": 0.058973172987974096, + "grad_norm": 4.4393134117126465, + "learning_rate": 9.97809790755971e-06, + "loss": 1.2217, + "step": 765 + }, + { + "epoch": 0.05905026210299106, + "grad_norm": 4.164086818695068, + "learning_rate": 9.97798102657703e-06, + "loss": 1.1628, + "step": 766 + }, + { + "epoch": 0.059127351218008016, + "grad_norm": 4.689718723297119, + "learning_rate": 9.977863835243016e-06, + "loss": 1.198, + "step": 767 + }, + { + "epoch": 0.05920444033302498, + "grad_norm": 4.145715236663818, + "learning_rate": 9.97774633356497e-06, + "loss": 1.2088, + "step": 768 + }, + { + "epoch": 0.059281529448041936, + "grad_norm": 4.122262477874756, + "learning_rate": 9.977628521550223e-06, + "loss": 1.1543, + "step": 769 + }, + { + "epoch": 0.05935861856305889, + "grad_norm": 4.569461822509766, + "learning_rate": 9.977510399206118e-06, + "loss": 1.1734, + "step": 770 + }, + { + "epoch": 0.05943570767807586, + "grad_norm": 4.216443061828613, + "learning_rate": 9.977391966540017e-06, + "loss": 1.2426, + "step": 771 + }, + { + "epoch": 0.05951279679309281, + "grad_norm": 4.43640661239624, + "learning_rate": 9.977273223559306e-06, + "loss": 1.1938, + "step": 772 + }, + { + "epoch": 0.05958988590810978, + "grad_norm": 3.9541783332824707, + "learning_rate": 9.977154170271389e-06, + "loss": 1.149, + "step": 773 + }, + { + "epoch": 0.059666975023126734, + "grad_norm": 4.332737445831299, + "learning_rate": 9.977034806683685e-06, + "loss": 1.2612, + "step": 774 + }, + { + "epoch": 0.0597440641381437, + "grad_norm": 4.155104637145996, + "learning_rate": 9.976915132803638e-06, + "loss": 1.0815, + "step": 775 + }, + { + "epoch": 0.059821153253160654, + "grad_norm": 4.293869495391846, + "learning_rate": 9.976795148638707e-06, + "loss": 1.2542, + "step": 776 + }, + { + "epoch": 0.05989824236817761, + "grad_norm": 4.076062202453613, + "learning_rate": 9.976674854196377e-06, + "loss": 1.1219, + "step": 777 + }, + { + "epoch": 0.059975331483194574, + "grad_norm": 4.0625152587890625, + "learning_rate": 9.976554249484144e-06, + "loss": 1.1951, + "step": 778 + }, + { + "epoch": 0.06005242059821153, + "grad_norm": 4.384305477142334, + "learning_rate": 9.976433334509525e-06, + "loss": 1.1677, + "step": 779 + }, + { + "epoch": 0.060129509713228495, + "grad_norm": 4.4182448387146, + "learning_rate": 9.976312109280063e-06, + "loss": 1.2527, + "step": 780 + }, + { + "epoch": 0.06020659882824545, + "grad_norm": 3.99910044670105, + "learning_rate": 9.976190573803314e-06, + "loss": 1.1124, + "step": 781 + }, + { + "epoch": 0.06028368794326241, + "grad_norm": 4.698251247406006, + "learning_rate": 9.976068728086857e-06, + "loss": 1.2172, + "step": 782 + }, + { + "epoch": 0.06036077705827937, + "grad_norm": 4.009255409240723, + "learning_rate": 9.975946572138284e-06, + "loss": 1.1481, + "step": 783 + }, + { + "epoch": 0.06043786617329633, + "grad_norm": 4.688887596130371, + "learning_rate": 9.975824105965215e-06, + "loss": 1.1883, + "step": 784 + }, + { + "epoch": 0.06051495528831329, + "grad_norm": 4.354454517364502, + "learning_rate": 9.975701329575283e-06, + "loss": 1.1541, + "step": 785 + }, + { + "epoch": 0.06059204440333025, + "grad_norm": 4.520771026611328, + "learning_rate": 9.97557824297614e-06, + "loss": 1.2542, + "step": 786 + }, + { + "epoch": 0.06066913351834721, + "grad_norm": 4.378393650054932, + "learning_rate": 9.975454846175466e-06, + "loss": 1.2588, + "step": 787 + }, + { + "epoch": 0.06074622263336417, + "grad_norm": 3.970188856124878, + "learning_rate": 9.975331139180951e-06, + "loss": 1.1532, + "step": 788 + }, + { + "epoch": 0.060823311748381126, + "grad_norm": 4.00541877746582, + "learning_rate": 9.975207122000305e-06, + "loss": 1.2816, + "step": 789 + }, + { + "epoch": 0.06090040086339809, + "grad_norm": 4.243439674377441, + "learning_rate": 9.975082794641264e-06, + "loss": 1.1577, + "step": 790 + }, + { + "epoch": 0.060977489978415046, + "grad_norm": 4.396401405334473, + "learning_rate": 9.974958157111578e-06, + "loss": 1.2457, + "step": 791 + }, + { + "epoch": 0.06105457909343201, + "grad_norm": 4.4037017822265625, + "learning_rate": 9.974833209419016e-06, + "loss": 1.2012, + "step": 792 + }, + { + "epoch": 0.06113166820844897, + "grad_norm": 4.3235907554626465, + "learning_rate": 9.974707951571369e-06, + "loss": 1.0912, + "step": 793 + }, + { + "epoch": 0.06120875732346592, + "grad_norm": 4.980129718780518, + "learning_rate": 9.974582383576446e-06, + "loss": 1.1828, + "step": 794 + }, + { + "epoch": 0.06128584643848289, + "grad_norm": 4.494161128997803, + "learning_rate": 9.974456505442073e-06, + "loss": 1.3294, + "step": 795 + }, + { + "epoch": 0.061362935553499844, + "grad_norm": 3.907623767852783, + "learning_rate": 9.974330317176103e-06, + "loss": 1.1584, + "step": 796 + }, + { + "epoch": 0.06144002466851681, + "grad_norm": 4.022689342498779, + "learning_rate": 9.9742038187864e-06, + "loss": 1.1379, + "step": 797 + }, + { + "epoch": 0.061517113783533764, + "grad_norm": 4.5095977783203125, + "learning_rate": 9.974077010280851e-06, + "loss": 1.2845, + "step": 798 + }, + { + "epoch": 0.06159420289855073, + "grad_norm": 4.331615447998047, + "learning_rate": 9.97394989166736e-06, + "loss": 1.1422, + "step": 799 + }, + { + "epoch": 0.061671292013567684, + "grad_norm": 4.0961527824401855, + "learning_rate": 9.973822462953856e-06, + "loss": 1.248, + "step": 800 + }, + { + "epoch": 0.06174838112858464, + "grad_norm": 4.432220935821533, + "learning_rate": 9.973694724148281e-06, + "loss": 1.1686, + "step": 801 + }, + { + "epoch": 0.061825470243601605, + "grad_norm": 4.118536949157715, + "learning_rate": 9.973566675258598e-06, + "loss": 1.1243, + "step": 802 + }, + { + "epoch": 0.06190255935861856, + "grad_norm": 3.9236130714416504, + "learning_rate": 9.973438316292794e-06, + "loss": 1.1906, + "step": 803 + }, + { + "epoch": 0.061979648473635525, + "grad_norm": 4.201266288757324, + "learning_rate": 9.97330964725887e-06, + "loss": 1.109, + "step": 804 + }, + { + "epoch": 0.06205673758865248, + "grad_norm": 4.015562534332275, + "learning_rate": 9.973180668164844e-06, + "loss": 1.1616, + "step": 805 + }, + { + "epoch": 0.062133826703669445, + "grad_norm": 5.223426818847656, + "learning_rate": 9.97305137901876e-06, + "loss": 1.382, + "step": 806 + }, + { + "epoch": 0.0622109158186864, + "grad_norm": 4.341038227081299, + "learning_rate": 9.972921779828679e-06, + "loss": 1.1932, + "step": 807 + }, + { + "epoch": 0.06228800493370336, + "grad_norm": 4.124241352081299, + "learning_rate": 9.972791870602682e-06, + "loss": 1.2678, + "step": 808 + }, + { + "epoch": 0.06236509404872032, + "grad_norm": 4.108554363250732, + "learning_rate": 9.972661651348865e-06, + "loss": 1.2885, + "step": 809 + }, + { + "epoch": 0.06244218316373728, + "grad_norm": 4.48992919921875, + "learning_rate": 9.972531122075349e-06, + "loss": 1.2948, + "step": 810 + }, + { + "epoch": 0.06251927227875424, + "grad_norm": 4.17953634262085, + "learning_rate": 9.97240028279027e-06, + "loss": 1.0968, + "step": 811 + }, + { + "epoch": 0.0625963613937712, + "grad_norm": 4.093837738037109, + "learning_rate": 9.972269133501787e-06, + "loss": 1.1572, + "step": 812 + }, + { + "epoch": 0.06267345050878816, + "grad_norm": 4.524899482727051, + "learning_rate": 9.972137674218077e-06, + "loss": 1.2688, + "step": 813 + }, + { + "epoch": 0.06275053962380511, + "grad_norm": 4.406780242919922, + "learning_rate": 9.972005904947332e-06, + "loss": 1.3075, + "step": 814 + }, + { + "epoch": 0.06282762873882208, + "grad_norm": 4.174956798553467, + "learning_rate": 9.97187382569777e-06, + "loss": 1.1822, + "step": 815 + }, + { + "epoch": 0.06290471785383904, + "grad_norm": 3.8772571086883545, + "learning_rate": 9.971741436477625e-06, + "loss": 1.1562, + "step": 816 + }, + { + "epoch": 0.062981806968856, + "grad_norm": 3.9671881198883057, + "learning_rate": 9.971608737295151e-06, + "loss": 1.2294, + "step": 817 + }, + { + "epoch": 0.06305889608387295, + "grad_norm": 4.8057098388671875, + "learning_rate": 9.971475728158622e-06, + "loss": 1.2987, + "step": 818 + }, + { + "epoch": 0.06313598519888991, + "grad_norm": 4.444809913635254, + "learning_rate": 9.971342409076328e-06, + "loss": 1.2099, + "step": 819 + }, + { + "epoch": 0.06321307431390688, + "grad_norm": 4.493791103363037, + "learning_rate": 9.971208780056582e-06, + "loss": 1.1324, + "step": 820 + }, + { + "epoch": 0.06329016342892384, + "grad_norm": 4.225183963775635, + "learning_rate": 9.971074841107715e-06, + "loss": 1.1474, + "step": 821 + }, + { + "epoch": 0.0633672525439408, + "grad_norm": 4.375453948974609, + "learning_rate": 9.970940592238077e-06, + "loss": 1.0902, + "step": 822 + }, + { + "epoch": 0.06344434165895775, + "grad_norm": 4.046267509460449, + "learning_rate": 9.97080603345604e-06, + "loss": 1.3007, + "step": 823 + }, + { + "epoch": 0.06352143077397472, + "grad_norm": 4.696292400360107, + "learning_rate": 9.970671164769989e-06, + "loss": 1.0967, + "step": 824 + }, + { + "epoch": 0.06359851988899168, + "grad_norm": 3.9629440307617188, + "learning_rate": 9.970535986188337e-06, + "loss": 1.0908, + "step": 825 + }, + { + "epoch": 0.06367560900400863, + "grad_norm": 4.310845851898193, + "learning_rate": 9.970400497719508e-06, + "loss": 1.1777, + "step": 826 + }, + { + "epoch": 0.06375269811902559, + "grad_norm": 4.32405948638916, + "learning_rate": 9.97026469937195e-06, + "loss": 1.2285, + "step": 827 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 4.7154130935668945, + "learning_rate": 9.970128591154131e-06, + "loss": 1.3087, + "step": 828 + }, + { + "epoch": 0.06390687634905952, + "grad_norm": 4.456521034240723, + "learning_rate": 9.969992173074534e-06, + "loss": 1.2192, + "step": 829 + }, + { + "epoch": 0.06398396546407648, + "grad_norm": 3.9812963008880615, + "learning_rate": 9.969855445141666e-06, + "loss": 1.1708, + "step": 830 + }, + { + "epoch": 0.06406105457909343, + "grad_norm": 4.128241539001465, + "learning_rate": 9.969718407364051e-06, + "loss": 1.0954, + "step": 831 + }, + { + "epoch": 0.06413814369411039, + "grad_norm": 4.4097089767456055, + "learning_rate": 9.969581059750231e-06, + "loss": 1.1696, + "step": 832 + }, + { + "epoch": 0.06421523280912735, + "grad_norm": 4.542466163635254, + "learning_rate": 9.96944340230877e-06, + "loss": 1.3143, + "step": 833 + }, + { + "epoch": 0.06429232192414432, + "grad_norm": 3.86757755279541, + "learning_rate": 9.969305435048251e-06, + "loss": 1.2229, + "step": 834 + }, + { + "epoch": 0.06436941103916127, + "grad_norm": 4.428218841552734, + "learning_rate": 9.969167157977275e-06, + "loss": 1.1942, + "step": 835 + }, + { + "epoch": 0.06444650015417823, + "grad_norm": 4.047502040863037, + "learning_rate": 9.969028571104462e-06, + "loss": 1.1897, + "step": 836 + }, + { + "epoch": 0.06452358926919519, + "grad_norm": 4.053440570831299, + "learning_rate": 9.968889674438453e-06, + "loss": 1.1565, + "step": 837 + }, + { + "epoch": 0.06460067838421214, + "grad_norm": 4.556980609893799, + "learning_rate": 9.968750467987907e-06, + "loss": 1.2299, + "step": 838 + }, + { + "epoch": 0.06467776749922911, + "grad_norm": 4.244764804840088, + "learning_rate": 9.968610951761504e-06, + "loss": 1.2834, + "step": 839 + }, + { + "epoch": 0.06475485661424607, + "grad_norm": 4.483509540557861, + "learning_rate": 9.968471125767942e-06, + "loss": 1.2776, + "step": 840 + }, + { + "epoch": 0.06483194572926303, + "grad_norm": 4.05586051940918, + "learning_rate": 9.968330990015935e-06, + "loss": 1.3225, + "step": 841 + }, + { + "epoch": 0.06490903484427998, + "grad_norm": 4.234732627868652, + "learning_rate": 9.968190544514225e-06, + "loss": 1.3191, + "step": 842 + }, + { + "epoch": 0.06498612395929695, + "grad_norm": 3.936809778213501, + "learning_rate": 9.968049789271564e-06, + "loss": 1.1848, + "step": 843 + }, + { + "epoch": 0.06506321307431391, + "grad_norm": 4.090641498565674, + "learning_rate": 9.96790872429673e-06, + "loss": 1.1973, + "step": 844 + }, + { + "epoch": 0.06514030218933087, + "grad_norm": 4.085000038146973, + "learning_rate": 9.967767349598517e-06, + "loss": 1.1307, + "step": 845 + }, + { + "epoch": 0.06521739130434782, + "grad_norm": 4.048746585845947, + "learning_rate": 9.967625665185737e-06, + "loss": 1.2981, + "step": 846 + }, + { + "epoch": 0.06529448041936478, + "grad_norm": 4.098531246185303, + "learning_rate": 9.967483671067224e-06, + "loss": 1.2129, + "step": 847 + }, + { + "epoch": 0.06537156953438175, + "grad_norm": 4.320443630218506, + "learning_rate": 9.967341367251833e-06, + "loss": 1.1298, + "step": 848 + }, + { + "epoch": 0.06544865864939871, + "grad_norm": 4.260039806365967, + "learning_rate": 9.967198753748432e-06, + "loss": 1.2355, + "step": 849 + }, + { + "epoch": 0.06552574776441566, + "grad_norm": 4.163205623626709, + "learning_rate": 9.967055830565917e-06, + "loss": 1.2541, + "step": 850 + }, + { + "epoch": 0.06560283687943262, + "grad_norm": 3.9784324169158936, + "learning_rate": 9.966912597713196e-06, + "loss": 1.1646, + "step": 851 + }, + { + "epoch": 0.06567992599444958, + "grad_norm": 4.5116682052612305, + "learning_rate": 9.966769055199197e-06, + "loss": 1.2124, + "step": 852 + }, + { + "epoch": 0.06575701510946655, + "grad_norm": 3.7753348350524902, + "learning_rate": 9.966625203032871e-06, + "loss": 1.1988, + "step": 853 + }, + { + "epoch": 0.0658341042244835, + "grad_norm": 4.4301323890686035, + "learning_rate": 9.966481041223188e-06, + "loss": 1.146, + "step": 854 + }, + { + "epoch": 0.06591119333950046, + "grad_norm": 4.574663162231445, + "learning_rate": 9.966336569779133e-06, + "loss": 1.1577, + "step": 855 + }, + { + "epoch": 0.06598828245451742, + "grad_norm": 3.872276544570923, + "learning_rate": 9.966191788709716e-06, + "loss": 1.0697, + "step": 856 + }, + { + "epoch": 0.06606537156953438, + "grad_norm": 3.989187240600586, + "learning_rate": 9.96604669802396e-06, + "loss": 1.1815, + "step": 857 + }, + { + "epoch": 0.06614246068455135, + "grad_norm": 4.288700580596924, + "learning_rate": 9.965901297730914e-06, + "loss": 1.0518, + "step": 858 + }, + { + "epoch": 0.0662195497995683, + "grad_norm": 4.061006546020508, + "learning_rate": 9.965755587839638e-06, + "loss": 1.2016, + "step": 859 + }, + { + "epoch": 0.06629663891458526, + "grad_norm": 4.13923454284668, + "learning_rate": 9.96560956835922e-06, + "loss": 1.1651, + "step": 860 + }, + { + "epoch": 0.06637372802960222, + "grad_norm": 4.172252655029297, + "learning_rate": 9.965463239298764e-06, + "loss": 1.1611, + "step": 861 + }, + { + "epoch": 0.06645081714461917, + "grad_norm": 4.616154670715332, + "learning_rate": 9.965316600667394e-06, + "loss": 1.2395, + "step": 862 + }, + { + "epoch": 0.06652790625963614, + "grad_norm": 4.157519340515137, + "learning_rate": 9.965169652474247e-06, + "loss": 1.271, + "step": 863 + }, + { + "epoch": 0.0666049953746531, + "grad_norm": 4.29591178894043, + "learning_rate": 9.965022394728487e-06, + "loss": 1.252, + "step": 864 + }, + { + "epoch": 0.06668208448967006, + "grad_norm": 4.058745861053467, + "learning_rate": 9.964874827439297e-06, + "loss": 1.2369, + "step": 865 + }, + { + "epoch": 0.06675917360468701, + "grad_norm": 4.279184818267822, + "learning_rate": 9.964726950615875e-06, + "loss": 1.3584, + "step": 866 + }, + { + "epoch": 0.06683626271970398, + "grad_norm": 4.2356672286987305, + "learning_rate": 9.964578764267441e-06, + "loss": 1.1432, + "step": 867 + }, + { + "epoch": 0.06691335183472094, + "grad_norm": 5.137538909912109, + "learning_rate": 9.96443026840323e-06, + "loss": 1.1755, + "step": 868 + }, + { + "epoch": 0.0669904409497379, + "grad_norm": 4.083669185638428, + "learning_rate": 9.964281463032507e-06, + "loss": 1.285, + "step": 869 + }, + { + "epoch": 0.06706753006475485, + "grad_norm": 4.169551372528076, + "learning_rate": 9.964132348164544e-06, + "loss": 1.1206, + "step": 870 + }, + { + "epoch": 0.06714461917977181, + "grad_norm": 4.000573635101318, + "learning_rate": 9.96398292380864e-06, + "loss": 1.1801, + "step": 871 + }, + { + "epoch": 0.06722170829478878, + "grad_norm": 4.286387920379639, + "learning_rate": 9.96383318997411e-06, + "loss": 1.0842, + "step": 872 + }, + { + "epoch": 0.06729879740980574, + "grad_norm": 3.734882354736328, + "learning_rate": 9.963683146670286e-06, + "loss": 1.1005, + "step": 873 + }, + { + "epoch": 0.0673758865248227, + "grad_norm": 4.232124328613281, + "learning_rate": 9.963532793906529e-06, + "loss": 1.2437, + "step": 874 + }, + { + "epoch": 0.06745297563983965, + "grad_norm": 4.824527740478516, + "learning_rate": 9.963382131692208e-06, + "loss": 1.2967, + "step": 875 + }, + { + "epoch": 0.06753006475485661, + "grad_norm": 4.362327575683594, + "learning_rate": 9.963231160036716e-06, + "loss": 1.23, + "step": 876 + }, + { + "epoch": 0.06760715386987358, + "grad_norm": 4.00999641418457, + "learning_rate": 9.963079878949466e-06, + "loss": 1.2246, + "step": 877 + }, + { + "epoch": 0.06768424298489054, + "grad_norm": 4.2067131996154785, + "learning_rate": 9.962928288439891e-06, + "loss": 1.2255, + "step": 878 + }, + { + "epoch": 0.06776133209990749, + "grad_norm": 4.093700408935547, + "learning_rate": 9.962776388517441e-06, + "loss": 1.199, + "step": 879 + }, + { + "epoch": 0.06783842121492445, + "grad_norm": 4.058950424194336, + "learning_rate": 9.962624179191586e-06, + "loss": 1.1883, + "step": 880 + }, + { + "epoch": 0.0679155103299414, + "grad_norm": 4.307193756103516, + "learning_rate": 9.962471660471815e-06, + "loss": 1.1255, + "step": 881 + }, + { + "epoch": 0.06799259944495838, + "grad_norm": 3.914050579071045, + "learning_rate": 9.96231883236764e-06, + "loss": 1.1739, + "step": 882 + }, + { + "epoch": 0.06806968855997533, + "grad_norm": 3.9826557636260986, + "learning_rate": 9.962165694888583e-06, + "loss": 1.1386, + "step": 883 + }, + { + "epoch": 0.06814677767499229, + "grad_norm": 3.8433432579040527, + "learning_rate": 9.962012248044195e-06, + "loss": 1.0846, + "step": 884 + }, + { + "epoch": 0.06822386679000925, + "grad_norm": 3.975701093673706, + "learning_rate": 9.961858491844044e-06, + "loss": 1.1565, + "step": 885 + }, + { + "epoch": 0.06830095590502622, + "grad_norm": 4.125971794128418, + "learning_rate": 9.961704426297712e-06, + "loss": 1.1605, + "step": 886 + }, + { + "epoch": 0.06837804502004317, + "grad_norm": 4.062132358551025, + "learning_rate": 9.961550051414808e-06, + "loss": 1.2098, + "step": 887 + }, + { + "epoch": 0.06845513413506013, + "grad_norm": 4.446691036224365, + "learning_rate": 9.961395367204953e-06, + "loss": 1.1216, + "step": 888 + }, + { + "epoch": 0.06853222325007709, + "grad_norm": 4.357199668884277, + "learning_rate": 9.961240373677793e-06, + "loss": 1.0838, + "step": 889 + }, + { + "epoch": 0.06860931236509404, + "grad_norm": 4.313502311706543, + "learning_rate": 9.961085070842992e-06, + "loss": 1.1937, + "step": 890 + }, + { + "epoch": 0.06868640148011101, + "grad_norm": 3.93827223777771, + "learning_rate": 9.960929458710231e-06, + "loss": 1.1371, + "step": 891 + }, + { + "epoch": 0.06876349059512797, + "grad_norm": 3.7555925846099854, + "learning_rate": 9.960773537289213e-06, + "loss": 1.1783, + "step": 892 + }, + { + "epoch": 0.06884057971014493, + "grad_norm": 4.327067852020264, + "learning_rate": 9.960617306589654e-06, + "loss": 1.2371, + "step": 893 + }, + { + "epoch": 0.06891766882516188, + "grad_norm": 3.748533248901367, + "learning_rate": 9.960460766621299e-06, + "loss": 1.1646, + "step": 894 + }, + { + "epoch": 0.06899475794017884, + "grad_norm": 4.586325168609619, + "learning_rate": 9.960303917393906e-06, + "loss": 1.2139, + "step": 895 + }, + { + "epoch": 0.06907184705519581, + "grad_norm": 4.419130802154541, + "learning_rate": 9.960146758917254e-06, + "loss": 1.4065, + "step": 896 + }, + { + "epoch": 0.06914893617021277, + "grad_norm": 4.031462669372559, + "learning_rate": 9.959989291201141e-06, + "loss": 1.2925, + "step": 897 + }, + { + "epoch": 0.06922602528522973, + "grad_norm": 4.59142541885376, + "learning_rate": 9.959831514255383e-06, + "loss": 1.2006, + "step": 898 + }, + { + "epoch": 0.06930311440024668, + "grad_norm": 3.835858106613159, + "learning_rate": 9.95967342808982e-06, + "loss": 1.2036, + "step": 899 + }, + { + "epoch": 0.06938020351526364, + "grad_norm": 4.229325771331787, + "learning_rate": 9.959515032714304e-06, + "loss": 1.1446, + "step": 900 + }, + { + "epoch": 0.06945729263028061, + "grad_norm": 4.047120094299316, + "learning_rate": 9.959356328138712e-06, + "loss": 1.1789, + "step": 901 + }, + { + "epoch": 0.06953438174529757, + "grad_norm": 3.6105780601501465, + "learning_rate": 9.959197314372937e-06, + "loss": 1.192, + "step": 902 + }, + { + "epoch": 0.06961147086031452, + "grad_norm": 4.577813625335693, + "learning_rate": 9.959037991426896e-06, + "loss": 1.2333, + "step": 903 + }, + { + "epoch": 0.06968855997533148, + "grad_norm": 3.890207052230835, + "learning_rate": 9.958878359310518e-06, + "loss": 1.1167, + "step": 904 + }, + { + "epoch": 0.06976564909034844, + "grad_norm": 4.432231426239014, + "learning_rate": 9.958718418033757e-06, + "loss": 1.196, + "step": 905 + }, + { + "epoch": 0.0698427382053654, + "grad_norm": 3.846874475479126, + "learning_rate": 9.958558167606585e-06, + "loss": 1.1761, + "step": 906 + }, + { + "epoch": 0.06991982732038236, + "grad_norm": 4.190489292144775, + "learning_rate": 9.958397608038994e-06, + "loss": 1.129, + "step": 907 + }, + { + "epoch": 0.06999691643539932, + "grad_norm": 3.783397912979126, + "learning_rate": 9.95823673934099e-06, + "loss": 1.2228, + "step": 908 + }, + { + "epoch": 0.07007400555041628, + "grad_norm": 3.776473045349121, + "learning_rate": 9.958075561522605e-06, + "loss": 1.1348, + "step": 909 + }, + { + "epoch": 0.07015109466543325, + "grad_norm": 4.117181777954102, + "learning_rate": 9.957914074593889e-06, + "loss": 1.1504, + "step": 910 + }, + { + "epoch": 0.0702281837804502, + "grad_norm": 4.3804545402526855, + "learning_rate": 9.957752278564905e-06, + "loss": 1.2356, + "step": 911 + }, + { + "epoch": 0.07030527289546716, + "grad_norm": 4.283797264099121, + "learning_rate": 9.957590173445746e-06, + "loss": 1.2674, + "step": 912 + }, + { + "epoch": 0.07038236201048412, + "grad_norm": 4.3605852127075195, + "learning_rate": 9.957427759246515e-06, + "loss": 1.2091, + "step": 913 + }, + { + "epoch": 0.07045945112550107, + "grad_norm": 4.470867156982422, + "learning_rate": 9.957265035977338e-06, + "loss": 1.2004, + "step": 914 + }, + { + "epoch": 0.07053654024051804, + "grad_norm": 4.192192077636719, + "learning_rate": 9.95710200364836e-06, + "loss": 1.2206, + "step": 915 + }, + { + "epoch": 0.070613629355535, + "grad_norm": 4.35572624206543, + "learning_rate": 9.956938662269745e-06, + "loss": 1.2407, + "step": 916 + }, + { + "epoch": 0.07069071847055196, + "grad_norm": 3.7318854331970215, + "learning_rate": 9.956775011851679e-06, + "loss": 1.1075, + "step": 917 + }, + { + "epoch": 0.07076780758556891, + "grad_norm": 4.309021949768066, + "learning_rate": 9.956611052404362e-06, + "loss": 1.1874, + "step": 918 + }, + { + "epoch": 0.07084489670058587, + "grad_norm": 4.061330318450928, + "learning_rate": 9.956446783938016e-06, + "loss": 1.2302, + "step": 919 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 4.194582462310791, + "learning_rate": 9.956282206462886e-06, + "loss": 1.2344, + "step": 920 + }, + { + "epoch": 0.0709990749306198, + "grad_norm": 3.940514326095581, + "learning_rate": 9.956117319989226e-06, + "loss": 1.1443, + "step": 921 + }, + { + "epoch": 0.07107616404563676, + "grad_norm": 4.2155046463012695, + "learning_rate": 9.95595212452732e-06, + "loss": 1.1655, + "step": 922 + }, + { + "epoch": 0.07115325316065371, + "grad_norm": 4.344760417938232, + "learning_rate": 9.95578662008747e-06, + "loss": 1.3106, + "step": 923 + }, + { + "epoch": 0.07123034227567067, + "grad_norm": 3.8140082359313965, + "learning_rate": 9.955620806679987e-06, + "loss": 1.188, + "step": 924 + }, + { + "epoch": 0.07130743139068764, + "grad_norm": 3.9605846405029297, + "learning_rate": 9.955454684315215e-06, + "loss": 1.21, + "step": 925 + }, + { + "epoch": 0.0713845205057046, + "grad_norm": 4.253360271453857, + "learning_rate": 9.955288253003508e-06, + "loss": 1.283, + "step": 926 + }, + { + "epoch": 0.07146160962072155, + "grad_norm": 4.852066516876221, + "learning_rate": 9.955121512755242e-06, + "loss": 1.2646, + "step": 927 + }, + { + "epoch": 0.07153869873573851, + "grad_norm": 4.225571155548096, + "learning_rate": 9.954954463580813e-06, + "loss": 1.1779, + "step": 928 + }, + { + "epoch": 0.07161578785075548, + "grad_norm": 4.433430194854736, + "learning_rate": 9.954787105490635e-06, + "loss": 1.3087, + "step": 929 + }, + { + "epoch": 0.07169287696577244, + "grad_norm": 4.311334609985352, + "learning_rate": 9.954619438495142e-06, + "loss": 1.1685, + "step": 930 + }, + { + "epoch": 0.0717699660807894, + "grad_norm": 4.312464237213135, + "learning_rate": 9.954451462604788e-06, + "loss": 1.2989, + "step": 931 + }, + { + "epoch": 0.07184705519580635, + "grad_norm": 4.1521477699279785, + "learning_rate": 9.954283177830047e-06, + "loss": 1.2642, + "step": 932 + }, + { + "epoch": 0.07192414431082331, + "grad_norm": 4.0158820152282715, + "learning_rate": 9.954114584181407e-06, + "loss": 1.1011, + "step": 933 + }, + { + "epoch": 0.07200123342584028, + "grad_norm": 3.9613866806030273, + "learning_rate": 9.953945681669381e-06, + "loss": 1.174, + "step": 934 + }, + { + "epoch": 0.07207832254085723, + "grad_norm": 4.178623199462891, + "learning_rate": 9.953776470304499e-06, + "loss": 1.174, + "step": 935 + }, + { + "epoch": 0.07215541165587419, + "grad_norm": 4.1417131423950195, + "learning_rate": 9.95360695009731e-06, + "loss": 1.2549, + "step": 936 + }, + { + "epoch": 0.07223250077089115, + "grad_norm": 4.326009750366211, + "learning_rate": 9.953437121058384e-06, + "loss": 1.1613, + "step": 937 + }, + { + "epoch": 0.0723095898859081, + "grad_norm": 3.9384677410125732, + "learning_rate": 9.953266983198307e-06, + "loss": 1.1953, + "step": 938 + }, + { + "epoch": 0.07238667900092507, + "grad_norm": 4.292700290679932, + "learning_rate": 9.953096536527688e-06, + "loss": 1.2971, + "step": 939 + }, + { + "epoch": 0.07246376811594203, + "grad_norm": 3.9370415210723877, + "learning_rate": 9.952925781057152e-06, + "loss": 1.2132, + "step": 940 + }, + { + "epoch": 0.07254085723095899, + "grad_norm": 4.184019088745117, + "learning_rate": 9.952754716797345e-06, + "loss": 1.1628, + "step": 941 + }, + { + "epoch": 0.07261794634597594, + "grad_norm": 3.967825412750244, + "learning_rate": 9.952583343758934e-06, + "loss": 1.242, + "step": 942 + }, + { + "epoch": 0.0726950354609929, + "grad_norm": 4.628615379333496, + "learning_rate": 9.9524116619526e-06, + "loss": 1.2041, + "step": 943 + }, + { + "epoch": 0.07277212457600987, + "grad_norm": 3.8717031478881836, + "learning_rate": 9.952239671389049e-06, + "loss": 1.0256, + "step": 944 + }, + { + "epoch": 0.07284921369102683, + "grad_norm": 3.9724347591400146, + "learning_rate": 9.952067372079003e-06, + "loss": 1.1419, + "step": 945 + }, + { + "epoch": 0.07292630280604379, + "grad_norm": 4.037529468536377, + "learning_rate": 9.951894764033202e-06, + "loss": 1.2491, + "step": 946 + }, + { + "epoch": 0.07300339192106074, + "grad_norm": 4.202883720397949, + "learning_rate": 9.951721847262413e-06, + "loss": 1.2303, + "step": 947 + }, + { + "epoch": 0.0730804810360777, + "grad_norm": 4.007006645202637, + "learning_rate": 9.951548621777409e-06, + "loss": 1.1551, + "step": 948 + }, + { + "epoch": 0.07315757015109467, + "grad_norm": 4.146481513977051, + "learning_rate": 9.951375087588993e-06, + "loss": 1.2194, + "step": 949 + }, + { + "epoch": 0.07323465926611163, + "grad_norm": 4.004715919494629, + "learning_rate": 9.951201244707986e-06, + "loss": 1.1161, + "step": 950 + }, + { + "epoch": 0.07331174838112858, + "grad_norm": 4.251381874084473, + "learning_rate": 9.951027093145222e-06, + "loss": 1.2399, + "step": 951 + }, + { + "epoch": 0.07338883749614554, + "grad_norm": 4.143471717834473, + "learning_rate": 9.950852632911563e-06, + "loss": 1.1627, + "step": 952 + }, + { + "epoch": 0.07346592661116251, + "grad_norm": 3.974905252456665, + "learning_rate": 9.950677864017882e-06, + "loss": 1.1123, + "step": 953 + }, + { + "epoch": 0.07354301572617947, + "grad_norm": 4.213186740875244, + "learning_rate": 9.950502786475078e-06, + "loss": 1.224, + "step": 954 + }, + { + "epoch": 0.07362010484119642, + "grad_norm": 4.501376628875732, + "learning_rate": 9.950327400294063e-06, + "loss": 1.254, + "step": 955 + }, + { + "epoch": 0.07369719395621338, + "grad_norm": 3.8753960132598877, + "learning_rate": 9.950151705485774e-06, + "loss": 1.1435, + "step": 956 + }, + { + "epoch": 0.07377428307123034, + "grad_norm": 4.606151580810547, + "learning_rate": 9.949975702061162e-06, + "loss": 1.2086, + "step": 957 + }, + { + "epoch": 0.07385137218624731, + "grad_norm": 3.9693284034729004, + "learning_rate": 9.949799390031203e-06, + "loss": 1.2506, + "step": 958 + }, + { + "epoch": 0.07392846130126426, + "grad_norm": 4.064651966094971, + "learning_rate": 9.949622769406888e-06, + "loss": 1.1616, + "step": 959 + }, + { + "epoch": 0.07400555041628122, + "grad_norm": 4.274019241333008, + "learning_rate": 9.949445840199227e-06, + "loss": 1.2061, + "step": 960 + }, + { + "epoch": 0.07408263953129818, + "grad_norm": 4.298183917999268, + "learning_rate": 9.949268602419253e-06, + "loss": 1.2899, + "step": 961 + }, + { + "epoch": 0.07415972864631513, + "grad_norm": 4.174452781677246, + "learning_rate": 9.949091056078012e-06, + "loss": 1.2072, + "step": 962 + }, + { + "epoch": 0.0742368177613321, + "grad_norm": 4.208117485046387, + "learning_rate": 9.948913201186579e-06, + "loss": 1.2707, + "step": 963 + }, + { + "epoch": 0.07431390687634906, + "grad_norm": 4.637174606323242, + "learning_rate": 9.948735037756037e-06, + "loss": 1.2792, + "step": 964 + }, + { + "epoch": 0.07439099599136602, + "grad_norm": 3.6787731647491455, + "learning_rate": 9.948556565797497e-06, + "loss": 1.1813, + "step": 965 + }, + { + "epoch": 0.07446808510638298, + "grad_norm": 4.200822830200195, + "learning_rate": 9.948377785322082e-06, + "loss": 1.2527, + "step": 966 + }, + { + "epoch": 0.07454517422139993, + "grad_norm": 3.7479918003082275, + "learning_rate": 9.948198696340943e-06, + "loss": 1.0949, + "step": 967 + }, + { + "epoch": 0.0746222633364169, + "grad_norm": 3.8182127475738525, + "learning_rate": 9.94801929886524e-06, + "loss": 1.1945, + "step": 968 + }, + { + "epoch": 0.07469935245143386, + "grad_norm": 4.381171703338623, + "learning_rate": 9.947839592906163e-06, + "loss": 1.302, + "step": 969 + }, + { + "epoch": 0.07477644156645082, + "grad_norm": 4.537683963775635, + "learning_rate": 9.947659578474911e-06, + "loss": 1.3543, + "step": 970 + }, + { + "epoch": 0.07485353068146777, + "grad_norm": 4.541914939880371, + "learning_rate": 9.94747925558271e-06, + "loss": 1.213, + "step": 971 + }, + { + "epoch": 0.07493061979648474, + "grad_norm": 4.564115524291992, + "learning_rate": 9.9472986242408e-06, + "loss": 1.2574, + "step": 972 + }, + { + "epoch": 0.0750077089115017, + "grad_norm": 4.06321907043457, + "learning_rate": 9.947117684460443e-06, + "loss": 1.227, + "step": 973 + }, + { + "epoch": 0.07508479802651866, + "grad_norm": 4.070244312286377, + "learning_rate": 9.946936436252923e-06, + "loss": 1.2062, + "step": 974 + }, + { + "epoch": 0.07516188714153561, + "grad_norm": 3.8233132362365723, + "learning_rate": 9.946754879629535e-06, + "loss": 1.1082, + "step": 975 + }, + { + "epoch": 0.07523897625655257, + "grad_norm": 4.254786014556885, + "learning_rate": 9.9465730146016e-06, + "loss": 1.2617, + "step": 976 + }, + { + "epoch": 0.07531606537156954, + "grad_norm": 5.448151111602783, + "learning_rate": 9.946390841180457e-06, + "loss": 1.3154, + "step": 977 + }, + { + "epoch": 0.0753931544865865, + "grad_norm": 3.762033462524414, + "learning_rate": 9.946208359377463e-06, + "loss": 1.1022, + "step": 978 + }, + { + "epoch": 0.07547024360160345, + "grad_norm": 4.151965618133545, + "learning_rate": 9.946025569203994e-06, + "loss": 1.1938, + "step": 979 + }, + { + "epoch": 0.07554733271662041, + "grad_norm": 3.941340684890747, + "learning_rate": 9.945842470671447e-06, + "loss": 1.2629, + "step": 980 + }, + { + "epoch": 0.07562442183163737, + "grad_norm": 3.871173620223999, + "learning_rate": 9.945659063791239e-06, + "loss": 1.0924, + "step": 981 + }, + { + "epoch": 0.07570151094665434, + "grad_norm": 3.7768266201019287, + "learning_rate": 9.9454753485748e-06, + "loss": 1.0993, + "step": 982 + }, + { + "epoch": 0.0757786000616713, + "grad_norm": 3.877859354019165, + "learning_rate": 9.945291325033587e-06, + "loss": 1.1623, + "step": 983 + }, + { + "epoch": 0.07585568917668825, + "grad_norm": 3.9639294147491455, + "learning_rate": 9.945106993179074e-06, + "loss": 1.0856, + "step": 984 + }, + { + "epoch": 0.07593277829170521, + "grad_norm": 4.09315824508667, + "learning_rate": 9.94492235302275e-06, + "loss": 1.2126, + "step": 985 + }, + { + "epoch": 0.07600986740672216, + "grad_norm": 4.234272480010986, + "learning_rate": 9.944737404576129e-06, + "loss": 1.1984, + "step": 986 + }, + { + "epoch": 0.07608695652173914, + "grad_norm": 3.7921226024627686, + "learning_rate": 9.94455214785074e-06, + "loss": 1.158, + "step": 987 + }, + { + "epoch": 0.07616404563675609, + "grad_norm": 4.407901763916016, + "learning_rate": 9.944366582858131e-06, + "loss": 1.1847, + "step": 988 + }, + { + "epoch": 0.07624113475177305, + "grad_norm": 4.234422206878662, + "learning_rate": 9.944180709609874e-06, + "loss": 1.1911, + "step": 989 + }, + { + "epoch": 0.07631822386679, + "grad_norm": 3.9739015102386475, + "learning_rate": 9.943994528117557e-06, + "loss": 1.1974, + "step": 990 + }, + { + "epoch": 0.07639531298180696, + "grad_norm": 4.158312797546387, + "learning_rate": 9.943808038392786e-06, + "loss": 1.1854, + "step": 991 + }, + { + "epoch": 0.07647240209682393, + "grad_norm": 3.7783594131469727, + "learning_rate": 9.943621240447188e-06, + "loss": 1.1394, + "step": 992 + }, + { + "epoch": 0.07654949121184089, + "grad_norm": 4.1893157958984375, + "learning_rate": 9.943434134292412e-06, + "loss": 1.2535, + "step": 993 + }, + { + "epoch": 0.07662658032685785, + "grad_norm": 4.0416388511657715, + "learning_rate": 9.943246719940118e-06, + "loss": 1.14, + "step": 994 + }, + { + "epoch": 0.0767036694418748, + "grad_norm": 5.405656337738037, + "learning_rate": 9.943058997401993e-06, + "loss": 1.2001, + "step": 995 + }, + { + "epoch": 0.07678075855689177, + "grad_norm": 3.7127041816711426, + "learning_rate": 9.942870966689742e-06, + "loss": 1.1386, + "step": 996 + }, + { + "epoch": 0.07685784767190873, + "grad_norm": 4.761865615844727, + "learning_rate": 9.942682627815084e-06, + "loss": 1.1629, + "step": 997 + }, + { + "epoch": 0.07693493678692569, + "grad_norm": 4.404749393463135, + "learning_rate": 9.942493980789762e-06, + "loss": 1.1531, + "step": 998 + }, + { + "epoch": 0.07701202590194264, + "grad_norm": 4.3769402503967285, + "learning_rate": 9.94230502562554e-06, + "loss": 1.2155, + "step": 999 + }, + { + "epoch": 0.0770891150169596, + "grad_norm": 4.384872913360596, + "learning_rate": 9.942115762334196e-06, + "loss": 1.287, + "step": 1000 + }, + { + "epoch": 0.07716620413197657, + "grad_norm": 3.899238109588623, + "learning_rate": 9.941926190927532e-06, + "loss": 1.1246, + "step": 1001 + }, + { + "epoch": 0.07724329324699353, + "grad_norm": 3.7073872089385986, + "learning_rate": 9.941736311417362e-06, + "loss": 1.2122, + "step": 1002 + }, + { + "epoch": 0.07732038236201048, + "grad_norm": 3.755033254623413, + "learning_rate": 9.94154612381553e-06, + "loss": 1.1603, + "step": 1003 + }, + { + "epoch": 0.07739747147702744, + "grad_norm": 4.051525115966797, + "learning_rate": 9.941355628133887e-06, + "loss": 1.121, + "step": 1004 + }, + { + "epoch": 0.0774745605920444, + "grad_norm": 3.882408380508423, + "learning_rate": 9.941164824384313e-06, + "loss": 1.2453, + "step": 1005 + }, + { + "epoch": 0.07755164970706137, + "grad_norm": 4.029034614562988, + "learning_rate": 9.940973712578706e-06, + "loss": 1.1899, + "step": 1006 + }, + { + "epoch": 0.07762873882207832, + "grad_norm": 4.212346076965332, + "learning_rate": 9.940782292728975e-06, + "loss": 1.2007, + "step": 1007 + }, + { + "epoch": 0.07770582793709528, + "grad_norm": 4.014603137969971, + "learning_rate": 9.940590564847059e-06, + "loss": 1.2438, + "step": 1008 + }, + { + "epoch": 0.07778291705211224, + "grad_norm": 4.43209171295166, + "learning_rate": 9.940398528944906e-06, + "loss": 1.2154, + "step": 1009 + }, + { + "epoch": 0.0778600061671292, + "grad_norm": 4.110109329223633, + "learning_rate": 9.940206185034496e-06, + "loss": 1.0843, + "step": 1010 + }, + { + "epoch": 0.07793709528214617, + "grad_norm": 4.09305477142334, + "learning_rate": 9.940013533127813e-06, + "loss": 1.1252, + "step": 1011 + }, + { + "epoch": 0.07801418439716312, + "grad_norm": 3.9109747409820557, + "learning_rate": 9.939820573236873e-06, + "loss": 1.3243, + "step": 1012 + }, + { + "epoch": 0.07809127351218008, + "grad_norm": 4.246638774871826, + "learning_rate": 9.939627305373703e-06, + "loss": 1.0503, + "step": 1013 + }, + { + "epoch": 0.07816836262719704, + "grad_norm": 4.346315383911133, + "learning_rate": 9.939433729550354e-06, + "loss": 1.2939, + "step": 1014 + }, + { + "epoch": 0.078245451742214, + "grad_norm": 4.1435322761535645, + "learning_rate": 9.939239845778894e-06, + "loss": 1.2417, + "step": 1015 + }, + { + "epoch": 0.07832254085723096, + "grad_norm": 4.223635673522949, + "learning_rate": 9.93904565407141e-06, + "loss": 1.2014, + "step": 1016 + }, + { + "epoch": 0.07839962997224792, + "grad_norm": 3.9585742950439453, + "learning_rate": 9.938851154440012e-06, + "loss": 1.0748, + "step": 1017 + }, + { + "epoch": 0.07847671908726488, + "grad_norm": 4.183574199676514, + "learning_rate": 9.93865634689682e-06, + "loss": 1.1727, + "step": 1018 + }, + { + "epoch": 0.07855380820228183, + "grad_norm": 3.769174098968506, + "learning_rate": 9.938461231453985e-06, + "loss": 1.2277, + "step": 1019 + }, + { + "epoch": 0.0786308973172988, + "grad_norm": 3.8185031414031982, + "learning_rate": 9.938265808123667e-06, + "loss": 1.1434, + "step": 1020 + }, + { + "epoch": 0.07870798643231576, + "grad_norm": 4.209797382354736, + "learning_rate": 9.938070076918056e-06, + "loss": 1.1186, + "step": 1021 + }, + { + "epoch": 0.07878507554733272, + "grad_norm": 4.088057518005371, + "learning_rate": 9.937874037849346e-06, + "loss": 1.2485, + "step": 1022 + }, + { + "epoch": 0.07886216466234967, + "grad_norm": 4.1362504959106445, + "learning_rate": 9.937677690929766e-06, + "loss": 1.1779, + "step": 1023 + }, + { + "epoch": 0.07893925377736663, + "grad_norm": 4.136228084564209, + "learning_rate": 9.937481036171555e-06, + "loss": 1.1269, + "step": 1024 + }, + { + "epoch": 0.0790163428923836, + "grad_norm": 4.334633827209473, + "learning_rate": 9.937284073586972e-06, + "loss": 1.191, + "step": 1025 + }, + { + "epoch": 0.07909343200740056, + "grad_norm": 4.252748012542725, + "learning_rate": 9.9370868031883e-06, + "loss": 1.2082, + "step": 1026 + }, + { + "epoch": 0.07917052112241751, + "grad_norm": 4.121849060058594, + "learning_rate": 9.936889224987834e-06, + "loss": 1.2799, + "step": 1027 + }, + { + "epoch": 0.07924761023743447, + "grad_norm": 4.112194061279297, + "learning_rate": 9.936691338997894e-06, + "loss": 1.2097, + "step": 1028 + }, + { + "epoch": 0.07932469935245143, + "grad_norm": 3.915144920349121, + "learning_rate": 9.936493145230817e-06, + "loss": 1.0446, + "step": 1029 + }, + { + "epoch": 0.0794017884674684, + "grad_norm": 4.162792682647705, + "learning_rate": 9.936294643698958e-06, + "loss": 1.1855, + "step": 1030 + }, + { + "epoch": 0.07947887758248535, + "grad_norm": 4.035287380218506, + "learning_rate": 9.936095834414693e-06, + "loss": 1.2207, + "step": 1031 + }, + { + "epoch": 0.07955596669750231, + "grad_norm": 4.192042827606201, + "learning_rate": 9.935896717390421e-06, + "loss": 1.2242, + "step": 1032 + }, + { + "epoch": 0.07963305581251927, + "grad_norm": 4.014597415924072, + "learning_rate": 9.93569729263855e-06, + "loss": 1.1331, + "step": 1033 + }, + { + "epoch": 0.07971014492753623, + "grad_norm": 3.836013078689575, + "learning_rate": 9.935497560171516e-06, + "loss": 1.1603, + "step": 1034 + }, + { + "epoch": 0.0797872340425532, + "grad_norm": 4.218697547912598, + "learning_rate": 9.93529752000177e-06, + "loss": 1.2531, + "step": 1035 + }, + { + "epoch": 0.07986432315757015, + "grad_norm": 3.811758518218994, + "learning_rate": 9.935097172141785e-06, + "loss": 1.1419, + "step": 1036 + }, + { + "epoch": 0.07994141227258711, + "grad_norm": 4.130055904388428, + "learning_rate": 9.93489651660405e-06, + "loss": 1.1776, + "step": 1037 + }, + { + "epoch": 0.08001850138760407, + "grad_norm": 4.011743068695068, + "learning_rate": 9.934695553401076e-06, + "loss": 1.2096, + "step": 1038 + }, + { + "epoch": 0.08009559050262104, + "grad_norm": 4.1407928466796875, + "learning_rate": 9.934494282545393e-06, + "loss": 1.1341, + "step": 1039 + }, + { + "epoch": 0.08017267961763799, + "grad_norm": 3.8697617053985596, + "learning_rate": 9.934292704049546e-06, + "loss": 1.1262, + "step": 1040 + }, + { + "epoch": 0.08024976873265495, + "grad_norm": 5.077935695648193, + "learning_rate": 9.934090817926105e-06, + "loss": 1.1413, + "step": 1041 + }, + { + "epoch": 0.0803268578476719, + "grad_norm": 4.142470359802246, + "learning_rate": 9.933888624187656e-06, + "loss": 1.3075, + "step": 1042 + }, + { + "epoch": 0.08040394696268886, + "grad_norm": 4.138065814971924, + "learning_rate": 9.933686122846804e-06, + "loss": 1.1422, + "step": 1043 + }, + { + "epoch": 0.08048103607770583, + "grad_norm": 4.101687908172607, + "learning_rate": 9.933483313916174e-06, + "loss": 1.2461, + "step": 1044 + }, + { + "epoch": 0.08055812519272279, + "grad_norm": 4.085910320281982, + "learning_rate": 9.933280197408413e-06, + "loss": 1.11, + "step": 1045 + }, + { + "epoch": 0.08063521430773975, + "grad_norm": 3.8039958477020264, + "learning_rate": 9.933076773336179e-06, + "loss": 1.1203, + "step": 1046 + }, + { + "epoch": 0.0807123034227567, + "grad_norm": 4.0165886878967285, + "learning_rate": 9.932873041712158e-06, + "loss": 1.1276, + "step": 1047 + }, + { + "epoch": 0.08078939253777366, + "grad_norm": 3.982520818710327, + "learning_rate": 9.932669002549052e-06, + "loss": 1.1715, + "step": 1048 + }, + { + "epoch": 0.08086648165279063, + "grad_norm": 4.542549133300781, + "learning_rate": 9.93246465585958e-06, + "loss": 1.2555, + "step": 1049 + }, + { + "epoch": 0.08094357076780759, + "grad_norm": 3.6177902221679688, + "learning_rate": 9.932260001656482e-06, + "loss": 0.995, + "step": 1050 + }, + { + "epoch": 0.08102065988282454, + "grad_norm": 4.135311603546143, + "learning_rate": 9.932055039952518e-06, + "loss": 1.2257, + "step": 1051 + }, + { + "epoch": 0.0810977489978415, + "grad_norm": 4.676593780517578, + "learning_rate": 9.931849770760467e-06, + "loss": 1.214, + "step": 1052 + }, + { + "epoch": 0.08117483811285846, + "grad_norm": 3.9388997554779053, + "learning_rate": 9.931644194093124e-06, + "loss": 1.1846, + "step": 1053 + }, + { + "epoch": 0.08125192722787543, + "grad_norm": 4.0811896324157715, + "learning_rate": 9.931438309963308e-06, + "loss": 1.1154, + "step": 1054 + }, + { + "epoch": 0.08132901634289239, + "grad_norm": 4.8191118240356445, + "learning_rate": 9.931232118383854e-06, + "loss": 1.2899, + "step": 1055 + }, + { + "epoch": 0.08140610545790934, + "grad_norm": 4.420556545257568, + "learning_rate": 9.931025619367617e-06, + "loss": 1.2523, + "step": 1056 + }, + { + "epoch": 0.0814831945729263, + "grad_norm": 4.228475093841553, + "learning_rate": 9.930818812927471e-06, + "loss": 1.2077, + "step": 1057 + }, + { + "epoch": 0.08156028368794327, + "grad_norm": 4.304680347442627, + "learning_rate": 9.93061169907631e-06, + "loss": 1.1991, + "step": 1058 + }, + { + "epoch": 0.08163737280296023, + "grad_norm": 4.341451168060303, + "learning_rate": 9.930404277827044e-06, + "loss": 1.1833, + "step": 1059 + }, + { + "epoch": 0.08171446191797718, + "grad_norm": 3.8857333660125732, + "learning_rate": 9.930196549192608e-06, + "loss": 1.1814, + "step": 1060 + }, + { + "epoch": 0.08179155103299414, + "grad_norm": 3.722275972366333, + "learning_rate": 9.929988513185952e-06, + "loss": 1.199, + "step": 1061 + }, + { + "epoch": 0.0818686401480111, + "grad_norm": 3.972513437271118, + "learning_rate": 9.929780169820043e-06, + "loss": 1.0716, + "step": 1062 + }, + { + "epoch": 0.08194572926302807, + "grad_norm": 3.907376289367676, + "learning_rate": 9.929571519107873e-06, + "loss": 1.0642, + "step": 1063 + }, + { + "epoch": 0.08202281837804502, + "grad_norm": 3.913541793823242, + "learning_rate": 9.92936256106245e-06, + "loss": 1.1794, + "step": 1064 + }, + { + "epoch": 0.08209990749306198, + "grad_norm": 4.19666862487793, + "learning_rate": 9.929153295696803e-06, + "loss": 1.2815, + "step": 1065 + }, + { + "epoch": 0.08217699660807894, + "grad_norm": 4.4634175300598145, + "learning_rate": 9.928943723023973e-06, + "loss": 1.2451, + "step": 1066 + }, + { + "epoch": 0.0822540857230959, + "grad_norm": 3.9684886932373047, + "learning_rate": 9.928733843057033e-06, + "loss": 1.1221, + "step": 1067 + }, + { + "epoch": 0.08233117483811286, + "grad_norm": 3.8272829055786133, + "learning_rate": 9.928523655809062e-06, + "loss": 1.1193, + "step": 1068 + }, + { + "epoch": 0.08240826395312982, + "grad_norm": 4.0728325843811035, + "learning_rate": 9.928313161293168e-06, + "loss": 1.1795, + "step": 1069 + }, + { + "epoch": 0.08248535306814678, + "grad_norm": 3.951913595199585, + "learning_rate": 9.928102359522473e-06, + "loss": 1.1846, + "step": 1070 + }, + { + "epoch": 0.08256244218316373, + "grad_norm": 3.7784245014190674, + "learning_rate": 9.927891250510118e-06, + "loss": 1.1188, + "step": 1071 + }, + { + "epoch": 0.08263953129818069, + "grad_norm": 4.366214752197266, + "learning_rate": 9.927679834269266e-06, + "loss": 1.1808, + "step": 1072 + }, + { + "epoch": 0.08271662041319766, + "grad_norm": 4.112607002258301, + "learning_rate": 9.927468110813099e-06, + "loss": 1.1893, + "step": 1073 + }, + { + "epoch": 0.08279370952821462, + "grad_norm": 3.9396910667419434, + "learning_rate": 9.927256080154813e-06, + "loss": 1.2093, + "step": 1074 + }, + { + "epoch": 0.08287079864323157, + "grad_norm": 4.674061298370361, + "learning_rate": 9.927043742307631e-06, + "loss": 1.2488, + "step": 1075 + }, + { + "epoch": 0.08294788775824853, + "grad_norm": 5.152288436889648, + "learning_rate": 9.926831097284788e-06, + "loss": 1.2201, + "step": 1076 + }, + { + "epoch": 0.08302497687326549, + "grad_norm": 3.9294207096099854, + "learning_rate": 9.926618145099544e-06, + "loss": 1.2437, + "step": 1077 + }, + { + "epoch": 0.08310206598828246, + "grad_norm": 4.059928894042969, + "learning_rate": 9.926404885765175e-06, + "loss": 1.1745, + "step": 1078 + }, + { + "epoch": 0.08317915510329942, + "grad_norm": 4.130738735198975, + "learning_rate": 9.926191319294974e-06, + "loss": 1.2372, + "step": 1079 + }, + { + "epoch": 0.08325624421831637, + "grad_norm": 4.510885238647461, + "learning_rate": 9.92597744570226e-06, + "loss": 1.3444, + "step": 1080 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 3.76959490776062, + "learning_rate": 9.92576326500036e-06, + "loss": 1.0194, + "step": 1081 + }, + { + "epoch": 0.0834104224483503, + "grad_norm": 4.379603862762451, + "learning_rate": 9.925548777202636e-06, + "loss": 1.2125, + "step": 1082 + }, + { + "epoch": 0.08348751156336726, + "grad_norm": 4.4609375, + "learning_rate": 9.925333982322456e-06, + "loss": 1.2694, + "step": 1083 + }, + { + "epoch": 0.08356460067838421, + "grad_norm": 4.134787082672119, + "learning_rate": 9.925118880373208e-06, + "loss": 1.188, + "step": 1084 + }, + { + "epoch": 0.08364168979340117, + "grad_norm": 3.97082781791687, + "learning_rate": 9.924903471368308e-06, + "loss": 1.0874, + "step": 1085 + }, + { + "epoch": 0.08371877890841813, + "grad_norm": 3.723400592803955, + "learning_rate": 9.924687755321183e-06, + "loss": 1.2485, + "step": 1086 + }, + { + "epoch": 0.0837958680234351, + "grad_norm": 4.063663005828857, + "learning_rate": 9.92447173224528e-06, + "loss": 1.1585, + "step": 1087 + }, + { + "epoch": 0.08387295713845205, + "grad_norm": 4.590901851654053, + "learning_rate": 9.924255402154072e-06, + "loss": 1.2752, + "step": 1088 + }, + { + "epoch": 0.08395004625346901, + "grad_norm": 3.818544626235962, + "learning_rate": 9.924038765061042e-06, + "loss": 1.0903, + "step": 1089 + }, + { + "epoch": 0.08402713536848597, + "grad_norm": 4.31704044342041, + "learning_rate": 9.923821820979695e-06, + "loss": 1.2734, + "step": 1090 + }, + { + "epoch": 0.08410422448350292, + "grad_norm": 4.051934719085693, + "learning_rate": 9.923604569923562e-06, + "loss": 1.304, + "step": 1091 + }, + { + "epoch": 0.0841813135985199, + "grad_norm": 3.77911376953125, + "learning_rate": 9.923387011906183e-06, + "loss": 1.1781, + "step": 1092 + }, + { + "epoch": 0.08425840271353685, + "grad_norm": 4.27567720413208, + "learning_rate": 9.923169146941121e-06, + "loss": 1.2181, + "step": 1093 + }, + { + "epoch": 0.08433549182855381, + "grad_norm": 4.199497699737549, + "learning_rate": 9.922950975041963e-06, + "loss": 1.2001, + "step": 1094 + }, + { + "epoch": 0.08441258094357076, + "grad_norm": 4.348501682281494, + "learning_rate": 9.922732496222306e-06, + "loss": 1.3706, + "step": 1095 + }, + { + "epoch": 0.08448967005858772, + "grad_norm": 3.888430118560791, + "learning_rate": 9.922513710495774e-06, + "loss": 1.2277, + "step": 1096 + }, + { + "epoch": 0.08456675917360469, + "grad_norm": 4.729867935180664, + "learning_rate": 9.922294617876007e-06, + "loss": 1.3851, + "step": 1097 + }, + { + "epoch": 0.08464384828862165, + "grad_norm": 4.120565891265869, + "learning_rate": 9.922075218376664e-06, + "loss": 1.1222, + "step": 1098 + }, + { + "epoch": 0.0847209374036386, + "grad_norm": 4.8957839012146, + "learning_rate": 9.921855512011422e-06, + "loss": 1.2947, + "step": 1099 + }, + { + "epoch": 0.08479802651865556, + "grad_norm": 4.164621829986572, + "learning_rate": 9.921635498793983e-06, + "loss": 1.0555, + "step": 1100 + }, + { + "epoch": 0.08487511563367253, + "grad_norm": 4.292223930358887, + "learning_rate": 9.921415178738056e-06, + "loss": 1.2633, + "step": 1101 + }, + { + "epoch": 0.08495220474868949, + "grad_norm": 4.393791198730469, + "learning_rate": 9.921194551857384e-06, + "loss": 1.2356, + "step": 1102 + }, + { + "epoch": 0.08502929386370645, + "grad_norm": 4.085275650024414, + "learning_rate": 9.920973618165719e-06, + "loss": 1.072, + "step": 1103 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 4.277721881866455, + "learning_rate": 9.920752377676835e-06, + "loss": 1.1315, + "step": 1104 + }, + { + "epoch": 0.08518347209374036, + "grad_norm": 4.160389423370361, + "learning_rate": 9.920530830404525e-06, + "loss": 1.2079, + "step": 1105 + }, + { + "epoch": 0.08526056120875733, + "grad_norm": 4.0644917488098145, + "learning_rate": 9.920308976362602e-06, + "loss": 1.1534, + "step": 1106 + }, + { + "epoch": 0.08533765032377429, + "grad_norm": 4.176297187805176, + "learning_rate": 9.920086815564898e-06, + "loss": 1.3915, + "step": 1107 + }, + { + "epoch": 0.08541473943879124, + "grad_norm": 4.304412841796875, + "learning_rate": 9.919864348025261e-06, + "loss": 1.275, + "step": 1108 + }, + { + "epoch": 0.0854918285538082, + "grad_norm": 4.072107791900635, + "learning_rate": 9.919641573757563e-06, + "loss": 1.1173, + "step": 1109 + }, + { + "epoch": 0.08556891766882516, + "grad_norm": 4.24763822555542, + "learning_rate": 9.919418492775694e-06, + "loss": 1.1268, + "step": 1110 + }, + { + "epoch": 0.08564600678384213, + "grad_norm": 4.156879901885986, + "learning_rate": 9.91919510509356e-06, + "loss": 1.1882, + "step": 1111 + }, + { + "epoch": 0.08572309589885908, + "grad_norm": 4.036397933959961, + "learning_rate": 9.918971410725089e-06, + "loss": 1.2513, + "step": 1112 + }, + { + "epoch": 0.08580018501387604, + "grad_norm": 4.336670398712158, + "learning_rate": 9.918747409684225e-06, + "loss": 1.0309, + "step": 1113 + }, + { + "epoch": 0.085877274128893, + "grad_norm": 4.337256908416748, + "learning_rate": 9.918523101984933e-06, + "loss": 1.1959, + "step": 1114 + }, + { + "epoch": 0.08595436324390995, + "grad_norm": 3.9590811729431152, + "learning_rate": 9.918298487641202e-06, + "loss": 1.15, + "step": 1115 + }, + { + "epoch": 0.08603145235892692, + "grad_norm": 4.2955522537231445, + "learning_rate": 9.918073566667033e-06, + "loss": 1.1446, + "step": 1116 + }, + { + "epoch": 0.08610854147394388, + "grad_norm": 4.317296028137207, + "learning_rate": 9.917848339076448e-06, + "loss": 1.2451, + "step": 1117 + }, + { + "epoch": 0.08618563058896084, + "grad_norm": 4.0374755859375, + "learning_rate": 9.917622804883488e-06, + "loss": 1.1549, + "step": 1118 + }, + { + "epoch": 0.0862627197039778, + "grad_norm": 3.970003366470337, + "learning_rate": 9.917396964102218e-06, + "loss": 1.123, + "step": 1119 + }, + { + "epoch": 0.08633980881899475, + "grad_norm": 3.7095046043395996, + "learning_rate": 9.917170816746713e-06, + "loss": 1.1372, + "step": 1120 + }, + { + "epoch": 0.08641689793401172, + "grad_norm": 4.35107946395874, + "learning_rate": 9.916944362831073e-06, + "loss": 1.2134, + "step": 1121 + }, + { + "epoch": 0.08649398704902868, + "grad_norm": 4.03241491317749, + "learning_rate": 9.91671760236942e-06, + "loss": 1.119, + "step": 1122 + }, + { + "epoch": 0.08657107616404563, + "grad_norm": 3.650089979171753, + "learning_rate": 9.916490535375889e-06, + "loss": 1.1689, + "step": 1123 + }, + { + "epoch": 0.08664816527906259, + "grad_norm": 4.5127787590026855, + "learning_rate": 9.916263161864634e-06, + "loss": 1.2254, + "step": 1124 + }, + { + "epoch": 0.08672525439407956, + "grad_norm": 4.502647399902344, + "learning_rate": 9.916035481849834e-06, + "loss": 1.3046, + "step": 1125 + }, + { + "epoch": 0.08680234350909652, + "grad_norm": 4.246219635009766, + "learning_rate": 9.915807495345682e-06, + "loss": 1.1592, + "step": 1126 + }, + { + "epoch": 0.08687943262411348, + "grad_norm": 3.7990305423736572, + "learning_rate": 9.915579202366393e-06, + "loss": 1.2351, + "step": 1127 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 3.929375648498535, + "learning_rate": 9.915350602926198e-06, + "loss": 1.1652, + "step": 1128 + }, + { + "epoch": 0.08703361085414739, + "grad_norm": 3.9628517627716064, + "learning_rate": 9.915121697039352e-06, + "loss": 1.0718, + "step": 1129 + }, + { + "epoch": 0.08711069996916436, + "grad_norm": 4.691619396209717, + "learning_rate": 9.914892484720124e-06, + "loss": 1.2074, + "step": 1130 + }, + { + "epoch": 0.08718778908418132, + "grad_norm": 3.8248648643493652, + "learning_rate": 9.914662965982803e-06, + "loss": 1.0582, + "step": 1131 + }, + { + "epoch": 0.08726487819919827, + "grad_norm": 4.224147319793701, + "learning_rate": 9.914433140841702e-06, + "loss": 1.1567, + "step": 1132 + }, + { + "epoch": 0.08734196731421523, + "grad_norm": 4.157731533050537, + "learning_rate": 9.914203009311146e-06, + "loss": 1.1007, + "step": 1133 + }, + { + "epoch": 0.08741905642923219, + "grad_norm": 4.48982048034668, + "learning_rate": 9.913972571405482e-06, + "loss": 1.3227, + "step": 1134 + }, + { + "epoch": 0.08749614554424916, + "grad_norm": 3.823336362838745, + "learning_rate": 9.913741827139081e-06, + "loss": 1.2045, + "step": 1135 + }, + { + "epoch": 0.08757323465926611, + "grad_norm": 4.46881103515625, + "learning_rate": 9.913510776526324e-06, + "loss": 1.2047, + "step": 1136 + }, + { + "epoch": 0.08765032377428307, + "grad_norm": 4.13620662689209, + "learning_rate": 9.913279419581619e-06, + "loss": 1.2527, + "step": 1137 + }, + { + "epoch": 0.08772741288930003, + "grad_norm": 3.984325408935547, + "learning_rate": 9.913047756319388e-06, + "loss": 1.1912, + "step": 1138 + }, + { + "epoch": 0.08780450200431698, + "grad_norm": 4.111431121826172, + "learning_rate": 9.912815786754075e-06, + "loss": 1.2131, + "step": 1139 + }, + { + "epoch": 0.08788159111933395, + "grad_norm": 3.925600290298462, + "learning_rate": 9.912583510900142e-06, + "loss": 1.0794, + "step": 1140 + }, + { + "epoch": 0.08795868023435091, + "grad_norm": 3.7339839935302734, + "learning_rate": 9.91235092877207e-06, + "loss": 1.1662, + "step": 1141 + }, + { + "epoch": 0.08803576934936787, + "grad_norm": 3.6147632598876953, + "learning_rate": 9.912118040384358e-06, + "loss": 1.0365, + "step": 1142 + }, + { + "epoch": 0.08811285846438482, + "grad_norm": 4.1122355461120605, + "learning_rate": 9.911884845751529e-06, + "loss": 1.2303, + "step": 1143 + }, + { + "epoch": 0.0881899475794018, + "grad_norm": 4.269367218017578, + "learning_rate": 9.911651344888117e-06, + "loss": 1.217, + "step": 1144 + }, + { + "epoch": 0.08826703669441875, + "grad_norm": 3.916430711746216, + "learning_rate": 9.911417537808684e-06, + "loss": 1.1381, + "step": 1145 + }, + { + "epoch": 0.08834412580943571, + "grad_norm": 3.4542276859283447, + "learning_rate": 9.911183424527802e-06, + "loss": 1.0333, + "step": 1146 + }, + { + "epoch": 0.08842121492445267, + "grad_norm": 3.716357707977295, + "learning_rate": 9.91094900506007e-06, + "loss": 1.228, + "step": 1147 + }, + { + "epoch": 0.08849830403946962, + "grad_norm": 4.024401664733887, + "learning_rate": 9.910714279420103e-06, + "loss": 1.1965, + "step": 1148 + }, + { + "epoch": 0.08857539315448659, + "grad_norm": 3.9577510356903076, + "learning_rate": 9.910479247622534e-06, + "loss": 1.1488, + "step": 1149 + }, + { + "epoch": 0.08865248226950355, + "grad_norm": 4.212368011474609, + "learning_rate": 9.910243909682014e-06, + "loss": 1.2137, + "step": 1150 + }, + { + "epoch": 0.0887295713845205, + "grad_norm": 3.935518503189087, + "learning_rate": 9.910008265613219e-06, + "loss": 1.0983, + "step": 1151 + }, + { + "epoch": 0.08880666049953746, + "grad_norm": 4.040730953216553, + "learning_rate": 9.909772315430837e-06, + "loss": 1.1936, + "step": 1152 + }, + { + "epoch": 0.08888374961455442, + "grad_norm": 3.9304912090301514, + "learning_rate": 9.90953605914958e-06, + "loss": 1.2163, + "step": 1153 + }, + { + "epoch": 0.08896083872957139, + "grad_norm": 3.890301465988159, + "learning_rate": 9.909299496784177e-06, + "loss": 1.2424, + "step": 1154 + }, + { + "epoch": 0.08903792784458835, + "grad_norm": 3.953073263168335, + "learning_rate": 9.909062628349375e-06, + "loss": 1.2468, + "step": 1155 + }, + { + "epoch": 0.0891150169596053, + "grad_norm": 4.048395156860352, + "learning_rate": 9.908825453859944e-06, + "loss": 1.2238, + "step": 1156 + }, + { + "epoch": 0.08919210607462226, + "grad_norm": 3.6528916358947754, + "learning_rate": 9.90858797333067e-06, + "loss": 1.1587, + "step": 1157 + }, + { + "epoch": 0.08926919518963922, + "grad_norm": 3.6531436443328857, + "learning_rate": 9.908350186776357e-06, + "loss": 1.1732, + "step": 1158 + }, + { + "epoch": 0.08934628430465619, + "grad_norm": 4.267482280731201, + "learning_rate": 9.908112094211831e-06, + "loss": 1.259, + "step": 1159 + }, + { + "epoch": 0.08942337341967314, + "grad_norm": 4.037517547607422, + "learning_rate": 9.907873695651935e-06, + "loss": 1.1787, + "step": 1160 + }, + { + "epoch": 0.0895004625346901, + "grad_norm": 3.696422815322876, + "learning_rate": 9.907634991111535e-06, + "loss": 1.0961, + "step": 1161 + }, + { + "epoch": 0.08957755164970706, + "grad_norm": 4.135885238647461, + "learning_rate": 9.907395980605509e-06, + "loss": 1.2558, + "step": 1162 + }, + { + "epoch": 0.08965464076472403, + "grad_norm": 4.450069904327393, + "learning_rate": 9.907156664148761e-06, + "loss": 1.2009, + "step": 1163 + }, + { + "epoch": 0.08973172987974098, + "grad_norm": 3.922731876373291, + "learning_rate": 9.906917041756208e-06, + "loss": 1.1875, + "step": 1164 + }, + { + "epoch": 0.08980881899475794, + "grad_norm": 4.55860710144043, + "learning_rate": 9.90667711344279e-06, + "loss": 1.2364, + "step": 1165 + }, + { + "epoch": 0.0898859081097749, + "grad_norm": 4.525522232055664, + "learning_rate": 9.90643687922347e-06, + "loss": 1.2342, + "step": 1166 + }, + { + "epoch": 0.08996299722479185, + "grad_norm": 4.051623344421387, + "learning_rate": 9.90619633911322e-06, + "loss": 1.2236, + "step": 1167 + }, + { + "epoch": 0.09004008633980883, + "grad_norm": 3.817911148071289, + "learning_rate": 9.905955493127037e-06, + "loss": 1.0846, + "step": 1168 + }, + { + "epoch": 0.09011717545482578, + "grad_norm": 3.949352979660034, + "learning_rate": 9.905714341279938e-06, + "loss": 1.1237, + "step": 1169 + }, + { + "epoch": 0.09019426456984274, + "grad_norm": 3.8742692470550537, + "learning_rate": 9.905472883586958e-06, + "loss": 1.0737, + "step": 1170 + }, + { + "epoch": 0.0902713536848597, + "grad_norm": 4.435111999511719, + "learning_rate": 9.905231120063149e-06, + "loss": 1.3749, + "step": 1171 + }, + { + "epoch": 0.09034844279987665, + "grad_norm": 3.793482780456543, + "learning_rate": 9.904989050723583e-06, + "loss": 1.1484, + "step": 1172 + }, + { + "epoch": 0.09042553191489362, + "grad_norm": 4.095993995666504, + "learning_rate": 9.904746675583356e-06, + "loss": 1.1579, + "step": 1173 + }, + { + "epoch": 0.09050262102991058, + "grad_norm": 4.051393032073975, + "learning_rate": 9.904503994657574e-06, + "loss": 1.1387, + "step": 1174 + }, + { + "epoch": 0.09057971014492754, + "grad_norm": 3.8025002479553223, + "learning_rate": 9.90426100796137e-06, + "loss": 1.0579, + "step": 1175 + }, + { + "epoch": 0.09065679925994449, + "grad_norm": 4.1807708740234375, + "learning_rate": 9.904017715509893e-06, + "loss": 1.2583, + "step": 1176 + }, + { + "epoch": 0.09073388837496145, + "grad_norm": 3.890801191329956, + "learning_rate": 9.90377411731831e-06, + "loss": 1.1381, + "step": 1177 + }, + { + "epoch": 0.09081097748997842, + "grad_norm": 3.741750955581665, + "learning_rate": 9.903530213401806e-06, + "loss": 1.1599, + "step": 1178 + }, + { + "epoch": 0.09088806660499538, + "grad_norm": 4.385606288909912, + "learning_rate": 9.903286003775592e-06, + "loss": 1.2269, + "step": 1179 + }, + { + "epoch": 0.09096515572001233, + "grad_norm": 4.160192012786865, + "learning_rate": 9.903041488454888e-06, + "loss": 1.0707, + "step": 1180 + }, + { + "epoch": 0.09104224483502929, + "grad_norm": 4.092873573303223, + "learning_rate": 9.90279666745494e-06, + "loss": 1.2221, + "step": 1181 + }, + { + "epoch": 0.09111933395004625, + "grad_norm": 4.053955078125, + "learning_rate": 9.902551540791016e-06, + "loss": 1.1687, + "step": 1182 + }, + { + "epoch": 0.09119642306506322, + "grad_norm": 4.5065789222717285, + "learning_rate": 9.902306108478393e-06, + "loss": 1.004, + "step": 1183 + }, + { + "epoch": 0.09127351218008017, + "grad_norm": 4.21845006942749, + "learning_rate": 9.902060370532371e-06, + "loss": 1.2317, + "step": 1184 + }, + { + "epoch": 0.09135060129509713, + "grad_norm": 3.860323190689087, + "learning_rate": 9.901814326968276e-06, + "loss": 1.1142, + "step": 1185 + }, + { + "epoch": 0.09142769041011409, + "grad_norm": 4.450876712799072, + "learning_rate": 9.901567977801444e-06, + "loss": 1.1097, + "step": 1186 + }, + { + "epoch": 0.09150477952513106, + "grad_norm": 3.953404664993286, + "learning_rate": 9.901321323047235e-06, + "loss": 1.0921, + "step": 1187 + }, + { + "epoch": 0.09158186864014801, + "grad_norm": 3.6998918056488037, + "learning_rate": 9.901074362721024e-06, + "loss": 1.2101, + "step": 1188 + }, + { + "epoch": 0.09165895775516497, + "grad_norm": 3.8323581218719482, + "learning_rate": 9.900827096838213e-06, + "loss": 1.1592, + "step": 1189 + }, + { + "epoch": 0.09173604687018193, + "grad_norm": 3.705106496810913, + "learning_rate": 9.900579525414213e-06, + "loss": 1.1281, + "step": 1190 + }, + { + "epoch": 0.09181313598519888, + "grad_norm": 3.9382426738739014, + "learning_rate": 9.900331648464459e-06, + "loss": 1.1911, + "step": 1191 + }, + { + "epoch": 0.09189022510021586, + "grad_norm": 4.3277740478515625, + "learning_rate": 9.900083466004409e-06, + "loss": 1.2685, + "step": 1192 + }, + { + "epoch": 0.09196731421523281, + "grad_norm": 3.8461456298828125, + "learning_rate": 9.899834978049531e-06, + "loss": 1.2131, + "step": 1193 + }, + { + "epoch": 0.09204440333024977, + "grad_norm": 4.075659275054932, + "learning_rate": 9.89958618461532e-06, + "loss": 1.171, + "step": 1194 + }, + { + "epoch": 0.09212149244526673, + "grad_norm": 3.7878012657165527, + "learning_rate": 9.899337085717284e-06, + "loss": 1.1431, + "step": 1195 + }, + { + "epoch": 0.09219858156028368, + "grad_norm": 3.931734800338745, + "learning_rate": 9.899087681370958e-06, + "loss": 1.2347, + "step": 1196 + }, + { + "epoch": 0.09227567067530065, + "grad_norm": 4.06639289855957, + "learning_rate": 9.898837971591885e-06, + "loss": 1.2163, + "step": 1197 + }, + { + "epoch": 0.09235275979031761, + "grad_norm": 3.754260301589966, + "learning_rate": 9.89858795639564e-06, + "loss": 1.1521, + "step": 1198 + }, + { + "epoch": 0.09242984890533457, + "grad_norm": 4.012813568115234, + "learning_rate": 9.898337635797803e-06, + "loss": 1.1976, + "step": 1199 + }, + { + "epoch": 0.09250693802035152, + "grad_norm": 3.8789308071136475, + "learning_rate": 9.898087009813985e-06, + "loss": 1.1611, + "step": 1200 + }, + { + "epoch": 0.09258402713536848, + "grad_norm": 3.8602101802825928, + "learning_rate": 9.89783607845981e-06, + "loss": 1.2561, + "step": 1201 + }, + { + "epoch": 0.09266111625038545, + "grad_norm": 3.982560157775879, + "learning_rate": 9.897584841750922e-06, + "loss": 1.2404, + "step": 1202 + }, + { + "epoch": 0.0927382053654024, + "grad_norm": 4.090329170227051, + "learning_rate": 9.897333299702982e-06, + "loss": 1.1678, + "step": 1203 + }, + { + "epoch": 0.09281529448041936, + "grad_norm": 4.530256271362305, + "learning_rate": 9.897081452331677e-06, + "loss": 1.2437, + "step": 1204 + }, + { + "epoch": 0.09289238359543632, + "grad_norm": 4.283809185028076, + "learning_rate": 9.896829299652705e-06, + "loss": 1.1977, + "step": 1205 + }, + { + "epoch": 0.09296947271045329, + "grad_norm": 3.9148154258728027, + "learning_rate": 9.896576841681792e-06, + "loss": 1.2863, + "step": 1206 + }, + { + "epoch": 0.09304656182547025, + "grad_norm": 3.6620705127716064, + "learning_rate": 9.896324078434668e-06, + "loss": 1.1627, + "step": 1207 + }, + { + "epoch": 0.0931236509404872, + "grad_norm": 4.204239368438721, + "learning_rate": 9.896071009927098e-06, + "loss": 1.166, + "step": 1208 + }, + { + "epoch": 0.09320074005550416, + "grad_norm": 4.137142658233643, + "learning_rate": 9.895817636174857e-06, + "loss": 1.2225, + "step": 1209 + }, + { + "epoch": 0.09327782917052112, + "grad_norm": 4.27466344833374, + "learning_rate": 9.895563957193744e-06, + "loss": 1.1555, + "step": 1210 + }, + { + "epoch": 0.09335491828553809, + "grad_norm": 4.381038188934326, + "learning_rate": 9.89530997299957e-06, + "loss": 1.2687, + "step": 1211 + }, + { + "epoch": 0.09343200740055504, + "grad_norm": 3.764836549758911, + "learning_rate": 9.895055683608175e-06, + "loss": 1.0278, + "step": 1212 + }, + { + "epoch": 0.093509096515572, + "grad_norm": 4.309133529663086, + "learning_rate": 9.89480108903541e-06, + "loss": 1.217, + "step": 1213 + }, + { + "epoch": 0.09358618563058896, + "grad_norm": 4.008470058441162, + "learning_rate": 9.894546189297148e-06, + "loss": 1.1429, + "step": 1214 + }, + { + "epoch": 0.09366327474560592, + "grad_norm": 3.7469325065612793, + "learning_rate": 9.894290984409281e-06, + "loss": 1.1331, + "step": 1215 + }, + { + "epoch": 0.09374036386062289, + "grad_norm": 4.368958473205566, + "learning_rate": 9.894035474387719e-06, + "loss": 1.2149, + "step": 1216 + }, + { + "epoch": 0.09381745297563984, + "grad_norm": 4.350530624389648, + "learning_rate": 9.893779659248393e-06, + "loss": 1.2107, + "step": 1217 + }, + { + "epoch": 0.0938945420906568, + "grad_norm": 3.9404361248016357, + "learning_rate": 9.893523539007248e-06, + "loss": 1.1363, + "step": 1218 + }, + { + "epoch": 0.09397163120567376, + "grad_norm": 4.558434009552002, + "learning_rate": 9.893267113680257e-06, + "loss": 1.1958, + "step": 1219 + }, + { + "epoch": 0.09404872032069071, + "grad_norm": 4.263239860534668, + "learning_rate": 9.893010383283404e-06, + "loss": 1.2427, + "step": 1220 + }, + { + "epoch": 0.09412580943570768, + "grad_norm": 4.081235408782959, + "learning_rate": 9.892753347832695e-06, + "loss": 1.1318, + "step": 1221 + }, + { + "epoch": 0.09420289855072464, + "grad_norm": 3.902259111404419, + "learning_rate": 9.892496007344155e-06, + "loss": 1.225, + "step": 1222 + }, + { + "epoch": 0.0942799876657416, + "grad_norm": 4.030986785888672, + "learning_rate": 9.892238361833826e-06, + "loss": 1.1857, + "step": 1223 + }, + { + "epoch": 0.09435707678075855, + "grad_norm": 4.28933048248291, + "learning_rate": 9.891980411317774e-06, + "loss": 1.2686, + "step": 1224 + }, + { + "epoch": 0.09443416589577551, + "grad_norm": 3.7828292846679688, + "learning_rate": 9.89172215581208e-06, + "loss": 1.1151, + "step": 1225 + }, + { + "epoch": 0.09451125501079248, + "grad_norm": 3.7171690464019775, + "learning_rate": 9.891463595332844e-06, + "loss": 1.18, + "step": 1226 + }, + { + "epoch": 0.09458834412580944, + "grad_norm": 4.4071478843688965, + "learning_rate": 9.891204729896187e-06, + "loss": 1.1709, + "step": 1227 + }, + { + "epoch": 0.0946654332408264, + "grad_norm": 3.7841458320617676, + "learning_rate": 9.890945559518247e-06, + "loss": 1.234, + "step": 1228 + }, + { + "epoch": 0.09474252235584335, + "grad_norm": 4.163311004638672, + "learning_rate": 9.890686084215182e-06, + "loss": 1.1784, + "step": 1229 + }, + { + "epoch": 0.09481961147086032, + "grad_norm": 4.381045341491699, + "learning_rate": 9.89042630400317e-06, + "loss": 1.1428, + "step": 1230 + }, + { + "epoch": 0.09489670058587728, + "grad_norm": 4.111490726470947, + "learning_rate": 9.890166218898405e-06, + "loss": 1.2638, + "step": 1231 + }, + { + "epoch": 0.09497378970089423, + "grad_norm": 4.06538200378418, + "learning_rate": 9.889905828917103e-06, + "loss": 1.2292, + "step": 1232 + }, + { + "epoch": 0.09505087881591119, + "grad_norm": 3.760375499725342, + "learning_rate": 9.8896451340755e-06, + "loss": 1.1367, + "step": 1233 + }, + { + "epoch": 0.09512796793092815, + "grad_norm": 3.769625186920166, + "learning_rate": 9.889384134389844e-06, + "loss": 1.2137, + "step": 1234 + }, + { + "epoch": 0.09520505704594512, + "grad_norm": 4.986996650695801, + "learning_rate": 9.889122829876412e-06, + "loss": 1.2135, + "step": 1235 + }, + { + "epoch": 0.09528214616096208, + "grad_norm": 3.983733654022217, + "learning_rate": 9.888861220551494e-06, + "loss": 1.1738, + "step": 1236 + }, + { + "epoch": 0.09535923527597903, + "grad_norm": 3.9600632190704346, + "learning_rate": 9.888599306431397e-06, + "loss": 1.0924, + "step": 1237 + }, + { + "epoch": 0.09543632439099599, + "grad_norm": 3.82342791557312, + "learning_rate": 9.888337087532452e-06, + "loss": 1.2129, + "step": 1238 + }, + { + "epoch": 0.09551341350601295, + "grad_norm": 4.06931734085083, + "learning_rate": 9.888074563871007e-06, + "loss": 1.0888, + "step": 1239 + }, + { + "epoch": 0.09559050262102992, + "grad_norm": 3.582798719406128, + "learning_rate": 9.88781173546343e-06, + "loss": 1.0759, + "step": 1240 + }, + { + "epoch": 0.09566759173604687, + "grad_norm": 4.151041507720947, + "learning_rate": 9.887548602326104e-06, + "loss": 1.09, + "step": 1241 + }, + { + "epoch": 0.09574468085106383, + "grad_norm": 4.339078426361084, + "learning_rate": 9.887285164475438e-06, + "loss": 1.1669, + "step": 1242 + }, + { + "epoch": 0.09582176996608079, + "grad_norm": 4.104028701782227, + "learning_rate": 9.887021421927853e-06, + "loss": 1.1578, + "step": 1243 + }, + { + "epoch": 0.09589885908109774, + "grad_norm": 4.224393844604492, + "learning_rate": 9.886757374699792e-06, + "loss": 1.1235, + "step": 1244 + }, + { + "epoch": 0.09597594819611471, + "grad_norm": 3.995770215988159, + "learning_rate": 9.88649302280772e-06, + "loss": 1.1039, + "step": 1245 + }, + { + "epoch": 0.09605303731113167, + "grad_norm": 4.252878665924072, + "learning_rate": 9.886228366268114e-06, + "loss": 1.1712, + "step": 1246 + }, + { + "epoch": 0.09613012642614863, + "grad_norm": 4.067622661590576, + "learning_rate": 9.885963405097477e-06, + "loss": 1.2701, + "step": 1247 + }, + { + "epoch": 0.09620721554116558, + "grad_norm": 4.1838765144348145, + "learning_rate": 9.885698139312326e-06, + "loss": 1.189, + "step": 1248 + }, + { + "epoch": 0.09628430465618255, + "grad_norm": 3.9121227264404297, + "learning_rate": 9.8854325689292e-06, + "loss": 1.1588, + "step": 1249 + }, + { + "epoch": 0.09636139377119951, + "grad_norm": 4.973738193511963, + "learning_rate": 9.885166693964654e-06, + "loss": 1.3419, + "step": 1250 + }, + { + "epoch": 0.09643848288621647, + "grad_norm": 3.7892560958862305, + "learning_rate": 9.884900514435266e-06, + "loss": 1.1613, + "step": 1251 + }, + { + "epoch": 0.09651557200123342, + "grad_norm": 3.8883039951324463, + "learning_rate": 9.884634030357634e-06, + "loss": 1.0897, + "step": 1252 + }, + { + "epoch": 0.09659266111625038, + "grad_norm": 3.8611433506011963, + "learning_rate": 9.884367241748364e-06, + "loss": 1.1545, + "step": 1253 + }, + { + "epoch": 0.09666975023126735, + "grad_norm": 3.8657636642456055, + "learning_rate": 9.884100148624096e-06, + "loss": 1.2268, + "step": 1254 + }, + { + "epoch": 0.09674683934628431, + "grad_norm": 3.610783576965332, + "learning_rate": 9.883832751001479e-06, + "loss": 1.0718, + "step": 1255 + }, + { + "epoch": 0.09682392846130126, + "grad_norm": 3.7750887870788574, + "learning_rate": 9.883565048897183e-06, + "loss": 1.1832, + "step": 1256 + }, + { + "epoch": 0.09690101757631822, + "grad_norm": 4.035364627838135, + "learning_rate": 9.883297042327899e-06, + "loss": 1.2015, + "step": 1257 + }, + { + "epoch": 0.09697810669133518, + "grad_norm": 3.7171030044555664, + "learning_rate": 9.883028731310335e-06, + "loss": 1.1283, + "step": 1258 + }, + { + "epoch": 0.09705519580635215, + "grad_norm": 3.9022812843322754, + "learning_rate": 9.88276011586122e-06, + "loss": 1.18, + "step": 1259 + }, + { + "epoch": 0.0971322849213691, + "grad_norm": 4.099169731140137, + "learning_rate": 9.882491195997301e-06, + "loss": 1.1186, + "step": 1260 + }, + { + "epoch": 0.09720937403638606, + "grad_norm": 4.0367326736450195, + "learning_rate": 9.882221971735343e-06, + "loss": 1.1546, + "step": 1261 + }, + { + "epoch": 0.09728646315140302, + "grad_norm": 3.8437790870666504, + "learning_rate": 9.88195244309213e-06, + "loss": 1.1853, + "step": 1262 + }, + { + "epoch": 0.09736355226641998, + "grad_norm": 3.9312708377838135, + "learning_rate": 9.881682610084467e-06, + "loss": 1.2333, + "step": 1263 + }, + { + "epoch": 0.09744064138143695, + "grad_norm": 4.007068157196045, + "learning_rate": 9.881412472729175e-06, + "loss": 1.1373, + "step": 1264 + }, + { + "epoch": 0.0975177304964539, + "grad_norm": 4.4400763511657715, + "learning_rate": 9.881142031043098e-06, + "loss": 1.3182, + "step": 1265 + }, + { + "epoch": 0.09759481961147086, + "grad_norm": 4.121272563934326, + "learning_rate": 9.880871285043095e-06, + "loss": 1.301, + "step": 1266 + }, + { + "epoch": 0.09767190872648782, + "grad_norm": 3.7698986530303955, + "learning_rate": 9.880600234746047e-06, + "loss": 1.132, + "step": 1267 + }, + { + "epoch": 0.09774899784150477, + "grad_norm": 4.014803886413574, + "learning_rate": 9.88032888016885e-06, + "loss": 1.1003, + "step": 1268 + }, + { + "epoch": 0.09782608695652174, + "grad_norm": 3.873784303665161, + "learning_rate": 9.880057221328425e-06, + "loss": 1.1009, + "step": 1269 + }, + { + "epoch": 0.0979031760715387, + "grad_norm": 4.115602970123291, + "learning_rate": 9.879785258241705e-06, + "loss": 1.3411, + "step": 1270 + }, + { + "epoch": 0.09798026518655566, + "grad_norm": 3.987370491027832, + "learning_rate": 9.879512990925648e-06, + "loss": 1.1694, + "step": 1271 + }, + { + "epoch": 0.09805735430157261, + "grad_norm": 3.914703607559204, + "learning_rate": 9.879240419397227e-06, + "loss": 1.2862, + "step": 1272 + }, + { + "epoch": 0.09813444341658958, + "grad_norm": 4.038669586181641, + "learning_rate": 9.878967543673436e-06, + "loss": 1.2318, + "step": 1273 + }, + { + "epoch": 0.09821153253160654, + "grad_norm": 3.7776105403900146, + "learning_rate": 9.878694363771289e-06, + "loss": 1.0864, + "step": 1274 + }, + { + "epoch": 0.0982886216466235, + "grad_norm": 3.926527976989746, + "learning_rate": 9.878420879707816e-06, + "loss": 1.1557, + "step": 1275 + }, + { + "epoch": 0.09836571076164045, + "grad_norm": 4.13185977935791, + "learning_rate": 9.878147091500065e-06, + "loss": 1.2331, + "step": 1276 + }, + { + "epoch": 0.09844279987665741, + "grad_norm": 4.489853858947754, + "learning_rate": 9.877872999165109e-06, + "loss": 1.2494, + "step": 1277 + }, + { + "epoch": 0.09851988899167438, + "grad_norm": 4.077756404876709, + "learning_rate": 9.877598602720034e-06, + "loss": 1.1705, + "step": 1278 + }, + { + "epoch": 0.09859697810669134, + "grad_norm": 4.43337345123291, + "learning_rate": 9.877323902181949e-06, + "loss": 1.1507, + "step": 1279 + }, + { + "epoch": 0.0986740672217083, + "grad_norm": 4.278952598571777, + "learning_rate": 9.877048897567977e-06, + "loss": 1.1005, + "step": 1280 + }, + { + "epoch": 0.09875115633672525, + "grad_norm": 3.6082239151000977, + "learning_rate": 9.876773588895265e-06, + "loss": 1.1287, + "step": 1281 + }, + { + "epoch": 0.09882824545174221, + "grad_norm": 3.5894458293914795, + "learning_rate": 9.876497976180978e-06, + "loss": 1.1771, + "step": 1282 + }, + { + "epoch": 0.09890533456675918, + "grad_norm": 4.010733127593994, + "learning_rate": 9.8762220594423e-06, + "loss": 1.0699, + "step": 1283 + }, + { + "epoch": 0.09898242368177614, + "grad_norm": 3.891744613647461, + "learning_rate": 9.87594583869643e-06, + "loss": 1.1908, + "step": 1284 + }, + { + "epoch": 0.09905951279679309, + "grad_norm": 4.345311164855957, + "learning_rate": 9.875669313960588e-06, + "loss": 1.2161, + "step": 1285 + }, + { + "epoch": 0.09913660191181005, + "grad_norm": 3.8743138313293457, + "learning_rate": 9.87539248525202e-06, + "loss": 1.2969, + "step": 1286 + }, + { + "epoch": 0.099213691026827, + "grad_norm": 4.233188152313232, + "learning_rate": 9.875115352587977e-06, + "loss": 1.1773, + "step": 1287 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 3.835361957550049, + "learning_rate": 9.874837915985743e-06, + "loss": 1.2376, + "step": 1288 + }, + { + "epoch": 0.09936786925686093, + "grad_norm": 3.9066238403320312, + "learning_rate": 9.874560175462612e-06, + "loss": 1.2814, + "step": 1289 + }, + { + "epoch": 0.09944495837187789, + "grad_norm": 3.9215028285980225, + "learning_rate": 9.874282131035899e-06, + "loss": 1.1192, + "step": 1290 + }, + { + "epoch": 0.09952204748689485, + "grad_norm": 3.68405818939209, + "learning_rate": 9.87400378272294e-06, + "loss": 1.2173, + "step": 1291 + }, + { + "epoch": 0.09959913660191182, + "grad_norm": 3.8745553493499756, + "learning_rate": 9.87372513054109e-06, + "loss": 1.1877, + "step": 1292 + }, + { + "epoch": 0.09967622571692877, + "grad_norm": 4.057330131530762, + "learning_rate": 9.873446174507719e-06, + "loss": 1.2889, + "step": 1293 + }, + { + "epoch": 0.09975331483194573, + "grad_norm": 4.208259105682373, + "learning_rate": 9.873166914640218e-06, + "loss": 1.1993, + "step": 1294 + }, + { + "epoch": 0.09983040394696269, + "grad_norm": 3.965298652648926, + "learning_rate": 9.872887350956e-06, + "loss": 1.1844, + "step": 1295 + }, + { + "epoch": 0.09990749306197964, + "grad_norm": 4.23174524307251, + "learning_rate": 9.872607483472491e-06, + "loss": 1.1607, + "step": 1296 + }, + { + "epoch": 0.09998458217699661, + "grad_norm": 4.803770542144775, + "learning_rate": 9.872327312207145e-06, + "loss": 1.2843, + "step": 1297 + }, + { + "epoch": 0.10006167129201357, + "grad_norm": 4.673795223236084, + "learning_rate": 9.872046837177421e-06, + "loss": 1.2134, + "step": 1298 + }, + { + "epoch": 0.10013876040703053, + "grad_norm": 3.8976125717163086, + "learning_rate": 9.871766058400812e-06, + "loss": 1.1635, + "step": 1299 + }, + { + "epoch": 0.10021584952204748, + "grad_norm": 4.471933841705322, + "learning_rate": 9.87148497589482e-06, + "loss": 1.1844, + "step": 1300 + }, + { + "epoch": 0.10029293863706444, + "grad_norm": 4.1576457023620605, + "learning_rate": 9.871203589676971e-06, + "loss": 1.1461, + "step": 1301 + }, + { + "epoch": 0.10037002775208141, + "grad_norm": 3.978757381439209, + "learning_rate": 9.870921899764807e-06, + "loss": 1.2684, + "step": 1302 + }, + { + "epoch": 0.10044711686709837, + "grad_norm": 3.5319221019744873, + "learning_rate": 9.87063990617589e-06, + "loss": 1.1454, + "step": 1303 + }, + { + "epoch": 0.10052420598211532, + "grad_norm": 4.05499267578125, + "learning_rate": 9.870357608927798e-06, + "loss": 1.1962, + "step": 1304 + }, + { + "epoch": 0.10060129509713228, + "grad_norm": 4.118753433227539, + "learning_rate": 9.870075008038137e-06, + "loss": 1.1875, + "step": 1305 + }, + { + "epoch": 0.10067838421214924, + "grad_norm": 3.8087265491485596, + "learning_rate": 9.869792103524517e-06, + "loss": 1.1726, + "step": 1306 + }, + { + "epoch": 0.10075547332716621, + "grad_norm": 4.719503879547119, + "learning_rate": 9.869508895404584e-06, + "loss": 1.1398, + "step": 1307 + }, + { + "epoch": 0.10083256244218317, + "grad_norm": 4.060819625854492, + "learning_rate": 9.86922538369599e-06, + "loss": 1.2485, + "step": 1308 + }, + { + "epoch": 0.10090965155720012, + "grad_norm": 4.301494121551514, + "learning_rate": 9.868941568416413e-06, + "loss": 1.1975, + "step": 1309 + }, + { + "epoch": 0.10098674067221708, + "grad_norm": 3.7967967987060547, + "learning_rate": 9.868657449583547e-06, + "loss": 1.1155, + "step": 1310 + }, + { + "epoch": 0.10106382978723404, + "grad_norm": 5.051084995269775, + "learning_rate": 9.8683730272151e-06, + "loss": 1.2833, + "step": 1311 + }, + { + "epoch": 0.101140918902251, + "grad_norm": 5.0358381271362305, + "learning_rate": 9.868088301328813e-06, + "loss": 1.358, + "step": 1312 + }, + { + "epoch": 0.10121800801726796, + "grad_norm": 4.280052185058594, + "learning_rate": 9.867803271942432e-06, + "loss": 1.2046, + "step": 1313 + }, + { + "epoch": 0.10129509713228492, + "grad_norm": 3.971712350845337, + "learning_rate": 9.867517939073727e-06, + "loss": 1.1826, + "step": 1314 + }, + { + "epoch": 0.10137218624730188, + "grad_norm": 4.396337985992432, + "learning_rate": 9.867232302740489e-06, + "loss": 1.2291, + "step": 1315 + }, + { + "epoch": 0.10144927536231885, + "grad_norm": 3.7291693687438965, + "learning_rate": 9.866946362960526e-06, + "loss": 1.2395, + "step": 1316 + }, + { + "epoch": 0.1015263644773358, + "grad_norm": 3.65598726272583, + "learning_rate": 9.86666011975166e-06, + "loss": 1.2252, + "step": 1317 + }, + { + "epoch": 0.10160345359235276, + "grad_norm": 3.8488965034484863, + "learning_rate": 9.866373573131744e-06, + "loss": 1.132, + "step": 1318 + }, + { + "epoch": 0.10168054270736972, + "grad_norm": 4.217064380645752, + "learning_rate": 9.86608672311864e-06, + "loss": 1.2116, + "step": 1319 + }, + { + "epoch": 0.10175763182238667, + "grad_norm": 3.518402576446533, + "learning_rate": 9.865799569730227e-06, + "loss": 1.0713, + "step": 1320 + }, + { + "epoch": 0.10183472093740364, + "grad_norm": 4.360300064086914, + "learning_rate": 9.865512112984414e-06, + "loss": 1.2738, + "step": 1321 + }, + { + "epoch": 0.1019118100524206, + "grad_norm": 4.2740068435668945, + "learning_rate": 9.86522435289912e-06, + "loss": 1.3994, + "step": 1322 + }, + { + "epoch": 0.10198889916743756, + "grad_norm": 4.366950511932373, + "learning_rate": 9.864936289492285e-06, + "loss": 1.2464, + "step": 1323 + }, + { + "epoch": 0.10206598828245451, + "grad_norm": 4.6967997550964355, + "learning_rate": 9.864647922781868e-06, + "loss": 1.2328, + "step": 1324 + }, + { + "epoch": 0.10214307739747147, + "grad_norm": 3.914935827255249, + "learning_rate": 9.864359252785847e-06, + "loss": 1.2629, + "step": 1325 + }, + { + "epoch": 0.10222016651248844, + "grad_norm": 3.91147780418396, + "learning_rate": 9.864070279522222e-06, + "loss": 1.1347, + "step": 1326 + }, + { + "epoch": 0.1022972556275054, + "grad_norm": 3.9424116611480713, + "learning_rate": 9.863781003009005e-06, + "loss": 1.1586, + "step": 1327 + }, + { + "epoch": 0.10237434474252236, + "grad_norm": 4.317541122436523, + "learning_rate": 9.863491423264232e-06, + "loss": 1.2135, + "step": 1328 + }, + { + "epoch": 0.10245143385753931, + "grad_norm": 3.8650455474853516, + "learning_rate": 9.86320154030596e-06, + "loss": 1.1562, + "step": 1329 + }, + { + "epoch": 0.10252852297255627, + "grad_norm": 4.095459461212158, + "learning_rate": 9.862911354152258e-06, + "loss": 1.0685, + "step": 1330 + }, + { + "epoch": 0.10260561208757324, + "grad_norm": 4.160573482513428, + "learning_rate": 9.862620864821218e-06, + "loss": 1.2398, + "step": 1331 + }, + { + "epoch": 0.1026827012025902, + "grad_norm": 4.309733867645264, + "learning_rate": 9.862330072330953e-06, + "loss": 1.1783, + "step": 1332 + }, + { + "epoch": 0.10275979031760715, + "grad_norm": 4.311089038848877, + "learning_rate": 9.862038976699589e-06, + "loss": 1.2346, + "step": 1333 + }, + { + "epoch": 0.10283687943262411, + "grad_norm": 4.145449638366699, + "learning_rate": 9.861747577945275e-06, + "loss": 1.241, + "step": 1334 + }, + { + "epoch": 0.10291396854764108, + "grad_norm": 4.2260236740112305, + "learning_rate": 9.861455876086181e-06, + "loss": 1.2695, + "step": 1335 + }, + { + "epoch": 0.10299105766265804, + "grad_norm": 4.495214462280273, + "learning_rate": 9.86116387114049e-06, + "loss": 1.2767, + "step": 1336 + }, + { + "epoch": 0.103068146777675, + "grad_norm": 4.408441066741943, + "learning_rate": 9.860871563126409e-06, + "loss": 1.1875, + "step": 1337 + }, + { + "epoch": 0.10314523589269195, + "grad_norm": 3.792508363723755, + "learning_rate": 9.860578952062161e-06, + "loss": 1.1621, + "step": 1338 + }, + { + "epoch": 0.1032223250077089, + "grad_norm": 4.099066734313965, + "learning_rate": 9.86028603796599e-06, + "loss": 1.244, + "step": 1339 + }, + { + "epoch": 0.10329941412272588, + "grad_norm": 3.596609354019165, + "learning_rate": 9.859992820856155e-06, + "loss": 1.151, + "step": 1340 + }, + { + "epoch": 0.10337650323774283, + "grad_norm": 4.01649808883667, + "learning_rate": 9.859699300750937e-06, + "loss": 1.1552, + "step": 1341 + }, + { + "epoch": 0.10345359235275979, + "grad_norm": 3.891861915588379, + "learning_rate": 9.85940547766864e-06, + "loss": 1.1511, + "step": 1342 + }, + { + "epoch": 0.10353068146777675, + "grad_norm": 3.784027576446533, + "learning_rate": 9.859111351627576e-06, + "loss": 1.0498, + "step": 1343 + }, + { + "epoch": 0.1036077705827937, + "grad_norm": 3.873615026473999, + "learning_rate": 9.858816922646088e-06, + "loss": 1.1498, + "step": 1344 + }, + { + "epoch": 0.10368485969781067, + "grad_norm": 4.082927227020264, + "learning_rate": 9.858522190742529e-06, + "loss": 1.2154, + "step": 1345 + }, + { + "epoch": 0.10376194881282763, + "grad_norm": 3.9876515865325928, + "learning_rate": 9.858227155935271e-06, + "loss": 1.1871, + "step": 1346 + }, + { + "epoch": 0.10383903792784459, + "grad_norm": 3.943120241165161, + "learning_rate": 9.857931818242715e-06, + "loss": 1.0969, + "step": 1347 + }, + { + "epoch": 0.10391612704286154, + "grad_norm": 4.118403434753418, + "learning_rate": 9.857636177683267e-06, + "loss": 1.0838, + "step": 1348 + }, + { + "epoch": 0.1039932161578785, + "grad_norm": 3.8626832962036133, + "learning_rate": 9.857340234275363e-06, + "loss": 1.1041, + "step": 1349 + }, + { + "epoch": 0.10407030527289547, + "grad_norm": 3.960684299468994, + "learning_rate": 9.857043988037453e-06, + "loss": 1.1214, + "step": 1350 + }, + { + "epoch": 0.10414739438791243, + "grad_norm": 3.8273346424102783, + "learning_rate": 9.856747438988005e-06, + "loss": 1.1456, + "step": 1351 + }, + { + "epoch": 0.10422448350292939, + "grad_norm": 4.401994705200195, + "learning_rate": 9.856450587145507e-06, + "loss": 1.1389, + "step": 1352 + }, + { + "epoch": 0.10430157261794634, + "grad_norm": 4.069916248321533, + "learning_rate": 9.856153432528467e-06, + "loss": 1.2821, + "step": 1353 + }, + { + "epoch": 0.1043786617329633, + "grad_norm": 3.9717392921447754, + "learning_rate": 9.855855975155414e-06, + "loss": 1.1286, + "step": 1354 + }, + { + "epoch": 0.10445575084798027, + "grad_norm": 3.949397087097168, + "learning_rate": 9.855558215044887e-06, + "loss": 1.1939, + "step": 1355 + }, + { + "epoch": 0.10453283996299723, + "grad_norm": 3.813549757003784, + "learning_rate": 9.855260152215455e-06, + "loss": 1.069, + "step": 1356 + }, + { + "epoch": 0.10460992907801418, + "grad_norm": 4.000561237335205, + "learning_rate": 9.854961786685697e-06, + "loss": 1.1664, + "step": 1357 + }, + { + "epoch": 0.10468701819303114, + "grad_norm": 4.400665283203125, + "learning_rate": 9.854663118474217e-06, + "loss": 1.1951, + "step": 1358 + }, + { + "epoch": 0.10476410730804811, + "grad_norm": 4.009283542633057, + "learning_rate": 9.854364147599635e-06, + "loss": 1.1911, + "step": 1359 + }, + { + "epoch": 0.10484119642306507, + "grad_norm": 4.075754642486572, + "learning_rate": 9.854064874080589e-06, + "loss": 1.2182, + "step": 1360 + }, + { + "epoch": 0.10491828553808202, + "grad_norm": 3.848576784133911, + "learning_rate": 9.853765297935738e-06, + "loss": 1.1876, + "step": 1361 + }, + { + "epoch": 0.10499537465309898, + "grad_norm": 4.087264060974121, + "learning_rate": 9.853465419183759e-06, + "loss": 1.2082, + "step": 1362 + }, + { + "epoch": 0.10507246376811594, + "grad_norm": 4.3004913330078125, + "learning_rate": 9.853165237843347e-06, + "loss": 1.1629, + "step": 1363 + }, + { + "epoch": 0.10514955288313291, + "grad_norm": 3.889831781387329, + "learning_rate": 9.852864753933218e-06, + "loss": 1.1833, + "step": 1364 + }, + { + "epoch": 0.10522664199814986, + "grad_norm": 3.902057409286499, + "learning_rate": 9.852563967472106e-06, + "loss": 1.2362, + "step": 1365 + }, + { + "epoch": 0.10530373111316682, + "grad_norm": 4.216920852661133, + "learning_rate": 9.852262878478762e-06, + "loss": 1.2058, + "step": 1366 + }, + { + "epoch": 0.10538082022818378, + "grad_norm": 3.6932597160339355, + "learning_rate": 9.851961486971959e-06, + "loss": 1.133, + "step": 1367 + }, + { + "epoch": 0.10545790934320073, + "grad_norm": 3.947735071182251, + "learning_rate": 9.851659792970485e-06, + "loss": 1.2308, + "step": 1368 + }, + { + "epoch": 0.1055349984582177, + "grad_norm": 3.9944536685943604, + "learning_rate": 9.85135779649315e-06, + "loss": 1.1327, + "step": 1369 + }, + { + "epoch": 0.10561208757323466, + "grad_norm": 3.8380393981933594, + "learning_rate": 9.851055497558783e-06, + "loss": 1.2182, + "step": 1370 + }, + { + "epoch": 0.10568917668825162, + "grad_norm": 3.927506446838379, + "learning_rate": 9.850752896186231e-06, + "loss": 1.1246, + "step": 1371 + }, + { + "epoch": 0.10576626580326857, + "grad_norm": 3.877490520477295, + "learning_rate": 9.850449992394357e-06, + "loss": 1.1259, + "step": 1372 + }, + { + "epoch": 0.10584335491828553, + "grad_norm": 3.9563748836517334, + "learning_rate": 9.850146786202048e-06, + "loss": 1.2217, + "step": 1373 + }, + { + "epoch": 0.1059204440333025, + "grad_norm": 3.84112811088562, + "learning_rate": 9.849843277628206e-06, + "loss": 1.1055, + "step": 1374 + }, + { + "epoch": 0.10599753314831946, + "grad_norm": 4.415043830871582, + "learning_rate": 9.849539466691755e-06, + "loss": 1.1178, + "step": 1375 + }, + { + "epoch": 0.10607462226333642, + "grad_norm": 3.7989046573638916, + "learning_rate": 9.849235353411632e-06, + "loss": 1.2218, + "step": 1376 + }, + { + "epoch": 0.10615171137835337, + "grad_norm": 4.221613883972168, + "learning_rate": 9.848930937806802e-06, + "loss": 1.3153, + "step": 1377 + }, + { + "epoch": 0.10622880049337034, + "grad_norm": 3.621276378631592, + "learning_rate": 9.84862621989624e-06, + "loss": 1.18, + "step": 1378 + }, + { + "epoch": 0.1063058896083873, + "grad_norm": 4.181283473968506, + "learning_rate": 9.848321199698945e-06, + "loss": 1.1736, + "step": 1379 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 4.222801685333252, + "learning_rate": 9.848015877233935e-06, + "loss": 1.1519, + "step": 1380 + }, + { + "epoch": 0.10646006783842121, + "grad_norm": 4.010768890380859, + "learning_rate": 9.847710252520242e-06, + "loss": 1.1414, + "step": 1381 + }, + { + "epoch": 0.10653715695343817, + "grad_norm": 4.021345138549805, + "learning_rate": 9.847404325576921e-06, + "loss": 1.2202, + "step": 1382 + }, + { + "epoch": 0.10661424606845514, + "grad_norm": 4.335690498352051, + "learning_rate": 9.847098096423046e-06, + "loss": 1.1836, + "step": 1383 + }, + { + "epoch": 0.1066913351834721, + "grad_norm": 4.25598669052124, + "learning_rate": 9.84679156507771e-06, + "loss": 1.2209, + "step": 1384 + }, + { + "epoch": 0.10676842429848905, + "grad_norm": 3.9418535232543945, + "learning_rate": 9.84648473156002e-06, + "loss": 1.2035, + "step": 1385 + }, + { + "epoch": 0.10684551341350601, + "grad_norm": 4.38209867477417, + "learning_rate": 9.84617759588911e-06, + "loss": 1.2745, + "step": 1386 + }, + { + "epoch": 0.10692260252852297, + "grad_norm": 3.9577906131744385, + "learning_rate": 9.845870158084123e-06, + "loss": 1.2592, + "step": 1387 + }, + { + "epoch": 0.10699969164353994, + "grad_norm": 3.4693827629089355, + "learning_rate": 9.845562418164232e-06, + "loss": 1.1585, + "step": 1388 + }, + { + "epoch": 0.1070767807585569, + "grad_norm": 3.7138640880584717, + "learning_rate": 9.845254376148617e-06, + "loss": 1.1364, + "step": 1389 + }, + { + "epoch": 0.10715386987357385, + "grad_norm": 3.9818434715270996, + "learning_rate": 9.844946032056487e-06, + "loss": 1.0948, + "step": 1390 + }, + { + "epoch": 0.10723095898859081, + "grad_norm": 3.7591545581817627, + "learning_rate": 9.844637385907066e-06, + "loss": 1.0582, + "step": 1391 + }, + { + "epoch": 0.10730804810360776, + "grad_norm": 3.9569385051727295, + "learning_rate": 9.844328437719595e-06, + "loss": 1.1992, + "step": 1392 + }, + { + "epoch": 0.10738513721862473, + "grad_norm": 3.6534078121185303, + "learning_rate": 9.844019187513335e-06, + "loss": 1.1582, + "step": 1393 + }, + { + "epoch": 0.10746222633364169, + "grad_norm": 3.8868963718414307, + "learning_rate": 9.843709635307563e-06, + "loss": 1.2037, + "step": 1394 + }, + { + "epoch": 0.10753931544865865, + "grad_norm": 4.207004070281982, + "learning_rate": 9.843399781121585e-06, + "loss": 1.2641, + "step": 1395 + }, + { + "epoch": 0.1076164045636756, + "grad_norm": 3.9298391342163086, + "learning_rate": 9.843089624974716e-06, + "loss": 1.1404, + "step": 1396 + }, + { + "epoch": 0.10769349367869256, + "grad_norm": 3.9258861541748047, + "learning_rate": 9.84277916688629e-06, + "loss": 1.184, + "step": 1397 + }, + { + "epoch": 0.10777058279370953, + "grad_norm": 3.8614659309387207, + "learning_rate": 9.842468406875665e-06, + "loss": 1.2738, + "step": 1398 + }, + { + "epoch": 0.10784767190872649, + "grad_norm": 4.084595680236816, + "learning_rate": 9.842157344962214e-06, + "loss": 1.2203, + "step": 1399 + }, + { + "epoch": 0.10792476102374345, + "grad_norm": 3.8629918098449707, + "learning_rate": 9.84184598116533e-06, + "loss": 1.1434, + "step": 1400 + }, + { + "epoch": 0.1080018501387604, + "grad_norm": 3.8280506134033203, + "learning_rate": 9.841534315504427e-06, + "loss": 1.1438, + "step": 1401 + }, + { + "epoch": 0.10807893925377737, + "grad_norm": 4.113556861877441, + "learning_rate": 9.841222347998933e-06, + "loss": 1.2824, + "step": 1402 + }, + { + "epoch": 0.10815602836879433, + "grad_norm": 4.399749279022217, + "learning_rate": 9.8409100786683e-06, + "loss": 1.178, + "step": 1403 + }, + { + "epoch": 0.10823311748381129, + "grad_norm": 4.358874320983887, + "learning_rate": 9.840597507531997e-06, + "loss": 1.2722, + "step": 1404 + }, + { + "epoch": 0.10831020659882824, + "grad_norm": 3.7543954849243164, + "learning_rate": 9.840284634609508e-06, + "loss": 1.1464, + "step": 1405 + }, + { + "epoch": 0.1083872957138452, + "grad_norm": 4.325103282928467, + "learning_rate": 9.839971459920338e-06, + "loss": 1.1934, + "step": 1406 + }, + { + "epoch": 0.10846438482886217, + "grad_norm": 4.050751209259033, + "learning_rate": 9.839657983484018e-06, + "loss": 1.3296, + "step": 1407 + }, + { + "epoch": 0.10854147394387913, + "grad_norm": 4.21981954574585, + "learning_rate": 9.839344205320088e-06, + "loss": 1.2409, + "step": 1408 + }, + { + "epoch": 0.10861856305889608, + "grad_norm": 4.275995254516602, + "learning_rate": 9.839030125448108e-06, + "loss": 1.3003, + "step": 1409 + }, + { + "epoch": 0.10869565217391304, + "grad_norm": 4.038300514221191, + "learning_rate": 9.838715743887662e-06, + "loss": 1.1667, + "step": 1410 + }, + { + "epoch": 0.10877274128893, + "grad_norm": 3.92463755607605, + "learning_rate": 9.838401060658352e-06, + "loss": 1.1469, + "step": 1411 + }, + { + "epoch": 0.10884983040394697, + "grad_norm": 4.041322231292725, + "learning_rate": 9.83808607577979e-06, + "loss": 1.2825, + "step": 1412 + }, + { + "epoch": 0.10892691951896392, + "grad_norm": 4.066822528839111, + "learning_rate": 9.837770789271623e-06, + "loss": 1.1624, + "step": 1413 + }, + { + "epoch": 0.10900400863398088, + "grad_norm": 3.8004565238952637, + "learning_rate": 9.8374552011535e-06, + "loss": 1.1102, + "step": 1414 + }, + { + "epoch": 0.10908109774899784, + "grad_norm": 3.726020336151123, + "learning_rate": 9.837139311445102e-06, + "loss": 1.1266, + "step": 1415 + }, + { + "epoch": 0.1091581868640148, + "grad_norm": 4.26671028137207, + "learning_rate": 9.836823120166116e-06, + "loss": 1.2413, + "step": 1416 + }, + { + "epoch": 0.10923527597903177, + "grad_norm": 3.6223459243774414, + "learning_rate": 9.836506627336261e-06, + "loss": 1.1612, + "step": 1417 + }, + { + "epoch": 0.10931236509404872, + "grad_norm": 3.5984158515930176, + "learning_rate": 9.836189832975267e-06, + "loss": 1.0419, + "step": 1418 + }, + { + "epoch": 0.10938945420906568, + "grad_norm": 3.670827627182007, + "learning_rate": 9.835872737102886e-06, + "loss": 1.2345, + "step": 1419 + }, + { + "epoch": 0.10946654332408264, + "grad_norm": 3.62565541267395, + "learning_rate": 9.835555339738882e-06, + "loss": 1.0729, + "step": 1420 + }, + { + "epoch": 0.1095436324390996, + "grad_norm": 4.1612958908081055, + "learning_rate": 9.83523764090305e-06, + "loss": 1.2735, + "step": 1421 + }, + { + "epoch": 0.10962072155411656, + "grad_norm": 3.878818988800049, + "learning_rate": 9.83491964061519e-06, + "loss": 1.2431, + "step": 1422 + }, + { + "epoch": 0.10969781066913352, + "grad_norm": 3.963610887527466, + "learning_rate": 9.834601338895133e-06, + "loss": 1.2172, + "step": 1423 + }, + { + "epoch": 0.10977489978415048, + "grad_norm": 4.701980113983154, + "learning_rate": 9.83428273576272e-06, + "loss": 1.2959, + "step": 1424 + }, + { + "epoch": 0.10985198889916743, + "grad_norm": 4.165730953216553, + "learning_rate": 9.833963831237819e-06, + "loss": 1.1758, + "step": 1425 + }, + { + "epoch": 0.1099290780141844, + "grad_norm": 4.231865882873535, + "learning_rate": 9.833644625340305e-06, + "loss": 1.1345, + "step": 1426 + }, + { + "epoch": 0.11000616712920136, + "grad_norm": 3.8427863121032715, + "learning_rate": 9.833325118090086e-06, + "loss": 1.1748, + "step": 1427 + }, + { + "epoch": 0.11008325624421832, + "grad_norm": 4.339493274688721, + "learning_rate": 9.833005309507076e-06, + "loss": 1.2364, + "step": 1428 + }, + { + "epoch": 0.11016034535923527, + "grad_norm": 3.940251350402832, + "learning_rate": 9.832685199611217e-06, + "loss": 1.1587, + "step": 1429 + }, + { + "epoch": 0.11023743447425223, + "grad_norm": 4.076071739196777, + "learning_rate": 9.832364788422464e-06, + "loss": 1.1959, + "step": 1430 + }, + { + "epoch": 0.1103145235892692, + "grad_norm": 3.910961389541626, + "learning_rate": 9.832044075960795e-06, + "loss": 1.0885, + "step": 1431 + }, + { + "epoch": 0.11039161270428616, + "grad_norm": 4.00604772567749, + "learning_rate": 9.831723062246204e-06, + "loss": 1.2461, + "step": 1432 + }, + { + "epoch": 0.11046870181930311, + "grad_norm": 4.331727981567383, + "learning_rate": 9.831401747298702e-06, + "loss": 1.2664, + "step": 1433 + }, + { + "epoch": 0.11054579093432007, + "grad_norm": 4.105038642883301, + "learning_rate": 9.831080131138325e-06, + "loss": 1.1604, + "step": 1434 + }, + { + "epoch": 0.11062288004933703, + "grad_norm": 4.079939365386963, + "learning_rate": 9.830758213785123e-06, + "loss": 1.1103, + "step": 1435 + }, + { + "epoch": 0.110699969164354, + "grad_norm": 3.9948172569274902, + "learning_rate": 9.830435995259165e-06, + "loss": 1.1889, + "step": 1436 + }, + { + "epoch": 0.11077705827937095, + "grad_norm": 4.091009616851807, + "learning_rate": 9.830113475580541e-06, + "loss": 1.1374, + "step": 1437 + }, + { + "epoch": 0.11085414739438791, + "grad_norm": 4.397194862365723, + "learning_rate": 9.829790654769356e-06, + "loss": 1.1226, + "step": 1438 + }, + { + "epoch": 0.11093123650940487, + "grad_norm": 4.208889961242676, + "learning_rate": 9.829467532845738e-06, + "loss": 1.1658, + "step": 1439 + }, + { + "epoch": 0.11100832562442182, + "grad_norm": 4.262278079986572, + "learning_rate": 9.829144109829832e-06, + "loss": 1.3317, + "step": 1440 + }, + { + "epoch": 0.1110854147394388, + "grad_norm": 4.373659133911133, + "learning_rate": 9.828820385741802e-06, + "loss": 1.1994, + "step": 1441 + }, + { + "epoch": 0.11116250385445575, + "grad_norm": 3.869906425476074, + "learning_rate": 9.82849636060183e-06, + "loss": 1.2039, + "step": 1442 + }, + { + "epoch": 0.11123959296947271, + "grad_norm": 3.81036114692688, + "learning_rate": 9.828172034430118e-06, + "loss": 1.0779, + "step": 1443 + }, + { + "epoch": 0.11131668208448967, + "grad_norm": 4.158944129943848, + "learning_rate": 9.827847407246885e-06, + "loss": 1.1907, + "step": 1444 + }, + { + "epoch": 0.11139377119950664, + "grad_norm": 4.358891487121582, + "learning_rate": 9.827522479072369e-06, + "loss": 1.1776, + "step": 1445 + }, + { + "epoch": 0.11147086031452359, + "grad_norm": 4.275304794311523, + "learning_rate": 9.82719724992683e-06, + "loss": 1.2849, + "step": 1446 + }, + { + "epoch": 0.11154794942954055, + "grad_norm": 4.336565971374512, + "learning_rate": 9.826871719830542e-06, + "loss": 1.1965, + "step": 1447 + }, + { + "epoch": 0.1116250385445575, + "grad_norm": 4.239962577819824, + "learning_rate": 9.826545888803802e-06, + "loss": 1.229, + "step": 1448 + }, + { + "epoch": 0.11170212765957446, + "grad_norm": 4.397287368774414, + "learning_rate": 9.826219756866923e-06, + "loss": 1.2252, + "step": 1449 + }, + { + "epoch": 0.11177921677459143, + "grad_norm": 4.538805961608887, + "learning_rate": 9.82589332404024e-06, + "loss": 1.2506, + "step": 1450 + }, + { + "epoch": 0.11185630588960839, + "grad_norm": 3.814425468444824, + "learning_rate": 9.825566590344098e-06, + "loss": 1.1343, + "step": 1451 + }, + { + "epoch": 0.11193339500462535, + "grad_norm": 3.6596169471740723, + "learning_rate": 9.825239555798875e-06, + "loss": 1.2412, + "step": 1452 + }, + { + "epoch": 0.1120104841196423, + "grad_norm": 4.104322910308838, + "learning_rate": 9.824912220424953e-06, + "loss": 1.1035, + "step": 1453 + }, + { + "epoch": 0.11208757323465926, + "grad_norm": 3.8343546390533447, + "learning_rate": 9.824584584242746e-06, + "loss": 1.1575, + "step": 1454 + }, + { + "epoch": 0.11216466234967623, + "grad_norm": 3.6301510334014893, + "learning_rate": 9.824256647272676e-06, + "loss": 1.2006, + "step": 1455 + }, + { + "epoch": 0.11224175146469319, + "grad_norm": 4.543994903564453, + "learning_rate": 9.823928409535191e-06, + "loss": 1.2539, + "step": 1456 + }, + { + "epoch": 0.11231884057971014, + "grad_norm": 4.104208469390869, + "learning_rate": 9.82359987105075e-06, + "loss": 1.2179, + "step": 1457 + }, + { + "epoch": 0.1123959296947271, + "grad_norm": 4.09733772277832, + "learning_rate": 9.823271031839843e-06, + "loss": 1.1579, + "step": 1458 + }, + { + "epoch": 0.11247301880974406, + "grad_norm": 4.35043478012085, + "learning_rate": 9.822941891922965e-06, + "loss": 1.2598, + "step": 1459 + }, + { + "epoch": 0.11255010792476103, + "grad_norm": 4.023888111114502, + "learning_rate": 9.82261245132064e-06, + "loss": 1.1077, + "step": 1460 + }, + { + "epoch": 0.11262719703977798, + "grad_norm": 3.594707489013672, + "learning_rate": 9.822282710053403e-06, + "loss": 1.1578, + "step": 1461 + }, + { + "epoch": 0.11270428615479494, + "grad_norm": 4.166836261749268, + "learning_rate": 9.821952668141817e-06, + "loss": 1.1718, + "step": 1462 + }, + { + "epoch": 0.1127813752698119, + "grad_norm": 3.9749162197113037, + "learning_rate": 9.821622325606454e-06, + "loss": 1.1749, + "step": 1463 + }, + { + "epoch": 0.11285846438482887, + "grad_norm": 4.0859150886535645, + "learning_rate": 9.821291682467912e-06, + "loss": 1.1667, + "step": 1464 + }, + { + "epoch": 0.11293555349984583, + "grad_norm": 5.696316242218018, + "learning_rate": 9.820960738746805e-06, + "loss": 1.0963, + "step": 1465 + }, + { + "epoch": 0.11301264261486278, + "grad_norm": 4.3305559158325195, + "learning_rate": 9.820629494463763e-06, + "loss": 1.1439, + "step": 1466 + }, + { + "epoch": 0.11308973172987974, + "grad_norm": 4.143655300140381, + "learning_rate": 9.820297949639439e-06, + "loss": 1.0771, + "step": 1467 + }, + { + "epoch": 0.1131668208448967, + "grad_norm": 4.505462169647217, + "learning_rate": 9.8199661042945e-06, + "loss": 1.2824, + "step": 1468 + }, + { + "epoch": 0.11324390995991367, + "grad_norm": 3.8366472721099854, + "learning_rate": 9.819633958449642e-06, + "loss": 1.1783, + "step": 1469 + }, + { + "epoch": 0.11332099907493062, + "grad_norm": 4.8427910804748535, + "learning_rate": 9.819301512125565e-06, + "loss": 1.3027, + "step": 1470 + }, + { + "epoch": 0.11339808818994758, + "grad_norm": 4.196298599243164, + "learning_rate": 9.818968765343e-06, + "loss": 1.1545, + "step": 1471 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 4.095616817474365, + "learning_rate": 9.818635718122692e-06, + "loss": 1.1564, + "step": 1472 + }, + { + "epoch": 0.11355226641998149, + "grad_norm": 4.081837177276611, + "learning_rate": 9.8183023704854e-06, + "loss": 1.141, + "step": 1473 + }, + { + "epoch": 0.11362935553499846, + "grad_norm": 4.244540214538574, + "learning_rate": 9.817968722451911e-06, + "loss": 1.2587, + "step": 1474 + }, + { + "epoch": 0.11370644465001542, + "grad_norm": 4.162115097045898, + "learning_rate": 9.817634774043026e-06, + "loss": 1.2686, + "step": 1475 + }, + { + "epoch": 0.11378353376503238, + "grad_norm": 4.111065864562988, + "learning_rate": 9.817300525279562e-06, + "loss": 1.1118, + "step": 1476 + }, + { + "epoch": 0.11386062288004933, + "grad_norm": 4.0052714347839355, + "learning_rate": 9.816965976182362e-06, + "loss": 1.145, + "step": 1477 + }, + { + "epoch": 0.11393771199506629, + "grad_norm": 3.8451459407806396, + "learning_rate": 9.81663112677228e-06, + "loss": 1.0982, + "step": 1478 + }, + { + "epoch": 0.11401480111008326, + "grad_norm": 4.380990982055664, + "learning_rate": 9.816295977070193e-06, + "loss": 1.2695, + "step": 1479 + }, + { + "epoch": 0.11409189022510022, + "grad_norm": 4.092302322387695, + "learning_rate": 9.815960527096996e-06, + "loss": 1.2088, + "step": 1480 + }, + { + "epoch": 0.11416897934011717, + "grad_norm": 3.8186841011047363, + "learning_rate": 9.815624776873605e-06, + "loss": 1.1231, + "step": 1481 + }, + { + "epoch": 0.11424606845513413, + "grad_norm": 4.1122894287109375, + "learning_rate": 9.815288726420949e-06, + "loss": 1.1516, + "step": 1482 + }, + { + "epoch": 0.1143231575701511, + "grad_norm": 4.764173984527588, + "learning_rate": 9.814952375759979e-06, + "loss": 1.2084, + "step": 1483 + }, + { + "epoch": 0.11440024668516806, + "grad_norm": 4.217214584350586, + "learning_rate": 9.814615724911664e-06, + "loss": 1.0574, + "step": 1484 + }, + { + "epoch": 0.11447733580018501, + "grad_norm": 3.7358455657958984, + "learning_rate": 9.814278773896997e-06, + "loss": 1.1282, + "step": 1485 + }, + { + "epoch": 0.11455442491520197, + "grad_norm": 3.7960875034332275, + "learning_rate": 9.813941522736981e-06, + "loss": 1.1453, + "step": 1486 + }, + { + "epoch": 0.11463151403021893, + "grad_norm": 4.146248817443848, + "learning_rate": 9.813603971452643e-06, + "loss": 1.2265, + "step": 1487 + }, + { + "epoch": 0.1147086031452359, + "grad_norm": 4.388052463531494, + "learning_rate": 9.813266120065028e-06, + "loss": 1.1369, + "step": 1488 + }, + { + "epoch": 0.11478569226025286, + "grad_norm": 4.071401119232178, + "learning_rate": 9.812927968595199e-06, + "loss": 1.1689, + "step": 1489 + }, + { + "epoch": 0.11486278137526981, + "grad_norm": 4.517001628875732, + "learning_rate": 9.812589517064237e-06, + "loss": 1.1537, + "step": 1490 + }, + { + "epoch": 0.11493987049028677, + "grad_norm": 4.021905899047852, + "learning_rate": 9.812250765493243e-06, + "loss": 1.0266, + "step": 1491 + }, + { + "epoch": 0.11501695960530373, + "grad_norm": 4.433926105499268, + "learning_rate": 9.811911713903339e-06, + "loss": 1.108, + "step": 1492 + }, + { + "epoch": 0.1150940487203207, + "grad_norm": 4.291812419891357, + "learning_rate": 9.811572362315661e-06, + "loss": 1.2626, + "step": 1493 + }, + { + "epoch": 0.11517113783533765, + "grad_norm": 4.35188627243042, + "learning_rate": 9.811232710751366e-06, + "loss": 1.1318, + "step": 1494 + }, + { + "epoch": 0.11524822695035461, + "grad_norm": 4.012880325317383, + "learning_rate": 9.810892759231629e-06, + "loss": 1.1044, + "step": 1495 + }, + { + "epoch": 0.11532531606537157, + "grad_norm": 4.010427474975586, + "learning_rate": 9.810552507777643e-06, + "loss": 1.2086, + "step": 1496 + }, + { + "epoch": 0.11540240518038852, + "grad_norm": 3.74871563911438, + "learning_rate": 9.810211956410625e-06, + "loss": 1.1279, + "step": 1497 + }, + { + "epoch": 0.1154794942954055, + "grad_norm": 3.983808755874634, + "learning_rate": 9.809871105151805e-06, + "loss": 1.1586, + "step": 1498 + }, + { + "epoch": 0.11555658341042245, + "grad_norm": 4.453268051147461, + "learning_rate": 9.80952995402243e-06, + "loss": 1.156, + "step": 1499 + }, + { + "epoch": 0.11563367252543941, + "grad_norm": 4.205508708953857, + "learning_rate": 9.809188503043773e-06, + "loss": 1.1945, + "step": 1500 + }, + { + "epoch": 0.11571076164045636, + "grad_norm": 4.098243713378906, + "learning_rate": 9.80884675223712e-06, + "loss": 1.1781, + "step": 1501 + }, + { + "epoch": 0.11578785075547332, + "grad_norm": 4.6129937171936035, + "learning_rate": 9.808504701623777e-06, + "loss": 1.1673, + "step": 1502 + }, + { + "epoch": 0.11586493987049029, + "grad_norm": 4.1249494552612305, + "learning_rate": 9.808162351225073e-06, + "loss": 1.1907, + "step": 1503 + }, + { + "epoch": 0.11594202898550725, + "grad_norm": 3.684288740158081, + "learning_rate": 9.807819701062345e-06, + "loss": 1.1824, + "step": 1504 + }, + { + "epoch": 0.1160191181005242, + "grad_norm": 4.17221212387085, + "learning_rate": 9.80747675115696e-06, + "loss": 1.1458, + "step": 1505 + }, + { + "epoch": 0.11609620721554116, + "grad_norm": 3.946536064147949, + "learning_rate": 9.807133501530297e-06, + "loss": 1.1267, + "step": 1506 + }, + { + "epoch": 0.11617329633055813, + "grad_norm": 4.113000392913818, + "learning_rate": 9.806789952203759e-06, + "loss": 1.1736, + "step": 1507 + }, + { + "epoch": 0.11625038544557509, + "grad_norm": 4.112078666687012, + "learning_rate": 9.806446103198761e-06, + "loss": 1.2042, + "step": 1508 + }, + { + "epoch": 0.11632747456059205, + "grad_norm": 3.994236946105957, + "learning_rate": 9.806101954536741e-06, + "loss": 1.2522, + "step": 1509 + }, + { + "epoch": 0.116404563675609, + "grad_norm": 4.026162624359131, + "learning_rate": 9.805757506239157e-06, + "loss": 1.1353, + "step": 1510 + }, + { + "epoch": 0.11648165279062596, + "grad_norm": 4.128935813903809, + "learning_rate": 9.805412758327483e-06, + "loss": 1.2068, + "step": 1511 + }, + { + "epoch": 0.11655874190564293, + "grad_norm": 3.9604172706604004, + "learning_rate": 9.80506771082321e-06, + "loss": 1.1924, + "step": 1512 + }, + { + "epoch": 0.11663583102065989, + "grad_norm": 6.198178291320801, + "learning_rate": 9.804722363747852e-06, + "loss": 1.165, + "step": 1513 + }, + { + "epoch": 0.11671292013567684, + "grad_norm": 4.178301811218262, + "learning_rate": 9.804376717122938e-06, + "loss": 1.1974, + "step": 1514 + }, + { + "epoch": 0.1167900092506938, + "grad_norm": 4.095193386077881, + "learning_rate": 9.804030770970019e-06, + "loss": 1.1372, + "step": 1515 + }, + { + "epoch": 0.11686709836571076, + "grad_norm": 3.5960633754730225, + "learning_rate": 9.803684525310662e-06, + "loss": 1.1578, + "step": 1516 + }, + { + "epoch": 0.11694418748072773, + "grad_norm": 3.994582414627075, + "learning_rate": 9.803337980166455e-06, + "loss": 1.1687, + "step": 1517 + }, + { + "epoch": 0.11702127659574468, + "grad_norm": 4.156383991241455, + "learning_rate": 9.802991135558998e-06, + "loss": 1.1485, + "step": 1518 + }, + { + "epoch": 0.11709836571076164, + "grad_norm": 3.9778475761413574, + "learning_rate": 9.802643991509923e-06, + "loss": 1.2049, + "step": 1519 + }, + { + "epoch": 0.1171754548257786, + "grad_norm": 4.528823375701904, + "learning_rate": 9.802296548040868e-06, + "loss": 1.3013, + "step": 1520 + }, + { + "epoch": 0.11725254394079555, + "grad_norm": 4.259623050689697, + "learning_rate": 9.801948805173494e-06, + "loss": 1.1648, + "step": 1521 + }, + { + "epoch": 0.11732963305581252, + "grad_norm": 4.2941460609436035, + "learning_rate": 9.801600762929482e-06, + "loss": 1.218, + "step": 1522 + }, + { + "epoch": 0.11740672217082948, + "grad_norm": 4.13802433013916, + "learning_rate": 9.801252421330531e-06, + "loss": 1.2677, + "step": 1523 + }, + { + "epoch": 0.11748381128584644, + "grad_norm": 3.9115195274353027, + "learning_rate": 9.800903780398357e-06, + "loss": 1.0531, + "step": 1524 + }, + { + "epoch": 0.1175609004008634, + "grad_norm": 4.135284423828125, + "learning_rate": 9.800554840154697e-06, + "loss": 1.1867, + "step": 1525 + }, + { + "epoch": 0.11763798951588036, + "grad_norm": 3.7419018745422363, + "learning_rate": 9.800205600621307e-06, + "loss": 1.1606, + "step": 1526 + }, + { + "epoch": 0.11771507863089732, + "grad_norm": 4.145817279815674, + "learning_rate": 9.799856061819958e-06, + "loss": 1.1399, + "step": 1527 + }, + { + "epoch": 0.11779216774591428, + "grad_norm": 3.8120839595794678, + "learning_rate": 9.799506223772442e-06, + "loss": 1.1803, + "step": 1528 + }, + { + "epoch": 0.11786925686093123, + "grad_norm": 4.319042682647705, + "learning_rate": 9.79915608650057e-06, + "loss": 1.0973, + "step": 1529 + }, + { + "epoch": 0.11794634597594819, + "grad_norm": 4.340944766998291, + "learning_rate": 9.798805650026173e-06, + "loss": 1.3635, + "step": 1530 + }, + { + "epoch": 0.11802343509096516, + "grad_norm": 4.413815975189209, + "learning_rate": 9.798454914371096e-06, + "loss": 1.1947, + "step": 1531 + }, + { + "epoch": 0.11810052420598212, + "grad_norm": 3.837949514389038, + "learning_rate": 9.798103879557207e-06, + "loss": 1.216, + "step": 1532 + }, + { + "epoch": 0.11817761332099908, + "grad_norm": 4.424009323120117, + "learning_rate": 9.797752545606391e-06, + "loss": 1.2992, + "step": 1533 + }, + { + "epoch": 0.11825470243601603, + "grad_norm": 3.9235024452209473, + "learning_rate": 9.797400912540553e-06, + "loss": 1.1753, + "step": 1534 + }, + { + "epoch": 0.11833179155103299, + "grad_norm": 3.9856998920440674, + "learning_rate": 9.797048980381614e-06, + "loss": 1.2046, + "step": 1535 + }, + { + "epoch": 0.11840888066604996, + "grad_norm": 3.952277898788452, + "learning_rate": 9.796696749151515e-06, + "loss": 1.1669, + "step": 1536 + }, + { + "epoch": 0.11848596978106692, + "grad_norm": 3.826277732849121, + "learning_rate": 9.796344218872218e-06, + "loss": 1.2616, + "step": 1537 + }, + { + "epoch": 0.11856305889608387, + "grad_norm": 4.182483196258545, + "learning_rate": 9.795991389565698e-06, + "loss": 1.0979, + "step": 1538 + }, + { + "epoch": 0.11864014801110083, + "grad_norm": 4.177645206451416, + "learning_rate": 9.795638261253955e-06, + "loss": 1.2607, + "step": 1539 + }, + { + "epoch": 0.11871723712611779, + "grad_norm": 3.5428824424743652, + "learning_rate": 9.795284833959003e-06, + "loss": 1.011, + "step": 1540 + }, + { + "epoch": 0.11879432624113476, + "grad_norm": 4.085594654083252, + "learning_rate": 9.794931107702877e-06, + "loss": 1.1675, + "step": 1541 + }, + { + "epoch": 0.11887141535615171, + "grad_norm": 4.093392848968506, + "learning_rate": 9.794577082507631e-06, + "loss": 1.2673, + "step": 1542 + }, + { + "epoch": 0.11894850447116867, + "grad_norm": 4.299323081970215, + "learning_rate": 9.794222758395336e-06, + "loss": 1.2598, + "step": 1543 + }, + { + "epoch": 0.11902559358618563, + "grad_norm": 3.8711843490600586, + "learning_rate": 9.79386813538808e-06, + "loss": 1.1941, + "step": 1544 + }, + { + "epoch": 0.11910268270120258, + "grad_norm": 4.2434983253479, + "learning_rate": 9.793513213507974e-06, + "loss": 1.1717, + "step": 1545 + }, + { + "epoch": 0.11917977181621955, + "grad_norm": 4.333231449127197, + "learning_rate": 9.793157992777147e-06, + "loss": 1.2443, + "step": 1546 + }, + { + "epoch": 0.11925686093123651, + "grad_norm": 3.959233045578003, + "learning_rate": 9.792802473217742e-06, + "loss": 1.191, + "step": 1547 + }, + { + "epoch": 0.11933395004625347, + "grad_norm": 3.931117534637451, + "learning_rate": 9.792446654851928e-06, + "loss": 1.1582, + "step": 1548 + }, + { + "epoch": 0.11941103916127042, + "grad_norm": 4.2051777839660645, + "learning_rate": 9.792090537701883e-06, + "loss": 1.1121, + "step": 1549 + }, + { + "epoch": 0.1194881282762874, + "grad_norm": 4.133641242980957, + "learning_rate": 9.791734121789814e-06, + "loss": 1.214, + "step": 1550 + }, + { + "epoch": 0.11956521739130435, + "grad_norm": 3.797180652618408, + "learning_rate": 9.791377407137936e-06, + "loss": 1.0736, + "step": 1551 + }, + { + "epoch": 0.11964230650632131, + "grad_norm": 4.1591477394104, + "learning_rate": 9.791020393768495e-06, + "loss": 1.2181, + "step": 1552 + }, + { + "epoch": 0.11971939562133826, + "grad_norm": 4.558741569519043, + "learning_rate": 9.790663081703746e-06, + "loss": 1.1797, + "step": 1553 + }, + { + "epoch": 0.11979648473635522, + "grad_norm": 4.007091522216797, + "learning_rate": 9.790305470965964e-06, + "loss": 1.2147, + "step": 1554 + }, + { + "epoch": 0.11987357385137219, + "grad_norm": 4.392326831817627, + "learning_rate": 9.789947561577445e-06, + "loss": 1.2772, + "step": 1555 + }, + { + "epoch": 0.11995066296638915, + "grad_norm": 3.7429401874542236, + "learning_rate": 9.789589353560505e-06, + "loss": 1.2085, + "step": 1556 + }, + { + "epoch": 0.1200277520814061, + "grad_norm": 4.087551116943359, + "learning_rate": 9.789230846937473e-06, + "loss": 1.1084, + "step": 1557 + }, + { + "epoch": 0.12010484119642306, + "grad_norm": 4.172464370727539, + "learning_rate": 9.788872041730703e-06, + "loss": 1.2449, + "step": 1558 + }, + { + "epoch": 0.12018193031144002, + "grad_norm": 4.280933856964111, + "learning_rate": 9.788512937962562e-06, + "loss": 1.1364, + "step": 1559 + }, + { + "epoch": 0.12025901942645699, + "grad_norm": 3.9372894763946533, + "learning_rate": 9.788153535655442e-06, + "loss": 1.1644, + "step": 1560 + }, + { + "epoch": 0.12033610854147395, + "grad_norm": 3.8368165493011475, + "learning_rate": 9.787793834831745e-06, + "loss": 1.138, + "step": 1561 + }, + { + "epoch": 0.1204131976564909, + "grad_norm": 3.9816088676452637, + "learning_rate": 9.787433835513901e-06, + "loss": 1.0688, + "step": 1562 + }, + { + "epoch": 0.12049028677150786, + "grad_norm": 3.9207653999328613, + "learning_rate": 9.787073537724351e-06, + "loss": 1.1746, + "step": 1563 + }, + { + "epoch": 0.12056737588652482, + "grad_norm": 4.689396858215332, + "learning_rate": 9.78671294148556e-06, + "loss": 1.2326, + "step": 1564 + }, + { + "epoch": 0.12064446500154179, + "grad_norm": 4.082599639892578, + "learning_rate": 9.786352046820007e-06, + "loss": 1.1073, + "step": 1565 + }, + { + "epoch": 0.12072155411655874, + "grad_norm": 4.284193515777588, + "learning_rate": 9.785990853750193e-06, + "loss": 1.2251, + "step": 1566 + }, + { + "epoch": 0.1207986432315757, + "grad_norm": 3.9901416301727295, + "learning_rate": 9.785629362298637e-06, + "loss": 1.2174, + "step": 1567 + }, + { + "epoch": 0.12087573234659266, + "grad_norm": 4.175442695617676, + "learning_rate": 9.785267572487876e-06, + "loss": 1.1415, + "step": 1568 + }, + { + "epoch": 0.12095282146160963, + "grad_norm": 4.132479190826416, + "learning_rate": 9.784905484340463e-06, + "loss": 1.2834, + "step": 1569 + }, + { + "epoch": 0.12102991057662658, + "grad_norm": 3.794235944747925, + "learning_rate": 9.784543097878977e-06, + "loss": 1.0903, + "step": 1570 + }, + { + "epoch": 0.12110699969164354, + "grad_norm": 3.9358041286468506, + "learning_rate": 9.784180413126009e-06, + "loss": 1.0461, + "step": 1571 + }, + { + "epoch": 0.1211840888066605, + "grad_norm": 3.783879041671753, + "learning_rate": 9.78381743010417e-06, + "loss": 1.2404, + "step": 1572 + }, + { + "epoch": 0.12126117792167745, + "grad_norm": 4.348257064819336, + "learning_rate": 9.783454148836089e-06, + "loss": 1.2484, + "step": 1573 + }, + { + "epoch": 0.12133826703669442, + "grad_norm": 4.235533237457275, + "learning_rate": 9.783090569344418e-06, + "loss": 1.0986, + "step": 1574 + }, + { + "epoch": 0.12141535615171138, + "grad_norm": 3.815244197845459, + "learning_rate": 9.782726691651819e-06, + "loss": 0.9998, + "step": 1575 + }, + { + "epoch": 0.12149244526672834, + "grad_norm": 3.847841739654541, + "learning_rate": 9.782362515780983e-06, + "loss": 1.1051, + "step": 1576 + }, + { + "epoch": 0.1215695343817453, + "grad_norm": 4.120035648345947, + "learning_rate": 9.781998041754613e-06, + "loss": 1.1412, + "step": 1577 + }, + { + "epoch": 0.12164662349676225, + "grad_norm": 4.0854105949401855, + "learning_rate": 9.781633269595432e-06, + "loss": 1.1599, + "step": 1578 + }, + { + "epoch": 0.12172371261177922, + "grad_norm": 4.094209671020508, + "learning_rate": 9.78126819932618e-06, + "loss": 1.2077, + "step": 1579 + }, + { + "epoch": 0.12180080172679618, + "grad_norm": 4.1311516761779785, + "learning_rate": 9.78090283096962e-06, + "loss": 1.1408, + "step": 1580 + }, + { + "epoch": 0.12187789084181314, + "grad_norm": 3.8271164894104004, + "learning_rate": 9.780537164548529e-06, + "loss": 1.2063, + "step": 1581 + }, + { + "epoch": 0.12195497995683009, + "grad_norm": 3.9459309577941895, + "learning_rate": 9.780171200085703e-06, + "loss": 1.1439, + "step": 1582 + }, + { + "epoch": 0.12203206907184705, + "grad_norm": 4.513016223907471, + "learning_rate": 9.779804937603963e-06, + "loss": 1.2068, + "step": 1583 + }, + { + "epoch": 0.12210915818686402, + "grad_norm": 4.303330421447754, + "learning_rate": 9.779438377126138e-06, + "loss": 1.2965, + "step": 1584 + }, + { + "epoch": 0.12218624730188098, + "grad_norm": 4.000280380249023, + "learning_rate": 9.779071518675086e-06, + "loss": 1.1383, + "step": 1585 + }, + { + "epoch": 0.12226333641689793, + "grad_norm": 3.8880882263183594, + "learning_rate": 9.778704362273673e-06, + "loss": 1.0986, + "step": 1586 + }, + { + "epoch": 0.12234042553191489, + "grad_norm": 4.341008186340332, + "learning_rate": 9.778336907944793e-06, + "loss": 1.1779, + "step": 1587 + }, + { + "epoch": 0.12241751464693185, + "grad_norm": 4.198411464691162, + "learning_rate": 9.777969155711356e-06, + "loss": 1.1633, + "step": 1588 + }, + { + "epoch": 0.12249460376194882, + "grad_norm": 3.935933828353882, + "learning_rate": 9.777601105596288e-06, + "loss": 1.1661, + "step": 1589 + }, + { + "epoch": 0.12257169287696577, + "grad_norm": 3.575408935546875, + "learning_rate": 9.777232757622534e-06, + "loss": 1.037, + "step": 1590 + }, + { + "epoch": 0.12264878199198273, + "grad_norm": 4.11090087890625, + "learning_rate": 9.77686411181306e-06, + "loss": 1.2121, + "step": 1591 + }, + { + "epoch": 0.12272587110699969, + "grad_norm": 4.116640090942383, + "learning_rate": 9.77649516819085e-06, + "loss": 1.0669, + "step": 1592 + }, + { + "epoch": 0.12280296022201666, + "grad_norm": 3.7206943035125732, + "learning_rate": 9.776125926778902e-06, + "loss": 1.0977, + "step": 1593 + }, + { + "epoch": 0.12288004933703361, + "grad_norm": 3.60390305519104, + "learning_rate": 9.775756387600239e-06, + "loss": 1.0635, + "step": 1594 + }, + { + "epoch": 0.12295713845205057, + "grad_norm": 4.16797399520874, + "learning_rate": 9.7753865506779e-06, + "loss": 1.1748, + "step": 1595 + }, + { + "epoch": 0.12303422756706753, + "grad_norm": 3.8636176586151123, + "learning_rate": 9.77501641603494e-06, + "loss": 1.1328, + "step": 1596 + }, + { + "epoch": 0.12311131668208448, + "grad_norm": 4.086244583129883, + "learning_rate": 9.77464598369444e-06, + "loss": 1.2368, + "step": 1597 + }, + { + "epoch": 0.12318840579710146, + "grad_norm": 3.9947853088378906, + "learning_rate": 9.77427525367949e-06, + "loss": 1.0748, + "step": 1598 + }, + { + "epoch": 0.12326549491211841, + "grad_norm": 3.5423076152801514, + "learning_rate": 9.773904226013207e-06, + "loss": 1.1288, + "step": 1599 + }, + { + "epoch": 0.12334258402713537, + "grad_norm": 4.289074897766113, + "learning_rate": 9.773532900718717e-06, + "loss": 1.2039, + "step": 1600 + }, + { + "epoch": 0.12341967314215233, + "grad_norm": 4.612265586853027, + "learning_rate": 9.773161277819175e-06, + "loss": 1.2929, + "step": 1601 + }, + { + "epoch": 0.12349676225716928, + "grad_norm": 4.081154823303223, + "learning_rate": 9.772789357337746e-06, + "loss": 1.221, + "step": 1602 + }, + { + "epoch": 0.12357385137218625, + "grad_norm": 4.336784839630127, + "learning_rate": 9.772417139297622e-06, + "loss": 1.2045, + "step": 1603 + }, + { + "epoch": 0.12365094048720321, + "grad_norm": 4.129827499389648, + "learning_rate": 9.772044623722005e-06, + "loss": 1.1447, + "step": 1604 + }, + { + "epoch": 0.12372802960222017, + "grad_norm": 4.177978515625, + "learning_rate": 9.771671810634123e-06, + "loss": 1.1951, + "step": 1605 + }, + { + "epoch": 0.12380511871723712, + "grad_norm": 3.896273374557495, + "learning_rate": 9.771298700057214e-06, + "loss": 1.1928, + "step": 1606 + }, + { + "epoch": 0.12388220783225408, + "grad_norm": 3.946012258529663, + "learning_rate": 9.770925292014542e-06, + "loss": 1.1168, + "step": 1607 + }, + { + "epoch": 0.12395929694727105, + "grad_norm": 4.167520046234131, + "learning_rate": 9.77055158652939e-06, + "loss": 1.3042, + "step": 1608 + }, + { + "epoch": 0.124036386062288, + "grad_norm": 4.672422885894775, + "learning_rate": 9.77017758362505e-06, + "loss": 1.2575, + "step": 1609 + }, + { + "epoch": 0.12411347517730496, + "grad_norm": 4.5983123779296875, + "learning_rate": 9.769803283324847e-06, + "loss": 1.1992, + "step": 1610 + }, + { + "epoch": 0.12419056429232192, + "grad_norm": 4.06178092956543, + "learning_rate": 9.769428685652112e-06, + "loss": 1.2428, + "step": 1611 + }, + { + "epoch": 0.12426765340733889, + "grad_norm": 4.024518013000488, + "learning_rate": 9.769053790630198e-06, + "loss": 1.1464, + "step": 1612 + }, + { + "epoch": 0.12434474252235585, + "grad_norm": 4.302224636077881, + "learning_rate": 9.768678598282481e-06, + "loss": 1.0665, + "step": 1613 + }, + { + "epoch": 0.1244218316373728, + "grad_norm": 4.330562114715576, + "learning_rate": 9.76830310863235e-06, + "loss": 1.1544, + "step": 1614 + }, + { + "epoch": 0.12449892075238976, + "grad_norm": 4.35994291305542, + "learning_rate": 9.767927321703217e-06, + "loss": 1.1948, + "step": 1615 + }, + { + "epoch": 0.12457600986740672, + "grad_norm": 3.8011748790740967, + "learning_rate": 9.767551237518508e-06, + "loss": 1.155, + "step": 1616 + }, + { + "epoch": 0.12465309898242369, + "grad_norm": 3.746467113494873, + "learning_rate": 9.767174856101672e-06, + "loss": 1.1757, + "step": 1617 + }, + { + "epoch": 0.12473018809744064, + "grad_norm": 3.849261522293091, + "learning_rate": 9.766798177476175e-06, + "loss": 0.9983, + "step": 1618 + }, + { + "epoch": 0.1248072772124576, + "grad_norm": 4.078786849975586, + "learning_rate": 9.766421201665498e-06, + "loss": 1.2175, + "step": 1619 + }, + { + "epoch": 0.12488436632747456, + "grad_norm": 4.006483554840088, + "learning_rate": 9.766043928693146e-06, + "loss": 1.2068, + "step": 1620 + }, + { + "epoch": 0.12496145544249151, + "grad_norm": 4.358697891235352, + "learning_rate": 9.765666358582637e-06, + "loss": 1.1717, + "step": 1621 + }, + { + "epoch": 0.12503854455750849, + "grad_norm": 4.190584182739258, + "learning_rate": 9.765288491357514e-06, + "loss": 1.1927, + "step": 1622 + }, + { + "epoch": 0.12511563367252543, + "grad_norm": 4.142174243927002, + "learning_rate": 9.764910327041333e-06, + "loss": 1.1538, + "step": 1623 + }, + { + "epoch": 0.1251927227875424, + "grad_norm": 3.6901378631591797, + "learning_rate": 9.764531865657671e-06, + "loss": 1.1287, + "step": 1624 + }, + { + "epoch": 0.12526981190255937, + "grad_norm": 3.891272783279419, + "learning_rate": 9.764153107230124e-06, + "loss": 1.084, + "step": 1625 + }, + { + "epoch": 0.1253469010175763, + "grad_norm": 4.083333969116211, + "learning_rate": 9.763774051782306e-06, + "loss": 1.1683, + "step": 1626 + }, + { + "epoch": 0.12542399013259328, + "grad_norm": 3.742480516433716, + "learning_rate": 9.763394699337846e-06, + "loss": 1.1251, + "step": 1627 + }, + { + "epoch": 0.12550107924761023, + "grad_norm": 4.092676639556885, + "learning_rate": 9.763015049920397e-06, + "loss": 1.1928, + "step": 1628 + }, + { + "epoch": 0.1255781683626272, + "grad_norm": 3.6800365447998047, + "learning_rate": 9.76263510355363e-06, + "loss": 1.1753, + "step": 1629 + }, + { + "epoch": 0.12565525747764417, + "grad_norm": 4.610281944274902, + "learning_rate": 9.762254860261229e-06, + "loss": 1.2049, + "step": 1630 + }, + { + "epoch": 0.1257323465926611, + "grad_norm": 3.89145827293396, + "learning_rate": 9.761874320066903e-06, + "loss": 0.9942, + "step": 1631 + }, + { + "epoch": 0.12580943570767808, + "grad_norm": 3.8079841136932373, + "learning_rate": 9.761493482994374e-06, + "loss": 1.2347, + "step": 1632 + }, + { + "epoch": 0.12588652482269502, + "grad_norm": 4.090712547302246, + "learning_rate": 9.761112349067387e-06, + "loss": 1.1586, + "step": 1633 + }, + { + "epoch": 0.125963613937712, + "grad_norm": 3.8824753761291504, + "learning_rate": 9.760730918309702e-06, + "loss": 1.1221, + "step": 1634 + }, + { + "epoch": 0.12604070305272896, + "grad_norm": 3.8684356212615967, + "learning_rate": 9.760349190745104e-06, + "loss": 1.0676, + "step": 1635 + }, + { + "epoch": 0.1261177921677459, + "grad_norm": 4.076964378356934, + "learning_rate": 9.759967166397386e-06, + "loss": 1.1901, + "step": 1636 + }, + { + "epoch": 0.12619488128276288, + "grad_norm": 3.883690357208252, + "learning_rate": 9.759584845290368e-06, + "loss": 1.179, + "step": 1637 + }, + { + "epoch": 0.12627197039777982, + "grad_norm": 3.9495391845703125, + "learning_rate": 9.759202227447888e-06, + "loss": 1.2468, + "step": 1638 + }, + { + "epoch": 0.1263490595127968, + "grad_norm": 4.153524398803711, + "learning_rate": 9.758819312893795e-06, + "loss": 1.0873, + "step": 1639 + }, + { + "epoch": 0.12642614862781376, + "grad_norm": 3.8686201572418213, + "learning_rate": 9.758436101651965e-06, + "loss": 1.0967, + "step": 1640 + }, + { + "epoch": 0.1265032377428307, + "grad_norm": 4.673964977264404, + "learning_rate": 9.758052593746287e-06, + "loss": 1.2438, + "step": 1641 + }, + { + "epoch": 0.12658032685784767, + "grad_norm": 4.521989345550537, + "learning_rate": 9.757668789200676e-06, + "loss": 1.2298, + "step": 1642 + }, + { + "epoch": 0.12665741597286465, + "grad_norm": 4.26433801651001, + "learning_rate": 9.757284688039054e-06, + "loss": 1.1951, + "step": 1643 + }, + { + "epoch": 0.1267345050878816, + "grad_norm": 3.7774715423583984, + "learning_rate": 9.75690029028537e-06, + "loss": 1.2376, + "step": 1644 + }, + { + "epoch": 0.12681159420289856, + "grad_norm": 4.080611228942871, + "learning_rate": 9.756515595963591e-06, + "loss": 1.1003, + "step": 1645 + }, + { + "epoch": 0.1268886833179155, + "grad_norm": 4.495629787445068, + "learning_rate": 9.7561306050977e-06, + "loss": 1.1129, + "step": 1646 + }, + { + "epoch": 0.12696577243293247, + "grad_norm": 4.433581352233887, + "learning_rate": 9.755745317711696e-06, + "loss": 1.2694, + "step": 1647 + }, + { + "epoch": 0.12704286154794944, + "grad_norm": 4.003334045410156, + "learning_rate": 9.755359733829604e-06, + "loss": 1.1851, + "step": 1648 + }, + { + "epoch": 0.12711995066296639, + "grad_norm": 4.140914440155029, + "learning_rate": 9.75497385347546e-06, + "loss": 1.1482, + "step": 1649 + }, + { + "epoch": 0.12719703977798336, + "grad_norm": 4.328746795654297, + "learning_rate": 9.754587676673323e-06, + "loss": 1.2779, + "step": 1650 + }, + { + "epoch": 0.1272741288930003, + "grad_norm": 4.75211238861084, + "learning_rate": 9.754201203447268e-06, + "loss": 1.2699, + "step": 1651 + }, + { + "epoch": 0.12735121800801727, + "grad_norm": 3.97086763381958, + "learning_rate": 9.753814433821393e-06, + "loss": 1.189, + "step": 1652 + }, + { + "epoch": 0.12742830712303424, + "grad_norm": 4.095003604888916, + "learning_rate": 9.753427367819808e-06, + "loss": 1.1897, + "step": 1653 + }, + { + "epoch": 0.12750539623805118, + "grad_norm": 4.253332614898682, + "learning_rate": 9.753040005466645e-06, + "loss": 1.1711, + "step": 1654 + }, + { + "epoch": 0.12758248535306815, + "grad_norm": 3.9506523609161377, + "learning_rate": 9.752652346786054e-06, + "loss": 1.2071, + "step": 1655 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 4.006979465484619, + "learning_rate": 9.752264391802203e-06, + "loss": 1.1794, + "step": 1656 + }, + { + "epoch": 0.12773666358310207, + "grad_norm": 4.240119457244873, + "learning_rate": 9.75187614053928e-06, + "loss": 1.1955, + "step": 1657 + }, + { + "epoch": 0.12781375269811904, + "grad_norm": 4.19885778427124, + "learning_rate": 9.751487593021491e-06, + "loss": 1.1181, + "step": 1658 + }, + { + "epoch": 0.12789084181313598, + "grad_norm": 3.8711414337158203, + "learning_rate": 9.75109874927306e-06, + "loss": 1.0409, + "step": 1659 + }, + { + "epoch": 0.12796793092815295, + "grad_norm": 4.2193756103515625, + "learning_rate": 9.750709609318227e-06, + "loss": 1.255, + "step": 1660 + }, + { + "epoch": 0.1280450200431699, + "grad_norm": 4.001354694366455, + "learning_rate": 9.750320173181256e-06, + "loss": 1.0984, + "step": 1661 + }, + { + "epoch": 0.12812210915818686, + "grad_norm": 3.9252028465270996, + "learning_rate": 9.749930440886424e-06, + "loss": 1.0892, + "step": 1662 + }, + { + "epoch": 0.12819919827320383, + "grad_norm": 4.056916236877441, + "learning_rate": 9.749540412458028e-06, + "loss": 1.177, + "step": 1663 + }, + { + "epoch": 0.12827628738822078, + "grad_norm": 4.366199970245361, + "learning_rate": 9.749150087920386e-06, + "loss": 1.1389, + "step": 1664 + }, + { + "epoch": 0.12835337650323775, + "grad_norm": 3.8969857692718506, + "learning_rate": 9.748759467297835e-06, + "loss": 1.1479, + "step": 1665 + }, + { + "epoch": 0.1284304656182547, + "grad_norm": 4.55438756942749, + "learning_rate": 9.748368550614723e-06, + "loss": 1.3332, + "step": 1666 + }, + { + "epoch": 0.12850755473327166, + "grad_norm": 4.168787002563477, + "learning_rate": 9.747977337895426e-06, + "loss": 1.1065, + "step": 1667 + }, + { + "epoch": 0.12858464384828863, + "grad_norm": 4.430618762969971, + "learning_rate": 9.747585829164332e-06, + "loss": 1.176, + "step": 1668 + }, + { + "epoch": 0.12866173296330558, + "grad_norm": 4.035445690155029, + "learning_rate": 9.747194024445851e-06, + "loss": 1.2115, + "step": 1669 + }, + { + "epoch": 0.12873882207832255, + "grad_norm": 3.8563239574432373, + "learning_rate": 9.746801923764409e-06, + "loss": 1.1879, + "step": 1670 + }, + { + "epoch": 0.1288159111933395, + "grad_norm": 4.492901802062988, + "learning_rate": 9.74640952714445e-06, + "loss": 1.2354, + "step": 1671 + }, + { + "epoch": 0.12889300030835646, + "grad_norm": 3.8549299240112305, + "learning_rate": 9.746016834610438e-06, + "loss": 1.1373, + "step": 1672 + }, + { + "epoch": 0.12897008942337343, + "grad_norm": 3.6520705223083496, + "learning_rate": 9.745623846186858e-06, + "loss": 1.0816, + "step": 1673 + }, + { + "epoch": 0.12904717853839037, + "grad_norm": 4.377813816070557, + "learning_rate": 9.74523056189821e-06, + "loss": 1.1365, + "step": 1674 + }, + { + "epoch": 0.12912426765340734, + "grad_norm": 3.965122699737549, + "learning_rate": 9.744836981769013e-06, + "loss": 1.0827, + "step": 1675 + }, + { + "epoch": 0.12920135676842429, + "grad_norm": 4.009072303771973, + "learning_rate": 9.744443105823802e-06, + "loss": 1.1322, + "step": 1676 + }, + { + "epoch": 0.12927844588344126, + "grad_norm": 4.5139994621276855, + "learning_rate": 9.744048934087138e-06, + "loss": 1.2437, + "step": 1677 + }, + { + "epoch": 0.12935553499845823, + "grad_norm": 3.827956199645996, + "learning_rate": 9.743654466583591e-06, + "loss": 1.1468, + "step": 1678 + }, + { + "epoch": 0.12943262411347517, + "grad_norm": 3.9646661281585693, + "learning_rate": 9.743259703337758e-06, + "loss": 1.1022, + "step": 1679 + }, + { + "epoch": 0.12950971322849214, + "grad_norm": 3.9882500171661377, + "learning_rate": 9.742864644374248e-06, + "loss": 1.1333, + "step": 1680 + }, + { + "epoch": 0.12958680234350908, + "grad_norm": 3.8956985473632812, + "learning_rate": 9.74246928971769e-06, + "loss": 1.2208, + "step": 1681 + }, + { + "epoch": 0.12966389145852605, + "grad_norm": 4.2606000900268555, + "learning_rate": 9.742073639392735e-06, + "loss": 1.2494, + "step": 1682 + }, + { + "epoch": 0.12974098057354302, + "grad_norm": 4.1300530433654785, + "learning_rate": 9.741677693424048e-06, + "loss": 1.111, + "step": 1683 + }, + { + "epoch": 0.12981806968855997, + "grad_norm": 3.594024896621704, + "learning_rate": 9.741281451836313e-06, + "loss": 1.0742, + "step": 1684 + }, + { + "epoch": 0.12989515880357694, + "grad_norm": 4.158742427825928, + "learning_rate": 9.740884914654237e-06, + "loss": 1.0348, + "step": 1685 + }, + { + "epoch": 0.1299722479185939, + "grad_norm": 3.8110404014587402, + "learning_rate": 9.74048808190254e-06, + "loss": 1.1757, + "step": 1686 + }, + { + "epoch": 0.13004933703361085, + "grad_norm": 3.863544464111328, + "learning_rate": 9.740090953605963e-06, + "loss": 1.0151, + "step": 1687 + }, + { + "epoch": 0.13012642614862782, + "grad_norm": 4.125679016113281, + "learning_rate": 9.739693529789264e-06, + "loss": 1.1922, + "step": 1688 + }, + { + "epoch": 0.13020351526364476, + "grad_norm": 4.0904083251953125, + "learning_rate": 9.73929581047722e-06, + "loss": 1.0156, + "step": 1689 + }, + { + "epoch": 0.13028060437866174, + "grad_norm": 4.615362644195557, + "learning_rate": 9.73889779569463e-06, + "loss": 1.2547, + "step": 1690 + }, + { + "epoch": 0.1303576934936787, + "grad_norm": 3.9230780601501465, + "learning_rate": 9.738499485466304e-06, + "loss": 1.1324, + "step": 1691 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 4.175113201141357, + "learning_rate": 9.738100879817077e-06, + "loss": 1.1832, + "step": 1692 + }, + { + "epoch": 0.13051187172371262, + "grad_norm": 3.8793387413024902, + "learning_rate": 9.7377019787718e-06, + "loss": 1.0902, + "step": 1693 + }, + { + "epoch": 0.13058896083872956, + "grad_norm": 4.145679950714111, + "learning_rate": 9.73730278235534e-06, + "loss": 1.1542, + "step": 1694 + }, + { + "epoch": 0.13066604995374653, + "grad_norm": 4.0476179122924805, + "learning_rate": 9.736903290592589e-06, + "loss": 1.2334, + "step": 1695 + }, + { + "epoch": 0.1307431390687635, + "grad_norm": 3.9774255752563477, + "learning_rate": 9.73650350350845e-06, + "loss": 1.1498, + "step": 1696 + }, + { + "epoch": 0.13082022818378045, + "grad_norm": 3.953183174133301, + "learning_rate": 9.736103421127847e-06, + "loss": 1.1513, + "step": 1697 + }, + { + "epoch": 0.13089731729879742, + "grad_norm": 4.064455986022949, + "learning_rate": 9.735703043475727e-06, + "loss": 1.2054, + "step": 1698 + }, + { + "epoch": 0.13097440641381436, + "grad_norm": 4.00160026550293, + "learning_rate": 9.735302370577049e-06, + "loss": 1.13, + "step": 1699 + }, + { + "epoch": 0.13105149552883133, + "grad_norm": 4.219839096069336, + "learning_rate": 9.734901402456792e-06, + "loss": 1.1448, + "step": 1700 + }, + { + "epoch": 0.1311285846438483, + "grad_norm": 4.194035053253174, + "learning_rate": 9.734500139139952e-06, + "loss": 1.1548, + "step": 1701 + }, + { + "epoch": 0.13120567375886524, + "grad_norm": 3.898303985595703, + "learning_rate": 9.734098580651555e-06, + "loss": 1.119, + "step": 1702 + }, + { + "epoch": 0.13128276287388221, + "grad_norm": 4.194339752197266, + "learning_rate": 9.733696727016626e-06, + "loss": 1.1971, + "step": 1703 + }, + { + "epoch": 0.13135985198889916, + "grad_norm": 4.266892433166504, + "learning_rate": 9.733294578260224e-06, + "loss": 1.1156, + "step": 1704 + }, + { + "epoch": 0.13143694110391613, + "grad_norm": 3.6440038681030273, + "learning_rate": 9.73289213440742e-06, + "loss": 1.1899, + "step": 1705 + }, + { + "epoch": 0.1315140302189331, + "grad_norm": 4.273855686187744, + "learning_rate": 9.7324893954833e-06, + "loss": 1.1433, + "step": 1706 + }, + { + "epoch": 0.13159111933395004, + "grad_norm": 3.3972716331481934, + "learning_rate": 9.73208636151298e-06, + "loss": 1.0053, + "step": 1707 + }, + { + "epoch": 0.131668208448967, + "grad_norm": 3.7814152240753174, + "learning_rate": 9.731683032521583e-06, + "loss": 1.16, + "step": 1708 + }, + { + "epoch": 0.13174529756398395, + "grad_norm": 4.149326324462891, + "learning_rate": 9.731279408534255e-06, + "loss": 1.1135, + "step": 1709 + }, + { + "epoch": 0.13182238667900092, + "grad_norm": 4.2798051834106445, + "learning_rate": 9.73087548957616e-06, + "loss": 1.2493, + "step": 1710 + }, + { + "epoch": 0.1318994757940179, + "grad_norm": 3.8137776851654053, + "learning_rate": 9.730471275672478e-06, + "loss": 1.1796, + "step": 1711 + }, + { + "epoch": 0.13197656490903484, + "grad_norm": 4.015539646148682, + "learning_rate": 9.730066766848415e-06, + "loss": 1.1601, + "step": 1712 + }, + { + "epoch": 0.1320536540240518, + "grad_norm": 3.887502431869507, + "learning_rate": 9.729661963129183e-06, + "loss": 1.1712, + "step": 1713 + }, + { + "epoch": 0.13213074313906875, + "grad_norm": 4.104816436767578, + "learning_rate": 9.729256864540025e-06, + "loss": 1.1473, + "step": 1714 + }, + { + "epoch": 0.13220783225408572, + "grad_norm": 3.9987316131591797, + "learning_rate": 9.728851471106195e-06, + "loss": 1.223, + "step": 1715 + }, + { + "epoch": 0.1322849213691027, + "grad_norm": 4.665685176849365, + "learning_rate": 9.728445782852967e-06, + "loss": 1.1008, + "step": 1716 + }, + { + "epoch": 0.13236201048411964, + "grad_norm": 4.16191291809082, + "learning_rate": 9.728039799805635e-06, + "loss": 1.0905, + "step": 1717 + }, + { + "epoch": 0.1324390995991366, + "grad_norm": 4.354928016662598, + "learning_rate": 9.727633521989505e-06, + "loss": 1.1924, + "step": 1718 + }, + { + "epoch": 0.13251618871415355, + "grad_norm": 4.347275733947754, + "learning_rate": 9.727226949429913e-06, + "loss": 1.169, + "step": 1719 + }, + { + "epoch": 0.13259327782917052, + "grad_norm": 4.061772346496582, + "learning_rate": 9.726820082152204e-06, + "loss": 1.2223, + "step": 1720 + }, + { + "epoch": 0.1326703669441875, + "grad_norm": 4.027084827423096, + "learning_rate": 9.726412920181742e-06, + "loss": 1.1924, + "step": 1721 + }, + { + "epoch": 0.13274745605920443, + "grad_norm": 4.302231788635254, + "learning_rate": 9.726005463543913e-06, + "loss": 1.1269, + "step": 1722 + }, + { + "epoch": 0.1328245451742214, + "grad_norm": 3.7703239917755127, + "learning_rate": 9.725597712264123e-06, + "loss": 1.1173, + "step": 1723 + }, + { + "epoch": 0.13290163428923835, + "grad_norm": 3.777994155883789, + "learning_rate": 9.725189666367787e-06, + "loss": 1.0722, + "step": 1724 + }, + { + "epoch": 0.13297872340425532, + "grad_norm": 3.945777177810669, + "learning_rate": 9.724781325880348e-06, + "loss": 1.0914, + "step": 1725 + }, + { + "epoch": 0.1330558125192723, + "grad_norm": 4.009458065032959, + "learning_rate": 9.724372690827264e-06, + "loss": 1.0405, + "step": 1726 + }, + { + "epoch": 0.13313290163428923, + "grad_norm": 4.944936752319336, + "learning_rate": 9.723963761234013e-06, + "loss": 1.2138, + "step": 1727 + }, + { + "epoch": 0.1332099907493062, + "grad_norm": 3.801499605178833, + "learning_rate": 9.723554537126084e-06, + "loss": 1.1508, + "step": 1728 + }, + { + "epoch": 0.13328707986432317, + "grad_norm": 3.6166648864746094, + "learning_rate": 9.723145018528995e-06, + "loss": 1.1144, + "step": 1729 + }, + { + "epoch": 0.13336416897934011, + "grad_norm": 3.8566949367523193, + "learning_rate": 9.722735205468277e-06, + "loss": 1.0411, + "step": 1730 + }, + { + "epoch": 0.13344125809435708, + "grad_norm": 3.865793466567993, + "learning_rate": 9.722325097969477e-06, + "loss": 1.153, + "step": 1731 + }, + { + "epoch": 0.13351834720937403, + "grad_norm": 4.346949577331543, + "learning_rate": 9.721914696058165e-06, + "loss": 1.2115, + "step": 1732 + }, + { + "epoch": 0.133595436324391, + "grad_norm": 3.8929085731506348, + "learning_rate": 9.721503999759926e-06, + "loss": 1.1708, + "step": 1733 + }, + { + "epoch": 0.13367252543940797, + "grad_norm": 4.4507575035095215, + "learning_rate": 9.721093009100368e-06, + "loss": 1.2546, + "step": 1734 + }, + { + "epoch": 0.1337496145544249, + "grad_norm": 4.165073394775391, + "learning_rate": 9.720681724105112e-06, + "loss": 1.1604, + "step": 1735 + }, + { + "epoch": 0.13382670366944188, + "grad_norm": 3.6533870697021484, + "learning_rate": 9.7202701447998e-06, + "loss": 1.0856, + "step": 1736 + }, + { + "epoch": 0.13390379278445882, + "grad_norm": 3.7656784057617188, + "learning_rate": 9.71985827121009e-06, + "loss": 1.1357, + "step": 1737 + }, + { + "epoch": 0.1339808818994758, + "grad_norm": 4.213956356048584, + "learning_rate": 9.719446103361662e-06, + "loss": 1.1676, + "step": 1738 + }, + { + "epoch": 0.13405797101449277, + "grad_norm": 4.172744274139404, + "learning_rate": 9.719033641280211e-06, + "loss": 1.2312, + "step": 1739 + }, + { + "epoch": 0.1341350601295097, + "grad_norm": 4.124050140380859, + "learning_rate": 9.718620884991455e-06, + "loss": 1.1368, + "step": 1740 + }, + { + "epoch": 0.13421214924452668, + "grad_norm": 4.168154716491699, + "learning_rate": 9.718207834521124e-06, + "loss": 1.0489, + "step": 1741 + }, + { + "epoch": 0.13428923835954362, + "grad_norm": 4.139477252960205, + "learning_rate": 9.71779448989497e-06, + "loss": 1.0745, + "step": 1742 + }, + { + "epoch": 0.1343663274745606, + "grad_norm": 4.274909496307373, + "learning_rate": 9.717380851138765e-06, + "loss": 1.0202, + "step": 1743 + }, + { + "epoch": 0.13444341658957756, + "grad_norm": 3.696258306503296, + "learning_rate": 9.716966918278295e-06, + "loss": 1.1022, + "step": 1744 + }, + { + "epoch": 0.1345205057045945, + "grad_norm": 3.8992645740509033, + "learning_rate": 9.716552691339369e-06, + "loss": 1.0238, + "step": 1745 + }, + { + "epoch": 0.13459759481961148, + "grad_norm": 4.183124542236328, + "learning_rate": 9.716138170347808e-06, + "loss": 1.1277, + "step": 1746 + }, + { + "epoch": 0.13467468393462842, + "grad_norm": 3.98201584815979, + "learning_rate": 9.71572335532946e-06, + "loss": 1.2144, + "step": 1747 + }, + { + "epoch": 0.1347517730496454, + "grad_norm": 4.045650005340576, + "learning_rate": 9.715308246310181e-06, + "loss": 1.1166, + "step": 1748 + }, + { + "epoch": 0.13482886216466236, + "grad_norm": 4.4853973388671875, + "learning_rate": 9.714892843315857e-06, + "loss": 1.1546, + "step": 1749 + }, + { + "epoch": 0.1349059512796793, + "grad_norm": 4.153515815734863, + "learning_rate": 9.714477146372383e-06, + "loss": 1.0463, + "step": 1750 + }, + { + "epoch": 0.13498304039469627, + "grad_norm": 4.120546817779541, + "learning_rate": 9.714061155505673e-06, + "loss": 1.1675, + "step": 1751 + }, + { + "epoch": 0.13506012950971322, + "grad_norm": 4.298232555389404, + "learning_rate": 9.713644870741669e-06, + "loss": 1.1224, + "step": 1752 + }, + { + "epoch": 0.1351372186247302, + "grad_norm": 3.9219980239868164, + "learning_rate": 9.713228292106319e-06, + "loss": 1.1257, + "step": 1753 + }, + { + "epoch": 0.13521430773974716, + "grad_norm": 3.6702487468719482, + "learning_rate": 9.712811419625592e-06, + "loss": 1.1176, + "step": 1754 + }, + { + "epoch": 0.1352913968547641, + "grad_norm": 4.173069953918457, + "learning_rate": 9.712394253325483e-06, + "loss": 1.07, + "step": 1755 + }, + { + "epoch": 0.13536848596978107, + "grad_norm": 4.047333240509033, + "learning_rate": 9.711976793232e-06, + "loss": 1.1868, + "step": 1756 + }, + { + "epoch": 0.13544557508479801, + "grad_norm": 4.153733730316162, + "learning_rate": 9.711559039371165e-06, + "loss": 1.1111, + "step": 1757 + }, + { + "epoch": 0.13552266419981499, + "grad_norm": 4.564488410949707, + "learning_rate": 9.711140991769028e-06, + "loss": 1.1292, + "step": 1758 + }, + { + "epoch": 0.13559975331483196, + "grad_norm": 4.647412300109863, + "learning_rate": 9.710722650451649e-06, + "loss": 1.1204, + "step": 1759 + }, + { + "epoch": 0.1356768424298489, + "grad_norm": 4.287388324737549, + "learning_rate": 9.71030401544511e-06, + "loss": 1.1814, + "step": 1760 + }, + { + "epoch": 0.13575393154486587, + "grad_norm": 4.7093024253845215, + "learning_rate": 9.709885086775512e-06, + "loss": 1.2857, + "step": 1761 + }, + { + "epoch": 0.1358310206598828, + "grad_norm": 4.541007041931152, + "learning_rate": 9.709465864468971e-06, + "loss": 1.2471, + "step": 1762 + }, + { + "epoch": 0.13590810977489978, + "grad_norm": 4.468155860900879, + "learning_rate": 9.709046348551626e-06, + "loss": 1.1188, + "step": 1763 + }, + { + "epoch": 0.13598519888991675, + "grad_norm": 4.554619312286377, + "learning_rate": 9.708626539049628e-06, + "loss": 1.2205, + "step": 1764 + }, + { + "epoch": 0.1360622880049337, + "grad_norm": 3.8080332279205322, + "learning_rate": 9.708206435989152e-06, + "loss": 1.0702, + "step": 1765 + }, + { + "epoch": 0.13613937711995067, + "grad_norm": 4.381828308105469, + "learning_rate": 9.707786039396389e-06, + "loss": 0.9736, + "step": 1766 + }, + { + "epoch": 0.1362164662349676, + "grad_norm": 4.196829319000244, + "learning_rate": 9.70736534929755e-06, + "loss": 1.1242, + "step": 1767 + }, + { + "epoch": 0.13629355534998458, + "grad_norm": 4.237636566162109, + "learning_rate": 9.70694436571886e-06, + "loss": 1.1547, + "step": 1768 + }, + { + "epoch": 0.13637064446500155, + "grad_norm": 4.274960517883301, + "learning_rate": 9.706523088686568e-06, + "loss": 1.1723, + "step": 1769 + }, + { + "epoch": 0.1364477335800185, + "grad_norm": 4.287873268127441, + "learning_rate": 9.706101518226939e-06, + "loss": 1.1283, + "step": 1770 + }, + { + "epoch": 0.13652482269503546, + "grad_norm": 4.142662525177002, + "learning_rate": 9.705679654366249e-06, + "loss": 1.1434, + "step": 1771 + }, + { + "epoch": 0.13660191181005243, + "grad_norm": 4.222457408905029, + "learning_rate": 9.705257497130807e-06, + "loss": 1.2198, + "step": 1772 + }, + { + "epoch": 0.13667900092506938, + "grad_norm": 4.041723251342773, + "learning_rate": 9.704835046546928e-06, + "loss": 1.0801, + "step": 1773 + }, + { + "epoch": 0.13675609004008635, + "grad_norm": 3.751447916030884, + "learning_rate": 9.704412302640951e-06, + "loss": 1.1612, + "step": 1774 + }, + { + "epoch": 0.1368331791551033, + "grad_norm": 4.4663615226745605, + "learning_rate": 9.703989265439233e-06, + "loss": 1.0822, + "step": 1775 + }, + { + "epoch": 0.13691026827012026, + "grad_norm": 3.6068341732025146, + "learning_rate": 9.703565934968146e-06, + "loss": 1.1349, + "step": 1776 + }, + { + "epoch": 0.13698735738513723, + "grad_norm": 4.411525726318359, + "learning_rate": 9.703142311254083e-06, + "loss": 1.1509, + "step": 1777 + }, + { + "epoch": 0.13706444650015417, + "grad_norm": 4.782855033874512, + "learning_rate": 9.702718394323456e-06, + "loss": 1.1925, + "step": 1778 + }, + { + "epoch": 0.13714153561517115, + "grad_norm": 3.9642584323883057, + "learning_rate": 9.702294184202692e-06, + "loss": 1.1764, + "step": 1779 + }, + { + "epoch": 0.1372186247301881, + "grad_norm": 3.878993272781372, + "learning_rate": 9.70186968091824e-06, + "loss": 1.0982, + "step": 1780 + }, + { + "epoch": 0.13729571384520506, + "grad_norm": 4.065914630889893, + "learning_rate": 9.701444884496564e-06, + "loss": 1.0681, + "step": 1781 + }, + { + "epoch": 0.13737280296022203, + "grad_norm": 4.171785354614258, + "learning_rate": 9.701019794964151e-06, + "loss": 1.0463, + "step": 1782 + }, + { + "epoch": 0.13744989207523897, + "grad_norm": 4.044963359832764, + "learning_rate": 9.700594412347499e-06, + "loss": 1.1055, + "step": 1783 + }, + { + "epoch": 0.13752698119025594, + "grad_norm": 4.143056869506836, + "learning_rate": 9.700168736673133e-06, + "loss": 1.0994, + "step": 1784 + }, + { + "epoch": 0.13760407030527289, + "grad_norm": 4.355381965637207, + "learning_rate": 9.699742767967586e-06, + "loss": 1.2089, + "step": 1785 + }, + { + "epoch": 0.13768115942028986, + "grad_norm": 3.8300368785858154, + "learning_rate": 9.699316506257421e-06, + "loss": 1.0759, + "step": 1786 + }, + { + "epoch": 0.13775824853530683, + "grad_norm": 3.7723491191864014, + "learning_rate": 9.698889951569208e-06, + "loss": 1.0988, + "step": 1787 + }, + { + "epoch": 0.13783533765032377, + "grad_norm": 3.426234245300293, + "learning_rate": 9.698463103929542e-06, + "loss": 1.129, + "step": 1788 + }, + { + "epoch": 0.13791242676534074, + "grad_norm": 3.9809820652008057, + "learning_rate": 9.698035963365038e-06, + "loss": 1.0853, + "step": 1789 + }, + { + "epoch": 0.13798951588035768, + "grad_norm": 4.098197937011719, + "learning_rate": 9.697608529902321e-06, + "loss": 1.2014, + "step": 1790 + }, + { + "epoch": 0.13806660499537465, + "grad_norm": 4.355566501617432, + "learning_rate": 9.697180803568042e-06, + "loss": 1.091, + "step": 1791 + }, + { + "epoch": 0.13814369411039162, + "grad_norm": 3.870091199874878, + "learning_rate": 9.69675278438887e-06, + "loss": 1.0783, + "step": 1792 + }, + { + "epoch": 0.13822078322540857, + "grad_norm": 3.829244375228882, + "learning_rate": 9.696324472391486e-06, + "loss": 1.0885, + "step": 1793 + }, + { + "epoch": 0.13829787234042554, + "grad_norm": 3.7947609424591064, + "learning_rate": 9.695895867602591e-06, + "loss": 0.9867, + "step": 1794 + }, + { + "epoch": 0.13837496145544248, + "grad_norm": 4.241781234741211, + "learning_rate": 9.695466970048912e-06, + "loss": 1.1875, + "step": 1795 + }, + { + "epoch": 0.13845205057045945, + "grad_norm": 4.397819995880127, + "learning_rate": 9.695037779757185e-06, + "loss": 1.2111, + "step": 1796 + }, + { + "epoch": 0.13852913968547642, + "grad_norm": 4.159852504730225, + "learning_rate": 9.694608296754168e-06, + "loss": 1.1907, + "step": 1797 + }, + { + "epoch": 0.13860622880049336, + "grad_norm": 3.971301317214966, + "learning_rate": 9.69417852106664e-06, + "loss": 1.1413, + "step": 1798 + }, + { + "epoch": 0.13868331791551033, + "grad_norm": 4.702109336853027, + "learning_rate": 9.693748452721392e-06, + "loss": 1.046, + "step": 1799 + }, + { + "epoch": 0.13876040703052728, + "grad_norm": 4.870896339416504, + "learning_rate": 9.693318091745237e-06, + "loss": 1.2241, + "step": 1800 + }, + { + "epoch": 0.13883749614554425, + "grad_norm": 3.7580301761627197, + "learning_rate": 9.692887438165007e-06, + "loss": 1.1777, + "step": 1801 + }, + { + "epoch": 0.13891458526056122, + "grad_norm": 3.787914276123047, + "learning_rate": 9.692456492007548e-06, + "loss": 1.1804, + "step": 1802 + }, + { + "epoch": 0.13899167437557816, + "grad_norm": 3.9445436000823975, + "learning_rate": 9.692025253299732e-06, + "loss": 1.135, + "step": 1803 + }, + { + "epoch": 0.13906876349059513, + "grad_norm": 4.596111297607422, + "learning_rate": 9.691593722068442e-06, + "loss": 1.1305, + "step": 1804 + }, + { + "epoch": 0.13914585260561207, + "grad_norm": 4.179446220397949, + "learning_rate": 9.69116189834058e-06, + "loss": 1.1967, + "step": 1805 + }, + { + "epoch": 0.13922294172062905, + "grad_norm": 3.8127710819244385, + "learning_rate": 9.69072978214307e-06, + "loss": 1.1141, + "step": 1806 + }, + { + "epoch": 0.13930003083564602, + "grad_norm": 4.053515434265137, + "learning_rate": 9.690297373502855e-06, + "loss": 1.2279, + "step": 1807 + }, + { + "epoch": 0.13937711995066296, + "grad_norm": 3.7114830017089844, + "learning_rate": 9.689864672446887e-06, + "loss": 1.0809, + "step": 1808 + }, + { + "epoch": 0.13945420906567993, + "grad_norm": 3.6122548580169678, + "learning_rate": 9.689431679002148e-06, + "loss": 1.0588, + "step": 1809 + }, + { + "epoch": 0.13953129818069687, + "grad_norm": 4.3173604011535645, + "learning_rate": 9.68899839319563e-06, + "loss": 1.1858, + "step": 1810 + }, + { + "epoch": 0.13960838729571384, + "grad_norm": 3.7968335151672363, + "learning_rate": 9.688564815054349e-06, + "loss": 1.1411, + "step": 1811 + }, + { + "epoch": 0.1396854764107308, + "grad_norm": 4.016263961791992, + "learning_rate": 9.688130944605332e-06, + "loss": 1.0934, + "step": 1812 + }, + { + "epoch": 0.13976256552574776, + "grad_norm": 3.948855400085449, + "learning_rate": 9.687696781875634e-06, + "loss": 1.1486, + "step": 1813 + }, + { + "epoch": 0.13983965464076473, + "grad_norm": 4.118218898773193, + "learning_rate": 9.687262326892317e-06, + "loss": 1.1233, + "step": 1814 + }, + { + "epoch": 0.1399167437557817, + "grad_norm": 4.039265155792236, + "learning_rate": 9.68682757968247e-06, + "loss": 1.0337, + "step": 1815 + }, + { + "epoch": 0.13999383287079864, + "grad_norm": 3.6836297512054443, + "learning_rate": 9.686392540273198e-06, + "loss": 1.1196, + "step": 1816 + }, + { + "epoch": 0.1400709219858156, + "grad_norm": 4.069727420806885, + "learning_rate": 9.685957208691623e-06, + "loss": 1.1305, + "step": 1817 + }, + { + "epoch": 0.14014801110083255, + "grad_norm": 3.9679081439971924, + "learning_rate": 9.685521584964885e-06, + "loss": 1.1529, + "step": 1818 + }, + { + "epoch": 0.14022510021584952, + "grad_norm": 3.8460636138916016, + "learning_rate": 9.685085669120142e-06, + "loss": 1.0946, + "step": 1819 + }, + { + "epoch": 0.1403021893308665, + "grad_norm": 4.128662586212158, + "learning_rate": 9.684649461184574e-06, + "loss": 1.2238, + "step": 1820 + }, + { + "epoch": 0.14037927844588344, + "grad_norm": 4.167182922363281, + "learning_rate": 9.684212961185374e-06, + "loss": 0.9351, + "step": 1821 + }, + { + "epoch": 0.1404563675609004, + "grad_norm": 4.262301921844482, + "learning_rate": 9.683776169149755e-06, + "loss": 1.1101, + "step": 1822 + }, + { + "epoch": 0.14053345667591735, + "grad_norm": 4.976638317108154, + "learning_rate": 9.683339085104952e-06, + "loss": 1.1853, + "step": 1823 + }, + { + "epoch": 0.14061054579093432, + "grad_norm": 4.163305282592773, + "learning_rate": 9.68290170907821e-06, + "loss": 1.1477, + "step": 1824 + }, + { + "epoch": 0.1406876349059513, + "grad_norm": 4.624197483062744, + "learning_rate": 9.682464041096801e-06, + "loss": 1.2981, + "step": 1825 + }, + { + "epoch": 0.14076472402096823, + "grad_norm": 4.289773464202881, + "learning_rate": 9.682026081188009e-06, + "loss": 1.1704, + "step": 1826 + }, + { + "epoch": 0.1408418131359852, + "grad_norm": 5.031392574310303, + "learning_rate": 9.681587829379143e-06, + "loss": 1.3166, + "step": 1827 + }, + { + "epoch": 0.14091890225100215, + "grad_norm": 4.640775203704834, + "learning_rate": 9.68114928569752e-06, + "loss": 1.0848, + "step": 1828 + }, + { + "epoch": 0.14099599136601912, + "grad_norm": 3.823460817337036, + "learning_rate": 9.680710450170482e-06, + "loss": 1.1245, + "step": 1829 + }, + { + "epoch": 0.1410730804810361, + "grad_norm": 4.686859130859375, + "learning_rate": 9.680271322825392e-06, + "loss": 1.1221, + "step": 1830 + }, + { + "epoch": 0.14115016959605303, + "grad_norm": 4.298460960388184, + "learning_rate": 9.679831903689624e-06, + "loss": 1.1457, + "step": 1831 + }, + { + "epoch": 0.14122725871107, + "grad_norm": 4.24713659286499, + "learning_rate": 9.679392192790573e-06, + "loss": 1.1776, + "step": 1832 + }, + { + "epoch": 0.14130434782608695, + "grad_norm": 4.055549621582031, + "learning_rate": 9.678952190155655e-06, + "loss": 1.1615, + "step": 1833 + }, + { + "epoch": 0.14138143694110392, + "grad_norm": 4.140902042388916, + "learning_rate": 9.678511895812301e-06, + "loss": 1.0056, + "step": 1834 + }, + { + "epoch": 0.1414585260561209, + "grad_norm": 4.143174648284912, + "learning_rate": 9.678071309787962e-06, + "loss": 1.1289, + "step": 1835 + }, + { + "epoch": 0.14153561517113783, + "grad_norm": 3.9411823749542236, + "learning_rate": 9.677630432110103e-06, + "loss": 1.1361, + "step": 1836 + }, + { + "epoch": 0.1416127042861548, + "grad_norm": 4.089599132537842, + "learning_rate": 9.677189262806213e-06, + "loss": 1.1156, + "step": 1837 + }, + { + "epoch": 0.14168979340117174, + "grad_norm": 3.4913694858551025, + "learning_rate": 9.676747801903798e-06, + "loss": 1.0064, + "step": 1838 + }, + { + "epoch": 0.1417668825161887, + "grad_norm": 3.990382432937622, + "learning_rate": 9.676306049430377e-06, + "loss": 1.1155, + "step": 1839 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 4.364477634429932, + "learning_rate": 9.675864005413494e-06, + "loss": 1.1699, + "step": 1840 + }, + { + "epoch": 0.14192106074622263, + "grad_norm": 4.722569942474365, + "learning_rate": 9.675421669880707e-06, + "loss": 1.1372, + "step": 1841 + }, + { + "epoch": 0.1419981498612396, + "grad_norm": 4.089046478271484, + "learning_rate": 9.674979042859593e-06, + "loss": 1.1623, + "step": 1842 + }, + { + "epoch": 0.14207523897625654, + "grad_norm": 3.667163610458374, + "learning_rate": 9.67453612437775e-06, + "loss": 1.0287, + "step": 1843 + }, + { + "epoch": 0.1421523280912735, + "grad_norm": 4.879021167755127, + "learning_rate": 9.674092914462788e-06, + "loss": 1.2368, + "step": 1844 + }, + { + "epoch": 0.14222941720629048, + "grad_norm": 3.920139789581299, + "learning_rate": 9.67364941314234e-06, + "loss": 1.049, + "step": 1845 + }, + { + "epoch": 0.14230650632130742, + "grad_norm": 3.814567804336548, + "learning_rate": 9.673205620444057e-06, + "loss": 1.1007, + "step": 1846 + }, + { + "epoch": 0.1423835954363244, + "grad_norm": 3.913419008255005, + "learning_rate": 9.672761536395608e-06, + "loss": 1.1205, + "step": 1847 + }, + { + "epoch": 0.14246068455134134, + "grad_norm": 3.8160576820373535, + "learning_rate": 9.672317161024679e-06, + "loss": 1.155, + "step": 1848 + }, + { + "epoch": 0.1425377736663583, + "grad_norm": 3.9836699962615967, + "learning_rate": 9.67187249435897e-06, + "loss": 1.0829, + "step": 1849 + }, + { + "epoch": 0.14261486278137528, + "grad_norm": 3.8983733654022217, + "learning_rate": 9.67142753642621e-06, + "loss": 1.0859, + "step": 1850 + }, + { + "epoch": 0.14269195189639222, + "grad_norm": 4.181762218475342, + "learning_rate": 9.670982287254136e-06, + "loss": 1.0088, + "step": 1851 + }, + { + "epoch": 0.1427690410114092, + "grad_norm": 4.015240669250488, + "learning_rate": 9.670536746870507e-06, + "loss": 1.1885, + "step": 1852 + }, + { + "epoch": 0.14284613012642614, + "grad_norm": 4.08405876159668, + "learning_rate": 9.670090915303103e-06, + "loss": 1.0023, + "step": 1853 + }, + { + "epoch": 0.1429232192414431, + "grad_norm": 4.5213189125061035, + "learning_rate": 9.669644792579717e-06, + "loss": 1.2307, + "step": 1854 + }, + { + "epoch": 0.14300030835646008, + "grad_norm": 4.091536521911621, + "learning_rate": 9.669198378728162e-06, + "loss": 1.1163, + "step": 1855 + }, + { + "epoch": 0.14307739747147702, + "grad_norm": 4.354223728179932, + "learning_rate": 9.668751673776272e-06, + "loss": 1.0047, + "step": 1856 + }, + { + "epoch": 0.143154486586494, + "grad_norm": 3.7144696712493896, + "learning_rate": 9.668304677751894e-06, + "loss": 1.1268, + "step": 1857 + }, + { + "epoch": 0.14323157570151096, + "grad_norm": 4.206509590148926, + "learning_rate": 9.667857390682897e-06, + "loss": 1.0878, + "step": 1858 + }, + { + "epoch": 0.1433086648165279, + "grad_norm": 4.0310492515563965, + "learning_rate": 9.667409812597168e-06, + "loss": 1.135, + "step": 1859 + }, + { + "epoch": 0.14338575393154487, + "grad_norm": 4.384598731994629, + "learning_rate": 9.66696194352261e-06, + "loss": 1.0957, + "step": 1860 + }, + { + "epoch": 0.14346284304656182, + "grad_norm": 3.8433432579040527, + "learning_rate": 9.666513783487145e-06, + "loss": 0.9749, + "step": 1861 + }, + { + "epoch": 0.1435399321615788, + "grad_norm": 3.990506887435913, + "learning_rate": 9.666065332518714e-06, + "loss": 1.086, + "step": 1862 + }, + { + "epoch": 0.14361702127659576, + "grad_norm": 3.817073106765747, + "learning_rate": 9.665616590645278e-06, + "loss": 1.0804, + "step": 1863 + }, + { + "epoch": 0.1436941103916127, + "grad_norm": 3.8277182579040527, + "learning_rate": 9.665167557894808e-06, + "loss": 1.0075, + "step": 1864 + }, + { + "epoch": 0.14377119950662967, + "grad_norm": 3.6581485271453857, + "learning_rate": 9.664718234295303e-06, + "loss": 1.122, + "step": 1865 + }, + { + "epoch": 0.14384828862164661, + "grad_norm": 3.7937848567962646, + "learning_rate": 9.664268619874776e-06, + "loss": 1.0867, + "step": 1866 + }, + { + "epoch": 0.14392537773666358, + "grad_norm": 4.170339584350586, + "learning_rate": 9.663818714661259e-06, + "loss": 1.0888, + "step": 1867 + }, + { + "epoch": 0.14400246685168056, + "grad_norm": 3.6754279136657715, + "learning_rate": 9.6633685186828e-06, + "loss": 1.0507, + "step": 1868 + }, + { + "epoch": 0.1440795559666975, + "grad_norm": 3.820255994796753, + "learning_rate": 9.662918031967463e-06, + "loss": 1.01, + "step": 1869 + }, + { + "epoch": 0.14415664508171447, + "grad_norm": 4.000062942504883, + "learning_rate": 9.662467254543337e-06, + "loss": 1.1658, + "step": 1870 + }, + { + "epoch": 0.1442337341967314, + "grad_norm": 3.684593439102173, + "learning_rate": 9.662016186438527e-06, + "loss": 1.0732, + "step": 1871 + }, + { + "epoch": 0.14431082331174838, + "grad_norm": 4.005534648895264, + "learning_rate": 9.661564827681152e-06, + "loss": 1.0009, + "step": 1872 + }, + { + "epoch": 0.14438791242676535, + "grad_norm": 3.8382081985473633, + "learning_rate": 9.661113178299353e-06, + "loss": 1.1442, + "step": 1873 + }, + { + "epoch": 0.1444650015417823, + "grad_norm": 4.236837863922119, + "learning_rate": 9.660661238321289e-06, + "loss": 1.0885, + "step": 1874 + }, + { + "epoch": 0.14454209065679927, + "grad_norm": 3.7837772369384766, + "learning_rate": 9.660209007775133e-06, + "loss": 1.0377, + "step": 1875 + }, + { + "epoch": 0.1446191797718162, + "grad_norm": 3.9629247188568115, + "learning_rate": 9.659756486689082e-06, + "loss": 1.0251, + "step": 1876 + }, + { + "epoch": 0.14469626888683318, + "grad_norm": 3.9419615268707275, + "learning_rate": 9.659303675091348e-06, + "loss": 1.181, + "step": 1877 + }, + { + "epoch": 0.14477335800185015, + "grad_norm": 3.833371162414551, + "learning_rate": 9.658850573010162e-06, + "loss": 0.9706, + "step": 1878 + }, + { + "epoch": 0.1448504471168671, + "grad_norm": 4.696406364440918, + "learning_rate": 9.658397180473768e-06, + "loss": 1.1338, + "step": 1879 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 3.9524292945861816, + "learning_rate": 9.65794349751044e-06, + "loss": 1.032, + "step": 1880 + }, + { + "epoch": 0.145004625346901, + "grad_norm": 4.391104221343994, + "learning_rate": 9.657489524148459e-06, + "loss": 1.1298, + "step": 1881 + }, + { + "epoch": 0.14508171446191798, + "grad_norm": 3.786226749420166, + "learning_rate": 9.657035260416126e-06, + "loss": 1.0417, + "step": 1882 + }, + { + "epoch": 0.14515880357693495, + "grad_norm": 3.7811176776885986, + "learning_rate": 9.656580706341763e-06, + "loss": 1.1276, + "step": 1883 + }, + { + "epoch": 0.1452358926919519, + "grad_norm": 4.132617473602295, + "learning_rate": 9.656125861953711e-06, + "loss": 1.0482, + "step": 1884 + }, + { + "epoch": 0.14531298180696886, + "grad_norm": 4.651538848876953, + "learning_rate": 9.655670727280326e-06, + "loss": 1.2372, + "step": 1885 + }, + { + "epoch": 0.1453900709219858, + "grad_norm": 3.871948719024658, + "learning_rate": 9.655215302349986e-06, + "loss": 1.0957, + "step": 1886 + }, + { + "epoch": 0.14546716003700277, + "grad_norm": 5.455825328826904, + "learning_rate": 9.65475958719108e-06, + "loss": 1.0278, + "step": 1887 + }, + { + "epoch": 0.14554424915201974, + "grad_norm": 3.909919261932373, + "learning_rate": 9.65430358183202e-06, + "loss": 1.1265, + "step": 1888 + }, + { + "epoch": 0.1456213382670367, + "grad_norm": 4.387859344482422, + "learning_rate": 9.653847286301238e-06, + "loss": 1.1409, + "step": 1889 + }, + { + "epoch": 0.14569842738205366, + "grad_norm": 4.335275650024414, + "learning_rate": 9.65339070062718e-06, + "loss": 1.0492, + "step": 1890 + }, + { + "epoch": 0.1457755164970706, + "grad_norm": 3.7810895442962646, + "learning_rate": 9.652933824838315e-06, + "loss": 0.9817, + "step": 1891 + }, + { + "epoch": 0.14585260561208757, + "grad_norm": 4.186948299407959, + "learning_rate": 9.652476658963122e-06, + "loss": 1.1084, + "step": 1892 + }, + { + "epoch": 0.14592969472710454, + "grad_norm": 4.576048374176025, + "learning_rate": 9.652019203030105e-06, + "loss": 1.1037, + "step": 1893 + }, + { + "epoch": 0.14600678384212148, + "grad_norm": 3.583650827407837, + "learning_rate": 9.651561457067785e-06, + "loss": 1.045, + "step": 1894 + }, + { + "epoch": 0.14608387295713846, + "grad_norm": 3.9985146522521973, + "learning_rate": 9.6511034211047e-06, + "loss": 1.0088, + "step": 1895 + }, + { + "epoch": 0.1461609620721554, + "grad_norm": 4.34122371673584, + "learning_rate": 9.650645095169403e-06, + "loss": 1.0772, + "step": 1896 + }, + { + "epoch": 0.14623805118717237, + "grad_norm": 4.4444122314453125, + "learning_rate": 9.650186479290472e-06, + "loss": 1.1531, + "step": 1897 + }, + { + "epoch": 0.14631514030218934, + "grad_norm": 3.7410149574279785, + "learning_rate": 9.649727573496499e-06, + "loss": 0.9934, + "step": 1898 + }, + { + "epoch": 0.14639222941720628, + "grad_norm": 3.8100745677948, + "learning_rate": 9.649268377816092e-06, + "loss": 1.0822, + "step": 1899 + }, + { + "epoch": 0.14646931853222325, + "grad_norm": 4.091446876525879, + "learning_rate": 9.64880889227788e-06, + "loss": 1.1054, + "step": 1900 + }, + { + "epoch": 0.14654640764724022, + "grad_norm": 4.245041847229004, + "learning_rate": 9.64834911691051e-06, + "loss": 1.1384, + "step": 1901 + }, + { + "epoch": 0.14662349676225717, + "grad_norm": 4.033145427703857, + "learning_rate": 9.647889051742649e-06, + "loss": 1.0701, + "step": 1902 + }, + { + "epoch": 0.14670058587727414, + "grad_norm": 4.123029708862305, + "learning_rate": 9.647428696802979e-06, + "loss": 1.0922, + "step": 1903 + }, + { + "epoch": 0.14677767499229108, + "grad_norm": 4.149605751037598, + "learning_rate": 9.646968052120196e-06, + "loss": 1.074, + "step": 1904 + }, + { + "epoch": 0.14685476410730805, + "grad_norm": 3.9435675144195557, + "learning_rate": 9.646507117723023e-06, + "loss": 1.0313, + "step": 1905 + }, + { + "epoch": 0.14693185322232502, + "grad_norm": 4.518271446228027, + "learning_rate": 9.646045893640197e-06, + "loss": 1.1398, + "step": 1906 + }, + { + "epoch": 0.14700894233734196, + "grad_norm": 3.978560447692871, + "learning_rate": 9.645584379900473e-06, + "loss": 1.1128, + "step": 1907 + }, + { + "epoch": 0.14708603145235893, + "grad_norm": 4.344027042388916, + "learning_rate": 9.64512257653262e-06, + "loss": 1.0262, + "step": 1908 + }, + { + "epoch": 0.14716312056737588, + "grad_norm": 4.042947769165039, + "learning_rate": 9.644660483565434e-06, + "loss": 1.1174, + "step": 1909 + }, + { + "epoch": 0.14724020968239285, + "grad_norm": 4.025622844696045, + "learning_rate": 9.644198101027721e-06, + "loss": 1.0611, + "step": 1910 + }, + { + "epoch": 0.14731729879740982, + "grad_norm": 4.026158809661865, + "learning_rate": 9.64373542894831e-06, + "loss": 1.083, + "step": 1911 + }, + { + "epoch": 0.14739438791242676, + "grad_norm": 4.082542419433594, + "learning_rate": 9.643272467356048e-06, + "loss": 1.132, + "step": 1912 + }, + { + "epoch": 0.14747147702744373, + "grad_norm": 3.592789888381958, + "learning_rate": 9.642809216279793e-06, + "loss": 1.019, + "step": 1913 + }, + { + "epoch": 0.14754856614246067, + "grad_norm": 3.6364972591400146, + "learning_rate": 9.64234567574843e-06, + "loss": 1.0443, + "step": 1914 + }, + { + "epoch": 0.14762565525747764, + "grad_norm": 4.323209285736084, + "learning_rate": 9.641881845790858e-06, + "loss": 1.0813, + "step": 1915 + }, + { + "epoch": 0.14770274437249462, + "grad_norm": 3.8409078121185303, + "learning_rate": 9.641417726435991e-06, + "loss": 1.0383, + "step": 1916 + }, + { + "epoch": 0.14777983348751156, + "grad_norm": 3.611025810241699, + "learning_rate": 9.64095331771277e-06, + "loss": 1.0272, + "step": 1917 + }, + { + "epoch": 0.14785692260252853, + "grad_norm": 4.067416667938232, + "learning_rate": 9.640488619650145e-06, + "loss": 1.1394, + "step": 1918 + }, + { + "epoch": 0.14793401171754547, + "grad_norm": 3.7739274501800537, + "learning_rate": 9.640023632277088e-06, + "loss": 1.0281, + "step": 1919 + }, + { + "epoch": 0.14801110083256244, + "grad_norm": 4.0226263999938965, + "learning_rate": 9.639558355622589e-06, + "loss": 1.0501, + "step": 1920 + }, + { + "epoch": 0.1480881899475794, + "grad_norm": 4.34377908706665, + "learning_rate": 9.639092789715656e-06, + "loss": 1.0817, + "step": 1921 + }, + { + "epoch": 0.14816527906259636, + "grad_norm": 3.9317402839660645, + "learning_rate": 9.638626934585314e-06, + "loss": 1.0944, + "step": 1922 + }, + { + "epoch": 0.14824236817761333, + "grad_norm": 3.948791265487671, + "learning_rate": 9.638160790260606e-06, + "loss": 1.0707, + "step": 1923 + }, + { + "epoch": 0.14831945729263027, + "grad_norm": 4.429360389709473, + "learning_rate": 9.637694356770595e-06, + "loss": 1.1624, + "step": 1924 + }, + { + "epoch": 0.14839654640764724, + "grad_norm": 4.858312129974365, + "learning_rate": 9.637227634144359e-06, + "loss": 0.995, + "step": 1925 + }, + { + "epoch": 0.1484736355226642, + "grad_norm": 3.9074039459228516, + "learning_rate": 9.636760622410997e-06, + "loss": 1.1135, + "step": 1926 + }, + { + "epoch": 0.14855072463768115, + "grad_norm": 3.9309797286987305, + "learning_rate": 9.636293321599625e-06, + "loss": 1.0696, + "step": 1927 + }, + { + "epoch": 0.14862781375269812, + "grad_norm": 3.8177499771118164, + "learning_rate": 9.635825731739376e-06, + "loss": 1.0943, + "step": 1928 + }, + { + "epoch": 0.14870490286771507, + "grad_norm": 4.232987880706787, + "learning_rate": 9.635357852859404e-06, + "loss": 1.0883, + "step": 1929 + }, + { + "epoch": 0.14878199198273204, + "grad_norm": 4.08406400680542, + "learning_rate": 9.634889684988875e-06, + "loss": 1.1446, + "step": 1930 + }, + { + "epoch": 0.148859081097749, + "grad_norm": 4.512781620025635, + "learning_rate": 9.63442122815698e-06, + "loss": 1.1857, + "step": 1931 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 4.197699069976807, + "learning_rate": 9.633952482392924e-06, + "loss": 1.0965, + "step": 1932 + }, + { + "epoch": 0.14901325932778292, + "grad_norm": 3.925628185272217, + "learning_rate": 9.63348344772593e-06, + "loss": 1.1148, + "step": 1933 + }, + { + "epoch": 0.14909034844279986, + "grad_norm": 4.278095245361328, + "learning_rate": 9.63301412418524e-06, + "loss": 1.0734, + "step": 1934 + }, + { + "epoch": 0.14916743755781683, + "grad_norm": 4.0269269943237305, + "learning_rate": 9.632544511800114e-06, + "loss": 1.1376, + "step": 1935 + }, + { + "epoch": 0.1492445266728338, + "grad_norm": 3.731536626815796, + "learning_rate": 9.63207461059983e-06, + "loss": 1.0154, + "step": 1936 + }, + { + "epoch": 0.14932161578785075, + "grad_norm": 3.8483800888061523, + "learning_rate": 9.631604420613685e-06, + "loss": 1.0973, + "step": 1937 + }, + { + "epoch": 0.14939870490286772, + "grad_norm": 3.9684271812438965, + "learning_rate": 9.631133941870993e-06, + "loss": 1.0039, + "step": 1938 + }, + { + "epoch": 0.14947579401788466, + "grad_norm": 4.196876049041748, + "learning_rate": 9.630663174401085e-06, + "loss": 1.2552, + "step": 1939 + }, + { + "epoch": 0.14955288313290163, + "grad_norm": 4.365589141845703, + "learning_rate": 9.630192118233309e-06, + "loss": 1.0885, + "step": 1940 + }, + { + "epoch": 0.1496299722479186, + "grad_norm": 3.9205033779144287, + "learning_rate": 9.629720773397036e-06, + "loss": 1.0257, + "step": 1941 + }, + { + "epoch": 0.14970706136293555, + "grad_norm": 3.8212361335754395, + "learning_rate": 9.62924913992165e-06, + "loss": 1.0147, + "step": 1942 + }, + { + "epoch": 0.14978415047795252, + "grad_norm": 4.525760650634766, + "learning_rate": 9.628777217836558e-06, + "loss": 1.1392, + "step": 1943 + }, + { + "epoch": 0.1498612395929695, + "grad_norm": 4.177234172821045, + "learning_rate": 9.628305007171177e-06, + "loss": 1.2419, + "step": 1944 + }, + { + "epoch": 0.14993832870798643, + "grad_norm": 4.351520538330078, + "learning_rate": 9.627832507954949e-06, + "loss": 1.1474, + "step": 1945 + }, + { + "epoch": 0.1500154178230034, + "grad_norm": 4.03515625, + "learning_rate": 9.627359720217334e-06, + "loss": 1.0986, + "step": 1946 + }, + { + "epoch": 0.15009250693802034, + "grad_norm": 3.882413387298584, + "learning_rate": 9.626886643987806e-06, + "loss": 1.1141, + "step": 1947 + }, + { + "epoch": 0.1501695960530373, + "grad_norm": 4.117105007171631, + "learning_rate": 9.626413279295859e-06, + "loss": 1.1688, + "step": 1948 + }, + { + "epoch": 0.15024668516805428, + "grad_norm": 3.7657737731933594, + "learning_rate": 9.625939626171004e-06, + "loss": 1.0316, + "step": 1949 + }, + { + "epoch": 0.15032377428307123, + "grad_norm": 4.3290019035339355, + "learning_rate": 9.625465684642773e-06, + "loss": 1.1699, + "step": 1950 + }, + { + "epoch": 0.1504008633980882, + "grad_norm": 4.567865371704102, + "learning_rate": 9.624991454740709e-06, + "loss": 1.1349, + "step": 1951 + }, + { + "epoch": 0.15047795251310514, + "grad_norm": 3.6431381702423096, + "learning_rate": 9.624516936494385e-06, + "loss": 0.9967, + "step": 1952 + }, + { + "epoch": 0.1505550416281221, + "grad_norm": 4.145571231842041, + "learning_rate": 9.62404212993338e-06, + "loss": 1.1052, + "step": 1953 + }, + { + "epoch": 0.15063213074313908, + "grad_norm": 3.933396339416504, + "learning_rate": 9.623567035087295e-06, + "loss": 1.0349, + "step": 1954 + }, + { + "epoch": 0.15070921985815602, + "grad_norm": 4.341703414916992, + "learning_rate": 9.623091651985754e-06, + "loss": 1.164, + "step": 1955 + }, + { + "epoch": 0.150786308973173, + "grad_norm": 3.696288585662842, + "learning_rate": 9.622615980658391e-06, + "loss": 1.0052, + "step": 1956 + }, + { + "epoch": 0.15086339808818994, + "grad_norm": 4.249678611755371, + "learning_rate": 9.622140021134863e-06, + "loss": 1.1709, + "step": 1957 + }, + { + "epoch": 0.1509404872032069, + "grad_norm": 4.261125087738037, + "learning_rate": 9.621663773444843e-06, + "loss": 1.0577, + "step": 1958 + }, + { + "epoch": 0.15101757631822388, + "grad_norm": 3.759453296661377, + "learning_rate": 9.621187237618024e-06, + "loss": 1.0644, + "step": 1959 + }, + { + "epoch": 0.15109466543324082, + "grad_norm": 4.062593460083008, + "learning_rate": 9.620710413684112e-06, + "loss": 1.1486, + "step": 1960 + }, + { + "epoch": 0.1511717545482578, + "grad_norm": 4.127292156219482, + "learning_rate": 9.62023330167284e-06, + "loss": 1.0529, + "step": 1961 + }, + { + "epoch": 0.15124884366327473, + "grad_norm": 4.187131881713867, + "learning_rate": 9.619755901613947e-06, + "loss": 1.0464, + "step": 1962 + }, + { + "epoch": 0.1513259327782917, + "grad_norm": 3.996009111404419, + "learning_rate": 9.619278213537202e-06, + "loss": 1.1509, + "step": 1963 + }, + { + "epoch": 0.15140302189330868, + "grad_norm": 4.108388423919678, + "learning_rate": 9.618800237472385e-06, + "loss": 1.0326, + "step": 1964 + }, + { + "epoch": 0.15148011100832562, + "grad_norm": 3.868997573852539, + "learning_rate": 9.618321973449294e-06, + "loss": 1.0271, + "step": 1965 + }, + { + "epoch": 0.1515572001233426, + "grad_norm": 4.561322212219238, + "learning_rate": 9.617843421497746e-06, + "loss": 1.0995, + "step": 1966 + }, + { + "epoch": 0.15163428923835953, + "grad_norm": 3.980226755142212, + "learning_rate": 9.617364581647578e-06, + "loss": 0.983, + "step": 1967 + }, + { + "epoch": 0.1517113783533765, + "grad_norm": 4.084739685058594, + "learning_rate": 9.616885453928641e-06, + "loss": 1.257, + "step": 1968 + }, + { + "epoch": 0.15178846746839347, + "grad_norm": 4.914328575134277, + "learning_rate": 9.616406038370809e-06, + "loss": 1.1405, + "step": 1969 + }, + { + "epoch": 0.15186555658341042, + "grad_norm": 3.754298210144043, + "learning_rate": 9.615926335003968e-06, + "loss": 1.0765, + "step": 1970 + }, + { + "epoch": 0.1519426456984274, + "grad_norm": 3.9111487865448, + "learning_rate": 9.615446343858028e-06, + "loss": 1.0048, + "step": 1971 + }, + { + "epoch": 0.15201973481344433, + "grad_norm": 4.179695129394531, + "learning_rate": 9.614966064962911e-06, + "loss": 1.0866, + "step": 1972 + }, + { + "epoch": 0.1520968239284613, + "grad_norm": 3.932072877883911, + "learning_rate": 9.614485498348563e-06, + "loss": 1.0682, + "step": 1973 + }, + { + "epoch": 0.15217391304347827, + "grad_norm": 4.217230796813965, + "learning_rate": 9.614004644044943e-06, + "loss": 1.1172, + "step": 1974 + }, + { + "epoch": 0.1522510021584952, + "grad_norm": 4.254328727722168, + "learning_rate": 9.613523502082029e-06, + "loss": 1.1907, + "step": 1975 + }, + { + "epoch": 0.15232809127351218, + "grad_norm": 4.149200916290283, + "learning_rate": 9.613042072489819e-06, + "loss": 1.1092, + "step": 1976 + }, + { + "epoch": 0.15240518038852913, + "grad_norm": 4.106222152709961, + "learning_rate": 9.612560355298328e-06, + "loss": 1.103, + "step": 1977 + }, + { + "epoch": 0.1524822695035461, + "grad_norm": 3.983444929122925, + "learning_rate": 9.612078350537586e-06, + "loss": 1.0695, + "step": 1978 + }, + { + "epoch": 0.15255935861856307, + "grad_norm": 4.1216959953308105, + "learning_rate": 9.611596058237647e-06, + "loss": 1.1196, + "step": 1979 + }, + { + "epoch": 0.15263644773358, + "grad_norm": 3.9716129302978516, + "learning_rate": 9.611113478428577e-06, + "loss": 1.2409, + "step": 1980 + }, + { + "epoch": 0.15271353684859698, + "grad_norm": 3.9019081592559814, + "learning_rate": 9.610630611140464e-06, + "loss": 0.9612, + "step": 1981 + }, + { + "epoch": 0.15279062596361392, + "grad_norm": 4.146617889404297, + "learning_rate": 9.610147456403412e-06, + "loss": 1.0516, + "step": 1982 + }, + { + "epoch": 0.1528677150786309, + "grad_norm": 3.6473166942596436, + "learning_rate": 9.609664014247542e-06, + "loss": 1.0253, + "step": 1983 + }, + { + "epoch": 0.15294480419364787, + "grad_norm": 4.137308120727539, + "learning_rate": 9.609180284702994e-06, + "loss": 1.0064, + "step": 1984 + }, + { + "epoch": 0.1530218933086648, + "grad_norm": 3.682206869125366, + "learning_rate": 9.608696267799928e-06, + "loss": 1.1105, + "step": 1985 + }, + { + "epoch": 0.15309898242368178, + "grad_norm": 3.6087026596069336, + "learning_rate": 9.608211963568518e-06, + "loss": 0.973, + "step": 1986 + }, + { + "epoch": 0.15317607153869875, + "grad_norm": 4.12633752822876, + "learning_rate": 9.60772737203896e-06, + "loss": 1.1194, + "step": 1987 + }, + { + "epoch": 0.1532531606537157, + "grad_norm": 3.913084030151367, + "learning_rate": 9.607242493241463e-06, + "loss": 1.1383, + "step": 1988 + }, + { + "epoch": 0.15333024976873266, + "grad_norm": 4.2011003494262695, + "learning_rate": 9.606757327206258e-06, + "loss": 1.1338, + "step": 1989 + }, + { + "epoch": 0.1534073388837496, + "grad_norm": 4.860283851623535, + "learning_rate": 9.606271873963591e-06, + "loss": 0.9991, + "step": 1990 + }, + { + "epoch": 0.15348442799876658, + "grad_norm": 3.819664716720581, + "learning_rate": 9.605786133543732e-06, + "loss": 1.1397, + "step": 1991 + }, + { + "epoch": 0.15356151711378355, + "grad_norm": 4.044528961181641, + "learning_rate": 9.60530010597696e-06, + "loss": 1.0723, + "step": 1992 + }, + { + "epoch": 0.1536386062288005, + "grad_norm": 3.8798742294311523, + "learning_rate": 9.604813791293579e-06, + "loss": 1.0981, + "step": 1993 + }, + { + "epoch": 0.15371569534381746, + "grad_norm": 4.1378703117370605, + "learning_rate": 9.604327189523906e-06, + "loss": 1.1575, + "step": 1994 + }, + { + "epoch": 0.1537927844588344, + "grad_norm": 5.460372447967529, + "learning_rate": 9.60384030069828e-06, + "loss": 1.1, + "step": 1995 + }, + { + "epoch": 0.15386987357385137, + "grad_norm": 4.201817035675049, + "learning_rate": 9.603353124847054e-06, + "loss": 1.0258, + "step": 1996 + }, + { + "epoch": 0.15394696268886834, + "grad_norm": 3.7332353591918945, + "learning_rate": 9.602865662000604e-06, + "loss": 1.1201, + "step": 1997 + }, + { + "epoch": 0.1540240518038853, + "grad_norm": 3.7747762203216553, + "learning_rate": 9.602377912189319e-06, + "loss": 1.0083, + "step": 1998 + }, + { + "epoch": 0.15410114091890226, + "grad_norm": 4.299161434173584, + "learning_rate": 9.601889875443605e-06, + "loss": 0.969, + "step": 1999 + }, + { + "epoch": 0.1541782300339192, + "grad_norm": 4.0587263107299805, + "learning_rate": 9.601401551793891e-06, + "loss": 1.0928, + "step": 2000 + }, + { + "epoch": 0.15425531914893617, + "grad_norm": 3.8370776176452637, + "learning_rate": 9.600912941270624e-06, + "loss": 1.1446, + "step": 2001 + }, + { + "epoch": 0.15433240826395314, + "grad_norm": 3.871699333190918, + "learning_rate": 9.600424043904263e-06, + "loss": 1.1099, + "step": 2002 + }, + { + "epoch": 0.15440949737897008, + "grad_norm": 4.0479230880737305, + "learning_rate": 9.599934859725288e-06, + "loss": 1.1099, + "step": 2003 + }, + { + "epoch": 0.15448658649398705, + "grad_norm": 4.1467485427856445, + "learning_rate": 9.599445388764199e-06, + "loss": 1.0611, + "step": 2004 + }, + { + "epoch": 0.154563675609004, + "grad_norm": 3.658740997314453, + "learning_rate": 9.598955631051512e-06, + "loss": 1.0018, + "step": 2005 + }, + { + "epoch": 0.15464076472402097, + "grad_norm": 4.59723424911499, + "learning_rate": 9.598465586617757e-06, + "loss": 1.1884, + "step": 2006 + }, + { + "epoch": 0.15471785383903794, + "grad_norm": 3.7733352184295654, + "learning_rate": 9.597975255493492e-06, + "loss": 1.0512, + "step": 2007 + }, + { + "epoch": 0.15479494295405488, + "grad_norm": 4.303390026092529, + "learning_rate": 9.597484637709282e-06, + "loss": 1.0479, + "step": 2008 + }, + { + "epoch": 0.15487203206907185, + "grad_norm": 3.650953769683838, + "learning_rate": 9.596993733295717e-06, + "loss": 0.9842, + "step": 2009 + }, + { + "epoch": 0.1549491211840888, + "grad_norm": 3.5866963863372803, + "learning_rate": 9.596502542283399e-06, + "loss": 1.1231, + "step": 2010 + }, + { + "epoch": 0.15502621029910577, + "grad_norm": 3.8881676197052, + "learning_rate": 9.596011064702954e-06, + "loss": 1.1773, + "step": 2011 + }, + { + "epoch": 0.15510329941412274, + "grad_norm": 3.849320650100708, + "learning_rate": 9.595519300585024e-06, + "loss": 1.0562, + "step": 2012 + }, + { + "epoch": 0.15518038852913968, + "grad_norm": 3.8360087871551514, + "learning_rate": 9.595027249960264e-06, + "loss": 1.17, + "step": 2013 + }, + { + "epoch": 0.15525747764415665, + "grad_norm": 3.9184608459472656, + "learning_rate": 9.594534912859356e-06, + "loss": 1.0578, + "step": 2014 + }, + { + "epoch": 0.1553345667591736, + "grad_norm": 4.464005470275879, + "learning_rate": 9.59404228931299e-06, + "loss": 1.0775, + "step": 2015 + }, + { + "epoch": 0.15541165587419056, + "grad_norm": 3.8075225353240967, + "learning_rate": 9.59354937935188e-06, + "loss": 0.9951, + "step": 2016 + }, + { + "epoch": 0.15548874498920753, + "grad_norm": 3.766676902770996, + "learning_rate": 9.59305618300676e-06, + "loss": 1.1488, + "step": 2017 + }, + { + "epoch": 0.15556583410422448, + "grad_norm": 3.7311699390411377, + "learning_rate": 9.592562700308372e-06, + "loss": 1.1167, + "step": 2018 + }, + { + "epoch": 0.15564292321924145, + "grad_norm": 4.18145751953125, + "learning_rate": 9.592068931287486e-06, + "loss": 1.1209, + "step": 2019 + }, + { + "epoch": 0.1557200123342584, + "grad_norm": 3.6943624019622803, + "learning_rate": 9.591574875974884e-06, + "loss": 1.0483, + "step": 2020 + }, + { + "epoch": 0.15579710144927536, + "grad_norm": 3.553483009338379, + "learning_rate": 9.591080534401371e-06, + "loss": 1.0577, + "step": 2021 + }, + { + "epoch": 0.15587419056429233, + "grad_norm": 3.7024497985839844, + "learning_rate": 9.590585906597764e-06, + "loss": 1.0646, + "step": 2022 + }, + { + "epoch": 0.15595127967930927, + "grad_norm": 4.6477837562561035, + "learning_rate": 9.590090992594901e-06, + "loss": 1.0384, + "step": 2023 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 3.604308843612671, + "learning_rate": 9.589595792423636e-06, + "loss": 0.9234, + "step": 2024 + }, + { + "epoch": 0.1561054579093432, + "grad_norm": 4.46380090713501, + "learning_rate": 9.589100306114842e-06, + "loss": 1.1194, + "step": 2025 + }, + { + "epoch": 0.15618254702436016, + "grad_norm": 4.057259559631348, + "learning_rate": 9.588604533699415e-06, + "loss": 1.1086, + "step": 2026 + }, + { + "epoch": 0.15625963613937713, + "grad_norm": 3.9798672199249268, + "learning_rate": 9.58810847520826e-06, + "loss": 1.0903, + "step": 2027 + }, + { + "epoch": 0.15633672525439407, + "grad_norm": 4.816097259521484, + "learning_rate": 9.587612130672302e-06, + "loss": 1.107, + "step": 2028 + }, + { + "epoch": 0.15641381436941104, + "grad_norm": 4.069070339202881, + "learning_rate": 9.587115500122489e-06, + "loss": 1.0659, + "step": 2029 + }, + { + "epoch": 0.156490903484428, + "grad_norm": 4.3908796310424805, + "learning_rate": 9.58661858358978e-06, + "loss": 0.9479, + "step": 2030 + }, + { + "epoch": 0.15656799259944496, + "grad_norm": 3.6012563705444336, + "learning_rate": 9.586121381105158e-06, + "loss": 1.0304, + "step": 2031 + }, + { + "epoch": 0.15664508171446193, + "grad_norm": 4.042422771453857, + "learning_rate": 9.58562389269962e-06, + "loss": 1.08, + "step": 2032 + }, + { + "epoch": 0.15672217082947887, + "grad_norm": 4.079514980316162, + "learning_rate": 9.585126118404183e-06, + "loss": 1.1668, + "step": 2033 + }, + { + "epoch": 0.15679925994449584, + "grad_norm": 3.817542314529419, + "learning_rate": 9.584628058249878e-06, + "loss": 1.0957, + "step": 2034 + }, + { + "epoch": 0.1568763490595128, + "grad_norm": 4.3651604652404785, + "learning_rate": 9.584129712267759e-06, + "loss": 1.0863, + "step": 2035 + }, + { + "epoch": 0.15695343817452975, + "grad_norm": 4.226108074188232, + "learning_rate": 9.583631080488893e-06, + "loss": 1.0109, + "step": 2036 + }, + { + "epoch": 0.15703052728954672, + "grad_norm": 3.7006771564483643, + "learning_rate": 9.58313216294437e-06, + "loss": 0.9356, + "step": 2037 + }, + { + "epoch": 0.15710761640456367, + "grad_norm": 4.0176100730896, + "learning_rate": 9.582632959665293e-06, + "loss": 1.0112, + "step": 2038 + }, + { + "epoch": 0.15718470551958064, + "grad_norm": 4.495166301727295, + "learning_rate": 9.582133470682785e-06, + "loss": 1.149, + "step": 2039 + }, + { + "epoch": 0.1572617946345976, + "grad_norm": 4.30600643157959, + "learning_rate": 9.581633696027986e-06, + "loss": 1.0351, + "step": 2040 + }, + { + "epoch": 0.15733888374961455, + "grad_norm": 4.271859645843506, + "learning_rate": 9.581133635732053e-06, + "loss": 1.2107, + "step": 2041 + }, + { + "epoch": 0.15741597286463152, + "grad_norm": 4.35610818862915, + "learning_rate": 9.580633289826166e-06, + "loss": 1.2124, + "step": 2042 + }, + { + "epoch": 0.15749306197964846, + "grad_norm": 3.801640272140503, + "learning_rate": 9.580132658341519e-06, + "loss": 1.0812, + "step": 2043 + }, + { + "epoch": 0.15757015109466543, + "grad_norm": 3.8817687034606934, + "learning_rate": 9.579631741309319e-06, + "loss": 1.1933, + "step": 2044 + }, + { + "epoch": 0.1576472402096824, + "grad_norm": 3.9286177158355713, + "learning_rate": 9.5791305387608e-06, + "loss": 1.0202, + "step": 2045 + }, + { + "epoch": 0.15772432932469935, + "grad_norm": 4.174068450927734, + "learning_rate": 9.578629050727208e-06, + "loss": 1.1846, + "step": 2046 + }, + { + "epoch": 0.15780141843971632, + "grad_norm": 4.3177409172058105, + "learning_rate": 9.578127277239807e-06, + "loss": 1.0551, + "step": 2047 + }, + { + "epoch": 0.15787850755473326, + "grad_norm": 4.210177898406982, + "learning_rate": 9.577625218329882e-06, + "loss": 1.0957, + "step": 2048 + }, + { + "epoch": 0.15795559666975023, + "grad_norm": 4.341221332550049, + "learning_rate": 9.577122874028733e-06, + "loss": 1.133, + "step": 2049 + }, + { + "epoch": 0.1580326857847672, + "grad_norm": 4.481259822845459, + "learning_rate": 9.576620244367676e-06, + "loss": 1.1467, + "step": 2050 + }, + { + "epoch": 0.15810977489978414, + "grad_norm": 4.296179294586182, + "learning_rate": 9.576117329378051e-06, + "loss": 1.0854, + "step": 2051 + }, + { + "epoch": 0.15818686401480112, + "grad_norm": 4.161882400512695, + "learning_rate": 9.575614129091211e-06, + "loss": 1.099, + "step": 2052 + }, + { + "epoch": 0.15826395312981806, + "grad_norm": 4.072216033935547, + "learning_rate": 9.575110643538528e-06, + "loss": 1.1539, + "step": 2053 + }, + { + "epoch": 0.15834104224483503, + "grad_norm": 3.7138614654541016, + "learning_rate": 9.574606872751391e-06, + "loss": 1.1287, + "step": 2054 + }, + { + "epoch": 0.158418131359852, + "grad_norm": 4.442841529846191, + "learning_rate": 9.574102816761209e-06, + "loss": 1.12, + "step": 2055 + }, + { + "epoch": 0.15849522047486894, + "grad_norm": 4.6196393966674805, + "learning_rate": 9.573598475599405e-06, + "loss": 1.0678, + "step": 2056 + }, + { + "epoch": 0.1585723095898859, + "grad_norm": 3.877990961074829, + "learning_rate": 9.573093849297423e-06, + "loss": 0.9528, + "step": 2057 + }, + { + "epoch": 0.15864939870490286, + "grad_norm": 4.252166748046875, + "learning_rate": 9.572588937886727e-06, + "loss": 1.0336, + "step": 2058 + }, + { + "epoch": 0.15872648781991983, + "grad_norm": 3.8500077724456787, + "learning_rate": 9.57208374139879e-06, + "loss": 1.1263, + "step": 2059 + }, + { + "epoch": 0.1588035769349368, + "grad_norm": 3.8318703174591064, + "learning_rate": 9.571578259865112e-06, + "loss": 1.0754, + "step": 2060 + }, + { + "epoch": 0.15888066604995374, + "grad_norm": 4.030394554138184, + "learning_rate": 9.571072493317207e-06, + "loss": 1.0642, + "step": 2061 + }, + { + "epoch": 0.1589577551649707, + "grad_norm": 4.103557109832764, + "learning_rate": 9.570566441786605e-06, + "loss": 1.0528, + "step": 2062 + }, + { + "epoch": 0.15903484427998765, + "grad_norm": 3.8708770275115967, + "learning_rate": 9.570060105304856e-06, + "loss": 1.0407, + "step": 2063 + }, + { + "epoch": 0.15911193339500462, + "grad_norm": 4.086610794067383, + "learning_rate": 9.569553483903531e-06, + "loss": 1.1052, + "step": 2064 + }, + { + "epoch": 0.1591890225100216, + "grad_norm": 3.6814303398132324, + "learning_rate": 9.569046577614212e-06, + "loss": 1.0038, + "step": 2065 + }, + { + "epoch": 0.15926611162503854, + "grad_norm": 4.457508087158203, + "learning_rate": 9.568539386468501e-06, + "loss": 1.1407, + "step": 2066 + }, + { + "epoch": 0.1593432007400555, + "grad_norm": 3.961879253387451, + "learning_rate": 9.568031910498021e-06, + "loss": 1.18, + "step": 2067 + }, + { + "epoch": 0.15942028985507245, + "grad_norm": 4.2065253257751465, + "learning_rate": 9.56752414973441e-06, + "loss": 0.9892, + "step": 2068 + }, + { + "epoch": 0.15949737897008942, + "grad_norm": 3.6328790187835693, + "learning_rate": 9.567016104209326e-06, + "loss": 1.0919, + "step": 2069 + }, + { + "epoch": 0.1595744680851064, + "grad_norm": 3.8659377098083496, + "learning_rate": 9.56650777395444e-06, + "loss": 1.0576, + "step": 2070 + }, + { + "epoch": 0.15965155720012333, + "grad_norm": 4.030930995941162, + "learning_rate": 9.565999159001442e-06, + "loss": 0.9824, + "step": 2071 + }, + { + "epoch": 0.1597286463151403, + "grad_norm": 4.241566181182861, + "learning_rate": 9.565490259382047e-06, + "loss": 1.0389, + "step": 2072 + }, + { + "epoch": 0.15980573543015728, + "grad_norm": 4.0185089111328125, + "learning_rate": 9.564981075127979e-06, + "loss": 1.0236, + "step": 2073 + }, + { + "epoch": 0.15988282454517422, + "grad_norm": 4.9292826652526855, + "learning_rate": 9.564471606270985e-06, + "loss": 1.0604, + "step": 2074 + }, + { + "epoch": 0.1599599136601912, + "grad_norm": 4.441895008087158, + "learning_rate": 9.563961852842824e-06, + "loss": 1.0312, + "step": 2075 + }, + { + "epoch": 0.16003700277520813, + "grad_norm": 3.789350748062134, + "learning_rate": 9.56345181487528e-06, + "loss": 1.073, + "step": 2076 + }, + { + "epoch": 0.1601140918902251, + "grad_norm": 4.255967617034912, + "learning_rate": 9.562941492400149e-06, + "loss": 1.039, + "step": 2077 + }, + { + "epoch": 0.16019118100524207, + "grad_norm": 3.9734389781951904, + "learning_rate": 9.56243088544925e-06, + "loss": 1.0608, + "step": 2078 + }, + { + "epoch": 0.16026827012025902, + "grad_norm": 4.481680870056152, + "learning_rate": 9.561919994054414e-06, + "loss": 1.1822, + "step": 2079 + }, + { + "epoch": 0.16034535923527599, + "grad_norm": 4.583976745605469, + "learning_rate": 9.561408818247493e-06, + "loss": 1.1088, + "step": 2080 + }, + { + "epoch": 0.16042244835029293, + "grad_norm": 3.912966012954712, + "learning_rate": 9.560897358060355e-06, + "loss": 1.0697, + "step": 2081 + }, + { + "epoch": 0.1604995374653099, + "grad_norm": 3.9257588386535645, + "learning_rate": 9.56038561352489e-06, + "loss": 1.0764, + "step": 2082 + }, + { + "epoch": 0.16057662658032687, + "grad_norm": 3.7236075401306152, + "learning_rate": 9.559873584673e-06, + "loss": 1.0295, + "step": 2083 + }, + { + "epoch": 0.1606537156953438, + "grad_norm": 4.096950531005859, + "learning_rate": 9.55936127153661e-06, + "loss": 1.0696, + "step": 2084 + }, + { + "epoch": 0.16073080481036078, + "grad_norm": 3.90035080909729, + "learning_rate": 9.558848674147657e-06, + "loss": 1.1163, + "step": 2085 + }, + { + "epoch": 0.16080789392537773, + "grad_norm": 3.8980672359466553, + "learning_rate": 9.558335792538099e-06, + "loss": 1.0165, + "step": 2086 + }, + { + "epoch": 0.1608849830403947, + "grad_norm": 4.154537200927734, + "learning_rate": 9.557822626739912e-06, + "loss": 1.0332, + "step": 2087 + }, + { + "epoch": 0.16096207215541167, + "grad_norm": 3.6484971046447754, + "learning_rate": 9.557309176785092e-06, + "loss": 1.011, + "step": 2088 + }, + { + "epoch": 0.1610391612704286, + "grad_norm": 4.025908470153809, + "learning_rate": 9.556795442705647e-06, + "loss": 1.0423, + "step": 2089 + }, + { + "epoch": 0.16111625038544558, + "grad_norm": 4.023404598236084, + "learning_rate": 9.556281424533606e-06, + "loss": 1.0278, + "step": 2090 + }, + { + "epoch": 0.16119333950046252, + "grad_norm": 4.211247444152832, + "learning_rate": 9.555767122301016e-06, + "loss": 1.0354, + "step": 2091 + }, + { + "epoch": 0.1612704286154795, + "grad_norm": 4.087972640991211, + "learning_rate": 9.55525253603994e-06, + "loss": 1.0494, + "step": 2092 + }, + { + "epoch": 0.16134751773049646, + "grad_norm": 3.722517728805542, + "learning_rate": 9.554737665782464e-06, + "loss": 1.0458, + "step": 2093 + }, + { + "epoch": 0.1614246068455134, + "grad_norm": 3.7930004596710205, + "learning_rate": 9.55422251156068e-06, + "loss": 1.0477, + "step": 2094 + }, + { + "epoch": 0.16150169596053038, + "grad_norm": 3.9096274375915527, + "learning_rate": 9.55370707340671e-06, + "loss": 1.1077, + "step": 2095 + }, + { + "epoch": 0.16157878507554732, + "grad_norm": 3.9330289363861084, + "learning_rate": 9.553191351352691e-06, + "loss": 0.9962, + "step": 2096 + }, + { + "epoch": 0.1616558741905643, + "grad_norm": 4.029851913452148, + "learning_rate": 9.55267534543077e-06, + "loss": 1.138, + "step": 2097 + }, + { + "epoch": 0.16173296330558126, + "grad_norm": 4.2497334480285645, + "learning_rate": 9.552159055673122e-06, + "loss": 1.0651, + "step": 2098 + }, + { + "epoch": 0.1618100524205982, + "grad_norm": 3.8570566177368164, + "learning_rate": 9.551642482111931e-06, + "loss": 1.035, + "step": 2099 + }, + { + "epoch": 0.16188714153561518, + "grad_norm": 4.435809135437012, + "learning_rate": 9.551125624779407e-06, + "loss": 1.1124, + "step": 2100 + }, + { + "epoch": 0.16196423065063212, + "grad_norm": 4.071799278259277, + "learning_rate": 9.55060848370777e-06, + "loss": 1.1761, + "step": 2101 + }, + { + "epoch": 0.1620413197656491, + "grad_norm": 3.8368101119995117, + "learning_rate": 9.550091058929264e-06, + "loss": 1.1273, + "step": 2102 + }, + { + "epoch": 0.16211840888066606, + "grad_norm": 4.056320667266846, + "learning_rate": 9.549573350476142e-06, + "loss": 1.1069, + "step": 2103 + }, + { + "epoch": 0.162195497995683, + "grad_norm": 4.4123148918151855, + "learning_rate": 9.549055358380687e-06, + "loss": 1.0627, + "step": 2104 + }, + { + "epoch": 0.16227258711069997, + "grad_norm": 3.741130828857422, + "learning_rate": 9.548537082675191e-06, + "loss": 1.1405, + "step": 2105 + }, + { + "epoch": 0.16234967622571692, + "grad_norm": 3.8030614852905273, + "learning_rate": 9.548018523391965e-06, + "loss": 1.1287, + "step": 2106 + }, + { + "epoch": 0.1624267653407339, + "grad_norm": 3.879117727279663, + "learning_rate": 9.54749968056334e-06, + "loss": 1.035, + "step": 2107 + }, + { + "epoch": 0.16250385445575086, + "grad_norm": 3.7147738933563232, + "learning_rate": 9.546980554221659e-06, + "loss": 0.9949, + "step": 2108 + }, + { + "epoch": 0.1625809435707678, + "grad_norm": 4.359158515930176, + "learning_rate": 9.546461144399293e-06, + "loss": 1.069, + "step": 2109 + }, + { + "epoch": 0.16265803268578477, + "grad_norm": 3.803569793701172, + "learning_rate": 9.54594145112862e-06, + "loss": 1.0991, + "step": 2110 + }, + { + "epoch": 0.1627351218008017, + "grad_norm": 3.9304962158203125, + "learning_rate": 9.545421474442042e-06, + "loss": 1.0607, + "step": 2111 + }, + { + "epoch": 0.16281221091581868, + "grad_norm": 4.038872718811035, + "learning_rate": 9.544901214371976e-06, + "loss": 1.0551, + "step": 2112 + }, + { + "epoch": 0.16288930003083565, + "grad_norm": 4.135643482208252, + "learning_rate": 9.544380670950857e-06, + "loss": 1.0892, + "step": 2113 + }, + { + "epoch": 0.1629663891458526, + "grad_norm": 4.121045112609863, + "learning_rate": 9.543859844211141e-06, + "loss": 1.0992, + "step": 2114 + }, + { + "epoch": 0.16304347826086957, + "grad_norm": 3.6090307235717773, + "learning_rate": 9.543338734185295e-06, + "loss": 1.0014, + "step": 2115 + }, + { + "epoch": 0.16312056737588654, + "grad_norm": 3.578199625015259, + "learning_rate": 9.54281734090581e-06, + "loss": 1.0165, + "step": 2116 + }, + { + "epoch": 0.16319765649090348, + "grad_norm": 4.004699230194092, + "learning_rate": 9.542295664405193e-06, + "loss": 1.1468, + "step": 2117 + }, + { + "epoch": 0.16327474560592045, + "grad_norm": 3.9040887355804443, + "learning_rate": 9.541773704715966e-06, + "loss": 1.0745, + "step": 2118 + }, + { + "epoch": 0.1633518347209374, + "grad_norm": 3.8351752758026123, + "learning_rate": 9.541251461870672e-06, + "loss": 1.1056, + "step": 2119 + }, + { + "epoch": 0.16342892383595437, + "grad_norm": 3.704251527786255, + "learning_rate": 9.540728935901867e-06, + "loss": 0.9895, + "step": 2120 + }, + { + "epoch": 0.16350601295097134, + "grad_norm": 4.368956565856934, + "learning_rate": 9.540206126842129e-06, + "loss": 1.0922, + "step": 2121 + }, + { + "epoch": 0.16358310206598828, + "grad_norm": 3.9333720207214355, + "learning_rate": 9.539683034724054e-06, + "loss": 1.1807, + "step": 2122 + }, + { + "epoch": 0.16366019118100525, + "grad_norm": 3.7927298545837402, + "learning_rate": 9.539159659580254e-06, + "loss": 1.0409, + "step": 2123 + }, + { + "epoch": 0.1637372802960222, + "grad_norm": 4.061433792114258, + "learning_rate": 9.53863600144336e-06, + "loss": 1.1204, + "step": 2124 + }, + { + "epoch": 0.16381436941103916, + "grad_norm": 3.5982680320739746, + "learning_rate": 9.538112060346016e-06, + "loss": 1.157, + "step": 2125 + }, + { + "epoch": 0.16389145852605613, + "grad_norm": 4.211023330688477, + "learning_rate": 9.537587836320887e-06, + "loss": 1.0758, + "step": 2126 + }, + { + "epoch": 0.16396854764107308, + "grad_norm": 3.861417055130005, + "learning_rate": 9.53706332940066e-06, + "loss": 1.1376, + "step": 2127 + }, + { + "epoch": 0.16404563675609005, + "grad_norm": 3.5378191471099854, + "learning_rate": 9.536538539618031e-06, + "loss": 1.0481, + "step": 2128 + }, + { + "epoch": 0.164122725871107, + "grad_norm": 3.9547457695007324, + "learning_rate": 9.53601346700572e-06, + "loss": 1.0943, + "step": 2129 + }, + { + "epoch": 0.16419981498612396, + "grad_norm": 3.9615726470947266, + "learning_rate": 9.53548811159646e-06, + "loss": 1.0865, + "step": 2130 + }, + { + "epoch": 0.16427690410114093, + "grad_norm": 3.7515220642089844, + "learning_rate": 9.534962473423008e-06, + "loss": 1.1495, + "step": 2131 + }, + { + "epoch": 0.16435399321615787, + "grad_norm": 4.2690534591674805, + "learning_rate": 9.534436552518134e-06, + "loss": 1.0762, + "step": 2132 + }, + { + "epoch": 0.16443108233117484, + "grad_norm": 4.034238338470459, + "learning_rate": 9.533910348914624e-06, + "loss": 1.2103, + "step": 2133 + }, + { + "epoch": 0.1645081714461918, + "grad_norm": 3.70621395111084, + "learning_rate": 9.533383862645285e-06, + "loss": 1.0361, + "step": 2134 + }, + { + "epoch": 0.16458526056120876, + "grad_norm": 4.004809856414795, + "learning_rate": 9.532857093742941e-06, + "loss": 1.1349, + "step": 2135 + }, + { + "epoch": 0.16466234967622573, + "grad_norm": 4.048299312591553, + "learning_rate": 9.532330042240434e-06, + "loss": 1.1853, + "step": 2136 + }, + { + "epoch": 0.16473943879124267, + "grad_norm": 4.68448543548584, + "learning_rate": 9.531802708170623e-06, + "loss": 1.0591, + "step": 2137 + }, + { + "epoch": 0.16481652790625964, + "grad_norm": 3.5135207176208496, + "learning_rate": 9.531275091566384e-06, + "loss": 0.9884, + "step": 2138 + }, + { + "epoch": 0.16489361702127658, + "grad_norm": 4.260873317718506, + "learning_rate": 9.530747192460609e-06, + "loss": 1.1469, + "step": 2139 + }, + { + "epoch": 0.16497070613629355, + "grad_norm": 3.82432222366333, + "learning_rate": 9.530219010886214e-06, + "loss": 1.1344, + "step": 2140 + }, + { + "epoch": 0.16504779525131053, + "grad_norm": 4.045070171356201, + "learning_rate": 9.529690546876125e-06, + "loss": 1.0296, + "step": 2141 + }, + { + "epoch": 0.16512488436632747, + "grad_norm": 4.116434574127197, + "learning_rate": 9.529161800463291e-06, + "loss": 1.1479, + "step": 2142 + }, + { + "epoch": 0.16520197348134444, + "grad_norm": 3.7906370162963867, + "learning_rate": 9.528632771680677e-06, + "loss": 1.0819, + "step": 2143 + }, + { + "epoch": 0.16527906259636138, + "grad_norm": 4.043308734893799, + "learning_rate": 9.528103460561262e-06, + "loss": 1.1089, + "step": 2144 + }, + { + "epoch": 0.16535615171137835, + "grad_norm": 4.255034446716309, + "learning_rate": 9.52757386713805e-06, + "loss": 1.0368, + "step": 2145 + }, + { + "epoch": 0.16543324082639532, + "grad_norm": 3.905506134033203, + "learning_rate": 9.527043991444053e-06, + "loss": 0.9421, + "step": 2146 + }, + { + "epoch": 0.16551032994141227, + "grad_norm": 4.145698070526123, + "learning_rate": 9.526513833512312e-06, + "loss": 1.1444, + "step": 2147 + }, + { + "epoch": 0.16558741905642924, + "grad_norm": 3.9307868480682373, + "learning_rate": 9.525983393375877e-06, + "loss": 1.1171, + "step": 2148 + }, + { + "epoch": 0.16566450817144618, + "grad_norm": 3.6001784801483154, + "learning_rate": 9.525452671067816e-06, + "loss": 0.9414, + "step": 2149 + }, + { + "epoch": 0.16574159728646315, + "grad_norm": 4.438070774078369, + "learning_rate": 9.524921666621221e-06, + "loss": 1.1429, + "step": 2150 + }, + { + "epoch": 0.16581868640148012, + "grad_norm": 3.5969676971435547, + "learning_rate": 9.524390380069195e-06, + "loss": 0.97, + "step": 2151 + }, + { + "epoch": 0.16589577551649706, + "grad_norm": 4.561358451843262, + "learning_rate": 9.52385881144486e-06, + "loss": 1.0779, + "step": 2152 + }, + { + "epoch": 0.16597286463151403, + "grad_norm": 3.637179136276245, + "learning_rate": 9.52332696078136e-06, + "loss": 0.9986, + "step": 2153 + }, + { + "epoch": 0.16604995374653098, + "grad_norm": 3.7705628871917725, + "learning_rate": 9.522794828111849e-06, + "loss": 1.0871, + "step": 2154 + }, + { + "epoch": 0.16612704286154795, + "grad_norm": 3.748274803161621, + "learning_rate": 9.522262413469505e-06, + "loss": 1.0919, + "step": 2155 + }, + { + "epoch": 0.16620413197656492, + "grad_norm": 3.7525219917297363, + "learning_rate": 9.52172971688752e-06, + "loss": 1.043, + "step": 2156 + }, + { + "epoch": 0.16628122109158186, + "grad_norm": 4.160288333892822, + "learning_rate": 9.521196738399107e-06, + "loss": 1.0777, + "step": 2157 + }, + { + "epoch": 0.16635831020659883, + "grad_norm": 4.212968826293945, + "learning_rate": 9.520663478037493e-06, + "loss": 0.9929, + "step": 2158 + }, + { + "epoch": 0.1664353993216158, + "grad_norm": 4.7758588790893555, + "learning_rate": 9.520129935835924e-06, + "loss": 1.0723, + "step": 2159 + }, + { + "epoch": 0.16651248843663274, + "grad_norm": 4.326691150665283, + "learning_rate": 9.519596111827665e-06, + "loss": 0.9742, + "step": 2160 + }, + { + "epoch": 0.16658957755164971, + "grad_norm": 3.9504857063293457, + "learning_rate": 9.519062006045995e-06, + "loss": 1.0658, + "step": 2161 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 3.9129183292388916, + "learning_rate": 9.518527618524213e-06, + "loss": 1.1364, + "step": 2162 + }, + { + "epoch": 0.16674375578168363, + "grad_norm": 3.5338165760040283, + "learning_rate": 9.517992949295637e-06, + "loss": 1.0188, + "step": 2163 + }, + { + "epoch": 0.1668208448967006, + "grad_norm": 3.9790713787078857, + "learning_rate": 9.5174579983936e-06, + "loss": 1.0366, + "step": 2164 + }, + { + "epoch": 0.16689793401171754, + "grad_norm": 3.752887725830078, + "learning_rate": 9.516922765851453e-06, + "loss": 1.0697, + "step": 2165 + }, + { + "epoch": 0.1669750231267345, + "grad_norm": 4.03433895111084, + "learning_rate": 9.516387251702566e-06, + "loss": 1.1041, + "step": 2166 + }, + { + "epoch": 0.16705211224175145, + "grad_norm": 4.188197612762451, + "learning_rate": 9.515851455980325e-06, + "loss": 1.0372, + "step": 2167 + }, + { + "epoch": 0.16712920135676843, + "grad_norm": 3.5528385639190674, + "learning_rate": 9.515315378718132e-06, + "loss": 1.021, + "step": 2168 + }, + { + "epoch": 0.1672062904717854, + "grad_norm": 4.173407077789307, + "learning_rate": 9.514779019949411e-06, + "loss": 1.0054, + "step": 2169 + }, + { + "epoch": 0.16728337958680234, + "grad_norm": 3.9701037406921387, + "learning_rate": 9.514242379707604e-06, + "loss": 1.0418, + "step": 2170 + }, + { + "epoch": 0.1673604687018193, + "grad_norm": 4.371380805969238, + "learning_rate": 9.513705458026162e-06, + "loss": 1.1153, + "step": 2171 + }, + { + "epoch": 0.16743755781683625, + "grad_norm": 4.344913482666016, + "learning_rate": 9.51316825493856e-06, + "loss": 1.0405, + "step": 2172 + }, + { + "epoch": 0.16751464693185322, + "grad_norm": 4.681027412414551, + "learning_rate": 9.512630770478292e-06, + "loss": 1.0118, + "step": 2173 + }, + { + "epoch": 0.1675917360468702, + "grad_norm": 4.137608528137207, + "learning_rate": 9.51209300467887e-06, + "loss": 1.1043, + "step": 2174 + }, + { + "epoch": 0.16766882516188714, + "grad_norm": 3.731459379196167, + "learning_rate": 9.511554957573816e-06, + "loss": 0.901, + "step": 2175 + }, + { + "epoch": 0.1677459142769041, + "grad_norm": 3.322524070739746, + "learning_rate": 9.511016629196676e-06, + "loss": 0.9328, + "step": 2176 + }, + { + "epoch": 0.16782300339192105, + "grad_norm": 3.5919430255889893, + "learning_rate": 9.510478019581013e-06, + "loss": 0.9859, + "step": 2177 + }, + { + "epoch": 0.16790009250693802, + "grad_norm": 3.575169086456299, + "learning_rate": 9.509939128760406e-06, + "loss": 1.0423, + "step": 2178 + }, + { + "epoch": 0.167977181621955, + "grad_norm": 4.213058948516846, + "learning_rate": 9.509399956768452e-06, + "loss": 1.0613, + "step": 2179 + }, + { + "epoch": 0.16805427073697193, + "grad_norm": 3.759490966796875, + "learning_rate": 9.508860503638765e-06, + "loss": 1.0447, + "step": 2180 + }, + { + "epoch": 0.1681313598519889, + "grad_norm": 4.783963203430176, + "learning_rate": 9.508320769404978e-06, + "loss": 1.0838, + "step": 2181 + }, + { + "epoch": 0.16820844896700585, + "grad_norm": 3.997779369354248, + "learning_rate": 9.507780754100739e-06, + "loss": 1.156, + "step": 2182 + }, + { + "epoch": 0.16828553808202282, + "grad_norm": 3.6696715354919434, + "learning_rate": 9.507240457759717e-06, + "loss": 1.0282, + "step": 2183 + }, + { + "epoch": 0.1683626271970398, + "grad_norm": 3.8719160556793213, + "learning_rate": 9.506699880415597e-06, + "loss": 1.0963, + "step": 2184 + }, + { + "epoch": 0.16843971631205673, + "grad_norm": 3.7864603996276855, + "learning_rate": 9.50615902210208e-06, + "loss": 1.0538, + "step": 2185 + }, + { + "epoch": 0.1685168054270737, + "grad_norm": 4.357522964477539, + "learning_rate": 9.505617882852884e-06, + "loss": 1.0002, + "step": 2186 + }, + { + "epoch": 0.16859389454209064, + "grad_norm": 4.590842247009277, + "learning_rate": 9.505076462701752e-06, + "loss": 1.0613, + "step": 2187 + }, + { + "epoch": 0.16867098365710761, + "grad_norm": 5.570279121398926, + "learning_rate": 9.504534761682431e-06, + "loss": 1.0677, + "step": 2188 + }, + { + "epoch": 0.16874807277212459, + "grad_norm": 3.9890170097351074, + "learning_rate": 9.503992779828698e-06, + "loss": 0.9587, + "step": 2189 + }, + { + "epoch": 0.16882516188714153, + "grad_norm": 3.743421792984009, + "learning_rate": 9.503450517174344e-06, + "loss": 1.1186, + "step": 2190 + }, + { + "epoch": 0.1689022510021585, + "grad_norm": 4.146291255950928, + "learning_rate": 9.502907973753173e-06, + "loss": 1.1364, + "step": 2191 + }, + { + "epoch": 0.16897934011717544, + "grad_norm": 4.190448760986328, + "learning_rate": 9.50236514959901e-06, + "loss": 1.1372, + "step": 2192 + }, + { + "epoch": 0.1690564292321924, + "grad_norm": 4.3127641677856445, + "learning_rate": 9.501822044745701e-06, + "loss": 1.1888, + "step": 2193 + }, + { + "epoch": 0.16913351834720938, + "grad_norm": 3.6845946311950684, + "learning_rate": 9.501278659227101e-06, + "loss": 0.9711, + "step": 2194 + }, + { + "epoch": 0.16921060746222633, + "grad_norm": 3.9466843605041504, + "learning_rate": 9.500734993077089e-06, + "loss": 1.0328, + "step": 2195 + }, + { + "epoch": 0.1692876965772433, + "grad_norm": 4.372476577758789, + "learning_rate": 9.500191046329561e-06, + "loss": 1.0785, + "step": 2196 + }, + { + "epoch": 0.16936478569226024, + "grad_norm": 3.853376865386963, + "learning_rate": 9.499646819018429e-06, + "loss": 1.0748, + "step": 2197 + }, + { + "epoch": 0.1694418748072772, + "grad_norm": 3.7514140605926514, + "learning_rate": 9.499102311177622e-06, + "loss": 1.1136, + "step": 2198 + }, + { + "epoch": 0.16951896392229418, + "grad_norm": 3.941514730453491, + "learning_rate": 9.498557522841085e-06, + "loss": 1.2299, + "step": 2199 + }, + { + "epoch": 0.16959605303731112, + "grad_norm": 4.029524326324463, + "learning_rate": 9.498012454042786e-06, + "loss": 1.0973, + "step": 2200 + }, + { + "epoch": 0.1696731421523281, + "grad_norm": 4.247512340545654, + "learning_rate": 9.497467104816709e-06, + "loss": 1.1146, + "step": 2201 + }, + { + "epoch": 0.16975023126734506, + "grad_norm": 3.7475624084472656, + "learning_rate": 9.496921475196847e-06, + "loss": 1.0355, + "step": 2202 + }, + { + "epoch": 0.169827320382362, + "grad_norm": 3.81801438331604, + "learning_rate": 9.496375565217225e-06, + "loss": 1.1312, + "step": 2203 + }, + { + "epoch": 0.16990440949737898, + "grad_norm": 4.464087963104248, + "learning_rate": 9.49582937491187e-06, + "loss": 1.1643, + "step": 2204 + }, + { + "epoch": 0.16998149861239592, + "grad_norm": 3.895153522491455, + "learning_rate": 9.49528290431484e-06, + "loss": 1.1044, + "step": 2205 + }, + { + "epoch": 0.1700585877274129, + "grad_norm": 4.139657974243164, + "learning_rate": 9.494736153460204e-06, + "loss": 1.1443, + "step": 2206 + }, + { + "epoch": 0.17013567684242986, + "grad_norm": 3.7436201572418213, + "learning_rate": 9.494189122382046e-06, + "loss": 1.0922, + "step": 2207 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 4.397914409637451, + "learning_rate": 9.493641811114472e-06, + "loss": 1.1507, + "step": 2208 + }, + { + "epoch": 0.17028985507246377, + "grad_norm": 3.714843511581421, + "learning_rate": 9.493094219691606e-06, + "loss": 1.0862, + "step": 2209 + }, + { + "epoch": 0.17036694418748072, + "grad_norm": 3.666947603225708, + "learning_rate": 9.492546348147584e-06, + "loss": 1.0753, + "step": 2210 + }, + { + "epoch": 0.1704440333024977, + "grad_norm": 4.164958953857422, + "learning_rate": 9.491998196516564e-06, + "loss": 1.133, + "step": 2211 + }, + { + "epoch": 0.17052112241751466, + "grad_norm": 4.119793891906738, + "learning_rate": 9.491449764832722e-06, + "loss": 1.0534, + "step": 2212 + }, + { + "epoch": 0.1705982115325316, + "grad_norm": 3.694411277770996, + "learning_rate": 9.49090105313025e-06, + "loss": 1.0354, + "step": 2213 + }, + { + "epoch": 0.17067530064754857, + "grad_norm": 4.022437572479248, + "learning_rate": 9.490352061443355e-06, + "loss": 1.0719, + "step": 2214 + }, + { + "epoch": 0.17075238976256552, + "grad_norm": 4.365479946136475, + "learning_rate": 9.489802789806265e-06, + "loss": 1.2675, + "step": 2215 + }, + { + "epoch": 0.17082947887758249, + "grad_norm": 4.730959892272949, + "learning_rate": 9.489253238253225e-06, + "loss": 1.1868, + "step": 2216 + }, + { + "epoch": 0.17090656799259946, + "grad_norm": 3.6116209030151367, + "learning_rate": 9.488703406818496e-06, + "loss": 1.0207, + "step": 2217 + }, + { + "epoch": 0.1709836571076164, + "grad_norm": 4.13337516784668, + "learning_rate": 9.488153295536358e-06, + "loss": 1.2437, + "step": 2218 + }, + { + "epoch": 0.17106074622263337, + "grad_norm": 4.369235038757324, + "learning_rate": 9.487602904441106e-06, + "loss": 1.1526, + "step": 2219 + }, + { + "epoch": 0.1711378353376503, + "grad_norm": 3.9253830909729004, + "learning_rate": 9.487052233567055e-06, + "loss": 1.0983, + "step": 2220 + }, + { + "epoch": 0.17121492445266728, + "grad_norm": 3.9150092601776123, + "learning_rate": 9.486501282948535e-06, + "loss": 1.0501, + "step": 2221 + }, + { + "epoch": 0.17129201356768425, + "grad_norm": 4.1889424324035645, + "learning_rate": 9.485950052619898e-06, + "loss": 1.0532, + "step": 2222 + }, + { + "epoch": 0.1713691026827012, + "grad_norm": 4.013206481933594, + "learning_rate": 9.485398542615507e-06, + "loss": 1.0343, + "step": 2223 + }, + { + "epoch": 0.17144619179771817, + "grad_norm": 3.747413158416748, + "learning_rate": 9.484846752969747e-06, + "loss": 0.9813, + "step": 2224 + }, + { + "epoch": 0.1715232809127351, + "grad_norm": 4.145966053009033, + "learning_rate": 9.484294683717021e-06, + "loss": 1.047, + "step": 2225 + }, + { + "epoch": 0.17160037002775208, + "grad_norm": 3.959221124649048, + "learning_rate": 9.483742334891747e-06, + "loss": 0.988, + "step": 2226 + }, + { + "epoch": 0.17167745914276905, + "grad_norm": 4.141855239868164, + "learning_rate": 9.483189706528358e-06, + "loss": 1.0136, + "step": 2227 + }, + { + "epoch": 0.171754548257786, + "grad_norm": 4.037559509277344, + "learning_rate": 9.482636798661311e-06, + "loss": 1.003, + "step": 2228 + }, + { + "epoch": 0.17183163737280296, + "grad_norm": 3.8605642318725586, + "learning_rate": 9.482083611325076e-06, + "loss": 1.105, + "step": 2229 + }, + { + "epoch": 0.1719087264878199, + "grad_norm": 4.074838161468506, + "learning_rate": 9.48153014455414e-06, + "loss": 1.073, + "step": 2230 + }, + { + "epoch": 0.17198581560283688, + "grad_norm": 3.978645086288452, + "learning_rate": 9.480976398383013e-06, + "loss": 1.1489, + "step": 2231 + }, + { + "epoch": 0.17206290471785385, + "grad_norm": 4.13065242767334, + "learning_rate": 9.480422372846212e-06, + "loss": 1.1217, + "step": 2232 + }, + { + "epoch": 0.1721399938328708, + "grad_norm": 4.274265766143799, + "learning_rate": 9.479868067978282e-06, + "loss": 1.0354, + "step": 2233 + }, + { + "epoch": 0.17221708294788776, + "grad_norm": 3.908271312713623, + "learning_rate": 9.47931348381378e-06, + "loss": 1.0812, + "step": 2234 + }, + { + "epoch": 0.1722941720629047, + "grad_norm": 4.128848075866699, + "learning_rate": 9.478758620387281e-06, + "loss": 1.0114, + "step": 2235 + }, + { + "epoch": 0.17237126117792168, + "grad_norm": 4.402393341064453, + "learning_rate": 9.478203477733377e-06, + "loss": 1.1512, + "step": 2236 + }, + { + "epoch": 0.17244835029293865, + "grad_norm": 4.182422161102295, + "learning_rate": 9.477648055886682e-06, + "loss": 1.0967, + "step": 2237 + }, + { + "epoch": 0.1725254394079556, + "grad_norm": 3.684023857116699, + "learning_rate": 9.477092354881818e-06, + "loss": 1.0843, + "step": 2238 + }, + { + "epoch": 0.17260252852297256, + "grad_norm": 4.2031660079956055, + "learning_rate": 9.476536374753434e-06, + "loss": 1.1126, + "step": 2239 + }, + { + "epoch": 0.1726796176379895, + "grad_norm": 3.8642640113830566, + "learning_rate": 9.475980115536193e-06, + "loss": 1.0219, + "step": 2240 + }, + { + "epoch": 0.17275670675300647, + "grad_norm": 4.426207542419434, + "learning_rate": 9.475423577264772e-06, + "loss": 1.1832, + "step": 2241 + }, + { + "epoch": 0.17283379586802344, + "grad_norm": 3.9104762077331543, + "learning_rate": 9.474866759973871e-06, + "loss": 1.0344, + "step": 2242 + }, + { + "epoch": 0.17291088498304039, + "grad_norm": 4.044737815856934, + "learning_rate": 9.474309663698202e-06, + "loss": 1.0671, + "step": 2243 + }, + { + "epoch": 0.17298797409805736, + "grad_norm": 4.002004623413086, + "learning_rate": 9.473752288472499e-06, + "loss": 1.0373, + "step": 2244 + }, + { + "epoch": 0.17306506321307433, + "grad_norm": 3.7214102745056152, + "learning_rate": 9.473194634331512e-06, + "loss": 1.0278, + "step": 2245 + }, + { + "epoch": 0.17314215232809127, + "grad_norm": 3.870584487915039, + "learning_rate": 9.472636701310005e-06, + "loss": 0.9826, + "step": 2246 + }, + { + "epoch": 0.17321924144310824, + "grad_norm": 3.6483523845672607, + "learning_rate": 9.472078489442766e-06, + "loss": 1.086, + "step": 2247 + }, + { + "epoch": 0.17329633055812518, + "grad_norm": 3.894209384918213, + "learning_rate": 9.471519998764593e-06, + "loss": 1.0286, + "step": 2248 + }, + { + "epoch": 0.17337341967314215, + "grad_norm": 3.856490135192871, + "learning_rate": 9.470961229310307e-06, + "loss": 1.1207, + "step": 2249 + }, + { + "epoch": 0.17345050878815912, + "grad_norm": 5.136345386505127, + "learning_rate": 9.470402181114747e-06, + "loss": 1.0207, + "step": 2250 + }, + { + "epoch": 0.17352759790317607, + "grad_norm": 4.148111343383789, + "learning_rate": 9.46984285421276e-06, + "loss": 1.1127, + "step": 2251 + }, + { + "epoch": 0.17360468701819304, + "grad_norm": 3.8354268074035645, + "learning_rate": 9.469283248639223e-06, + "loss": 1.0683, + "step": 2252 + }, + { + "epoch": 0.17368177613320998, + "grad_norm": 3.8218307495117188, + "learning_rate": 9.468723364429021e-06, + "loss": 1.0874, + "step": 2253 + }, + { + "epoch": 0.17375886524822695, + "grad_norm": 4.011178016662598, + "learning_rate": 9.468163201617063e-06, + "loss": 1.0788, + "step": 2254 + }, + { + "epoch": 0.17383595436324392, + "grad_norm": 3.5386111736297607, + "learning_rate": 9.467602760238268e-06, + "loss": 0.8796, + "step": 2255 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 3.82024884223938, + "learning_rate": 9.467042040327582e-06, + "loss": 1.0407, + "step": 2256 + }, + { + "epoch": 0.17399013259327784, + "grad_norm": 4.042757987976074, + "learning_rate": 9.46648104191996e-06, + "loss": 1.1627, + "step": 2257 + }, + { + "epoch": 0.17406722170829478, + "grad_norm": 3.925922393798828, + "learning_rate": 9.465919765050375e-06, + "loss": 1.1055, + "step": 2258 + }, + { + "epoch": 0.17414431082331175, + "grad_norm": 3.940410852432251, + "learning_rate": 9.465358209753824e-06, + "loss": 0.9985, + "step": 2259 + }, + { + "epoch": 0.17422139993832872, + "grad_norm": 3.904409646987915, + "learning_rate": 9.464796376065316e-06, + "loss": 1.0892, + "step": 2260 + }, + { + "epoch": 0.17429848905334566, + "grad_norm": 3.91276216506958, + "learning_rate": 9.464234264019876e-06, + "loss": 1.0863, + "step": 2261 + }, + { + "epoch": 0.17437557816836263, + "grad_norm": 3.8074424266815186, + "learning_rate": 9.463671873652551e-06, + "loss": 1.0709, + "step": 2262 + }, + { + "epoch": 0.17445266728337958, + "grad_norm": 4.295366287231445, + "learning_rate": 9.463109204998405e-06, + "loss": 1.0679, + "step": 2263 + }, + { + "epoch": 0.17452975639839655, + "grad_norm": 3.8966548442840576, + "learning_rate": 9.462546258092512e-06, + "loss": 1.0652, + "step": 2264 + }, + { + "epoch": 0.17460684551341352, + "grad_norm": 3.9106335639953613, + "learning_rate": 9.461983032969972e-06, + "loss": 1.0031, + "step": 2265 + }, + { + "epoch": 0.17468393462843046, + "grad_norm": 4.223884582519531, + "learning_rate": 9.4614195296659e-06, + "loss": 1.1193, + "step": 2266 + }, + { + "epoch": 0.17476102374344743, + "grad_norm": 3.972520351409912, + "learning_rate": 9.460855748215427e-06, + "loss": 1.1208, + "step": 2267 + }, + { + "epoch": 0.17483811285846437, + "grad_norm": 3.772442102432251, + "learning_rate": 9.460291688653702e-06, + "loss": 1.0707, + "step": 2268 + }, + { + "epoch": 0.17491520197348134, + "grad_norm": 4.243513584136963, + "learning_rate": 9.45972735101589e-06, + "loss": 1.0806, + "step": 2269 + }, + { + "epoch": 0.17499229108849831, + "grad_norm": 3.9305193424224854, + "learning_rate": 9.459162735337176e-06, + "loss": 1.0473, + "step": 2270 + }, + { + "epoch": 0.17506938020351526, + "grad_norm": 3.5326061248779297, + "learning_rate": 9.458597841652758e-06, + "loss": 1.0894, + "step": 2271 + }, + { + "epoch": 0.17514646931853223, + "grad_norm": 4.2558746337890625, + "learning_rate": 9.458032669997858e-06, + "loss": 1.1245, + "step": 2272 + }, + { + "epoch": 0.17522355843354917, + "grad_norm": 3.9435577392578125, + "learning_rate": 9.457467220407707e-06, + "loss": 1.0961, + "step": 2273 + }, + { + "epoch": 0.17530064754856614, + "grad_norm": 4.034180641174316, + "learning_rate": 9.456901492917564e-06, + "loss": 1.0136, + "step": 2274 + }, + { + "epoch": 0.1753777366635831, + "grad_norm": 4.0210676193237305, + "learning_rate": 9.456335487562695e-06, + "loss": 1.058, + "step": 2275 + }, + { + "epoch": 0.17545482577860005, + "grad_norm": 3.803864002227783, + "learning_rate": 9.455769204378388e-06, + "loss": 0.9875, + "step": 2276 + }, + { + "epoch": 0.17553191489361702, + "grad_norm": 3.870218515396118, + "learning_rate": 9.455202643399948e-06, + "loss": 1.0243, + "step": 2277 + }, + { + "epoch": 0.17560900400863397, + "grad_norm": 3.751405954360962, + "learning_rate": 9.454635804662697e-06, + "loss": 1.0363, + "step": 2278 + }, + { + "epoch": 0.17568609312365094, + "grad_norm": 3.922459125518799, + "learning_rate": 9.454068688201975e-06, + "loss": 1.0977, + "step": 2279 + }, + { + "epoch": 0.1757631822386679, + "grad_norm": 4.521456241607666, + "learning_rate": 9.453501294053139e-06, + "loss": 1.1405, + "step": 2280 + }, + { + "epoch": 0.17584027135368485, + "grad_norm": 4.3583478927612305, + "learning_rate": 9.452933622251561e-06, + "loss": 1.1479, + "step": 2281 + }, + { + "epoch": 0.17591736046870182, + "grad_norm": 3.9341979026794434, + "learning_rate": 9.452365672832635e-06, + "loss": 1.0467, + "step": 2282 + }, + { + "epoch": 0.1759944495837188, + "grad_norm": 4.151937484741211, + "learning_rate": 9.45179744583177e-06, + "loss": 1.0061, + "step": 2283 + }, + { + "epoch": 0.17607153869873574, + "grad_norm": 3.7211530208587646, + "learning_rate": 9.451228941284389e-06, + "loss": 1.026, + "step": 2284 + }, + { + "epoch": 0.1761486278137527, + "grad_norm": 4.126251697540283, + "learning_rate": 9.450660159225938e-06, + "loss": 1.055, + "step": 2285 + }, + { + "epoch": 0.17622571692876965, + "grad_norm": 3.7324936389923096, + "learning_rate": 9.450091099691876e-06, + "loss": 1.0071, + "step": 2286 + }, + { + "epoch": 0.17630280604378662, + "grad_norm": 3.911670446395874, + "learning_rate": 9.449521762717682e-06, + "loss": 1.0975, + "step": 2287 + }, + { + "epoch": 0.1763798951588036, + "grad_norm": 3.6570870876312256, + "learning_rate": 9.44895214833885e-06, + "loss": 0.911, + "step": 2288 + }, + { + "epoch": 0.17645698427382053, + "grad_norm": 3.5680623054504395, + "learning_rate": 9.448382256590893e-06, + "loss": 0.9641, + "step": 2289 + }, + { + "epoch": 0.1765340733888375, + "grad_norm": 3.8727200031280518, + "learning_rate": 9.447812087509343e-06, + "loss": 1.1022, + "step": 2290 + }, + { + "epoch": 0.17661116250385445, + "grad_norm": 3.9284706115722656, + "learning_rate": 9.447241641129743e-06, + "loss": 1.1042, + "step": 2291 + }, + { + "epoch": 0.17668825161887142, + "grad_norm": 4.463204383850098, + "learning_rate": 9.446670917487662e-06, + "loss": 1.1087, + "step": 2292 + }, + { + "epoch": 0.1767653407338884, + "grad_norm": 3.742551803588867, + "learning_rate": 9.446099916618676e-06, + "loss": 1.143, + "step": 2293 + }, + { + "epoch": 0.17684242984890533, + "grad_norm": 3.764596462249756, + "learning_rate": 9.445528638558389e-06, + "loss": 1.0978, + "step": 2294 + }, + { + "epoch": 0.1769195189639223, + "grad_norm": 3.68927001953125, + "learning_rate": 9.444957083342415e-06, + "loss": 0.8989, + "step": 2295 + }, + { + "epoch": 0.17699660807893924, + "grad_norm": 3.6712417602539062, + "learning_rate": 9.444385251006389e-06, + "loss": 0.9342, + "step": 2296 + }, + { + "epoch": 0.17707369719395621, + "grad_norm": 4.3771467208862305, + "learning_rate": 9.44381314158596e-06, + "loss": 1.1587, + "step": 2297 + }, + { + "epoch": 0.17715078630897318, + "grad_norm": 3.71481990814209, + "learning_rate": 9.443240755116797e-06, + "loss": 1.0263, + "step": 2298 + }, + { + "epoch": 0.17722787542399013, + "grad_norm": 3.5364553928375244, + "learning_rate": 9.442668091634583e-06, + "loss": 0.9791, + "step": 2299 + }, + { + "epoch": 0.1773049645390071, + "grad_norm": 4.014560699462891, + "learning_rate": 9.442095151175024e-06, + "loss": 0.9919, + "step": 2300 + }, + { + "epoch": 0.17738205365402404, + "grad_norm": 4.145420074462891, + "learning_rate": 9.441521933773839e-06, + "loss": 1.047, + "step": 2301 + }, + { + "epoch": 0.177459142769041, + "grad_norm": 4.094470977783203, + "learning_rate": 9.440948439466764e-06, + "loss": 1.095, + "step": 2302 + }, + { + "epoch": 0.17753623188405798, + "grad_norm": 3.7132604122161865, + "learning_rate": 9.440374668289553e-06, + "loss": 1.0473, + "step": 2303 + }, + { + "epoch": 0.17761332099907493, + "grad_norm": 3.507946491241455, + "learning_rate": 9.439800620277981e-06, + "loss": 1.0164, + "step": 2304 + }, + { + "epoch": 0.1776904101140919, + "grad_norm": 3.750944137573242, + "learning_rate": 9.439226295467834e-06, + "loss": 1.048, + "step": 2305 + }, + { + "epoch": 0.17776749922910884, + "grad_norm": 3.6753499507904053, + "learning_rate": 9.438651693894919e-06, + "loss": 1.0394, + "step": 2306 + }, + { + "epoch": 0.1778445883441258, + "grad_norm": 4.113043785095215, + "learning_rate": 9.438076815595058e-06, + "loss": 0.9472, + "step": 2307 + }, + { + "epoch": 0.17792167745914278, + "grad_norm": 4.260312080383301, + "learning_rate": 9.437501660604094e-06, + "loss": 1.1437, + "step": 2308 + }, + { + "epoch": 0.17799876657415972, + "grad_norm": 3.7821967601776123, + "learning_rate": 9.436926228957883e-06, + "loss": 1.0623, + "step": 2309 + }, + { + "epoch": 0.1780758556891767, + "grad_norm": 3.920973062515259, + "learning_rate": 9.436350520692303e-06, + "loss": 1.0917, + "step": 2310 + }, + { + "epoch": 0.17815294480419364, + "grad_norm": 3.868009567260742, + "learning_rate": 9.435774535843243e-06, + "loss": 1.0319, + "step": 2311 + }, + { + "epoch": 0.1782300339192106, + "grad_norm": 3.780529022216797, + "learning_rate": 9.435198274446614e-06, + "loss": 0.9861, + "step": 2312 + }, + { + "epoch": 0.17830712303422758, + "grad_norm": 3.820247173309326, + "learning_rate": 9.434621736538343e-06, + "loss": 1.1113, + "step": 2313 + }, + { + "epoch": 0.17838421214924452, + "grad_norm": 4.4522705078125, + "learning_rate": 9.434044922154375e-06, + "loss": 1.1872, + "step": 2314 + }, + { + "epoch": 0.1784613012642615, + "grad_norm": 4.132708549499512, + "learning_rate": 9.433467831330671e-06, + "loss": 1.0276, + "step": 2315 + }, + { + "epoch": 0.17853839037927843, + "grad_norm": 3.8560283184051514, + "learning_rate": 9.432890464103208e-06, + "loss": 1.077, + "step": 2316 + }, + { + "epoch": 0.1786154794942954, + "grad_norm": 3.8112268447875977, + "learning_rate": 9.432312820507982e-06, + "loss": 1.059, + "step": 2317 + }, + { + "epoch": 0.17869256860931237, + "grad_norm": 4.3246259689331055, + "learning_rate": 9.431734900581011e-06, + "loss": 1.2361, + "step": 2318 + }, + { + "epoch": 0.17876965772432932, + "grad_norm": 4.359192371368408, + "learning_rate": 9.431156704358318e-06, + "loss": 1.0469, + "step": 2319 + }, + { + "epoch": 0.1788467468393463, + "grad_norm": 3.730458974838257, + "learning_rate": 9.430578231875956e-06, + "loss": 0.9637, + "step": 2320 + }, + { + "epoch": 0.17892383595436323, + "grad_norm": 4.066617965698242, + "learning_rate": 9.429999483169987e-06, + "loss": 0.956, + "step": 2321 + }, + { + "epoch": 0.1790009250693802, + "grad_norm": 3.660848379135132, + "learning_rate": 9.429420458276495e-06, + "loss": 1.0173, + "step": 2322 + }, + { + "epoch": 0.17907801418439717, + "grad_norm": 4.498156547546387, + "learning_rate": 9.428841157231576e-06, + "loss": 1.0781, + "step": 2323 + }, + { + "epoch": 0.17915510329941411, + "grad_norm": 3.933738946914673, + "learning_rate": 9.42826158007135e-06, + "loss": 1.1102, + "step": 2324 + }, + { + "epoch": 0.17923219241443109, + "grad_norm": 3.9478960037231445, + "learning_rate": 9.427681726831948e-06, + "loss": 1.0797, + "step": 2325 + }, + { + "epoch": 0.17930928152944806, + "grad_norm": 4.165237903594971, + "learning_rate": 9.427101597549522e-06, + "loss": 1.0487, + "step": 2326 + }, + { + "epoch": 0.179386370644465, + "grad_norm": 3.704395294189453, + "learning_rate": 9.426521192260239e-06, + "loss": 1.0585, + "step": 2327 + }, + { + "epoch": 0.17946345975948197, + "grad_norm": 4.005001544952393, + "learning_rate": 9.425940511000286e-06, + "loss": 1.0563, + "step": 2328 + }, + { + "epoch": 0.1795405488744989, + "grad_norm": 3.9118220806121826, + "learning_rate": 9.425359553805866e-06, + "loss": 1.1235, + "step": 2329 + }, + { + "epoch": 0.17961763798951588, + "grad_norm": 4.479408264160156, + "learning_rate": 9.424778320713196e-06, + "loss": 1.1773, + "step": 2330 + }, + { + "epoch": 0.17969472710453285, + "grad_norm": 3.661409854888916, + "learning_rate": 9.424196811758515e-06, + "loss": 0.9807, + "step": 2331 + }, + { + "epoch": 0.1797718162195498, + "grad_norm": 3.719728946685791, + "learning_rate": 9.423615026978076e-06, + "loss": 1.1761, + "step": 2332 + }, + { + "epoch": 0.17984890533456677, + "grad_norm": 4.085165023803711, + "learning_rate": 9.42303296640815e-06, + "loss": 1.0771, + "step": 2333 + }, + { + "epoch": 0.1799259944495837, + "grad_norm": 3.7032485008239746, + "learning_rate": 9.422450630085026e-06, + "loss": 1.1194, + "step": 2334 + }, + { + "epoch": 0.18000308356460068, + "grad_norm": 3.613755941390991, + "learning_rate": 9.42186801804501e-06, + "loss": 1.009, + "step": 2335 + }, + { + "epoch": 0.18008017267961765, + "grad_norm": 3.7317869663238525, + "learning_rate": 9.421285130324425e-06, + "loss": 1.0114, + "step": 2336 + }, + { + "epoch": 0.1801572617946346, + "grad_norm": 3.6703238487243652, + "learning_rate": 9.42070196695961e-06, + "loss": 0.9959, + "step": 2337 + }, + { + "epoch": 0.18023435090965156, + "grad_norm": 3.9614789485931396, + "learning_rate": 9.420118527986923e-06, + "loss": 0.935, + "step": 2338 + }, + { + "epoch": 0.1803114400246685, + "grad_norm": 4.142663478851318, + "learning_rate": 9.41953481344274e-06, + "loss": 1.1967, + "step": 2339 + }, + { + "epoch": 0.18038852913968548, + "grad_norm": 3.7918946743011475, + "learning_rate": 9.418950823363446e-06, + "loss": 1.0058, + "step": 2340 + }, + { + "epoch": 0.18046561825470245, + "grad_norm": 4.344407558441162, + "learning_rate": 9.41836655778546e-06, + "loss": 0.9986, + "step": 2341 + }, + { + "epoch": 0.1805427073697194, + "grad_norm": 3.8436927795410156, + "learning_rate": 9.417782016745198e-06, + "loss": 1.1001, + "step": 2342 + }, + { + "epoch": 0.18061979648473636, + "grad_norm": 4.281997203826904, + "learning_rate": 9.41719720027911e-06, + "loss": 1.1118, + "step": 2343 + }, + { + "epoch": 0.1806968855997533, + "grad_norm": 4.253276824951172, + "learning_rate": 9.416612108423653e-06, + "loss": 1.0024, + "step": 2344 + }, + { + "epoch": 0.18077397471477027, + "grad_norm": 3.944852590560913, + "learning_rate": 9.416026741215305e-06, + "loss": 1.0365, + "step": 2345 + }, + { + "epoch": 0.18085106382978725, + "grad_norm": 3.773387908935547, + "learning_rate": 9.415441098690562e-06, + "loss": 0.9963, + "step": 2346 + }, + { + "epoch": 0.1809281529448042, + "grad_norm": 3.8287198543548584, + "learning_rate": 9.414855180885933e-06, + "loss": 1.0586, + "step": 2347 + }, + { + "epoch": 0.18100524205982116, + "grad_norm": 4.154395580291748, + "learning_rate": 9.41426898783795e-06, + "loss": 0.9962, + "step": 2348 + }, + { + "epoch": 0.1810823311748381, + "grad_norm": 4.041691780090332, + "learning_rate": 9.413682519583156e-06, + "loss": 1.1823, + "step": 2349 + }, + { + "epoch": 0.18115942028985507, + "grad_norm": 4.263957977294922, + "learning_rate": 9.413095776158117e-06, + "loss": 0.9897, + "step": 2350 + }, + { + "epoch": 0.18123650940487204, + "grad_norm": 3.6769165992736816, + "learning_rate": 9.412508757599413e-06, + "loss": 1.0331, + "step": 2351 + }, + { + "epoch": 0.18131359851988899, + "grad_norm": 3.703239679336548, + "learning_rate": 9.411921463943641e-06, + "loss": 0.9509, + "step": 2352 + }, + { + "epoch": 0.18139068763490596, + "grad_norm": 3.7787482738494873, + "learning_rate": 9.411333895227415e-06, + "loss": 0.9942, + "step": 2353 + }, + { + "epoch": 0.1814677767499229, + "grad_norm": 3.9517102241516113, + "learning_rate": 9.410746051487367e-06, + "loss": 1.0088, + "step": 2354 + }, + { + "epoch": 0.18154486586493987, + "grad_norm": 3.5368359088897705, + "learning_rate": 9.410157932760148e-06, + "loss": 1.0106, + "step": 2355 + }, + { + "epoch": 0.18162195497995684, + "grad_norm": 3.456052780151367, + "learning_rate": 9.40956953908242e-06, + "loss": 1.0061, + "step": 2356 + }, + { + "epoch": 0.18169904409497378, + "grad_norm": 3.8283514976501465, + "learning_rate": 9.408980870490872e-06, + "loss": 1.0836, + "step": 2357 + }, + { + "epoch": 0.18177613320999075, + "grad_norm": 3.708845853805542, + "learning_rate": 9.4083919270222e-06, + "loss": 0.9974, + "step": 2358 + }, + { + "epoch": 0.1818532223250077, + "grad_norm": 3.6090118885040283, + "learning_rate": 9.407802708713123e-06, + "loss": 0.906, + "step": 2359 + }, + { + "epoch": 0.18193031144002467, + "grad_norm": 4.260048866271973, + "learning_rate": 9.407213215600377e-06, + "loss": 1.1036, + "step": 2360 + }, + { + "epoch": 0.18200740055504164, + "grad_norm": 4.0702056884765625, + "learning_rate": 9.406623447720711e-06, + "loss": 1.072, + "step": 2361 + }, + { + "epoch": 0.18208448967005858, + "grad_norm": 4.009487152099609, + "learning_rate": 9.406033405110896e-06, + "loss": 1.0236, + "step": 2362 + }, + { + "epoch": 0.18216157878507555, + "grad_norm": 4.346181869506836, + "learning_rate": 9.405443087807717e-06, + "loss": 0.978, + "step": 2363 + }, + { + "epoch": 0.1822386679000925, + "grad_norm": 4.057311534881592, + "learning_rate": 9.404852495847979e-06, + "loss": 1.13, + "step": 2364 + }, + { + "epoch": 0.18231575701510946, + "grad_norm": 4.186524391174316, + "learning_rate": 9.4042616292685e-06, + "loss": 1.0866, + "step": 2365 + }, + { + "epoch": 0.18239284613012643, + "grad_norm": 3.631807565689087, + "learning_rate": 9.40367048810612e-06, + "loss": 0.9123, + "step": 2366 + }, + { + "epoch": 0.18246993524514338, + "grad_norm": 3.841275930404663, + "learning_rate": 9.403079072397692e-06, + "loss": 1.19, + "step": 2367 + }, + { + "epoch": 0.18254702436016035, + "grad_norm": 3.969886541366577, + "learning_rate": 9.402487382180088e-06, + "loss": 0.9525, + "step": 2368 + }, + { + "epoch": 0.18262411347517732, + "grad_norm": 3.8512344360351562, + "learning_rate": 9.401895417490197e-06, + "loss": 0.9227, + "step": 2369 + }, + { + "epoch": 0.18270120259019426, + "grad_norm": 3.7888450622558594, + "learning_rate": 9.401303178364923e-06, + "loss": 1.0298, + "step": 2370 + }, + { + "epoch": 0.18277829170521123, + "grad_norm": 3.4874343872070312, + "learning_rate": 9.400710664841192e-06, + "loss": 0.997, + "step": 2371 + }, + { + "epoch": 0.18285538082022817, + "grad_norm": 3.7443392276763916, + "learning_rate": 9.400117876955943e-06, + "loss": 1.0684, + "step": 2372 + }, + { + "epoch": 0.18293246993524515, + "grad_norm": 4.328792572021484, + "learning_rate": 9.399524814746133e-06, + "loss": 1.1274, + "step": 2373 + }, + { + "epoch": 0.18300955905026212, + "grad_norm": 3.9465620517730713, + "learning_rate": 9.398931478248736e-06, + "loss": 1.1876, + "step": 2374 + }, + { + "epoch": 0.18308664816527906, + "grad_norm": 3.4830267429351807, + "learning_rate": 9.398337867500744e-06, + "loss": 0.9418, + "step": 2375 + }, + { + "epoch": 0.18316373728029603, + "grad_norm": 3.6330952644348145, + "learning_rate": 9.397743982539166e-06, + "loss": 1.0558, + "step": 2376 + }, + { + "epoch": 0.18324082639531297, + "grad_norm": 4.2686076164245605, + "learning_rate": 9.397149823401029e-06, + "loss": 1.0845, + "step": 2377 + }, + { + "epoch": 0.18331791551032994, + "grad_norm": 3.876206636428833, + "learning_rate": 9.396555390123371e-06, + "loss": 1.0598, + "step": 2378 + }, + { + "epoch": 0.1833950046253469, + "grad_norm": 4.2616472244262695, + "learning_rate": 9.395960682743255e-06, + "loss": 1.1524, + "step": 2379 + }, + { + "epoch": 0.18347209374036386, + "grad_norm": 4.4118475914001465, + "learning_rate": 9.39536570129776e-06, + "loss": 1.2057, + "step": 2380 + }, + { + "epoch": 0.18354918285538083, + "grad_norm": 3.960028648376465, + "learning_rate": 9.394770445823976e-06, + "loss": 1.0039, + "step": 2381 + }, + { + "epoch": 0.18362627197039777, + "grad_norm": 3.7380380630493164, + "learning_rate": 9.394174916359016e-06, + "loss": 1.0101, + "step": 2382 + }, + { + "epoch": 0.18370336108541474, + "grad_norm": 4.420310020446777, + "learning_rate": 9.393579112940007e-06, + "loss": 0.9947, + "step": 2383 + }, + { + "epoch": 0.1837804502004317, + "grad_norm": 3.7579898834228516, + "learning_rate": 9.392983035604098e-06, + "loss": 0.9812, + "step": 2384 + }, + { + "epoch": 0.18385753931544865, + "grad_norm": 4.156567573547363, + "learning_rate": 9.392386684388446e-06, + "loss": 1.1538, + "step": 2385 + }, + { + "epoch": 0.18393462843046562, + "grad_norm": 3.7567379474639893, + "learning_rate": 9.391790059330234e-06, + "loss": 1.0734, + "step": 2386 + }, + { + "epoch": 0.18401171754548257, + "grad_norm": 4.441596031188965, + "learning_rate": 9.391193160466658e-06, + "loss": 1.009, + "step": 2387 + }, + { + "epoch": 0.18408880666049954, + "grad_norm": 4.164433479309082, + "learning_rate": 9.39059598783493e-06, + "loss": 1.1177, + "step": 2388 + }, + { + "epoch": 0.1841658957755165, + "grad_norm": 4.159829139709473, + "learning_rate": 9.389998541472282e-06, + "loss": 1.0572, + "step": 2389 + }, + { + "epoch": 0.18424298489053345, + "grad_norm": 4.0909743309021, + "learning_rate": 9.389400821415962e-06, + "loss": 1.0564, + "step": 2390 + }, + { + "epoch": 0.18432007400555042, + "grad_norm": 4.253016471862793, + "learning_rate": 9.388802827703231e-06, + "loss": 1.162, + "step": 2391 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 4.08793306350708, + "learning_rate": 9.388204560371377e-06, + "loss": 1.1314, + "step": 2392 + }, + { + "epoch": 0.18447425223558434, + "grad_norm": 3.5922398567199707, + "learning_rate": 9.387606019457696e-06, + "loss": 1.0718, + "step": 2393 + }, + { + "epoch": 0.1845513413506013, + "grad_norm": 3.838916063308716, + "learning_rate": 9.387007204999503e-06, + "loss": 1.1038, + "step": 2394 + }, + { + "epoch": 0.18462843046561825, + "grad_norm": 3.8841969966888428, + "learning_rate": 9.386408117034131e-06, + "loss": 0.9366, + "step": 2395 + }, + { + "epoch": 0.18470551958063522, + "grad_norm": 4.599263668060303, + "learning_rate": 9.385808755598932e-06, + "loss": 1.0969, + "step": 2396 + }, + { + "epoch": 0.18478260869565216, + "grad_norm": 3.7928740978240967, + "learning_rate": 9.38520912073127e-06, + "loss": 1.0684, + "step": 2397 + }, + { + "epoch": 0.18485969781066913, + "grad_norm": 4.084198474884033, + "learning_rate": 9.384609212468531e-06, + "loss": 1.1246, + "step": 2398 + }, + { + "epoch": 0.1849367869256861, + "grad_norm": 4.296706199645996, + "learning_rate": 9.384009030848118e-06, + "loss": 1.0905, + "step": 2399 + }, + { + "epoch": 0.18501387604070305, + "grad_norm": 4.311187267303467, + "learning_rate": 9.383408575907446e-06, + "loss": 1.1047, + "step": 2400 + }, + { + "epoch": 0.18509096515572002, + "grad_norm": 3.62276554107666, + "learning_rate": 9.382807847683952e-06, + "loss": 1.1747, + "step": 2401 + }, + { + "epoch": 0.18516805427073696, + "grad_norm": 3.8191843032836914, + "learning_rate": 9.382206846215087e-06, + "loss": 1.052, + "step": 2402 + }, + { + "epoch": 0.18524514338575393, + "grad_norm": 4.004209041595459, + "learning_rate": 9.381605571538321e-06, + "loss": 0.9742, + "step": 2403 + }, + { + "epoch": 0.1853222325007709, + "grad_norm": 3.64969539642334, + "learning_rate": 9.381004023691142e-06, + "loss": 1.0338, + "step": 2404 + }, + { + "epoch": 0.18539932161578784, + "grad_norm": 4.0959978103637695, + "learning_rate": 9.38040220271105e-06, + "loss": 1.0634, + "step": 2405 + }, + { + "epoch": 0.1854764107308048, + "grad_norm": 4.057779788970947, + "learning_rate": 9.37980010863557e-06, + "loss": 1.0364, + "step": 2406 + }, + { + "epoch": 0.18555349984582176, + "grad_norm": 3.817060708999634, + "learning_rate": 9.379197741502236e-06, + "loss": 1.0794, + "step": 2407 + }, + { + "epoch": 0.18563058896083873, + "grad_norm": 4.1733880043029785, + "learning_rate": 9.378595101348602e-06, + "loss": 1.0477, + "step": 2408 + }, + { + "epoch": 0.1857076780758557, + "grad_norm": 3.8468310832977295, + "learning_rate": 9.377992188212241e-06, + "loss": 0.9913, + "step": 2409 + }, + { + "epoch": 0.18578476719087264, + "grad_norm": 4.041358470916748, + "learning_rate": 9.377389002130741e-06, + "loss": 1.1645, + "step": 2410 + }, + { + "epoch": 0.1858618563058896, + "grad_norm": 3.796046733856201, + "learning_rate": 9.376785543141709e-06, + "loss": 1.1405, + "step": 2411 + }, + { + "epoch": 0.18593894542090658, + "grad_norm": 3.9358856678009033, + "learning_rate": 9.376181811282764e-06, + "loss": 1.073, + "step": 2412 + }, + { + "epoch": 0.18601603453592352, + "grad_norm": 4.8310956954956055, + "learning_rate": 9.375577806591552e-06, + "loss": 1.1654, + "step": 2413 + }, + { + "epoch": 0.1860931236509405, + "grad_norm": 4.1156792640686035, + "learning_rate": 9.374973529105722e-06, + "loss": 0.9819, + "step": 2414 + }, + { + "epoch": 0.18617021276595744, + "grad_norm": 4.034856796264648, + "learning_rate": 9.374368978862952e-06, + "loss": 1.0477, + "step": 2415 + }, + { + "epoch": 0.1862473018809744, + "grad_norm": 4.216608047485352, + "learning_rate": 9.373764155900932e-06, + "loss": 1.1058, + "step": 2416 + }, + { + "epoch": 0.18632439099599138, + "grad_norm": 3.979484796524048, + "learning_rate": 9.373159060257368e-06, + "loss": 0.9858, + "step": 2417 + }, + { + "epoch": 0.18640148011100832, + "grad_norm": 3.889035940170288, + "learning_rate": 9.372553691969987e-06, + "loss": 0.9886, + "step": 2418 + }, + { + "epoch": 0.1864785692260253, + "grad_norm": 3.900179862976074, + "learning_rate": 9.37194805107653e-06, + "loss": 0.919, + "step": 2419 + }, + { + "epoch": 0.18655565834104224, + "grad_norm": 4.21008825302124, + "learning_rate": 9.371342137614754e-06, + "loss": 1.086, + "step": 2420 + }, + { + "epoch": 0.1866327474560592, + "grad_norm": 3.7535738945007324, + "learning_rate": 9.370735951622435e-06, + "loss": 0.9342, + "step": 2421 + }, + { + "epoch": 0.18670983657107618, + "grad_norm": 4.046445369720459, + "learning_rate": 9.370129493137367e-06, + "loss": 1.1538, + "step": 2422 + }, + { + "epoch": 0.18678692568609312, + "grad_norm": 3.904557228088379, + "learning_rate": 9.369522762197357e-06, + "loss": 1.0304, + "step": 2423 + }, + { + "epoch": 0.1868640148011101, + "grad_norm": 3.9920108318328857, + "learning_rate": 9.368915758840235e-06, + "loss": 1.0235, + "step": 2424 + }, + { + "epoch": 0.18694110391612703, + "grad_norm": 4.578022480010986, + "learning_rate": 9.36830848310384e-06, + "loss": 1.0139, + "step": 2425 + }, + { + "epoch": 0.187018193031144, + "grad_norm": 4.43154764175415, + "learning_rate": 9.367700935026038e-06, + "loss": 1.1063, + "step": 2426 + }, + { + "epoch": 0.18709528214616097, + "grad_norm": 3.994178533554077, + "learning_rate": 9.367093114644703e-06, + "loss": 1.0885, + "step": 2427 + }, + { + "epoch": 0.18717237126117792, + "grad_norm": 4.0110015869140625, + "learning_rate": 9.366485021997728e-06, + "loss": 1.05, + "step": 2428 + }, + { + "epoch": 0.1872494603761949, + "grad_norm": 3.9679675102233887, + "learning_rate": 9.36587665712303e-06, + "loss": 1.0291, + "step": 2429 + }, + { + "epoch": 0.18732654949121183, + "grad_norm": 4.2142558097839355, + "learning_rate": 9.36526802005853e-06, + "loss": 1.0634, + "step": 2430 + }, + { + "epoch": 0.1874036386062288, + "grad_norm": 3.840346574783325, + "learning_rate": 9.36465911084218e-06, + "loss": 1.0937, + "step": 2431 + }, + { + "epoch": 0.18748072772124577, + "grad_norm": 3.9532737731933594, + "learning_rate": 9.36404992951194e-06, + "loss": 1.1248, + "step": 2432 + }, + { + "epoch": 0.18755781683626271, + "grad_norm": 3.815847873687744, + "learning_rate": 9.363440476105787e-06, + "loss": 0.9521, + "step": 2433 + }, + { + "epoch": 0.18763490595127968, + "grad_norm": 4.12700080871582, + "learning_rate": 9.36283075066172e-06, + "loss": 1.0596, + "step": 2434 + }, + { + "epoch": 0.18771199506629663, + "grad_norm": 4.737533092498779, + "learning_rate": 9.36222075321775e-06, + "loss": 1.1245, + "step": 2435 + }, + { + "epoch": 0.1877890841813136, + "grad_norm": 3.9905295372009277, + "learning_rate": 9.36161048381191e-06, + "loss": 1.1027, + "step": 2436 + }, + { + "epoch": 0.18786617329633057, + "grad_norm": 4.189011096954346, + "learning_rate": 9.360999942482247e-06, + "loss": 1.0894, + "step": 2437 + }, + { + "epoch": 0.1879432624113475, + "grad_norm": 3.8845021724700928, + "learning_rate": 9.360389129266822e-06, + "loss": 1.0176, + "step": 2438 + }, + { + "epoch": 0.18802035152636448, + "grad_norm": 4.049330234527588, + "learning_rate": 9.359778044203718e-06, + "loss": 0.8773, + "step": 2439 + }, + { + "epoch": 0.18809744064138142, + "grad_norm": 3.8811943531036377, + "learning_rate": 9.359166687331032e-06, + "loss": 1.0447, + "step": 2440 + }, + { + "epoch": 0.1881745297563984, + "grad_norm": 3.700079917907715, + "learning_rate": 9.35855505868688e-06, + "loss": 0.9359, + "step": 2441 + }, + { + "epoch": 0.18825161887141537, + "grad_norm": 4.782196521759033, + "learning_rate": 9.357943158309396e-06, + "loss": 1.1038, + "step": 2442 + }, + { + "epoch": 0.1883287079864323, + "grad_norm": 3.7085964679718018, + "learning_rate": 9.357330986236723e-06, + "loss": 1.0557, + "step": 2443 + }, + { + "epoch": 0.18840579710144928, + "grad_norm": 4.303460597991943, + "learning_rate": 9.356718542507032e-06, + "loss": 1.0125, + "step": 2444 + }, + { + "epoch": 0.18848288621646622, + "grad_norm": 3.9245944023132324, + "learning_rate": 9.356105827158505e-06, + "loss": 0.9875, + "step": 2445 + }, + { + "epoch": 0.1885599753314832, + "grad_norm": 4.061418533325195, + "learning_rate": 9.355492840229338e-06, + "loss": 1.0002, + "step": 2446 + }, + { + "epoch": 0.18863706444650016, + "grad_norm": 4.799796104431152, + "learning_rate": 9.354879581757753e-06, + "loss": 1.0555, + "step": 2447 + }, + { + "epoch": 0.1887141535615171, + "grad_norm": 3.832697629928589, + "learning_rate": 9.354266051781978e-06, + "loss": 1.1255, + "step": 2448 + }, + { + "epoch": 0.18879124267653408, + "grad_norm": 4.201719284057617, + "learning_rate": 9.353652250340268e-06, + "loss": 0.9662, + "step": 2449 + }, + { + "epoch": 0.18886833179155102, + "grad_norm": 3.8492846488952637, + "learning_rate": 9.353038177470886e-06, + "loss": 1.0201, + "step": 2450 + }, + { + "epoch": 0.188945420906568, + "grad_norm": 4.532588005065918, + "learning_rate": 9.35242383321212e-06, + "loss": 1.0201, + "step": 2451 + }, + { + "epoch": 0.18902251002158496, + "grad_norm": 4.5830206871032715, + "learning_rate": 9.35180921760227e-06, + "loss": 1.0741, + "step": 2452 + }, + { + "epoch": 0.1890995991366019, + "grad_norm": 4.1921868324279785, + "learning_rate": 9.351194330679653e-06, + "loss": 1.0388, + "step": 2453 + }, + { + "epoch": 0.18917668825161887, + "grad_norm": 3.923304796218872, + "learning_rate": 9.350579172482607e-06, + "loss": 1.0602, + "step": 2454 + }, + { + "epoch": 0.18925377736663584, + "grad_norm": 4.064269065856934, + "learning_rate": 9.349963743049479e-06, + "loss": 1.1611, + "step": 2455 + }, + { + "epoch": 0.1893308664816528, + "grad_norm": 4.29310417175293, + "learning_rate": 9.349348042418643e-06, + "loss": 0.9577, + "step": 2456 + }, + { + "epoch": 0.18940795559666976, + "grad_norm": 3.83111572265625, + "learning_rate": 9.348732070628482e-06, + "loss": 1.0647, + "step": 2457 + }, + { + "epoch": 0.1894850447116867, + "grad_norm": 4.015953063964844, + "learning_rate": 9.348115827717398e-06, + "loss": 1.0998, + "step": 2458 + }, + { + "epoch": 0.18956213382670367, + "grad_norm": 5.061834335327148, + "learning_rate": 9.347499313723811e-06, + "loss": 1.0665, + "step": 2459 + }, + { + "epoch": 0.18963922294172064, + "grad_norm": 3.875699043273926, + "learning_rate": 9.346882528686159e-06, + "loss": 1.0707, + "step": 2460 + }, + { + "epoch": 0.18971631205673758, + "grad_norm": 3.674454927444458, + "learning_rate": 9.346265472642895e-06, + "loss": 1.031, + "step": 2461 + }, + { + "epoch": 0.18979340117175456, + "grad_norm": 3.886066198348999, + "learning_rate": 9.34564814563249e-06, + "loss": 1.0683, + "step": 2462 + }, + { + "epoch": 0.1898704902867715, + "grad_norm": 3.8345463275909424, + "learning_rate": 9.345030547693426e-06, + "loss": 1.0606, + "step": 2463 + }, + { + "epoch": 0.18994757940178847, + "grad_norm": 4.000880718231201, + "learning_rate": 9.344412678864214e-06, + "loss": 1.1364, + "step": 2464 + }, + { + "epoch": 0.19002466851680544, + "grad_norm": 4.1079607009887695, + "learning_rate": 9.34379453918337e-06, + "loss": 1.1008, + "step": 2465 + }, + { + "epoch": 0.19010175763182238, + "grad_norm": 3.764862537384033, + "learning_rate": 9.343176128689434e-06, + "loss": 0.9816, + "step": 2466 + }, + { + "epoch": 0.19017884674683935, + "grad_norm": 4.217895984649658, + "learning_rate": 9.342557447420961e-06, + "loss": 0.9649, + "step": 2467 + }, + { + "epoch": 0.1902559358618563, + "grad_norm": 4.198402404785156, + "learning_rate": 9.34193849541652e-06, + "loss": 1.0241, + "step": 2468 + }, + { + "epoch": 0.19033302497687327, + "grad_norm": 3.5659427642822266, + "learning_rate": 9.341319272714704e-06, + "loss": 1.0963, + "step": 2469 + }, + { + "epoch": 0.19041011409189024, + "grad_norm": 4.269014358520508, + "learning_rate": 9.340699779354114e-06, + "loss": 1.1575, + "step": 2470 + }, + { + "epoch": 0.19048720320690718, + "grad_norm": 3.8241798877716064, + "learning_rate": 9.340080015373374e-06, + "loss": 1.0775, + "step": 2471 + }, + { + "epoch": 0.19056429232192415, + "grad_norm": 4.083940505981445, + "learning_rate": 9.339459980811122e-06, + "loss": 1.0798, + "step": 2472 + }, + { + "epoch": 0.1906413814369411, + "grad_norm": 3.9632468223571777, + "learning_rate": 9.338839675706017e-06, + "loss": 1.0339, + "step": 2473 + }, + { + "epoch": 0.19071847055195806, + "grad_norm": 3.6100375652313232, + "learning_rate": 9.338219100096728e-06, + "loss": 1.0676, + "step": 2474 + }, + { + "epoch": 0.19079555966697503, + "grad_norm": 3.7622032165527344, + "learning_rate": 9.337598254021947e-06, + "loss": 1.0779, + "step": 2475 + }, + { + "epoch": 0.19087264878199198, + "grad_norm": 3.8564085960388184, + "learning_rate": 9.33697713752038e-06, + "loss": 1.0088, + "step": 2476 + }, + { + "epoch": 0.19094973789700895, + "grad_norm": 4.029922962188721, + "learning_rate": 9.33635575063075e-06, + "loss": 1.1112, + "step": 2477 + }, + { + "epoch": 0.1910268270120259, + "grad_norm": 4.0595502853393555, + "learning_rate": 9.335734093391797e-06, + "loss": 0.9914, + "step": 2478 + }, + { + "epoch": 0.19110391612704286, + "grad_norm": 3.9760942459106445, + "learning_rate": 9.335112165842277e-06, + "loss": 1.0767, + "step": 2479 + }, + { + "epoch": 0.19118100524205983, + "grad_norm": 4.477503776550293, + "learning_rate": 9.334489968020968e-06, + "loss": 1.0201, + "step": 2480 + }, + { + "epoch": 0.19125809435707677, + "grad_norm": 4.229430198669434, + "learning_rate": 9.333867499966659e-06, + "loss": 1.0431, + "step": 2481 + }, + { + "epoch": 0.19133518347209374, + "grad_norm": 3.821248769760132, + "learning_rate": 9.333244761718157e-06, + "loss": 1.0613, + "step": 2482 + }, + { + "epoch": 0.1914122725871107, + "grad_norm": 3.8744101524353027, + "learning_rate": 9.332621753314285e-06, + "loss": 0.8911, + "step": 2483 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 3.607072114944458, + "learning_rate": 9.331998474793886e-06, + "loss": 0.9713, + "step": 2484 + }, + { + "epoch": 0.19156645081714463, + "grad_norm": 4.2475810050964355, + "learning_rate": 9.331374926195819e-06, + "loss": 1.076, + "step": 2485 + }, + { + "epoch": 0.19164353993216157, + "grad_norm": 4.018882751464844, + "learning_rate": 9.330751107558957e-06, + "loss": 1.0054, + "step": 2486 + }, + { + "epoch": 0.19172062904717854, + "grad_norm": 3.6863954067230225, + "learning_rate": 9.330127018922195e-06, + "loss": 0.9955, + "step": 2487 + }, + { + "epoch": 0.19179771816219549, + "grad_norm": 3.837610960006714, + "learning_rate": 9.329502660324437e-06, + "loss": 1.0766, + "step": 2488 + }, + { + "epoch": 0.19187480727721246, + "grad_norm": 3.6833620071411133, + "learning_rate": 9.328878031804613e-06, + "loss": 1.0074, + "step": 2489 + }, + { + "epoch": 0.19195189639222943, + "grad_norm": 3.849368095397949, + "learning_rate": 9.328253133401663e-06, + "loss": 1.0552, + "step": 2490 + }, + { + "epoch": 0.19202898550724637, + "grad_norm": 4.261739730834961, + "learning_rate": 9.327627965154547e-06, + "loss": 1.1149, + "step": 2491 + }, + { + "epoch": 0.19210607462226334, + "grad_norm": 5.0863938331604, + "learning_rate": 9.32700252710224e-06, + "loss": 1.1051, + "step": 2492 + }, + { + "epoch": 0.19218316373728028, + "grad_norm": 3.6857082843780518, + "learning_rate": 9.326376819283737e-06, + "loss": 1.0603, + "step": 2493 + }, + { + "epoch": 0.19226025285229725, + "grad_norm": 4.171791076660156, + "learning_rate": 9.325750841738046e-06, + "loss": 0.9976, + "step": 2494 + }, + { + "epoch": 0.19233734196731422, + "grad_norm": 3.735818386077881, + "learning_rate": 9.325124594504191e-06, + "loss": 1.0204, + "step": 2495 + }, + { + "epoch": 0.19241443108233117, + "grad_norm": 3.711371660232544, + "learning_rate": 9.32449807762122e-06, + "loss": 1.0821, + "step": 2496 + }, + { + "epoch": 0.19249152019734814, + "grad_norm": 3.6877596378326416, + "learning_rate": 9.323871291128192e-06, + "loss": 0.9228, + "step": 2497 + }, + { + "epoch": 0.1925686093123651, + "grad_norm": 4.088908672332764, + "learning_rate": 9.323244235064182e-06, + "loss": 1.0792, + "step": 2498 + }, + { + "epoch": 0.19264569842738205, + "grad_norm": 3.9644157886505127, + "learning_rate": 9.322616909468284e-06, + "loss": 1.0787, + "step": 2499 + }, + { + "epoch": 0.19272278754239902, + "grad_norm": 3.950051784515381, + "learning_rate": 9.321989314379609e-06, + "loss": 1.1276, + "step": 2500 + }, + { + "epoch": 0.19279987665741596, + "grad_norm": 4.185604095458984, + "learning_rate": 9.321361449837286e-06, + "loss": 1.0614, + "step": 2501 + }, + { + "epoch": 0.19287696577243293, + "grad_norm": 4.12734842300415, + "learning_rate": 9.320733315880455e-06, + "loss": 1.0167, + "step": 2502 + }, + { + "epoch": 0.1929540548874499, + "grad_norm": 3.2868363857269287, + "learning_rate": 9.320104912548281e-06, + "loss": 0.9841, + "step": 2503 + }, + { + "epoch": 0.19303114400246685, + "grad_norm": 3.8465094566345215, + "learning_rate": 9.319476239879941e-06, + "loss": 0.949, + "step": 2504 + }, + { + "epoch": 0.19310823311748382, + "grad_norm": 3.994518995285034, + "learning_rate": 9.318847297914627e-06, + "loss": 1.0177, + "step": 2505 + }, + { + "epoch": 0.19318532223250076, + "grad_norm": 4.2856268882751465, + "learning_rate": 9.318218086691553e-06, + "loss": 1.1933, + "step": 2506 + }, + { + "epoch": 0.19326241134751773, + "grad_norm": 3.8407466411590576, + "learning_rate": 9.317588606249945e-06, + "loss": 1.0947, + "step": 2507 + }, + { + "epoch": 0.1933395004625347, + "grad_norm": 3.695981979370117, + "learning_rate": 9.316958856629049e-06, + "loss": 1.0531, + "step": 2508 + }, + { + "epoch": 0.19341658957755165, + "grad_norm": 4.270875453948975, + "learning_rate": 9.316328837868126e-06, + "loss": 0.8991, + "step": 2509 + }, + { + "epoch": 0.19349367869256862, + "grad_norm": 4.000987529754639, + "learning_rate": 9.315698550006456e-06, + "loss": 1.0195, + "step": 2510 + }, + { + "epoch": 0.19357076780758556, + "grad_norm": 3.681183099746704, + "learning_rate": 9.315067993083331e-06, + "loss": 1.0773, + "step": 2511 + }, + { + "epoch": 0.19364785692260253, + "grad_norm": 5.025501728057861, + "learning_rate": 9.314437167138065e-06, + "loss": 1.1425, + "step": 2512 + }, + { + "epoch": 0.1937249460376195, + "grad_norm": 3.879424810409546, + "learning_rate": 9.31380607220999e-06, + "loss": 1.0285, + "step": 2513 + }, + { + "epoch": 0.19380203515263644, + "grad_norm": 4.076712608337402, + "learning_rate": 9.313174708338446e-06, + "loss": 1.0066, + "step": 2514 + }, + { + "epoch": 0.1938791242676534, + "grad_norm": 3.7475578784942627, + "learning_rate": 9.312543075562797e-06, + "loss": 1.019, + "step": 2515 + }, + { + "epoch": 0.19395621338267036, + "grad_norm": 3.9618778228759766, + "learning_rate": 9.311911173922423e-06, + "loss": 1.0797, + "step": 2516 + }, + { + "epoch": 0.19403330249768733, + "grad_norm": 4.08378267288208, + "learning_rate": 9.311279003456719e-06, + "loss": 1.0427, + "step": 2517 + }, + { + "epoch": 0.1941103916127043, + "grad_norm": 3.681499719619751, + "learning_rate": 9.3106465642051e-06, + "loss": 1.0812, + "step": 2518 + }, + { + "epoch": 0.19418748072772124, + "grad_norm": 3.9023890495300293, + "learning_rate": 9.31001385620699e-06, + "loss": 1.0172, + "step": 2519 + }, + { + "epoch": 0.1942645698427382, + "grad_norm": 3.5285422801971436, + "learning_rate": 9.309380879501839e-06, + "loss": 1.0496, + "step": 2520 + }, + { + "epoch": 0.19434165895775515, + "grad_norm": 3.923809051513672, + "learning_rate": 9.30874763412911e-06, + "loss": 1.0218, + "step": 2521 + }, + { + "epoch": 0.19441874807277212, + "grad_norm": 3.9072353839874268, + "learning_rate": 9.30811412012828e-06, + "loss": 1.05, + "step": 2522 + }, + { + "epoch": 0.1944958371877891, + "grad_norm": 3.4843831062316895, + "learning_rate": 9.307480337538847e-06, + "loss": 1.0038, + "step": 2523 + }, + { + "epoch": 0.19457292630280604, + "grad_norm": 3.578922748565674, + "learning_rate": 9.306846286400326e-06, + "loss": 1.0023, + "step": 2524 + }, + { + "epoch": 0.194650015417823, + "grad_norm": 3.625309705734253, + "learning_rate": 9.306211966752243e-06, + "loss": 1.0317, + "step": 2525 + }, + { + "epoch": 0.19472710453283995, + "grad_norm": 3.810683012008667, + "learning_rate": 9.305577378634148e-06, + "loss": 1.0276, + "step": 2526 + }, + { + "epoch": 0.19480419364785692, + "grad_norm": 4.362732887268066, + "learning_rate": 9.3049425220856e-06, + "loss": 1.0026, + "step": 2527 + }, + { + "epoch": 0.1948812827628739, + "grad_norm": 3.8706610202789307, + "learning_rate": 9.304307397146184e-06, + "loss": 0.9928, + "step": 2528 + }, + { + "epoch": 0.19495837187789083, + "grad_norm": 3.865088701248169, + "learning_rate": 9.303672003855495e-06, + "loss": 1.0645, + "step": 2529 + }, + { + "epoch": 0.1950354609929078, + "grad_norm": 3.7114272117614746, + "learning_rate": 9.303036342253143e-06, + "loss": 1.003, + "step": 2530 + }, + { + "epoch": 0.19511255010792475, + "grad_norm": 3.746708631515503, + "learning_rate": 9.302400412378762e-06, + "loss": 0.9651, + "step": 2531 + }, + { + "epoch": 0.19518963922294172, + "grad_norm": 4.280477046966553, + "learning_rate": 9.301764214272e-06, + "loss": 0.9485, + "step": 2532 + }, + { + "epoch": 0.1952667283379587, + "grad_norm": 3.7413394451141357, + "learning_rate": 9.301127747972516e-06, + "loss": 1.0762, + "step": 2533 + }, + { + "epoch": 0.19534381745297563, + "grad_norm": 4.122095584869385, + "learning_rate": 9.300491013519995e-06, + "loss": 1.0784, + "step": 2534 + }, + { + "epoch": 0.1954209065679926, + "grad_norm": 3.7376046180725098, + "learning_rate": 9.29985401095413e-06, + "loss": 1.0385, + "step": 2535 + }, + { + "epoch": 0.19549799568300955, + "grad_norm": 4.303701877593994, + "learning_rate": 9.299216740314639e-06, + "loss": 1.0439, + "step": 2536 + }, + { + "epoch": 0.19557508479802652, + "grad_norm": 4.109951019287109, + "learning_rate": 9.298579201641248e-06, + "loss": 1.0927, + "step": 2537 + }, + { + "epoch": 0.1956521739130435, + "grad_norm": 4.093001365661621, + "learning_rate": 9.29794139497371e-06, + "loss": 1.0977, + "step": 2538 + }, + { + "epoch": 0.19572926302806043, + "grad_norm": 3.4555115699768066, + "learning_rate": 9.297303320351784e-06, + "loss": 0.9951, + "step": 2539 + }, + { + "epoch": 0.1958063521430774, + "grad_norm": 3.6072354316711426, + "learning_rate": 9.296664977815251e-06, + "loss": 0.9743, + "step": 2540 + }, + { + "epoch": 0.19588344125809437, + "grad_norm": 3.506932020187378, + "learning_rate": 9.296026367403912e-06, + "loss": 0.9706, + "step": 2541 + }, + { + "epoch": 0.1959605303731113, + "grad_norm": 4.099440097808838, + "learning_rate": 9.295387489157577e-06, + "loss": 1.0866, + "step": 2542 + }, + { + "epoch": 0.19603761948812828, + "grad_norm": 3.744800090789795, + "learning_rate": 9.29474834311608e-06, + "loss": 0.9554, + "step": 2543 + }, + { + "epoch": 0.19611470860314523, + "grad_norm": 3.71545672416687, + "learning_rate": 9.294108929319266e-06, + "loss": 1.0344, + "step": 2544 + }, + { + "epoch": 0.1961917977181622, + "grad_norm": 4.267796516418457, + "learning_rate": 9.293469247806999e-06, + "loss": 1.0125, + "step": 2545 + }, + { + "epoch": 0.19626888683317917, + "grad_norm": 3.7793099880218506, + "learning_rate": 9.292829298619161e-06, + "loss": 1.0101, + "step": 2546 + }, + { + "epoch": 0.1963459759481961, + "grad_norm": 3.366549015045166, + "learning_rate": 9.292189081795651e-06, + "loss": 0.9683, + "step": 2547 + }, + { + "epoch": 0.19642306506321308, + "grad_norm": 3.89920973777771, + "learning_rate": 9.291548597376382e-06, + "loss": 1.0093, + "step": 2548 + }, + { + "epoch": 0.19650015417823002, + "grad_norm": 3.8635799884796143, + "learning_rate": 9.290907845401283e-06, + "loss": 1.0173, + "step": 2549 + }, + { + "epoch": 0.196577243293247, + "grad_norm": 3.8625593185424805, + "learning_rate": 9.2902668259103e-06, + "loss": 1.0594, + "step": 2550 + }, + { + "epoch": 0.19665433240826397, + "grad_norm": 3.8342230319976807, + "learning_rate": 9.289625538943405e-06, + "loss": 0.9789, + "step": 2551 + }, + { + "epoch": 0.1967314215232809, + "grad_norm": 3.684582233428955, + "learning_rate": 9.288983984540573e-06, + "loss": 0.8711, + "step": 2552 + }, + { + "epoch": 0.19680851063829788, + "grad_norm": 4.02480936050415, + "learning_rate": 9.288342162741803e-06, + "loss": 0.9812, + "step": 2553 + }, + { + "epoch": 0.19688559975331482, + "grad_norm": 3.966688871383667, + "learning_rate": 9.287700073587106e-06, + "loss": 0.9101, + "step": 2554 + }, + { + "epoch": 0.1969626888683318, + "grad_norm": 3.733137607574463, + "learning_rate": 9.28705771711652e-06, + "loss": 1.0469, + "step": 2555 + }, + { + "epoch": 0.19703977798334876, + "grad_norm": 4.436910629272461, + "learning_rate": 9.286415093370087e-06, + "loss": 1.0353, + "step": 2556 + }, + { + "epoch": 0.1971168670983657, + "grad_norm": 4.32921028137207, + "learning_rate": 9.285772202387872e-06, + "loss": 1.2546, + "step": 2557 + }, + { + "epoch": 0.19719395621338268, + "grad_norm": 4.157625675201416, + "learning_rate": 9.285129044209958e-06, + "loss": 1.144, + "step": 2558 + }, + { + "epoch": 0.19727104532839962, + "grad_norm": 4.597263813018799, + "learning_rate": 9.284485618876442e-06, + "loss": 1.1378, + "step": 2559 + }, + { + "epoch": 0.1973481344434166, + "grad_norm": 3.756883382797241, + "learning_rate": 9.283841926427435e-06, + "loss": 0.993, + "step": 2560 + }, + { + "epoch": 0.19742522355843356, + "grad_norm": 3.802865982055664, + "learning_rate": 9.283197966903073e-06, + "loss": 1.0976, + "step": 2561 + }, + { + "epoch": 0.1975023126734505, + "grad_norm": 3.97096848487854, + "learning_rate": 9.282553740343501e-06, + "loss": 1.0863, + "step": 2562 + }, + { + "epoch": 0.19757940178846747, + "grad_norm": 3.7261781692504883, + "learning_rate": 9.28190924678888e-06, + "loss": 1.1109, + "step": 2563 + }, + { + "epoch": 0.19765649090348442, + "grad_norm": 3.8818700313568115, + "learning_rate": 9.281264486279398e-06, + "loss": 1.1454, + "step": 2564 + }, + { + "epoch": 0.1977335800185014, + "grad_norm": 3.9310739040374756, + "learning_rate": 9.280619458855246e-06, + "loss": 1.0731, + "step": 2565 + }, + { + "epoch": 0.19781066913351836, + "grad_norm": 4.061331272125244, + "learning_rate": 9.279974164556643e-06, + "loss": 0.9986, + "step": 2566 + }, + { + "epoch": 0.1978877582485353, + "grad_norm": 3.422016143798828, + "learning_rate": 9.279328603423815e-06, + "loss": 1.0443, + "step": 2567 + }, + { + "epoch": 0.19796484736355227, + "grad_norm": 4.034252643585205, + "learning_rate": 9.278682775497012e-06, + "loss": 1.0879, + "step": 2568 + }, + { + "epoch": 0.1980419364785692, + "grad_norm": 4.055642127990723, + "learning_rate": 9.278036680816497e-06, + "loss": 1.1014, + "step": 2569 + }, + { + "epoch": 0.19811902559358618, + "grad_norm": 4.593109607696533, + "learning_rate": 9.277390319422555e-06, + "loss": 1.0524, + "step": 2570 + }, + { + "epoch": 0.19819611470860315, + "grad_norm": 4.03125524520874, + "learning_rate": 9.276743691355476e-06, + "loss": 1.0198, + "step": 2571 + }, + { + "epoch": 0.1982732038236201, + "grad_norm": 3.723781108856201, + "learning_rate": 9.27609679665558e-06, + "loss": 0.9915, + "step": 2572 + }, + { + "epoch": 0.19835029293863707, + "grad_norm": 3.6599931716918945, + "learning_rate": 9.275449635363195e-06, + "loss": 1.0793, + "step": 2573 + }, + { + "epoch": 0.198427382053654, + "grad_norm": 3.8307621479034424, + "learning_rate": 9.274802207518668e-06, + "loss": 1.0124, + "step": 2574 + }, + { + "epoch": 0.19850447116867098, + "grad_norm": 3.984130620956421, + "learning_rate": 9.274154513162364e-06, + "loss": 0.9356, + "step": 2575 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 3.825328826904297, + "learning_rate": 9.27350655233466e-06, + "loss": 1.0427, + "step": 2576 + }, + { + "epoch": 0.1986586493987049, + "grad_norm": 4.283047199249268, + "learning_rate": 9.272858325075957e-06, + "loss": 1.0344, + "step": 2577 + }, + { + "epoch": 0.19873573851372187, + "grad_norm": 3.8551671504974365, + "learning_rate": 9.272209831426667e-06, + "loss": 1.022, + "step": 2578 + }, + { + "epoch": 0.1988128276287388, + "grad_norm": 3.6993603706359863, + "learning_rate": 9.271561071427222e-06, + "loss": 0.9627, + "step": 2579 + }, + { + "epoch": 0.19888991674375578, + "grad_norm": 3.686549186706543, + "learning_rate": 9.270912045118065e-06, + "loss": 1.1547, + "step": 2580 + }, + { + "epoch": 0.19896700585877275, + "grad_norm": 3.775413990020752, + "learning_rate": 9.270262752539662e-06, + "loss": 1.1395, + "step": 2581 + }, + { + "epoch": 0.1990440949737897, + "grad_norm": 3.9551045894622803, + "learning_rate": 9.269613193732492e-06, + "loss": 1.05, + "step": 2582 + }, + { + "epoch": 0.19912118408880666, + "grad_norm": 3.7017507553100586, + "learning_rate": 9.268963368737053e-06, + "loss": 1.0631, + "step": 2583 + }, + { + "epoch": 0.19919827320382363, + "grad_norm": 4.082302093505859, + "learning_rate": 9.268313277593859e-06, + "loss": 1.177, + "step": 2584 + }, + { + "epoch": 0.19927536231884058, + "grad_norm": 3.7110402584075928, + "learning_rate": 9.267662920343436e-06, + "loss": 1.0281, + "step": 2585 + }, + { + "epoch": 0.19935245143385755, + "grad_norm": 4.061951160430908, + "learning_rate": 9.267012297026334e-06, + "loss": 1.0825, + "step": 2586 + }, + { + "epoch": 0.1994295405488745, + "grad_norm": 3.88759708404541, + "learning_rate": 9.266361407683115e-06, + "loss": 1.0694, + "step": 2587 + }, + { + "epoch": 0.19950662966389146, + "grad_norm": 4.016077518463135, + "learning_rate": 9.265710252354358e-06, + "loss": 1.0718, + "step": 2588 + }, + { + "epoch": 0.19958371877890843, + "grad_norm": 3.7587389945983887, + "learning_rate": 9.26505883108066e-06, + "loss": 1.0342, + "step": 2589 + }, + { + "epoch": 0.19966080789392537, + "grad_norm": 3.6690351963043213, + "learning_rate": 9.264407143902632e-06, + "loss": 1.057, + "step": 2590 + }, + { + "epoch": 0.19973789700894234, + "grad_norm": 3.946889638900757, + "learning_rate": 9.263755190860904e-06, + "loss": 1.0364, + "step": 2591 + }, + { + "epoch": 0.1998149861239593, + "grad_norm": 3.688429832458496, + "learning_rate": 9.263102971996123e-06, + "loss": 1.1029, + "step": 2592 + }, + { + "epoch": 0.19989207523897626, + "grad_norm": 3.6999905109405518, + "learning_rate": 9.262450487348952e-06, + "loss": 1.0277, + "step": 2593 + }, + { + "epoch": 0.19996916435399323, + "grad_norm": 3.4279160499572754, + "learning_rate": 9.261797736960067e-06, + "loss": 0.9504, + "step": 2594 + }, + { + "epoch": 0.20004625346901017, + "grad_norm": 3.8050403594970703, + "learning_rate": 9.261144720870167e-06, + "loss": 0.9475, + "step": 2595 + }, + { + "epoch": 0.20012334258402714, + "grad_norm": 4.1190009117126465, + "learning_rate": 9.260491439119963e-06, + "loss": 1.1579, + "step": 2596 + }, + { + "epoch": 0.20020043169904408, + "grad_norm": 3.654827117919922, + "learning_rate": 9.259837891750183e-06, + "loss": 0.9909, + "step": 2597 + }, + { + "epoch": 0.20027752081406106, + "grad_norm": 3.997933864593506, + "learning_rate": 9.259184078801571e-06, + "loss": 1.1038, + "step": 2598 + }, + { + "epoch": 0.20035460992907803, + "grad_norm": 3.865818738937378, + "learning_rate": 9.258530000314893e-06, + "loss": 0.9885, + "step": 2599 + }, + { + "epoch": 0.20043169904409497, + "grad_norm": 3.9201407432556152, + "learning_rate": 9.257875656330923e-06, + "loss": 1.1399, + "step": 2600 + }, + { + "epoch": 0.20050878815911194, + "grad_norm": 3.548663377761841, + "learning_rate": 9.257221046890459e-06, + "loss": 1.0436, + "step": 2601 + }, + { + "epoch": 0.20058587727412888, + "grad_norm": 4.565245628356934, + "learning_rate": 9.256566172034312e-06, + "loss": 1.0835, + "step": 2602 + }, + { + "epoch": 0.20066296638914585, + "grad_norm": 3.7430996894836426, + "learning_rate": 9.255911031803308e-06, + "loss": 1.1998, + "step": 2603 + }, + { + "epoch": 0.20074005550416282, + "grad_norm": 3.844200611114502, + "learning_rate": 9.255255626238295e-06, + "loss": 1.0305, + "step": 2604 + }, + { + "epoch": 0.20081714461917977, + "grad_norm": 3.844482898712158, + "learning_rate": 9.25459995538013e-06, + "loss": 1.1108, + "step": 2605 + }, + { + "epoch": 0.20089423373419674, + "grad_norm": 4.113783836364746, + "learning_rate": 9.253944019269695e-06, + "loss": 0.9527, + "step": 2606 + }, + { + "epoch": 0.20097132284921368, + "grad_norm": 3.4817748069763184, + "learning_rate": 9.25328781794788e-06, + "loss": 0.9548, + "step": 2607 + }, + { + "epoch": 0.20104841196423065, + "grad_norm": 3.778887987136841, + "learning_rate": 9.2526313514556e-06, + "loss": 1.0612, + "step": 2608 + }, + { + "epoch": 0.20112550107924762, + "grad_norm": 4.771206855773926, + "learning_rate": 9.251974619833779e-06, + "loss": 1.1353, + "step": 2609 + }, + { + "epoch": 0.20120259019426456, + "grad_norm": 3.8691132068634033, + "learning_rate": 9.251317623123363e-06, + "loss": 1.0599, + "step": 2610 + }, + { + "epoch": 0.20127967930928153, + "grad_norm": 4.0417070388793945, + "learning_rate": 9.25066036136531e-06, + "loss": 1.0021, + "step": 2611 + }, + { + "epoch": 0.20135676842429848, + "grad_norm": 4.274367809295654, + "learning_rate": 9.2500028346006e-06, + "loss": 1.0906, + "step": 2612 + }, + { + "epoch": 0.20143385753931545, + "grad_norm": 3.9163665771484375, + "learning_rate": 9.249345042870222e-06, + "loss": 0.9961, + "step": 2613 + }, + { + "epoch": 0.20151094665433242, + "grad_norm": 4.183355808258057, + "learning_rate": 9.248686986215189e-06, + "loss": 1.1059, + "step": 2614 + }, + { + "epoch": 0.20158803576934936, + "grad_norm": 3.863225221633911, + "learning_rate": 9.248028664676529e-06, + "loss": 1.0913, + "step": 2615 + }, + { + "epoch": 0.20166512488436633, + "grad_norm": 4.189373016357422, + "learning_rate": 9.24737007829528e-06, + "loss": 1.094, + "step": 2616 + }, + { + "epoch": 0.20174221399938327, + "grad_norm": 3.7555601596832275, + "learning_rate": 9.246711227112509e-06, + "loss": 1.0754, + "step": 2617 + }, + { + "epoch": 0.20181930311440024, + "grad_norm": 3.700612783432007, + "learning_rate": 9.246052111169283e-06, + "loss": 0.989, + "step": 2618 + }, + { + "epoch": 0.20189639222941722, + "grad_norm": 4.023221015930176, + "learning_rate": 9.2453927305067e-06, + "loss": 1.0806, + "step": 2619 + }, + { + "epoch": 0.20197348134443416, + "grad_norm": 3.949838161468506, + "learning_rate": 9.244733085165868e-06, + "loss": 1.0044, + "step": 2620 + }, + { + "epoch": 0.20205057045945113, + "grad_norm": 4.30818510055542, + "learning_rate": 9.244073175187912e-06, + "loss": 1.0475, + "step": 2621 + }, + { + "epoch": 0.20212765957446807, + "grad_norm": 4.544628620147705, + "learning_rate": 9.243413000613974e-06, + "loss": 1.0492, + "step": 2622 + }, + { + "epoch": 0.20220474868948504, + "grad_norm": 3.771693706512451, + "learning_rate": 9.242752561485213e-06, + "loss": 1.0835, + "step": 2623 + }, + { + "epoch": 0.202281837804502, + "grad_norm": 3.973714590072632, + "learning_rate": 9.242091857842803e-06, + "loss": 1.0409, + "step": 2624 + }, + { + "epoch": 0.20235892691951896, + "grad_norm": 4.235926628112793, + "learning_rate": 9.241430889727936e-06, + "loss": 1.1188, + "step": 2625 + }, + { + "epoch": 0.20243601603453593, + "grad_norm": 4.404877185821533, + "learning_rate": 9.24076965718182e-06, + "loss": 0.9393, + "step": 2626 + }, + { + "epoch": 0.2025131051495529, + "grad_norm": 3.5260396003723145, + "learning_rate": 9.24010816024568e-06, + "loss": 0.9963, + "step": 2627 + }, + { + "epoch": 0.20259019426456984, + "grad_norm": 3.725531816482544, + "learning_rate": 9.239446398960756e-06, + "loss": 0.9743, + "step": 2628 + }, + { + "epoch": 0.2026672833795868, + "grad_norm": 3.668250799179077, + "learning_rate": 9.238784373368306e-06, + "loss": 1.0292, + "step": 2629 + }, + { + "epoch": 0.20274437249460375, + "grad_norm": 4.116795539855957, + "learning_rate": 9.238122083509602e-06, + "loss": 1.1924, + "step": 2630 + }, + { + "epoch": 0.20282146160962072, + "grad_norm": 3.9489383697509766, + "learning_rate": 9.237459529425938e-06, + "loss": 1.0364, + "step": 2631 + }, + { + "epoch": 0.2028985507246377, + "grad_norm": 3.3951849937438965, + "learning_rate": 9.236796711158617e-06, + "loss": 0.9725, + "step": 2632 + }, + { + "epoch": 0.20297563983965464, + "grad_norm": 3.6997361183166504, + "learning_rate": 9.236133628748965e-06, + "loss": 1.0287, + "step": 2633 + }, + { + "epoch": 0.2030527289546716, + "grad_norm": 3.9400417804718018, + "learning_rate": 9.235470282238322e-06, + "loss": 1.1658, + "step": 2634 + }, + { + "epoch": 0.20312981806968855, + "grad_norm": 4.31356954574585, + "learning_rate": 9.23480667166804e-06, + "loss": 1.0993, + "step": 2635 + }, + { + "epoch": 0.20320690718470552, + "grad_norm": 3.9322855472564697, + "learning_rate": 9.234142797079496e-06, + "loss": 0.9019, + "step": 2636 + }, + { + "epoch": 0.2032839962997225, + "grad_norm": 4.056495666503906, + "learning_rate": 9.233478658514078e-06, + "loss": 1.0006, + "step": 2637 + }, + { + "epoch": 0.20336108541473943, + "grad_norm": 3.7447257041931152, + "learning_rate": 9.232814256013192e-06, + "loss": 1.009, + "step": 2638 + }, + { + "epoch": 0.2034381745297564, + "grad_norm": 3.8734138011932373, + "learning_rate": 9.232149589618257e-06, + "loss": 0.9408, + "step": 2639 + }, + { + "epoch": 0.20351526364477335, + "grad_norm": 3.8473434448242188, + "learning_rate": 9.231484659370717e-06, + "loss": 1.0765, + "step": 2640 + }, + { + "epoch": 0.20359235275979032, + "grad_norm": 3.7293701171875, + "learning_rate": 9.230819465312022e-06, + "loss": 1.1238, + "step": 2641 + }, + { + "epoch": 0.2036694418748073, + "grad_norm": 3.6699728965759277, + "learning_rate": 9.230154007483646e-06, + "loss": 0.968, + "step": 2642 + }, + { + "epoch": 0.20374653098982423, + "grad_norm": 3.8388984203338623, + "learning_rate": 9.229488285927077e-06, + "loss": 0.9983, + "step": 2643 + }, + { + "epoch": 0.2038236201048412, + "grad_norm": 4.062560558319092, + "learning_rate": 9.228822300683817e-06, + "loss": 1.0297, + "step": 2644 + }, + { + "epoch": 0.20390070921985815, + "grad_norm": 3.8813700675964355, + "learning_rate": 9.228156051795388e-06, + "loss": 1.1485, + "step": 2645 + }, + { + "epoch": 0.20397779833487512, + "grad_norm": 3.9767556190490723, + "learning_rate": 9.227489539303329e-06, + "loss": 0.9742, + "step": 2646 + }, + { + "epoch": 0.2040548874498921, + "grad_norm": 4.134043216705322, + "learning_rate": 9.22682276324919e-06, + "loss": 1.0414, + "step": 2647 + }, + { + "epoch": 0.20413197656490903, + "grad_norm": 3.7752439975738525, + "learning_rate": 9.226155723674543e-06, + "loss": 0.9868, + "step": 2648 + }, + { + "epoch": 0.204209065679926, + "grad_norm": 3.694692373275757, + "learning_rate": 9.225488420620977e-06, + "loss": 1.116, + "step": 2649 + }, + { + "epoch": 0.20428615479494294, + "grad_norm": 3.801795721054077, + "learning_rate": 9.22482085413009e-06, + "loss": 1.0275, + "step": 2650 + }, + { + "epoch": 0.2043632439099599, + "grad_norm": 3.7427256107330322, + "learning_rate": 9.224153024243505e-06, + "loss": 1.1208, + "step": 2651 + }, + { + "epoch": 0.20444033302497688, + "grad_norm": 4.123345375061035, + "learning_rate": 9.223484931002856e-06, + "loss": 1.0278, + "step": 2652 + }, + { + "epoch": 0.20451742213999383, + "grad_norm": 3.854743003845215, + "learning_rate": 9.222816574449796e-06, + "loss": 1.1139, + "step": 2653 + }, + { + "epoch": 0.2045945112550108, + "grad_norm": 3.620626449584961, + "learning_rate": 9.222147954625992e-06, + "loss": 1.03, + "step": 2654 + }, + { + "epoch": 0.20467160037002774, + "grad_norm": 3.9541471004486084, + "learning_rate": 9.22147907157313e-06, + "loss": 1.0944, + "step": 2655 + }, + { + "epoch": 0.2047486894850447, + "grad_norm": 4.048750400543213, + "learning_rate": 9.220809925332911e-06, + "loss": 1.08, + "step": 2656 + }, + { + "epoch": 0.20482577860006168, + "grad_norm": 3.7369632720947266, + "learning_rate": 9.220140515947056e-06, + "loss": 0.989, + "step": 2657 + }, + { + "epoch": 0.20490286771507862, + "grad_norm": 4.2197489738464355, + "learning_rate": 9.219470843457294e-06, + "loss": 1.1959, + "step": 2658 + }, + { + "epoch": 0.2049799568300956, + "grad_norm": 3.825822591781616, + "learning_rate": 9.218800907905379e-06, + "loss": 1.026, + "step": 2659 + }, + { + "epoch": 0.20505704594511254, + "grad_norm": 3.7242703437805176, + "learning_rate": 9.218130709333077e-06, + "loss": 1.0186, + "step": 2660 + }, + { + "epoch": 0.2051341350601295, + "grad_norm": 3.669600248336792, + "learning_rate": 9.217460247782174e-06, + "loss": 1.0269, + "step": 2661 + }, + { + "epoch": 0.20521122417514648, + "grad_norm": 4.220530986785889, + "learning_rate": 9.216789523294462e-06, + "loss": 1.1483, + "step": 2662 + }, + { + "epoch": 0.20528831329016342, + "grad_norm": 3.4208221435546875, + "learning_rate": 9.216118535911766e-06, + "loss": 0.8748, + "step": 2663 + }, + { + "epoch": 0.2053654024051804, + "grad_norm": 4.243244647979736, + "learning_rate": 9.215447285675916e-06, + "loss": 1.18, + "step": 2664 + }, + { + "epoch": 0.20544249152019733, + "grad_norm": 3.644279718399048, + "learning_rate": 9.214775772628759e-06, + "loss": 1.042, + "step": 2665 + }, + { + "epoch": 0.2055195806352143, + "grad_norm": 4.032573223114014, + "learning_rate": 9.21410399681216e-06, + "loss": 1.1426, + "step": 2666 + }, + { + "epoch": 0.20559666975023128, + "grad_norm": 3.6652822494506836, + "learning_rate": 9.213431958268004e-06, + "loss": 0.9583, + "step": 2667 + }, + { + "epoch": 0.20567375886524822, + "grad_norm": 3.9682374000549316, + "learning_rate": 9.212759657038186e-06, + "loss": 0.9783, + "step": 2668 + }, + { + "epoch": 0.2057508479802652, + "grad_norm": 3.6445670127868652, + "learning_rate": 9.212087093164624e-06, + "loss": 0.9417, + "step": 2669 + }, + { + "epoch": 0.20582793709528216, + "grad_norm": 3.758375406265259, + "learning_rate": 9.211414266689245e-06, + "loss": 1.0221, + "step": 2670 + }, + { + "epoch": 0.2059050262102991, + "grad_norm": 4.022796154022217, + "learning_rate": 9.210741177653997e-06, + "loss": 1.107, + "step": 2671 + }, + { + "epoch": 0.20598211532531607, + "grad_norm": 3.7436466217041016, + "learning_rate": 9.210067826100845e-06, + "loss": 1.0894, + "step": 2672 + }, + { + "epoch": 0.20605920444033302, + "grad_norm": 4.940155982971191, + "learning_rate": 9.20939421207177e-06, + "loss": 1.0682, + "step": 2673 + }, + { + "epoch": 0.20613629355535, + "grad_norm": 3.8470208644866943, + "learning_rate": 9.208720335608767e-06, + "loss": 1.0735, + "step": 2674 + }, + { + "epoch": 0.20621338267036696, + "grad_norm": 3.703685998916626, + "learning_rate": 9.208046196753848e-06, + "loss": 1.0533, + "step": 2675 + }, + { + "epoch": 0.2062904717853839, + "grad_norm": 4.3979291915893555, + "learning_rate": 9.207371795549043e-06, + "loss": 1.2048, + "step": 2676 + }, + { + "epoch": 0.20636756090040087, + "grad_norm": 3.9959652423858643, + "learning_rate": 9.206697132036395e-06, + "loss": 1.01, + "step": 2677 + }, + { + "epoch": 0.2064446500154178, + "grad_norm": 3.741591453552246, + "learning_rate": 9.206022206257969e-06, + "loss": 0.971, + "step": 2678 + }, + { + "epoch": 0.20652173913043478, + "grad_norm": 3.790801763534546, + "learning_rate": 9.205347018255844e-06, + "loss": 1.2011, + "step": 2679 + }, + { + "epoch": 0.20659882824545175, + "grad_norm": 4.108438491821289, + "learning_rate": 9.20467156807211e-06, + "loss": 1.0247, + "step": 2680 + }, + { + "epoch": 0.2066759173604687, + "grad_norm": 3.6132218837738037, + "learning_rate": 9.203995855748882e-06, + "loss": 1.0329, + "step": 2681 + }, + { + "epoch": 0.20675300647548567, + "grad_norm": 3.6847808361053467, + "learning_rate": 9.203319881328284e-06, + "loss": 1.0839, + "step": 2682 + }, + { + "epoch": 0.2068300955905026, + "grad_norm": 3.7520806789398193, + "learning_rate": 9.202643644852461e-06, + "loss": 1.0236, + "step": 2683 + }, + { + "epoch": 0.20690718470551958, + "grad_norm": 3.8004398345947266, + "learning_rate": 9.201967146363572e-06, + "loss": 1.0043, + "step": 2684 + }, + { + "epoch": 0.20698427382053655, + "grad_norm": 3.612555503845215, + "learning_rate": 9.201290385903796e-06, + "loss": 1.0641, + "step": 2685 + }, + { + "epoch": 0.2070613629355535, + "grad_norm": 3.6692962646484375, + "learning_rate": 9.200613363515325e-06, + "loss": 0.9243, + "step": 2686 + }, + { + "epoch": 0.20713845205057047, + "grad_norm": 3.6443002223968506, + "learning_rate": 9.199936079240364e-06, + "loss": 1.0226, + "step": 2687 + }, + { + "epoch": 0.2072155411655874, + "grad_norm": 4.001389503479004, + "learning_rate": 9.19925853312114e-06, + "loss": 1.092, + "step": 2688 + }, + { + "epoch": 0.20729263028060438, + "grad_norm": 4.218007564544678, + "learning_rate": 9.198580725199894e-06, + "loss": 1.0681, + "step": 2689 + }, + { + "epoch": 0.20736971939562135, + "grad_norm": 4.421291351318359, + "learning_rate": 9.197902655518887e-06, + "loss": 1.092, + "step": 2690 + }, + { + "epoch": 0.2074468085106383, + "grad_norm": 3.6270670890808105, + "learning_rate": 9.197224324120391e-06, + "loss": 0.9676, + "step": 2691 + }, + { + "epoch": 0.20752389762565526, + "grad_norm": 3.7732226848602295, + "learning_rate": 9.196545731046695e-06, + "loss": 0.9676, + "step": 2692 + }, + { + "epoch": 0.2076009867406722, + "grad_norm": 4.031553268432617, + "learning_rate": 9.195866876340107e-06, + "loss": 1.0286, + "step": 2693 + }, + { + "epoch": 0.20767807585568918, + "grad_norm": 3.7905490398406982, + "learning_rate": 9.195187760042952e-06, + "loss": 1.0421, + "step": 2694 + }, + { + "epoch": 0.20775516497070615, + "grad_norm": 3.822108745574951, + "learning_rate": 9.194508382197566e-06, + "loss": 1.0981, + "step": 2695 + }, + { + "epoch": 0.2078322540857231, + "grad_norm": 3.774367332458496, + "learning_rate": 9.193828742846307e-06, + "loss": 1.0733, + "step": 2696 + }, + { + "epoch": 0.20790934320074006, + "grad_norm": 4.311622619628906, + "learning_rate": 9.193148842031548e-06, + "loss": 0.9735, + "step": 2697 + }, + { + "epoch": 0.207986432315757, + "grad_norm": 3.389493465423584, + "learning_rate": 9.192468679795671e-06, + "loss": 0.9759, + "step": 2698 + }, + { + "epoch": 0.20806352143077397, + "grad_norm": 4.131453990936279, + "learning_rate": 9.19178825618109e-06, + "loss": 1.011, + "step": 2699 + }, + { + "epoch": 0.20814061054579094, + "grad_norm": 3.75632643699646, + "learning_rate": 9.191107571230217e-06, + "loss": 0.9215, + "step": 2700 + }, + { + "epoch": 0.2082176996608079, + "grad_norm": 3.789395332336426, + "learning_rate": 9.190426624985497e-06, + "loss": 1.1335, + "step": 2701 + }, + { + "epoch": 0.20829478877582486, + "grad_norm": 4.15241003036499, + "learning_rate": 9.189745417489378e-06, + "loss": 1.1109, + "step": 2702 + }, + { + "epoch": 0.2083718778908418, + "grad_norm": 3.8731484413146973, + "learning_rate": 9.189063948784332e-06, + "loss": 0.9326, + "step": 2703 + }, + { + "epoch": 0.20844896700585877, + "grad_norm": 3.67352294921875, + "learning_rate": 9.188382218912844e-06, + "loss": 1.058, + "step": 2704 + }, + { + "epoch": 0.20852605612087574, + "grad_norm": 3.72332501411438, + "learning_rate": 9.187700227917415e-06, + "loss": 1.0898, + "step": 2705 + }, + { + "epoch": 0.20860314523589268, + "grad_norm": 3.6140379905700684, + "learning_rate": 9.187017975840568e-06, + "loss": 1.0351, + "step": 2706 + }, + { + "epoch": 0.20868023435090965, + "grad_norm": 3.6925249099731445, + "learning_rate": 9.186335462724834e-06, + "loss": 1.0802, + "step": 2707 + }, + { + "epoch": 0.2087573234659266, + "grad_norm": 3.8140275478363037, + "learning_rate": 9.185652688612766e-06, + "loss": 1.0149, + "step": 2708 + }, + { + "epoch": 0.20883441258094357, + "grad_norm": 3.920686960220337, + "learning_rate": 9.184969653546932e-06, + "loss": 1.1498, + "step": 2709 + }, + { + "epoch": 0.20891150169596054, + "grad_norm": 3.6589865684509277, + "learning_rate": 9.184286357569913e-06, + "loss": 0.9951, + "step": 2710 + }, + { + "epoch": 0.20898859081097748, + "grad_norm": 3.9114480018615723, + "learning_rate": 9.183602800724311e-06, + "loss": 1.1772, + "step": 2711 + }, + { + "epoch": 0.20906567992599445, + "grad_norm": 3.792837381362915, + "learning_rate": 9.182918983052743e-06, + "loss": 0.9794, + "step": 2712 + }, + { + "epoch": 0.20914276904101142, + "grad_norm": 3.940286636352539, + "learning_rate": 9.182234904597838e-06, + "loss": 1.0731, + "step": 2713 + }, + { + "epoch": 0.20921985815602837, + "grad_norm": 3.8480448722839355, + "learning_rate": 9.181550565402248e-06, + "loss": 1.1191, + "step": 2714 + }, + { + "epoch": 0.20929694727104534, + "grad_norm": 3.821777820587158, + "learning_rate": 9.180865965508638e-06, + "loss": 0.9818, + "step": 2715 + }, + { + "epoch": 0.20937403638606228, + "grad_norm": 4.0068511962890625, + "learning_rate": 9.180181104959686e-06, + "loss": 1.0953, + "step": 2716 + }, + { + "epoch": 0.20945112550107925, + "grad_norm": 3.940507650375366, + "learning_rate": 9.179495983798094e-06, + "loss": 1.07, + "step": 2717 + }, + { + "epoch": 0.20952821461609622, + "grad_norm": 4.213906288146973, + "learning_rate": 9.178810602066575e-06, + "loss": 1.0311, + "step": 2718 + }, + { + "epoch": 0.20960530373111316, + "grad_norm": 3.6800248622894287, + "learning_rate": 9.178124959807854e-06, + "loss": 0.9295, + "step": 2719 + }, + { + "epoch": 0.20968239284613013, + "grad_norm": 3.5638949871063232, + "learning_rate": 9.177439057064684e-06, + "loss": 0.9917, + "step": 2720 + }, + { + "epoch": 0.20975948196114708, + "grad_norm": 4.166075229644775, + "learning_rate": 9.17675289387982e-06, + "loss": 1.0279, + "step": 2721 + }, + { + "epoch": 0.20983657107616405, + "grad_norm": 3.7445201873779297, + "learning_rate": 9.17606647029605e-06, + "loss": 1.0408, + "step": 2722 + }, + { + "epoch": 0.20991366019118102, + "grad_norm": 3.632784128189087, + "learning_rate": 9.175379786356162e-06, + "loss": 1.0679, + "step": 2723 + }, + { + "epoch": 0.20999074930619796, + "grad_norm": 3.969790458679199, + "learning_rate": 9.174692842102968e-06, + "loss": 1.1247, + "step": 2724 + }, + { + "epoch": 0.21006783842121493, + "grad_norm": 3.7024993896484375, + "learning_rate": 9.174005637579297e-06, + "loss": 1.0884, + "step": 2725 + }, + { + "epoch": 0.21014492753623187, + "grad_norm": 4.002373695373535, + "learning_rate": 9.173318172827994e-06, + "loss": 1.1234, + "step": 2726 + }, + { + "epoch": 0.21022201665124884, + "grad_norm": 3.7659733295440674, + "learning_rate": 9.172630447891915e-06, + "loss": 1.0724, + "step": 2727 + }, + { + "epoch": 0.21029910576626581, + "grad_norm": 4.004013538360596, + "learning_rate": 9.171942462813938e-06, + "loss": 1.0574, + "step": 2728 + }, + { + "epoch": 0.21037619488128276, + "grad_norm": 3.9406075477600098, + "learning_rate": 9.171254217636958e-06, + "loss": 1.1462, + "step": 2729 + }, + { + "epoch": 0.21045328399629973, + "grad_norm": 4.120555877685547, + "learning_rate": 9.17056571240388e-06, + "loss": 1.0335, + "step": 2730 + }, + { + "epoch": 0.21053037311131667, + "grad_norm": 3.955146312713623, + "learning_rate": 9.169876947157628e-06, + "loss": 1.0698, + "step": 2731 + }, + { + "epoch": 0.21060746222633364, + "grad_norm": 4.344998836517334, + "learning_rate": 9.169187921941148e-06, + "loss": 1.1751, + "step": 2732 + }, + { + "epoch": 0.2106845513413506, + "grad_norm": 4.642431259155273, + "learning_rate": 9.16849863679739e-06, + "loss": 1.1332, + "step": 2733 + }, + { + "epoch": 0.21076164045636755, + "grad_norm": 3.788193464279175, + "learning_rate": 9.167809091769334e-06, + "loss": 1.0431, + "step": 2734 + }, + { + "epoch": 0.21083872957138453, + "grad_norm": 3.8150315284729004, + "learning_rate": 9.167119286899965e-06, + "loss": 1.1172, + "step": 2735 + }, + { + "epoch": 0.21091581868640147, + "grad_norm": 4.12971305847168, + "learning_rate": 9.16642922223229e-06, + "loss": 1.0524, + "step": 2736 + }, + { + "epoch": 0.21099290780141844, + "grad_norm": 3.7752163410186768, + "learning_rate": 9.165738897809335e-06, + "loss": 1.0584, + "step": 2737 + }, + { + "epoch": 0.2110699969164354, + "grad_norm": 4.05067253112793, + "learning_rate": 9.165048313674131e-06, + "loss": 1.1037, + "step": 2738 + }, + { + "epoch": 0.21114708603145235, + "grad_norm": 4.372398376464844, + "learning_rate": 9.164357469869737e-06, + "loss": 1.0896, + "step": 2739 + }, + { + "epoch": 0.21122417514646932, + "grad_norm": 3.3885154724121094, + "learning_rate": 9.163666366439223e-06, + "loss": 0.8929, + "step": 2740 + }, + { + "epoch": 0.21130126426148627, + "grad_norm": 3.9432756900787354, + "learning_rate": 9.162975003425676e-06, + "loss": 1.0253, + "step": 2741 + }, + { + "epoch": 0.21137835337650324, + "grad_norm": 3.852996587753296, + "learning_rate": 9.162283380872197e-06, + "loss": 0.9896, + "step": 2742 + }, + { + "epoch": 0.2114554424915202, + "grad_norm": 3.6689140796661377, + "learning_rate": 9.161591498821907e-06, + "loss": 1.1041, + "step": 2743 + }, + { + "epoch": 0.21153253160653715, + "grad_norm": 4.0420403480529785, + "learning_rate": 9.16089935731794e-06, + "loss": 1.0409, + "step": 2744 + }, + { + "epoch": 0.21160962072155412, + "grad_norm": 3.9778621196746826, + "learning_rate": 9.160206956403448e-06, + "loss": 1.0178, + "step": 2745 + }, + { + "epoch": 0.21168670983657106, + "grad_norm": 4.711827278137207, + "learning_rate": 9.1595142961216e-06, + "loss": 0.9995, + "step": 2746 + }, + { + "epoch": 0.21176379895158803, + "grad_norm": 3.6355984210968018, + "learning_rate": 9.158821376515574e-06, + "loss": 1.0407, + "step": 2747 + }, + { + "epoch": 0.211840888066605, + "grad_norm": 4.1716203689575195, + "learning_rate": 9.158128197628578e-06, + "loss": 0.9243, + "step": 2748 + }, + { + "epoch": 0.21191797718162195, + "grad_norm": 3.8837966918945312, + "learning_rate": 9.157434759503823e-06, + "loss": 1.0692, + "step": 2749 + }, + { + "epoch": 0.21199506629663892, + "grad_norm": 4.1938700675964355, + "learning_rate": 9.156741062184543e-06, + "loss": 1.0286, + "step": 2750 + }, + { + "epoch": 0.21207215541165586, + "grad_norm": 3.5614426136016846, + "learning_rate": 9.156047105713986e-06, + "loss": 1.0052, + "step": 2751 + }, + { + "epoch": 0.21214924452667283, + "grad_norm": 3.653048276901245, + "learning_rate": 9.155352890135417e-06, + "loss": 1.0898, + "step": 2752 + }, + { + "epoch": 0.2122263336416898, + "grad_norm": 3.959932327270508, + "learning_rate": 9.154658415492116e-06, + "loss": 1.1145, + "step": 2753 + }, + { + "epoch": 0.21230342275670674, + "grad_norm": 3.750103235244751, + "learning_rate": 9.153963681827382e-06, + "loss": 1.018, + "step": 2754 + }, + { + "epoch": 0.21238051187172372, + "grad_norm": 4.643415451049805, + "learning_rate": 9.153268689184524e-06, + "loss": 1.0096, + "step": 2755 + }, + { + "epoch": 0.21245760098674069, + "grad_norm": 4.112432956695557, + "learning_rate": 9.152573437606874e-06, + "loss": 1.1197, + "step": 2756 + }, + { + "epoch": 0.21253469010175763, + "grad_norm": 3.4916539192199707, + "learning_rate": 9.151877927137778e-06, + "loss": 1.0737, + "step": 2757 + }, + { + "epoch": 0.2126117792167746, + "grad_norm": 3.6449830532073975, + "learning_rate": 9.151182157820595e-06, + "loss": 1.031, + "step": 2758 + }, + { + "epoch": 0.21268886833179154, + "grad_norm": 3.4803693294525146, + "learning_rate": 9.150486129698706e-06, + "loss": 1.0208, + "step": 2759 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 4.427255153656006, + "learning_rate": 9.1497898428155e-06, + "loss": 1.1607, + "step": 2760 + }, + { + "epoch": 0.21284304656182548, + "grad_norm": 3.903338670730591, + "learning_rate": 9.149093297214392e-06, + "loss": 1.0678, + "step": 2761 + }, + { + "epoch": 0.21292013567684243, + "grad_norm": 3.8415660858154297, + "learning_rate": 9.148396492938806e-06, + "loss": 1.0445, + "step": 2762 + }, + { + "epoch": 0.2129972247918594, + "grad_norm": 4.833527565002441, + "learning_rate": 9.14769943003218e-06, + "loss": 1.2442, + "step": 2763 + }, + { + "epoch": 0.21307431390687634, + "grad_norm": 3.9709668159484863, + "learning_rate": 9.14700210853798e-06, + "loss": 1.0287, + "step": 2764 + }, + { + "epoch": 0.2131514030218933, + "grad_norm": 4.086493015289307, + "learning_rate": 9.146304528499674e-06, + "loss": 0.9555, + "step": 2765 + }, + { + "epoch": 0.21322849213691028, + "grad_norm": 3.9784085750579834, + "learning_rate": 9.145606689960756e-06, + "loss": 1.0339, + "step": 2766 + }, + { + "epoch": 0.21330558125192722, + "grad_norm": 4.03312873840332, + "learning_rate": 9.144908592964732e-06, + "loss": 1.0068, + "step": 2767 + }, + { + "epoch": 0.2133826703669442, + "grad_norm": 4.233692646026611, + "learning_rate": 9.144210237555124e-06, + "loss": 1.1303, + "step": 2768 + }, + { + "epoch": 0.21345975948196114, + "grad_norm": 4.059010982513428, + "learning_rate": 9.143511623775469e-06, + "loss": 0.9926, + "step": 2769 + }, + { + "epoch": 0.2135368485969781, + "grad_norm": 3.532331705093384, + "learning_rate": 9.142812751669327e-06, + "loss": 0.9392, + "step": 2770 + }, + { + "epoch": 0.21361393771199508, + "grad_norm": 3.3265960216522217, + "learning_rate": 9.142113621280265e-06, + "loss": 0.9405, + "step": 2771 + }, + { + "epoch": 0.21369102682701202, + "grad_norm": 3.408034324645996, + "learning_rate": 9.14141423265187e-06, + "loss": 0.9525, + "step": 2772 + }, + { + "epoch": 0.213768115942029, + "grad_norm": 3.924441337585449, + "learning_rate": 9.140714585827748e-06, + "loss": 1.0772, + "step": 2773 + }, + { + "epoch": 0.21384520505704593, + "grad_norm": 3.8719756603240967, + "learning_rate": 9.140014680851516e-06, + "loss": 0.9904, + "step": 2774 + }, + { + "epoch": 0.2139222941720629, + "grad_norm": 3.7581069469451904, + "learning_rate": 9.139314517766811e-06, + "loss": 1.0082, + "step": 2775 + }, + { + "epoch": 0.21399938328707988, + "grad_norm": 3.611703872680664, + "learning_rate": 9.138614096617285e-06, + "loss": 0.9756, + "step": 2776 + }, + { + "epoch": 0.21407647240209682, + "grad_norm": 3.581341505050659, + "learning_rate": 9.137913417446603e-06, + "loss": 1.0092, + "step": 2777 + }, + { + "epoch": 0.2141535615171138, + "grad_norm": 3.8926100730895996, + "learning_rate": 9.137212480298451e-06, + "loss": 1.0584, + "step": 2778 + }, + { + "epoch": 0.21423065063213073, + "grad_norm": 3.6705634593963623, + "learning_rate": 9.136511285216527e-06, + "loss": 1.0492, + "step": 2779 + }, + { + "epoch": 0.2143077397471477, + "grad_norm": 3.6244938373565674, + "learning_rate": 9.13580983224455e-06, + "loss": 0.9325, + "step": 2780 + }, + { + "epoch": 0.21438482886216467, + "grad_norm": 3.5282609462738037, + "learning_rate": 9.135108121426247e-06, + "loss": 0.9749, + "step": 2781 + }, + { + "epoch": 0.21446191797718162, + "grad_norm": 4.567509651184082, + "learning_rate": 9.13440615280537e-06, + "loss": 1.0813, + "step": 2782 + }, + { + "epoch": 0.21453900709219859, + "grad_norm": 3.8714842796325684, + "learning_rate": 9.133703926425683e-06, + "loss": 1.0343, + "step": 2783 + }, + { + "epoch": 0.21461609620721553, + "grad_norm": 3.845766305923462, + "learning_rate": 9.133001442330964e-06, + "loss": 1.0322, + "step": 2784 + }, + { + "epoch": 0.2146931853222325, + "grad_norm": 3.7517282962799072, + "learning_rate": 9.132298700565011e-06, + "loss": 1.0577, + "step": 2785 + }, + { + "epoch": 0.21477027443724947, + "grad_norm": 3.8736910820007324, + "learning_rate": 9.131595701171636e-06, + "loss": 1.0193, + "step": 2786 + }, + { + "epoch": 0.2148473635522664, + "grad_norm": 3.8819291591644287, + "learning_rate": 9.130892444194666e-06, + "loss": 1.0163, + "step": 2787 + }, + { + "epoch": 0.21492445266728338, + "grad_norm": 3.802039623260498, + "learning_rate": 9.130188929677948e-06, + "loss": 1.0257, + "step": 2788 + }, + { + "epoch": 0.21500154178230033, + "grad_norm": 4.080732822418213, + "learning_rate": 9.129485157665339e-06, + "loss": 0.9556, + "step": 2789 + }, + { + "epoch": 0.2150786308973173, + "grad_norm": 3.748866081237793, + "learning_rate": 9.12878112820072e-06, + "loss": 0.9364, + "step": 2790 + }, + { + "epoch": 0.21515572001233427, + "grad_norm": 3.8058340549468994, + "learning_rate": 9.12807684132798e-06, + "loss": 0.9971, + "step": 2791 + }, + { + "epoch": 0.2152328091273512, + "grad_norm": 4.344945430755615, + "learning_rate": 9.127372297091028e-06, + "loss": 1.1081, + "step": 2792 + }, + { + "epoch": 0.21530989824236818, + "grad_norm": 4.168917655944824, + "learning_rate": 9.126667495533791e-06, + "loss": 1.0641, + "step": 2793 + }, + { + "epoch": 0.21538698735738512, + "grad_norm": 3.46258282661438, + "learning_rate": 9.125962436700207e-06, + "loss": 0.9893, + "step": 2794 + }, + { + "epoch": 0.2154640764724021, + "grad_norm": 3.5184786319732666, + "learning_rate": 9.125257120634234e-06, + "loss": 1.0043, + "step": 2795 + }, + { + "epoch": 0.21554116558741906, + "grad_norm": 3.7548696994781494, + "learning_rate": 9.124551547379846e-06, + "loss": 0.9768, + "step": 2796 + }, + { + "epoch": 0.215618254702436, + "grad_norm": 3.801421642303467, + "learning_rate": 9.12384571698103e-06, + "loss": 0.9912, + "step": 2797 + }, + { + "epoch": 0.21569534381745298, + "grad_norm": 4.047616004943848, + "learning_rate": 9.123139629481792e-06, + "loss": 1.0559, + "step": 2798 + }, + { + "epoch": 0.21577243293246995, + "grad_norm": 3.8471899032592773, + "learning_rate": 9.12243328492615e-06, + "loss": 1.099, + "step": 2799 + }, + { + "epoch": 0.2158495220474869, + "grad_norm": 3.7033841609954834, + "learning_rate": 9.121726683358146e-06, + "loss": 1.0111, + "step": 2800 + }, + { + "epoch": 0.21592661116250386, + "grad_norm": 4.303826808929443, + "learning_rate": 9.12101982482183e-06, + "loss": 1.025, + "step": 2801 + }, + { + "epoch": 0.2160037002775208, + "grad_norm": 3.404069662094116, + "learning_rate": 9.120312709361271e-06, + "loss": 0.9853, + "step": 2802 + }, + { + "epoch": 0.21608078939253778, + "grad_norm": 3.566990613937378, + "learning_rate": 9.119605337020554e-06, + "loss": 1.0534, + "step": 2803 + }, + { + "epoch": 0.21615787850755475, + "grad_norm": 4.058454990386963, + "learning_rate": 9.118897707843779e-06, + "loss": 0.9761, + "step": 2804 + }, + { + "epoch": 0.2162349676225717, + "grad_norm": 3.7274980545043945, + "learning_rate": 9.118189821875066e-06, + "loss": 0.9726, + "step": 2805 + }, + { + "epoch": 0.21631205673758866, + "grad_norm": 3.776718854904175, + "learning_rate": 9.117481679158546e-06, + "loss": 1.0489, + "step": 2806 + }, + { + "epoch": 0.2163891458526056, + "grad_norm": 3.99607253074646, + "learning_rate": 9.116773279738367e-06, + "loss": 0.9758, + "step": 2807 + }, + { + "epoch": 0.21646623496762257, + "grad_norm": 3.691988945007324, + "learning_rate": 9.116064623658696e-06, + "loss": 1.0635, + "step": 2808 + }, + { + "epoch": 0.21654332408263954, + "grad_norm": 4.369837284088135, + "learning_rate": 9.115355710963714e-06, + "loss": 1.1056, + "step": 2809 + }, + { + "epoch": 0.2166204131976565, + "grad_norm": 3.6378722190856934, + "learning_rate": 9.114646541697617e-06, + "loss": 0.8665, + "step": 2810 + }, + { + "epoch": 0.21669750231267346, + "grad_norm": 3.6210851669311523, + "learning_rate": 9.113937115904618e-06, + "loss": 0.9098, + "step": 2811 + }, + { + "epoch": 0.2167745914276904, + "grad_norm": 3.999722480773926, + "learning_rate": 9.113227433628948e-06, + "loss": 0.9969, + "step": 2812 + }, + { + "epoch": 0.21685168054270737, + "grad_norm": 3.984334707260132, + "learning_rate": 9.112517494914847e-06, + "loss": 1.0185, + "step": 2813 + }, + { + "epoch": 0.21692876965772434, + "grad_norm": 3.8469057083129883, + "learning_rate": 9.111807299806584e-06, + "loss": 1.0389, + "step": 2814 + }, + { + "epoch": 0.21700585877274128, + "grad_norm": 4.206716537475586, + "learning_rate": 9.111096848348429e-06, + "loss": 1.0314, + "step": 2815 + }, + { + "epoch": 0.21708294788775825, + "grad_norm": 4.628408908843994, + "learning_rate": 9.110386140584677e-06, + "loss": 1.0994, + "step": 2816 + }, + { + "epoch": 0.2171600370027752, + "grad_norm": 3.524751663208008, + "learning_rate": 9.109675176559639e-06, + "loss": 0.9583, + "step": 2817 + }, + { + "epoch": 0.21723712611779217, + "grad_norm": 3.7516028881073, + "learning_rate": 9.108963956317635e-06, + "loss": 1.0813, + "step": 2818 + }, + { + "epoch": 0.21731421523280914, + "grad_norm": 4.167080879211426, + "learning_rate": 9.108252479903012e-06, + "loss": 1.0686, + "step": 2819 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 4.301987648010254, + "learning_rate": 9.107540747360124e-06, + "loss": 1.0793, + "step": 2820 + }, + { + "epoch": 0.21746839346284305, + "grad_norm": 4.356029987335205, + "learning_rate": 9.106828758733344e-06, + "loss": 1.0524, + "step": 2821 + }, + { + "epoch": 0.21754548257786, + "grad_norm": 3.900385618209839, + "learning_rate": 9.10611651406706e-06, + "loss": 1.0539, + "step": 2822 + }, + { + "epoch": 0.21762257169287696, + "grad_norm": 3.5353188514709473, + "learning_rate": 9.105404013405677e-06, + "loss": 1.0595, + "step": 2823 + }, + { + "epoch": 0.21769966080789394, + "grad_norm": 3.6671149730682373, + "learning_rate": 9.104691256793618e-06, + "loss": 1.1179, + "step": 2824 + }, + { + "epoch": 0.21777674992291088, + "grad_norm": 3.817493200302124, + "learning_rate": 9.103978244275316e-06, + "loss": 0.9962, + "step": 2825 + }, + { + "epoch": 0.21785383903792785, + "grad_norm": 3.781205654144287, + "learning_rate": 9.103264975895227e-06, + "loss": 1.0873, + "step": 2826 + }, + { + "epoch": 0.2179309281529448, + "grad_norm": 3.8384957313537598, + "learning_rate": 9.102551451697816e-06, + "loss": 1.0985, + "step": 2827 + }, + { + "epoch": 0.21800801726796176, + "grad_norm": 3.9920754432678223, + "learning_rate": 9.10183767172757e-06, + "loss": 1.1307, + "step": 2828 + }, + { + "epoch": 0.21808510638297873, + "grad_norm": 3.888017416000366, + "learning_rate": 9.101123636028993e-06, + "loss": 1.0162, + "step": 2829 + }, + { + "epoch": 0.21816219549799568, + "grad_norm": 3.8230643272399902, + "learning_rate": 9.100409344646593e-06, + "loss": 1.0684, + "step": 2830 + }, + { + "epoch": 0.21823928461301265, + "grad_norm": 3.7765209674835205, + "learning_rate": 9.09969479762491e-06, + "loss": 0.9862, + "step": 2831 + }, + { + "epoch": 0.2183163737280296, + "grad_norm": 3.8649239540100098, + "learning_rate": 9.098979995008486e-06, + "loss": 1.058, + "step": 2832 + }, + { + "epoch": 0.21839346284304656, + "grad_norm": 3.9235241413116455, + "learning_rate": 9.098264936841891e-06, + "loss": 1.0378, + "step": 2833 + }, + { + "epoch": 0.21847055195806353, + "grad_norm": 4.208407878875732, + "learning_rate": 9.097549623169701e-06, + "loss": 1.0499, + "step": 2834 + }, + { + "epoch": 0.21854764107308047, + "grad_norm": 3.320283889770508, + "learning_rate": 9.096834054036516e-06, + "loss": 0.9359, + "step": 2835 + }, + { + "epoch": 0.21862473018809744, + "grad_norm": 4.07008695602417, + "learning_rate": 9.096118229486945e-06, + "loss": 1.193, + "step": 2836 + }, + { + "epoch": 0.2187018193031144, + "grad_norm": 3.860478162765503, + "learning_rate": 9.095402149565615e-06, + "loss": 0.9158, + "step": 2837 + }, + { + "epoch": 0.21877890841813136, + "grad_norm": 4.054520606994629, + "learning_rate": 9.094685814317174e-06, + "loss": 1.0763, + "step": 2838 + }, + { + "epoch": 0.21885599753314833, + "grad_norm": 3.9474873542785645, + "learning_rate": 9.093969223786277e-06, + "loss": 1.0329, + "step": 2839 + }, + { + "epoch": 0.21893308664816527, + "grad_norm": 3.73187255859375, + "learning_rate": 9.093252378017604e-06, + "loss": 1.0298, + "step": 2840 + }, + { + "epoch": 0.21901017576318224, + "grad_norm": 4.097843647003174, + "learning_rate": 9.092535277055845e-06, + "loss": 1.1312, + "step": 2841 + }, + { + "epoch": 0.2190872648781992, + "grad_norm": 3.9080405235290527, + "learning_rate": 9.091817920945704e-06, + "loss": 0.9503, + "step": 2842 + }, + { + "epoch": 0.21916435399321615, + "grad_norm": 3.772008180618286, + "learning_rate": 9.09110030973191e-06, + "loss": 0.9951, + "step": 2843 + }, + { + "epoch": 0.21924144310823312, + "grad_norm": 3.8021061420440674, + "learning_rate": 9.090382443459201e-06, + "loss": 1.068, + "step": 2844 + }, + { + "epoch": 0.21931853222325007, + "grad_norm": 3.884033203125, + "learning_rate": 9.089664322172331e-06, + "loss": 0.9915, + "step": 2845 + }, + { + "epoch": 0.21939562133826704, + "grad_norm": 3.7119226455688477, + "learning_rate": 9.08894594591607e-06, + "loss": 0.9613, + "step": 2846 + }, + { + "epoch": 0.219472710453284, + "grad_norm": 3.494401454925537, + "learning_rate": 9.088227314735208e-06, + "loss": 0.912, + "step": 2847 + }, + { + "epoch": 0.21954979956830095, + "grad_norm": 4.098977088928223, + "learning_rate": 9.087508428674546e-06, + "loss": 0.9818, + "step": 2848 + }, + { + "epoch": 0.21962688868331792, + "grad_norm": 3.923501491546631, + "learning_rate": 9.086789287778902e-06, + "loss": 0.9673, + "step": 2849 + }, + { + "epoch": 0.21970397779833487, + "grad_norm": 3.787045955657959, + "learning_rate": 9.086069892093114e-06, + "loss": 1.0714, + "step": 2850 + }, + { + "epoch": 0.21978106691335184, + "grad_norm": 4.722983360290527, + "learning_rate": 9.085350241662028e-06, + "loss": 1.0963, + "step": 2851 + }, + { + "epoch": 0.2198581560283688, + "grad_norm": 3.732903242111206, + "learning_rate": 9.084630336530515e-06, + "loss": 1.0903, + "step": 2852 + }, + { + "epoch": 0.21993524514338575, + "grad_norm": 4.350831508636475, + "learning_rate": 9.083910176743455e-06, + "loss": 1.1121, + "step": 2853 + }, + { + "epoch": 0.22001233425840272, + "grad_norm": 3.895369291305542, + "learning_rate": 9.083189762345746e-06, + "loss": 0.9028, + "step": 2854 + }, + { + "epoch": 0.22008942337341966, + "grad_norm": 4.156214237213135, + "learning_rate": 9.082469093382303e-06, + "loss": 1.0989, + "step": 2855 + }, + { + "epoch": 0.22016651248843663, + "grad_norm": 3.565054416656494, + "learning_rate": 9.081748169898054e-06, + "loss": 1.0293, + "step": 2856 + }, + { + "epoch": 0.2202436016034536, + "grad_norm": 4.331825256347656, + "learning_rate": 9.08102699193795e-06, + "loss": 1.1025, + "step": 2857 + }, + { + "epoch": 0.22032069071847055, + "grad_norm": 5.1282734870910645, + "learning_rate": 9.080305559546947e-06, + "loss": 0.9675, + "step": 2858 + }, + { + "epoch": 0.22039777983348752, + "grad_norm": 4.076694011688232, + "learning_rate": 9.079583872770025e-06, + "loss": 1.0403, + "step": 2859 + }, + { + "epoch": 0.22047486894850446, + "grad_norm": 3.9768247604370117, + "learning_rate": 9.078861931652178e-06, + "loss": 1.0653, + "step": 2860 + }, + { + "epoch": 0.22055195806352143, + "grad_norm": 3.875684976577759, + "learning_rate": 9.078139736238414e-06, + "loss": 0.8698, + "step": 2861 + }, + { + "epoch": 0.2206290471785384, + "grad_norm": 3.808002471923828, + "learning_rate": 9.077417286573759e-06, + "loss": 1.0765, + "step": 2862 + }, + { + "epoch": 0.22070613629355534, + "grad_norm": 3.7562291622161865, + "learning_rate": 9.076694582703252e-06, + "loss": 0.9728, + "step": 2863 + }, + { + "epoch": 0.22078322540857231, + "grad_norm": 3.816787004470825, + "learning_rate": 9.075971624671953e-06, + "loss": 1.1007, + "step": 2864 + }, + { + "epoch": 0.22086031452358926, + "grad_norm": 3.6228833198547363, + "learning_rate": 9.075248412524932e-06, + "loss": 0.9763, + "step": 2865 + }, + { + "epoch": 0.22093740363860623, + "grad_norm": 3.993579864501953, + "learning_rate": 9.074524946307282e-06, + "loss": 0.9982, + "step": 2866 + }, + { + "epoch": 0.2210144927536232, + "grad_norm": 3.9125168323516846, + "learning_rate": 9.0738012260641e-06, + "loss": 1.015, + "step": 2867 + }, + { + "epoch": 0.22109158186864014, + "grad_norm": 3.7729082107543945, + "learning_rate": 9.073077251840514e-06, + "loss": 0.8864, + "step": 2868 + }, + { + "epoch": 0.2211686709836571, + "grad_norm": 3.5397660732269287, + "learning_rate": 9.072353023681653e-06, + "loss": 0.9667, + "step": 2869 + }, + { + "epoch": 0.22124576009867405, + "grad_norm": 4.714186668395996, + "learning_rate": 9.071628541632675e-06, + "loss": 0.8968, + "step": 2870 + }, + { + "epoch": 0.22132284921369103, + "grad_norm": 4.182320594787598, + "learning_rate": 9.070903805738744e-06, + "loss": 1.1579, + "step": 2871 + }, + { + "epoch": 0.221399938328708, + "grad_norm": 3.612896203994751, + "learning_rate": 9.070178816045043e-06, + "loss": 1.0278, + "step": 2872 + }, + { + "epoch": 0.22147702744372494, + "grad_norm": 3.8569085597991943, + "learning_rate": 9.069453572596774e-06, + "loss": 1.0517, + "step": 2873 + }, + { + "epoch": 0.2215541165587419, + "grad_norm": 3.7985715866088867, + "learning_rate": 9.068728075439153e-06, + "loss": 1.112, + "step": 2874 + }, + { + "epoch": 0.22163120567375885, + "grad_norm": 3.7914130687713623, + "learning_rate": 9.068002324617407e-06, + "loss": 1.1309, + "step": 2875 + }, + { + "epoch": 0.22170829478877582, + "grad_norm": 3.6983115673065186, + "learning_rate": 9.067276320176783e-06, + "loss": 1.0458, + "step": 2876 + }, + { + "epoch": 0.2217853839037928, + "grad_norm": 3.6084189414978027, + "learning_rate": 9.066550062162547e-06, + "loss": 0.9881, + "step": 2877 + }, + { + "epoch": 0.22186247301880974, + "grad_norm": 3.6229233741760254, + "learning_rate": 9.065823550619976e-06, + "loss": 0.9464, + "step": 2878 + }, + { + "epoch": 0.2219395621338267, + "grad_norm": 3.949673652648926, + "learning_rate": 9.065096785594364e-06, + "loss": 1.0257, + "step": 2879 + }, + { + "epoch": 0.22201665124884365, + "grad_norm": 3.5906119346618652, + "learning_rate": 9.064369767131021e-06, + "loss": 0.9943, + "step": 2880 + }, + { + "epoch": 0.22209374036386062, + "grad_norm": 3.8634934425354004, + "learning_rate": 9.063642495275273e-06, + "loss": 1.015, + "step": 2881 + }, + { + "epoch": 0.2221708294788776, + "grad_norm": 4.381479740142822, + "learning_rate": 9.062914970072463e-06, + "loss": 1.1069, + "step": 2882 + }, + { + "epoch": 0.22224791859389453, + "grad_norm": 3.475149393081665, + "learning_rate": 9.062187191567947e-06, + "loss": 1.0361, + "step": 2883 + }, + { + "epoch": 0.2223250077089115, + "grad_norm": 3.652864694595337, + "learning_rate": 9.061459159807096e-06, + "loss": 0.9466, + "step": 2884 + }, + { + "epoch": 0.22240209682392847, + "grad_norm": 3.7307932376861572, + "learning_rate": 9.060730874835301e-06, + "loss": 1.0076, + "step": 2885 + }, + { + "epoch": 0.22247918593894542, + "grad_norm": 4.080811977386475, + "learning_rate": 9.060002336697968e-06, + "loss": 1.0577, + "step": 2886 + }, + { + "epoch": 0.2225562750539624, + "grad_norm": 3.7193477153778076, + "learning_rate": 9.059273545440516e-06, + "loss": 1.1106, + "step": 2887 + }, + { + "epoch": 0.22263336416897933, + "grad_norm": 3.971679925918579, + "learning_rate": 9.058544501108383e-06, + "loss": 1.0908, + "step": 2888 + }, + { + "epoch": 0.2227104532839963, + "grad_norm": 3.5091354846954346, + "learning_rate": 9.05781520374702e-06, + "loss": 0.8429, + "step": 2889 + }, + { + "epoch": 0.22278754239901327, + "grad_norm": 3.8452577590942383, + "learning_rate": 9.057085653401896e-06, + "loss": 1.0097, + "step": 2890 + }, + { + "epoch": 0.22286463151403021, + "grad_norm": 3.8790271282196045, + "learning_rate": 9.056355850118492e-06, + "loss": 1.1066, + "step": 2891 + }, + { + "epoch": 0.22294172062904719, + "grad_norm": 4.002335071563721, + "learning_rate": 9.055625793942308e-06, + "loss": 1.0349, + "step": 2892 + }, + { + "epoch": 0.22301880974406413, + "grad_norm": 3.5299620628356934, + "learning_rate": 9.054895484918863e-06, + "loss": 0.9838, + "step": 2893 + }, + { + "epoch": 0.2230958988590811, + "grad_norm": 4.129887104034424, + "learning_rate": 9.054164923093684e-06, + "loss": 1.1289, + "step": 2894 + }, + { + "epoch": 0.22317298797409807, + "grad_norm": 3.8371896743774414, + "learning_rate": 9.05343410851232e-06, + "loss": 0.9709, + "step": 2895 + }, + { + "epoch": 0.223250077089115, + "grad_norm": 3.9022574424743652, + "learning_rate": 9.052703041220332e-06, + "loss": 1.043, + "step": 2896 + }, + { + "epoch": 0.22332716620413198, + "grad_norm": 3.414318323135376, + "learning_rate": 9.0519717212633e-06, + "loss": 1.0308, + "step": 2897 + }, + { + "epoch": 0.22340425531914893, + "grad_norm": 3.9684841632843018, + "learning_rate": 9.051240148686815e-06, + "loss": 0.9831, + "step": 2898 + }, + { + "epoch": 0.2234813444341659, + "grad_norm": 3.743062973022461, + "learning_rate": 9.050508323536488e-06, + "loss": 1.1446, + "step": 2899 + }, + { + "epoch": 0.22355843354918287, + "grad_norm": 4.060634613037109, + "learning_rate": 9.049776245857947e-06, + "loss": 1.1005, + "step": 2900 + }, + { + "epoch": 0.2236355226641998, + "grad_norm": 3.9923877716064453, + "learning_rate": 9.049043915696831e-06, + "loss": 0.9606, + "step": 2901 + }, + { + "epoch": 0.22371261177921678, + "grad_norm": 3.815098285675049, + "learning_rate": 9.048311333098798e-06, + "loss": 1.1327, + "step": 2902 + }, + { + "epoch": 0.22378970089423372, + "grad_norm": 3.9265224933624268, + "learning_rate": 9.04757849810952e-06, + "loss": 1.0604, + "step": 2903 + }, + { + "epoch": 0.2238667900092507, + "grad_norm": 3.8014273643493652, + "learning_rate": 9.046845410774685e-06, + "loss": 0.9963, + "step": 2904 + }, + { + "epoch": 0.22394387912426766, + "grad_norm": 4.092421531677246, + "learning_rate": 9.04611207114e-06, + "loss": 1.1604, + "step": 2905 + }, + { + "epoch": 0.2240209682392846, + "grad_norm": 4.311123847961426, + "learning_rate": 9.045378479251179e-06, + "loss": 1.1204, + "step": 2906 + }, + { + "epoch": 0.22409805735430158, + "grad_norm": 3.77968430519104, + "learning_rate": 9.044644635153963e-06, + "loss": 1.0621, + "step": 2907 + }, + { + "epoch": 0.22417514646931852, + "grad_norm": 3.6823370456695557, + "learning_rate": 9.0439105388941e-06, + "loss": 0.9396, + "step": 2908 + }, + { + "epoch": 0.2242522355843355, + "grad_norm": 3.8284711837768555, + "learning_rate": 9.043176190517362e-06, + "loss": 1.1167, + "step": 2909 + }, + { + "epoch": 0.22432932469935246, + "grad_norm": 3.750458002090454, + "learning_rate": 9.042441590069526e-06, + "loss": 0.9483, + "step": 2910 + }, + { + "epoch": 0.2244064138143694, + "grad_norm": 3.8216614723205566, + "learning_rate": 9.041706737596397e-06, + "loss": 1.0972, + "step": 2911 + }, + { + "epoch": 0.22448350292938637, + "grad_norm": 4.093216896057129, + "learning_rate": 9.040971633143782e-06, + "loss": 1.2209, + "step": 2912 + }, + { + "epoch": 0.22456059204440332, + "grad_norm": 3.4413156509399414, + "learning_rate": 9.040236276757514e-06, + "loss": 0.8431, + "step": 2913 + }, + { + "epoch": 0.2246376811594203, + "grad_norm": 3.6368567943573, + "learning_rate": 9.03950066848344e-06, + "loss": 1.0675, + "step": 2914 + }, + { + "epoch": 0.22471477027443726, + "grad_norm": 3.8164429664611816, + "learning_rate": 9.038764808367422e-06, + "loss": 1.1033, + "step": 2915 + }, + { + "epoch": 0.2247918593894542, + "grad_norm": 3.761277198791504, + "learning_rate": 9.038028696455335e-06, + "loss": 1.0389, + "step": 2916 + }, + { + "epoch": 0.22486894850447117, + "grad_norm": 3.7666642665863037, + "learning_rate": 9.03729233279307e-06, + "loss": 1.0024, + "step": 2917 + }, + { + "epoch": 0.22494603761948812, + "grad_norm": 3.848273992538452, + "learning_rate": 9.03655571742654e-06, + "loss": 1.0972, + "step": 2918 + }, + { + "epoch": 0.22502312673450509, + "grad_norm": 3.224196672439575, + "learning_rate": 9.035818850401667e-06, + "loss": 0.9211, + "step": 2919 + }, + { + "epoch": 0.22510021584952206, + "grad_norm": 3.469214916229248, + "learning_rate": 9.03508173176439e-06, + "loss": 1.0197, + "step": 2920 + }, + { + "epoch": 0.225177304964539, + "grad_norm": 3.8008034229278564, + "learning_rate": 9.034344361560663e-06, + "loss": 1.0002, + "step": 2921 + }, + { + "epoch": 0.22525439407955597, + "grad_norm": 3.6283414363861084, + "learning_rate": 9.033606739836463e-06, + "loss": 1.0574, + "step": 2922 + }, + { + "epoch": 0.22533148319457294, + "grad_norm": 4.280021667480469, + "learning_rate": 9.032868866637772e-06, + "loss": 1.0326, + "step": 2923 + }, + { + "epoch": 0.22540857230958988, + "grad_norm": 3.6488280296325684, + "learning_rate": 9.032130742010594e-06, + "loss": 1.0502, + "step": 2924 + }, + { + "epoch": 0.22548566142460685, + "grad_norm": 3.933264970779419, + "learning_rate": 9.031392366000945e-06, + "loss": 1.1104, + "step": 2925 + }, + { + "epoch": 0.2255627505396238, + "grad_norm": 3.7967422008514404, + "learning_rate": 9.030653738654864e-06, + "loss": 1.0144, + "step": 2926 + }, + { + "epoch": 0.22563983965464077, + "grad_norm": 3.846534490585327, + "learning_rate": 9.029914860018399e-06, + "loss": 1.0428, + "step": 2927 + }, + { + "epoch": 0.22571692876965774, + "grad_norm": 3.627790689468384, + "learning_rate": 9.029175730137611e-06, + "loss": 0.996, + "step": 2928 + }, + { + "epoch": 0.22579401788467468, + "grad_norm": 3.8956503868103027, + "learning_rate": 9.028436349058584e-06, + "loss": 1.1075, + "step": 2929 + }, + { + "epoch": 0.22587110699969165, + "grad_norm": 4.157092571258545, + "learning_rate": 9.027696716827416e-06, + "loss": 1.0656, + "step": 2930 + }, + { + "epoch": 0.2259481961147086, + "grad_norm": 3.8785336017608643, + "learning_rate": 9.026956833490217e-06, + "loss": 0.9546, + "step": 2931 + }, + { + "epoch": 0.22602528522972556, + "grad_norm": 3.8939132690429688, + "learning_rate": 9.026216699093114e-06, + "loss": 1.0963, + "step": 2932 + }, + { + "epoch": 0.22610237434474253, + "grad_norm": 3.9576644897460938, + "learning_rate": 9.025476313682253e-06, + "loss": 1.0504, + "step": 2933 + }, + { + "epoch": 0.22617946345975948, + "grad_norm": 3.753843069076538, + "learning_rate": 9.024735677303793e-06, + "loss": 0.9969, + "step": 2934 + }, + { + "epoch": 0.22625655257477645, + "grad_norm": 3.669050693511963, + "learning_rate": 9.023994790003908e-06, + "loss": 0.9765, + "step": 2935 + }, + { + "epoch": 0.2263336416897934, + "grad_norm": 3.700514078140259, + "learning_rate": 9.023253651828789e-06, + "loss": 1.1164, + "step": 2936 + }, + { + "epoch": 0.22641073080481036, + "grad_norm": 3.998882532119751, + "learning_rate": 9.022512262824642e-06, + "loss": 1.0073, + "step": 2937 + }, + { + "epoch": 0.22648781991982733, + "grad_norm": 4.040007591247559, + "learning_rate": 9.021770623037688e-06, + "loss": 1.0537, + "step": 2938 + }, + { + "epoch": 0.22656490903484428, + "grad_norm": 4.1160173416137695, + "learning_rate": 9.021028732514166e-06, + "loss": 1.1404, + "step": 2939 + }, + { + "epoch": 0.22664199814986125, + "grad_norm": 3.754762887954712, + "learning_rate": 9.020286591300325e-06, + "loss": 1.045, + "step": 2940 + }, + { + "epoch": 0.2267190872648782, + "grad_norm": 3.404191017150879, + "learning_rate": 9.019544199442438e-06, + "loss": 0.9894, + "step": 2941 + }, + { + "epoch": 0.22679617637989516, + "grad_norm": 3.9192099571228027, + "learning_rate": 9.01880155698679e-06, + "loss": 0.9169, + "step": 2942 + }, + { + "epoch": 0.22687326549491213, + "grad_norm": 3.7557027339935303, + "learning_rate": 9.018058663979676e-06, + "loss": 1.0611, + "step": 2943 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 3.523135185241699, + "learning_rate": 9.017315520467416e-06, + "loss": 1.019, + "step": 2944 + }, + { + "epoch": 0.22702744372494604, + "grad_norm": 3.6484501361846924, + "learning_rate": 9.016572126496339e-06, + "loss": 0.9376, + "step": 2945 + }, + { + "epoch": 0.22710453283996299, + "grad_norm": 4.25970983505249, + "learning_rate": 9.015828482112793e-06, + "loss": 0.9697, + "step": 2946 + }, + { + "epoch": 0.22718162195497996, + "grad_norm": 3.3932673931121826, + "learning_rate": 9.01508458736314e-06, + "loss": 0.9625, + "step": 2947 + }, + { + "epoch": 0.22725871106999693, + "grad_norm": 3.8393819332122803, + "learning_rate": 9.014340442293755e-06, + "loss": 1.0045, + "step": 2948 + }, + { + "epoch": 0.22733580018501387, + "grad_norm": 3.575310230255127, + "learning_rate": 9.013596046951037e-06, + "loss": 1.0233, + "step": 2949 + }, + { + "epoch": 0.22741288930003084, + "grad_norm": 3.7742843627929688, + "learning_rate": 9.012851401381391e-06, + "loss": 0.992, + "step": 2950 + }, + { + "epoch": 0.22748997841504778, + "grad_norm": 4.013235092163086, + "learning_rate": 9.012106505631244e-06, + "loss": 0.931, + "step": 2951 + }, + { + "epoch": 0.22756706753006475, + "grad_norm": 3.5595037937164307, + "learning_rate": 9.011361359747034e-06, + "loss": 1.0418, + "step": 2952 + }, + { + "epoch": 0.22764415664508172, + "grad_norm": 4.25527811050415, + "learning_rate": 9.01061596377522e-06, + "loss": 1.0631, + "step": 2953 + }, + { + "epoch": 0.22772124576009867, + "grad_norm": 4.163780689239502, + "learning_rate": 9.009870317762273e-06, + "loss": 1.0558, + "step": 2954 + }, + { + "epoch": 0.22779833487511564, + "grad_norm": 3.863081216812134, + "learning_rate": 9.009124421754676e-06, + "loss": 0.8664, + "step": 2955 + }, + { + "epoch": 0.22787542399013258, + "grad_norm": 3.8147034645080566, + "learning_rate": 9.008378275798939e-06, + "loss": 1.0287, + "step": 2956 + }, + { + "epoch": 0.22795251310514955, + "grad_norm": 4.305243968963623, + "learning_rate": 9.007631879941575e-06, + "loss": 1.1372, + "step": 2957 + }, + { + "epoch": 0.22802960222016652, + "grad_norm": 3.846221685409546, + "learning_rate": 9.006885234229119e-06, + "loss": 0.9999, + "step": 2958 + }, + { + "epoch": 0.22810669133518346, + "grad_norm": 3.8825647830963135, + "learning_rate": 9.00613833870812e-06, + "loss": 1.0099, + "step": 2959 + }, + { + "epoch": 0.22818378045020044, + "grad_norm": 3.7892098426818848, + "learning_rate": 9.005391193425144e-06, + "loss": 0.9672, + "step": 2960 + }, + { + "epoch": 0.22826086956521738, + "grad_norm": 4.183690547943115, + "learning_rate": 9.004643798426772e-06, + "loss": 0.9231, + "step": 2961 + }, + { + "epoch": 0.22833795868023435, + "grad_norm": 3.8365590572357178, + "learning_rate": 9.003896153759602e-06, + "loss": 1.1283, + "step": 2962 + }, + { + "epoch": 0.22841504779525132, + "grad_norm": 3.8340327739715576, + "learning_rate": 9.00314825947024e-06, + "loss": 0.957, + "step": 2963 + }, + { + "epoch": 0.22849213691026826, + "grad_norm": 4.034872531890869, + "learning_rate": 9.002400115605319e-06, + "loss": 1.0195, + "step": 2964 + }, + { + "epoch": 0.22856922602528523, + "grad_norm": 3.9130313396453857, + "learning_rate": 9.00165172221148e-06, + "loss": 1.1405, + "step": 2965 + }, + { + "epoch": 0.2286463151403022, + "grad_norm": 3.705179214477539, + "learning_rate": 9.00090307933538e-06, + "loss": 1.0191, + "step": 2966 + }, + { + "epoch": 0.22872340425531915, + "grad_norm": 3.985041618347168, + "learning_rate": 9.000154187023696e-06, + "loss": 1.0271, + "step": 2967 + }, + { + "epoch": 0.22880049337033612, + "grad_norm": 3.9260165691375732, + "learning_rate": 8.999405045323113e-06, + "loss": 1.0308, + "step": 2968 + }, + { + "epoch": 0.22887758248535306, + "grad_norm": 3.8115878105163574, + "learning_rate": 8.99865565428034e-06, + "loss": 0.9711, + "step": 2969 + }, + { + "epoch": 0.22895467160037003, + "grad_norm": 3.6064395904541016, + "learning_rate": 8.997906013942097e-06, + "loss": 1.0217, + "step": 2970 + }, + { + "epoch": 0.229031760715387, + "grad_norm": 3.8164539337158203, + "learning_rate": 8.99715612435512e-06, + "loss": 1.0237, + "step": 2971 + }, + { + "epoch": 0.22910884983040394, + "grad_norm": 11.672921180725098, + "learning_rate": 8.99640598556616e-06, + "loss": 1.0373, + "step": 2972 + }, + { + "epoch": 0.22918593894542091, + "grad_norm": 3.9608118534088135, + "learning_rate": 8.995655597621985e-06, + "loss": 1.0344, + "step": 2973 + }, + { + "epoch": 0.22926302806043786, + "grad_norm": 3.9548451900482178, + "learning_rate": 8.994904960569378e-06, + "loss": 1.0988, + "step": 2974 + }, + { + "epoch": 0.22934011717545483, + "grad_norm": 3.6159064769744873, + "learning_rate": 8.994154074455135e-06, + "loss": 1.0542, + "step": 2975 + }, + { + "epoch": 0.2294172062904718, + "grad_norm": 3.7155611515045166, + "learning_rate": 8.993402939326072e-06, + "loss": 1.0449, + "step": 2976 + }, + { + "epoch": 0.22949429540548874, + "grad_norm": 3.6567227840423584, + "learning_rate": 8.99265155522902e-06, + "loss": 0.9836, + "step": 2977 + }, + { + "epoch": 0.2295713845205057, + "grad_norm": 3.778001546859741, + "learning_rate": 8.991899922210821e-06, + "loss": 1.0483, + "step": 2978 + }, + { + "epoch": 0.22964847363552265, + "grad_norm": 4.267853260040283, + "learning_rate": 8.991148040318335e-06, + "loss": 0.9866, + "step": 2979 + }, + { + "epoch": 0.22972556275053962, + "grad_norm": 3.8425397872924805, + "learning_rate": 8.990395909598442e-06, + "loss": 1.0059, + "step": 2980 + }, + { + "epoch": 0.2298026518655566, + "grad_norm": 4.225255489349365, + "learning_rate": 8.989643530098028e-06, + "loss": 1.0807, + "step": 2981 + }, + { + "epoch": 0.22987974098057354, + "grad_norm": 3.793910026550293, + "learning_rate": 8.988890901864006e-06, + "loss": 1.0763, + "step": 2982 + }, + { + "epoch": 0.2299568300955905, + "grad_norm": 4.9717020988464355, + "learning_rate": 8.988138024943294e-06, + "loss": 1.0412, + "step": 2983 + }, + { + "epoch": 0.23003391921060745, + "grad_norm": 3.8835487365722656, + "learning_rate": 8.98738489938283e-06, + "loss": 1.0469, + "step": 2984 + }, + { + "epoch": 0.23011100832562442, + "grad_norm": 3.7455644607543945, + "learning_rate": 8.98663152522957e-06, + "loss": 0.8876, + "step": 2985 + }, + { + "epoch": 0.2301880974406414, + "grad_norm": 4.213279724121094, + "learning_rate": 8.985877902530482e-06, + "loss": 1.0481, + "step": 2986 + }, + { + "epoch": 0.23026518655565834, + "grad_norm": 3.730454921722412, + "learning_rate": 8.985124031332549e-06, + "loss": 1.1566, + "step": 2987 + }, + { + "epoch": 0.2303422756706753, + "grad_norm": 3.9859375953674316, + "learning_rate": 8.984369911682773e-06, + "loss": 1.0923, + "step": 2988 + }, + { + "epoch": 0.23041936478569225, + "grad_norm": 3.603957176208496, + "learning_rate": 8.983615543628167e-06, + "loss": 1.0712, + "step": 2989 + }, + { + "epoch": 0.23049645390070922, + "grad_norm": 4.084186553955078, + "learning_rate": 8.982860927215765e-06, + "loss": 1.0741, + "step": 2990 + }, + { + "epoch": 0.2305735430157262, + "grad_norm": 3.7829389572143555, + "learning_rate": 8.982106062492613e-06, + "loss": 1.1044, + "step": 2991 + }, + { + "epoch": 0.23065063213074313, + "grad_norm": 4.031806468963623, + "learning_rate": 8.98135094950577e-06, + "loss": 1.0782, + "step": 2992 + }, + { + "epoch": 0.2307277212457601, + "grad_norm": 4.211963653564453, + "learning_rate": 8.980595588302315e-06, + "loss": 1.0644, + "step": 2993 + }, + { + "epoch": 0.23080481036077705, + "grad_norm": 3.739516258239746, + "learning_rate": 8.979839978929342e-06, + "loss": 1.0677, + "step": 2994 + }, + { + "epoch": 0.23088189947579402, + "grad_norm": 3.528867483139038, + "learning_rate": 8.979084121433958e-06, + "loss": 0.9519, + "step": 2995 + }, + { + "epoch": 0.230958988590811, + "grad_norm": 3.507763147354126, + "learning_rate": 8.978328015863287e-06, + "loss": 0.9619, + "step": 2996 + }, + { + "epoch": 0.23103607770582793, + "grad_norm": 4.500532150268555, + "learning_rate": 8.977571662264471e-06, + "loss": 1.2321, + "step": 2997 + }, + { + "epoch": 0.2311131668208449, + "grad_norm": 3.3748843669891357, + "learning_rate": 8.976815060684659e-06, + "loss": 0.999, + "step": 2998 + }, + { + "epoch": 0.23119025593586184, + "grad_norm": 3.790412425994873, + "learning_rate": 8.976058211171027e-06, + "loss": 1.1291, + "step": 2999 + }, + { + "epoch": 0.23126734505087881, + "grad_norm": 3.6807661056518555, + "learning_rate": 8.975301113770756e-06, + "loss": 0.9742, + "step": 3000 + }, + { + "epoch": 0.23134443416589578, + "grad_norm": 3.902876138687134, + "learning_rate": 8.97454376853105e-06, + "loss": 1.1042, + "step": 3001 + }, + { + "epoch": 0.23142152328091273, + "grad_norm": 3.766279935836792, + "learning_rate": 8.973786175499123e-06, + "loss": 1.112, + "step": 3002 + }, + { + "epoch": 0.2314986123959297, + "grad_norm": 3.676820993423462, + "learning_rate": 8.973028334722212e-06, + "loss": 0.9986, + "step": 3003 + }, + { + "epoch": 0.23157570151094664, + "grad_norm": 3.577911138534546, + "learning_rate": 8.972270246247558e-06, + "loss": 0.9868, + "step": 3004 + }, + { + "epoch": 0.2316527906259636, + "grad_norm": 3.7404353618621826, + "learning_rate": 8.97151191012243e-06, + "loss": 1.1051, + "step": 3005 + }, + { + "epoch": 0.23172987974098058, + "grad_norm": 3.7032761573791504, + "learning_rate": 8.9707533263941e-06, + "loss": 1.0054, + "step": 3006 + }, + { + "epoch": 0.23180696885599752, + "grad_norm": 3.8245747089385986, + "learning_rate": 8.969994495109868e-06, + "loss": 1.0588, + "step": 3007 + }, + { + "epoch": 0.2318840579710145, + "grad_norm": 3.9548802375793457, + "learning_rate": 8.96923541631704e-06, + "loss": 1.1028, + "step": 3008 + }, + { + "epoch": 0.23196114708603147, + "grad_norm": 3.6920297145843506, + "learning_rate": 8.968476090062941e-06, + "loss": 0.9935, + "step": 3009 + }, + { + "epoch": 0.2320382362010484, + "grad_norm": 3.5321335792541504, + "learning_rate": 8.967716516394911e-06, + "loss": 0.9922, + "step": 3010 + }, + { + "epoch": 0.23211532531606538, + "grad_norm": 3.3093225955963135, + "learning_rate": 8.966956695360304e-06, + "loss": 0.9767, + "step": 3011 + }, + { + "epoch": 0.23219241443108232, + "grad_norm": 3.7494637966156006, + "learning_rate": 8.966196627006493e-06, + "loss": 1.068, + "step": 3012 + }, + { + "epoch": 0.2322695035460993, + "grad_norm": 3.80411696434021, + "learning_rate": 8.965436311380866e-06, + "loss": 1.0047, + "step": 3013 + }, + { + "epoch": 0.23234659266111626, + "grad_norm": 3.954469919204712, + "learning_rate": 8.96467574853082e-06, + "loss": 1.0272, + "step": 3014 + }, + { + "epoch": 0.2324236817761332, + "grad_norm": 4.207308292388916, + "learning_rate": 8.963914938503777e-06, + "loss": 1.0145, + "step": 3015 + }, + { + "epoch": 0.23250077089115018, + "grad_norm": 3.938061237335205, + "learning_rate": 8.963153881347164e-06, + "loss": 0.9881, + "step": 3016 + }, + { + "epoch": 0.23257786000616712, + "grad_norm": 3.8759334087371826, + "learning_rate": 8.962392577108433e-06, + "loss": 1.0464, + "step": 3017 + }, + { + "epoch": 0.2326549491211841, + "grad_norm": 4.220122814178467, + "learning_rate": 8.96163102583505e-06, + "loss": 0.9949, + "step": 3018 + }, + { + "epoch": 0.23273203823620106, + "grad_norm": 3.3575594425201416, + "learning_rate": 8.960869227574486e-06, + "loss": 1.0295, + "step": 3019 + }, + { + "epoch": 0.232809127351218, + "grad_norm": 3.665529489517212, + "learning_rate": 8.960107182374242e-06, + "loss": 1.1279, + "step": 3020 + }, + { + "epoch": 0.23288621646623497, + "grad_norm": 4.267427921295166, + "learning_rate": 8.959344890281826e-06, + "loss": 1.0118, + "step": 3021 + }, + { + "epoch": 0.23296330558125192, + "grad_norm": 4.257380962371826, + "learning_rate": 8.958582351344759e-06, + "loss": 1.0997, + "step": 3022 + }, + { + "epoch": 0.2330403946962689, + "grad_norm": 4.008502006530762, + "learning_rate": 8.957819565610585e-06, + "loss": 1.0658, + "step": 3023 + }, + { + "epoch": 0.23311748381128586, + "grad_norm": 3.9444122314453125, + "learning_rate": 8.957056533126861e-06, + "loss": 1.0731, + "step": 3024 + }, + { + "epoch": 0.2331945729263028, + "grad_norm": 4.073714256286621, + "learning_rate": 8.956293253941155e-06, + "loss": 1.0287, + "step": 3025 + }, + { + "epoch": 0.23327166204131977, + "grad_norm": 4.01447057723999, + "learning_rate": 8.955529728101055e-06, + "loss": 1.1299, + "step": 3026 + }, + { + "epoch": 0.23334875115633671, + "grad_norm": 4.0629706382751465, + "learning_rate": 8.954765955654164e-06, + "loss": 1.0278, + "step": 3027 + }, + { + "epoch": 0.23342584027135369, + "grad_norm": 3.837249279022217, + "learning_rate": 8.954001936648095e-06, + "loss": 1.0456, + "step": 3028 + }, + { + "epoch": 0.23350292938637066, + "grad_norm": 3.798231363296509, + "learning_rate": 8.953237671130486e-06, + "loss": 0.9937, + "step": 3029 + }, + { + "epoch": 0.2335800185013876, + "grad_norm": 3.616384267807007, + "learning_rate": 8.952473159148982e-06, + "loss": 0.9586, + "step": 3030 + }, + { + "epoch": 0.23365710761640457, + "grad_norm": 3.4887986183166504, + "learning_rate": 8.951708400751246e-06, + "loss": 0.997, + "step": 3031 + }, + { + "epoch": 0.2337341967314215, + "grad_norm": 3.35778546333313, + "learning_rate": 8.950943395984959e-06, + "loss": 0.922, + "step": 3032 + }, + { + "epoch": 0.23381128584643848, + "grad_norm": 4.111606121063232, + "learning_rate": 8.950178144897814e-06, + "loss": 1.1236, + "step": 3033 + }, + { + "epoch": 0.23388837496145545, + "grad_norm": 3.4538538455963135, + "learning_rate": 8.949412647537519e-06, + "loss": 0.8232, + "step": 3034 + }, + { + "epoch": 0.2339654640764724, + "grad_norm": 3.7339892387390137, + "learning_rate": 8.948646903951801e-06, + "loss": 0.9423, + "step": 3035 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 4.029750347137451, + "learning_rate": 8.947880914188397e-06, + "loss": 1.1581, + "step": 3036 + }, + { + "epoch": 0.2341196423065063, + "grad_norm": 3.9381988048553467, + "learning_rate": 8.947114678295066e-06, + "loss": 1.0831, + "step": 3037 + }, + { + "epoch": 0.23419673142152328, + "grad_norm": 3.86995530128479, + "learning_rate": 8.946348196319578e-06, + "loss": 1.0168, + "step": 3038 + }, + { + "epoch": 0.23427382053654025, + "grad_norm": 3.5996012687683105, + "learning_rate": 8.945581468309718e-06, + "loss": 0.9506, + "step": 3039 + }, + { + "epoch": 0.2343509096515572, + "grad_norm": 4.0110368728637695, + "learning_rate": 8.94481449431329e-06, + "loss": 1.0381, + "step": 3040 + }, + { + "epoch": 0.23442799876657416, + "grad_norm": 4.450444221496582, + "learning_rate": 8.944047274378105e-06, + "loss": 1.0377, + "step": 3041 + }, + { + "epoch": 0.2345050878815911, + "grad_norm": 4.031339645385742, + "learning_rate": 8.943279808552e-06, + "loss": 1.059, + "step": 3042 + }, + { + "epoch": 0.23458217699660808, + "grad_norm": 3.6864678859710693, + "learning_rate": 8.942512096882825e-06, + "loss": 1.034, + "step": 3043 + }, + { + "epoch": 0.23465926611162505, + "grad_norm": 4.179306507110596, + "learning_rate": 8.941744139418438e-06, + "loss": 1.007, + "step": 3044 + }, + { + "epoch": 0.234736355226642, + "grad_norm": 3.602816104888916, + "learning_rate": 8.940975936206717e-06, + "loss": 0.9409, + "step": 3045 + }, + { + "epoch": 0.23481344434165896, + "grad_norm": 3.6022086143493652, + "learning_rate": 8.94020748729556e-06, + "loss": 1.0962, + "step": 3046 + }, + { + "epoch": 0.2348905334566759, + "grad_norm": 3.6227774620056152, + "learning_rate": 8.93943879273287e-06, + "loss": 1.0189, + "step": 3047 + }, + { + "epoch": 0.23496762257169287, + "grad_norm": 3.8395297527313232, + "learning_rate": 8.938669852566578e-06, + "loss": 0.9859, + "step": 3048 + }, + { + "epoch": 0.23504471168670985, + "grad_norm": 3.932426929473877, + "learning_rate": 8.937900666844616e-06, + "loss": 1.0921, + "step": 3049 + }, + { + "epoch": 0.2351218008017268, + "grad_norm": 3.409926414489746, + "learning_rate": 8.937131235614945e-06, + "loss": 0.9903, + "step": 3050 + }, + { + "epoch": 0.23519888991674376, + "grad_norm": 3.986931800842285, + "learning_rate": 8.93636155892553e-06, + "loss": 1.0701, + "step": 3051 + }, + { + "epoch": 0.23527597903176073, + "grad_norm": 3.698172092437744, + "learning_rate": 8.935591636824359e-06, + "loss": 1.0048, + "step": 3052 + }, + { + "epoch": 0.23535306814677767, + "grad_norm": 3.6447818279266357, + "learning_rate": 8.934821469359434e-06, + "loss": 1.0963, + "step": 3053 + }, + { + "epoch": 0.23543015726179464, + "grad_norm": 3.609434127807617, + "learning_rate": 8.934051056578768e-06, + "loss": 0.9999, + "step": 3054 + }, + { + "epoch": 0.23550724637681159, + "grad_norm": 3.5842690467834473, + "learning_rate": 8.933280398530394e-06, + "loss": 1.1416, + "step": 3055 + }, + { + "epoch": 0.23558433549182856, + "grad_norm": 3.902055501937866, + "learning_rate": 8.932509495262357e-06, + "loss": 0.9909, + "step": 3056 + }, + { + "epoch": 0.23566142460684553, + "grad_norm": 4.051197528839111, + "learning_rate": 8.931738346822723e-06, + "loss": 1.0003, + "step": 3057 + }, + { + "epoch": 0.23573851372186247, + "grad_norm": 3.764064311981201, + "learning_rate": 8.930966953259563e-06, + "loss": 1.0474, + "step": 3058 + }, + { + "epoch": 0.23581560283687944, + "grad_norm": 3.6911094188690186, + "learning_rate": 8.930195314620975e-06, + "loss": 1.0557, + "step": 3059 + }, + { + "epoch": 0.23589269195189638, + "grad_norm": 3.983623743057251, + "learning_rate": 8.929423430955062e-06, + "loss": 1.0087, + "step": 3060 + }, + { + "epoch": 0.23596978106691335, + "grad_norm": 3.7596616744995117, + "learning_rate": 8.92865130230995e-06, + "loss": 1.0966, + "step": 3061 + }, + { + "epoch": 0.23604687018193032, + "grad_norm": 3.731916904449463, + "learning_rate": 8.927878928733777e-06, + "loss": 1.0216, + "step": 3062 + }, + { + "epoch": 0.23612395929694727, + "grad_norm": 3.540071487426758, + "learning_rate": 8.927106310274695e-06, + "loss": 0.9634, + "step": 3063 + }, + { + "epoch": 0.23620104841196424, + "grad_norm": 3.379556655883789, + "learning_rate": 8.926333446980873e-06, + "loss": 0.9989, + "step": 3064 + }, + { + "epoch": 0.23627813752698118, + "grad_norm": 3.504272937774658, + "learning_rate": 8.925560338900496e-06, + "loss": 0.9902, + "step": 3065 + }, + { + "epoch": 0.23635522664199815, + "grad_norm": 3.81402325630188, + "learning_rate": 8.924786986081764e-06, + "loss": 1.0649, + "step": 3066 + }, + { + "epoch": 0.23643231575701512, + "grad_norm": 3.7274930477142334, + "learning_rate": 8.92401338857289e-06, + "loss": 1.0921, + "step": 3067 + }, + { + "epoch": 0.23650940487203206, + "grad_norm": 3.597395658493042, + "learning_rate": 8.923239546422104e-06, + "loss": 0.9825, + "step": 3068 + }, + { + "epoch": 0.23658649398704903, + "grad_norm": 3.9207396507263184, + "learning_rate": 8.922465459677649e-06, + "loss": 0.974, + "step": 3069 + }, + { + "epoch": 0.23666358310206598, + "grad_norm": 3.931241035461426, + "learning_rate": 8.92169112838779e-06, + "loss": 1.0103, + "step": 3070 + }, + { + "epoch": 0.23674067221708295, + "grad_norm": 4.226929187774658, + "learning_rate": 8.9209165526008e-06, + "loss": 1.0043, + "step": 3071 + }, + { + "epoch": 0.23681776133209992, + "grad_norm": 3.8953144550323486, + "learning_rate": 8.920141732364971e-06, + "loss": 1.0027, + "step": 3072 + }, + { + "epoch": 0.23689485044711686, + "grad_norm": 3.6181445121765137, + "learning_rate": 8.919366667728607e-06, + "loss": 1.0596, + "step": 3073 + }, + { + "epoch": 0.23697193956213383, + "grad_norm": 4.111177921295166, + "learning_rate": 8.918591358740027e-06, + "loss": 1.0282, + "step": 3074 + }, + { + "epoch": 0.23704902867715077, + "grad_norm": 3.385059118270874, + "learning_rate": 8.917815805447574e-06, + "loss": 0.9483, + "step": 3075 + }, + { + "epoch": 0.23712611779216775, + "grad_norm": 3.6011502742767334, + "learning_rate": 8.917040007899596e-06, + "loss": 1.0622, + "step": 3076 + }, + { + "epoch": 0.23720320690718472, + "grad_norm": 3.875084161758423, + "learning_rate": 8.91626396614446e-06, + "loss": 0.9209, + "step": 3077 + }, + { + "epoch": 0.23728029602220166, + "grad_norm": 3.934360980987549, + "learning_rate": 8.91548768023055e-06, + "loss": 1.0676, + "step": 3078 + }, + { + "epoch": 0.23735738513721863, + "grad_norm": 3.5961403846740723, + "learning_rate": 8.91471115020626e-06, + "loss": 0.9208, + "step": 3079 + }, + { + "epoch": 0.23743447425223557, + "grad_norm": 3.771339178085327, + "learning_rate": 8.913934376120006e-06, + "loss": 1.0828, + "step": 3080 + }, + { + "epoch": 0.23751156336725254, + "grad_norm": 3.8882675170898438, + "learning_rate": 8.913157358020212e-06, + "loss": 1.0836, + "step": 3081 + }, + { + "epoch": 0.2375886524822695, + "grad_norm": 4.413726329803467, + "learning_rate": 8.912380095955326e-06, + "loss": 1.0002, + "step": 3082 + }, + { + "epoch": 0.23766574159728646, + "grad_norm": 4.3546953201293945, + "learning_rate": 8.911602589973803e-06, + "loss": 1.0706, + "step": 3083 + }, + { + "epoch": 0.23774283071230343, + "grad_norm": 4.231818199157715, + "learning_rate": 8.910824840124116e-06, + "loss": 1.104, + "step": 3084 + }, + { + "epoch": 0.23781991982732037, + "grad_norm": 3.49495267868042, + "learning_rate": 8.910046846454756e-06, + "loss": 0.9728, + "step": 3085 + }, + { + "epoch": 0.23789700894233734, + "grad_norm": 3.724919080734253, + "learning_rate": 8.909268609014228e-06, + "loss": 1.0903, + "step": 3086 + }, + { + "epoch": 0.2379740980573543, + "grad_norm": 4.132431983947754, + "learning_rate": 8.908490127851047e-06, + "loss": 1.0781, + "step": 3087 + }, + { + "epoch": 0.23805118717237125, + "grad_norm": 3.652280330657959, + "learning_rate": 8.907711403013748e-06, + "loss": 1.0157, + "step": 3088 + }, + { + "epoch": 0.23812827628738822, + "grad_norm": 3.8804502487182617, + "learning_rate": 8.90693243455088e-06, + "loss": 1.0803, + "step": 3089 + }, + { + "epoch": 0.23820536540240517, + "grad_norm": 3.9575884342193604, + "learning_rate": 8.906153222511014e-06, + "loss": 1.0079, + "step": 3090 + }, + { + "epoch": 0.23828245451742214, + "grad_norm": 3.4292240142822266, + "learning_rate": 8.905373766942722e-06, + "loss": 1.0367, + "step": 3091 + }, + { + "epoch": 0.2383595436324391, + "grad_norm": 3.641467332839966, + "learning_rate": 8.904594067894603e-06, + "loss": 1.0492, + "step": 3092 + }, + { + "epoch": 0.23843663274745605, + "grad_norm": 3.656708240509033, + "learning_rate": 8.903814125415267e-06, + "loss": 1.0476, + "step": 3093 + }, + { + "epoch": 0.23851372186247302, + "grad_norm": 3.6211674213409424, + "learning_rate": 8.903033939553336e-06, + "loss": 1.0771, + "step": 3094 + }, + { + "epoch": 0.23859081097749, + "grad_norm": 3.7082033157348633, + "learning_rate": 8.902253510357455e-06, + "loss": 1.0533, + "step": 3095 + }, + { + "epoch": 0.23866790009250693, + "grad_norm": 3.7945261001586914, + "learning_rate": 8.90147283787628e-06, + "loss": 0.9977, + "step": 3096 + }, + { + "epoch": 0.2387449892075239, + "grad_norm": 3.3629956245422363, + "learning_rate": 8.900691922158476e-06, + "loss": 0.8274, + "step": 3097 + }, + { + "epoch": 0.23882207832254085, + "grad_norm": 3.971607208251953, + "learning_rate": 8.899910763252735e-06, + "loss": 1.1203, + "step": 3098 + }, + { + "epoch": 0.23889916743755782, + "grad_norm": 3.6717000007629395, + "learning_rate": 8.899129361207754e-06, + "loss": 0.9675, + "step": 3099 + }, + { + "epoch": 0.2389762565525748, + "grad_norm": 3.761371374130249, + "learning_rate": 8.898347716072254e-06, + "loss": 0.9512, + "step": 3100 + }, + { + "epoch": 0.23905334566759173, + "grad_norm": 4.109459400177002, + "learning_rate": 8.897565827894963e-06, + "loss": 1.1068, + "step": 3101 + }, + { + "epoch": 0.2391304347826087, + "grad_norm": 3.7636146545410156, + "learning_rate": 8.89678369672463e-06, + "loss": 0.9598, + "step": 3102 + }, + { + "epoch": 0.23920752389762565, + "grad_norm": 3.6051504611968994, + "learning_rate": 8.896001322610013e-06, + "loss": 1.0457, + "step": 3103 + }, + { + "epoch": 0.23928461301264262, + "grad_norm": 3.4127960205078125, + "learning_rate": 8.895218705599894e-06, + "loss": 0.8803, + "step": 3104 + }, + { + "epoch": 0.2393617021276596, + "grad_norm": 3.451889991760254, + "learning_rate": 8.89443584574306e-06, + "loss": 0.8208, + "step": 3105 + }, + { + "epoch": 0.23943879124267653, + "grad_norm": 3.950897455215454, + "learning_rate": 8.893652743088321e-06, + "loss": 1.0732, + "step": 3106 + }, + { + "epoch": 0.2395158803576935, + "grad_norm": 3.8806371688842773, + "learning_rate": 8.8928693976845e-06, + "loss": 1.0772, + "step": 3107 + }, + { + "epoch": 0.23959296947271044, + "grad_norm": 3.3952512741088867, + "learning_rate": 8.892085809580435e-06, + "loss": 0.998, + "step": 3108 + }, + { + "epoch": 0.2396700585877274, + "grad_norm": 4.141806602478027, + "learning_rate": 8.891301978824975e-06, + "loss": 1.0228, + "step": 3109 + }, + { + "epoch": 0.23974714770274438, + "grad_norm": 4.028316974639893, + "learning_rate": 8.890517905466991e-06, + "loss": 0.9701, + "step": 3110 + }, + { + "epoch": 0.23982423681776133, + "grad_norm": 3.646756172180176, + "learning_rate": 8.889733589555368e-06, + "loss": 1.0763, + "step": 3111 + }, + { + "epoch": 0.2399013259327783, + "grad_norm": 3.909493923187256, + "learning_rate": 8.888949031138997e-06, + "loss": 1.0528, + "step": 3112 + }, + { + "epoch": 0.23997841504779524, + "grad_norm": 3.6650424003601074, + "learning_rate": 8.888164230266798e-06, + "loss": 1.015, + "step": 3113 + }, + { + "epoch": 0.2400555041628122, + "grad_norm": 3.6798250675201416, + "learning_rate": 8.887379186987695e-06, + "loss": 0.9954, + "step": 3114 + }, + { + "epoch": 0.24013259327782918, + "grad_norm": 4.122318744659424, + "learning_rate": 8.886593901350632e-06, + "loss": 1.1222, + "step": 3115 + }, + { + "epoch": 0.24020968239284612, + "grad_norm": 3.966783046722412, + "learning_rate": 8.885808373404572e-06, + "loss": 0.9446, + "step": 3116 + }, + { + "epoch": 0.2402867715078631, + "grad_norm": 3.8579766750335693, + "learning_rate": 8.885022603198481e-06, + "loss": 1.0659, + "step": 3117 + }, + { + "epoch": 0.24036386062288004, + "grad_norm": 3.9546401500701904, + "learning_rate": 8.884236590781354e-06, + "loss": 1.054, + "step": 3118 + }, + { + "epoch": 0.240440949737897, + "grad_norm": 3.887936592102051, + "learning_rate": 8.883450336202194e-06, + "loss": 1.0296, + "step": 3119 + }, + { + "epoch": 0.24051803885291398, + "grad_norm": 3.620002031326294, + "learning_rate": 8.882663839510017e-06, + "loss": 0.8848, + "step": 3120 + }, + { + "epoch": 0.24059512796793092, + "grad_norm": 4.082600116729736, + "learning_rate": 8.881877100753857e-06, + "loss": 1.0739, + "step": 3121 + }, + { + "epoch": 0.2406722170829479, + "grad_norm": 4.3715500831604, + "learning_rate": 8.881090119982766e-06, + "loss": 1.1382, + "step": 3122 + }, + { + "epoch": 0.24074930619796484, + "grad_norm": 3.6317408084869385, + "learning_rate": 8.880302897245805e-06, + "loss": 1.0448, + "step": 3123 + }, + { + "epoch": 0.2408263953129818, + "grad_norm": 3.779918909072876, + "learning_rate": 8.879515432592057e-06, + "loss": 1.0362, + "step": 3124 + }, + { + "epoch": 0.24090348442799878, + "grad_norm": 3.709547519683838, + "learning_rate": 8.878727726070614e-06, + "loss": 0.8987, + "step": 3125 + }, + { + "epoch": 0.24098057354301572, + "grad_norm": 3.8204429149627686, + "learning_rate": 8.877939777730585e-06, + "loss": 1.0463, + "step": 3126 + }, + { + "epoch": 0.2410576626580327, + "grad_norm": 3.505138874053955, + "learning_rate": 8.877151587621096e-06, + "loss": 1.0174, + "step": 3127 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 3.879589557647705, + "learning_rate": 8.876363155791286e-06, + "loss": 1.0325, + "step": 3128 + }, + { + "epoch": 0.2412118408880666, + "grad_norm": 3.7704927921295166, + "learning_rate": 8.875574482290308e-06, + "loss": 1.0305, + "step": 3129 + }, + { + "epoch": 0.24128893000308357, + "grad_norm": 3.636652946472168, + "learning_rate": 8.874785567167335e-06, + "loss": 0.9645, + "step": 3130 + }, + { + "epoch": 0.24136601911810052, + "grad_norm": 3.62375545501709, + "learning_rate": 8.873996410471549e-06, + "loss": 1.071, + "step": 3131 + }, + { + "epoch": 0.2414431082331175, + "grad_norm": 3.3911542892456055, + "learning_rate": 8.87320701225215e-06, + "loss": 1.0456, + "step": 3132 + }, + { + "epoch": 0.24152019734813443, + "grad_norm": 3.846412181854248, + "learning_rate": 8.872417372558353e-06, + "loss": 0.9716, + "step": 3133 + }, + { + "epoch": 0.2415972864631514, + "grad_norm": 3.8464903831481934, + "learning_rate": 8.871627491439389e-06, + "loss": 0.9607, + "step": 3134 + }, + { + "epoch": 0.24167437557816837, + "grad_norm": 3.595994234085083, + "learning_rate": 8.870837368944505e-06, + "loss": 1.0328, + "step": 3135 + }, + { + "epoch": 0.24175146469318531, + "grad_norm": 3.919416904449463, + "learning_rate": 8.870047005122957e-06, + "loss": 1.1371, + "step": 3136 + }, + { + "epoch": 0.24182855380820228, + "grad_norm": 4.039884567260742, + "learning_rate": 8.869256400024023e-06, + "loss": 1.0813, + "step": 3137 + }, + { + "epoch": 0.24190564292321926, + "grad_norm": 3.456634044647217, + "learning_rate": 8.86846555369699e-06, + "loss": 1.0678, + "step": 3138 + }, + { + "epoch": 0.2419827320382362, + "grad_norm": 3.9659950733184814, + "learning_rate": 8.867674466191164e-06, + "loss": 1.0229, + "step": 3139 + }, + { + "epoch": 0.24205982115325317, + "grad_norm": 3.6779067516326904, + "learning_rate": 8.866883137555869e-06, + "loss": 0.9999, + "step": 3140 + }, + { + "epoch": 0.2421369102682701, + "grad_norm": 3.332307815551758, + "learning_rate": 8.866091567840436e-06, + "loss": 1.0288, + "step": 3141 + }, + { + "epoch": 0.24221399938328708, + "grad_norm": 3.756145477294922, + "learning_rate": 8.865299757094217e-06, + "loss": 1.0806, + "step": 3142 + }, + { + "epoch": 0.24229108849830405, + "grad_norm": 3.585340976715088, + "learning_rate": 8.864507705366577e-06, + "loss": 0.9627, + "step": 3143 + }, + { + "epoch": 0.242368177613321, + "grad_norm": 3.905465841293335, + "learning_rate": 8.863715412706897e-06, + "loss": 1.0693, + "step": 3144 + }, + { + "epoch": 0.24244526672833797, + "grad_norm": 4.029237270355225, + "learning_rate": 8.862922879164571e-06, + "loss": 1.0433, + "step": 3145 + }, + { + "epoch": 0.2425223558433549, + "grad_norm": 3.680603265762329, + "learning_rate": 8.86213010478901e-06, + "loss": 0.9543, + "step": 3146 + }, + { + "epoch": 0.24259944495837188, + "grad_norm": 3.9981207847595215, + "learning_rate": 8.86133708962964e-06, + "loss": 0.9039, + "step": 3147 + }, + { + "epoch": 0.24267653407338885, + "grad_norm": 4.111728191375732, + "learning_rate": 8.860543833735902e-06, + "loss": 0.9712, + "step": 3148 + }, + { + "epoch": 0.2427536231884058, + "grad_norm": 4.376039028167725, + "learning_rate": 8.859750337157247e-06, + "loss": 0.9934, + "step": 3149 + }, + { + "epoch": 0.24283071230342276, + "grad_norm": 3.6740853786468506, + "learning_rate": 8.858956599943151e-06, + "loss": 1.117, + "step": 3150 + }, + { + "epoch": 0.2429078014184397, + "grad_norm": 3.9488065242767334, + "learning_rate": 8.858162622143096e-06, + "loss": 0.9975, + "step": 3151 + }, + { + "epoch": 0.24298489053345668, + "grad_norm": 3.6119964122772217, + "learning_rate": 8.857368403806586e-06, + "loss": 1.0049, + "step": 3152 + }, + { + "epoch": 0.24306197964847365, + "grad_norm": 3.641690731048584, + "learning_rate": 8.856573944983131e-06, + "loss": 1.03, + "step": 3153 + }, + { + "epoch": 0.2431390687634906, + "grad_norm": 4.013689041137695, + "learning_rate": 8.855779245722264e-06, + "loss": 1.0664, + "step": 3154 + }, + { + "epoch": 0.24321615787850756, + "grad_norm": 3.818681478500366, + "learning_rate": 8.854984306073533e-06, + "loss": 0.9579, + "step": 3155 + }, + { + "epoch": 0.2432932469935245, + "grad_norm": 4.519876003265381, + "learning_rate": 8.854189126086494e-06, + "loss": 1.01, + "step": 3156 + }, + { + "epoch": 0.24337033610854147, + "grad_norm": 4.186661243438721, + "learning_rate": 8.853393705810725e-06, + "loss": 0.9834, + "step": 3157 + }, + { + "epoch": 0.24344742522355844, + "grad_norm": 3.6581454277038574, + "learning_rate": 8.852598045295815e-06, + "loss": 1.0094, + "step": 3158 + }, + { + "epoch": 0.2435245143385754, + "grad_norm": 3.646156072616577, + "learning_rate": 8.851802144591371e-06, + "loss": 1.016, + "step": 3159 + }, + { + "epoch": 0.24360160345359236, + "grad_norm": 3.637533187866211, + "learning_rate": 8.851006003747013e-06, + "loss": 1.0252, + "step": 3160 + }, + { + "epoch": 0.2436786925686093, + "grad_norm": 3.3992221355438232, + "learning_rate": 8.850209622812374e-06, + "loss": 0.9184, + "step": 3161 + }, + { + "epoch": 0.24375578168362627, + "grad_norm": 3.7890117168426514, + "learning_rate": 8.849413001837105e-06, + "loss": 1.0386, + "step": 3162 + }, + { + "epoch": 0.24383287079864324, + "grad_norm": 3.5062248706817627, + "learning_rate": 8.848616140870875e-06, + "loss": 1.0384, + "step": 3163 + }, + { + "epoch": 0.24390995991366018, + "grad_norm": 3.830472469329834, + "learning_rate": 8.847819039963359e-06, + "loss": 1.0629, + "step": 3164 + }, + { + "epoch": 0.24398704902867716, + "grad_norm": 3.8684401512145996, + "learning_rate": 8.847021699164255e-06, + "loss": 1.0184, + "step": 3165 + }, + { + "epoch": 0.2440641381436941, + "grad_norm": 3.6413917541503906, + "learning_rate": 8.846224118523271e-06, + "loss": 1.0626, + "step": 3166 + }, + { + "epoch": 0.24414122725871107, + "grad_norm": 3.6060991287231445, + "learning_rate": 8.845426298090133e-06, + "loss": 1.1064, + "step": 3167 + }, + { + "epoch": 0.24421831637372804, + "grad_norm": 3.7429842948913574, + "learning_rate": 8.844628237914584e-06, + "loss": 0.9361, + "step": 3168 + }, + { + "epoch": 0.24429540548874498, + "grad_norm": 3.62557053565979, + "learning_rate": 8.843829938046376e-06, + "loss": 0.9631, + "step": 3169 + }, + { + "epoch": 0.24437249460376195, + "grad_norm": 3.561077833175659, + "learning_rate": 8.843031398535276e-06, + "loss": 1.0517, + "step": 3170 + }, + { + "epoch": 0.2444495837187789, + "grad_norm": 3.694340467453003, + "learning_rate": 8.842232619431075e-06, + "loss": 0.9603, + "step": 3171 + }, + { + "epoch": 0.24452667283379587, + "grad_norm": 3.8666601181030273, + "learning_rate": 8.841433600783568e-06, + "loss": 1.0499, + "step": 3172 + }, + { + "epoch": 0.24460376194881284, + "grad_norm": 3.9767086505889893, + "learning_rate": 8.840634342642572e-06, + "loss": 1.0151, + "step": 3173 + }, + { + "epoch": 0.24468085106382978, + "grad_norm": 3.6622021198272705, + "learning_rate": 8.839834845057915e-06, + "loss": 0.9359, + "step": 3174 + }, + { + "epoch": 0.24475794017884675, + "grad_norm": 4.173803329467773, + "learning_rate": 8.839035108079443e-06, + "loss": 1.0276, + "step": 3175 + }, + { + "epoch": 0.2448350292938637, + "grad_norm": 3.558023691177368, + "learning_rate": 8.838235131757014e-06, + "loss": 0.9519, + "step": 3176 + }, + { + "epoch": 0.24491211840888066, + "grad_norm": 3.633117914199829, + "learning_rate": 8.837434916140504e-06, + "loss": 0.9188, + "step": 3177 + }, + { + "epoch": 0.24498920752389763, + "grad_norm": 3.513375997543335, + "learning_rate": 8.8366344612798e-06, + "loss": 1.0165, + "step": 3178 + }, + { + "epoch": 0.24506629663891458, + "grad_norm": 3.9629836082458496, + "learning_rate": 8.835833767224811e-06, + "loss": 1.0958, + "step": 3179 + }, + { + "epoch": 0.24514338575393155, + "grad_norm": 3.611243724822998, + "learning_rate": 8.83503283402545e-06, + "loss": 1.1016, + "step": 3180 + }, + { + "epoch": 0.24522047486894852, + "grad_norm": 3.9855403900146484, + "learning_rate": 8.834231661731656e-06, + "loss": 1.1526, + "step": 3181 + }, + { + "epoch": 0.24529756398396546, + "grad_norm": 3.8271896839141846, + "learning_rate": 8.833430250393376e-06, + "loss": 0.8604, + "step": 3182 + }, + { + "epoch": 0.24537465309898243, + "grad_norm": 4.085109710693359, + "learning_rate": 8.832628600060572e-06, + "loss": 1.0036, + "step": 3183 + }, + { + "epoch": 0.24545174221399937, + "grad_norm": 3.691535711288452, + "learning_rate": 8.831826710783226e-06, + "loss": 1.1316, + "step": 3184 + }, + { + "epoch": 0.24552883132901634, + "grad_norm": 3.610386371612549, + "learning_rate": 8.83102458261133e-06, + "loss": 1.0648, + "step": 3185 + }, + { + "epoch": 0.24560592044403332, + "grad_norm": 3.5450527667999268, + "learning_rate": 8.83022221559489e-06, + "loss": 1.0361, + "step": 3186 + }, + { + "epoch": 0.24568300955905026, + "grad_norm": 3.520220994949341, + "learning_rate": 8.829419609783936e-06, + "loss": 0.8683, + "step": 3187 + }, + { + "epoch": 0.24576009867406723, + "grad_norm": 3.9032230377197266, + "learning_rate": 8.8286167652285e-06, + "loss": 1.0631, + "step": 3188 + }, + { + "epoch": 0.24583718778908417, + "grad_norm": 3.8549489974975586, + "learning_rate": 8.82781368197864e-06, + "loss": 1.0082, + "step": 3189 + }, + { + "epoch": 0.24591427690410114, + "grad_norm": 3.810291051864624, + "learning_rate": 8.827010360084419e-06, + "loss": 0.9334, + "step": 3190 + }, + { + "epoch": 0.2459913660191181, + "grad_norm": 3.9244349002838135, + "learning_rate": 8.826206799595922e-06, + "loss": 1.0369, + "step": 3191 + }, + { + "epoch": 0.24606845513413506, + "grad_norm": 3.6987247467041016, + "learning_rate": 8.825403000563248e-06, + "loss": 0.963, + "step": 3192 + }, + { + "epoch": 0.24614554424915203, + "grad_norm": 3.6557397842407227, + "learning_rate": 8.824598963036511e-06, + "loss": 0.9969, + "step": 3193 + }, + { + "epoch": 0.24622263336416897, + "grad_norm": 3.776524066925049, + "learning_rate": 8.823794687065836e-06, + "loss": 1.0117, + "step": 3194 + }, + { + "epoch": 0.24629972247918594, + "grad_norm": 3.713355779647827, + "learning_rate": 8.822990172701367e-06, + "loss": 0.9675, + "step": 3195 + }, + { + "epoch": 0.2463768115942029, + "grad_norm": 3.96952748298645, + "learning_rate": 8.82218541999326e-06, + "loss": 0.9109, + "step": 3196 + }, + { + "epoch": 0.24645390070921985, + "grad_norm": 3.8969883918762207, + "learning_rate": 8.821380428991686e-06, + "loss": 1.07, + "step": 3197 + }, + { + "epoch": 0.24653098982423682, + "grad_norm": 3.5697784423828125, + "learning_rate": 8.820575199746835e-06, + "loss": 0.9745, + "step": 3198 + }, + { + "epoch": 0.24660807893925377, + "grad_norm": 3.8984580039978027, + "learning_rate": 8.819769732308908e-06, + "loss": 1.0264, + "step": 3199 + }, + { + "epoch": 0.24668516805427074, + "grad_norm": 4.095597267150879, + "learning_rate": 8.818964026728121e-06, + "loss": 1.1447, + "step": 3200 + }, + { + "epoch": 0.2467622571692877, + "grad_norm": 3.7202227115631104, + "learning_rate": 8.818158083054706e-06, + "loss": 0.9318, + "step": 3201 + }, + { + "epoch": 0.24683934628430465, + "grad_norm": 3.9623115062713623, + "learning_rate": 8.817351901338908e-06, + "loss": 1.1674, + "step": 3202 + }, + { + "epoch": 0.24691643539932162, + "grad_norm": 4.0379862785339355, + "learning_rate": 8.816545481630992e-06, + "loss": 0.9857, + "step": 3203 + }, + { + "epoch": 0.24699352451433856, + "grad_norm": 3.6850240230560303, + "learning_rate": 8.815738823981228e-06, + "loss": 0.9235, + "step": 3204 + }, + { + "epoch": 0.24707061362935553, + "grad_norm": 3.994870185852051, + "learning_rate": 8.814931928439914e-06, + "loss": 1.0827, + "step": 3205 + }, + { + "epoch": 0.2471477027443725, + "grad_norm": 3.5296335220336914, + "learning_rate": 8.81412479505735e-06, + "loss": 1.06, + "step": 3206 + }, + { + "epoch": 0.24722479185938945, + "grad_norm": 3.7254252433776855, + "learning_rate": 8.81331742388386e-06, + "loss": 1.0456, + "step": 3207 + }, + { + "epoch": 0.24730188097440642, + "grad_norm": 4.138040542602539, + "learning_rate": 8.81250981496978e-06, + "loss": 1.0802, + "step": 3208 + }, + { + "epoch": 0.24737897008942336, + "grad_norm": 4.0760884284973145, + "learning_rate": 8.811701968365455e-06, + "loss": 1.0602, + "step": 3209 + }, + { + "epoch": 0.24745605920444033, + "grad_norm": 3.428788185119629, + "learning_rate": 8.810893884121255e-06, + "loss": 0.9812, + "step": 3210 + }, + { + "epoch": 0.2475331483194573, + "grad_norm": 3.9063422679901123, + "learning_rate": 8.81008556228756e-06, + "loss": 0.9296, + "step": 3211 + }, + { + "epoch": 0.24761023743447425, + "grad_norm": 3.583909749984741, + "learning_rate": 8.809277002914762e-06, + "loss": 1.0541, + "step": 3212 + }, + { + "epoch": 0.24768732654949122, + "grad_norm": 4.865204811096191, + "learning_rate": 8.808468206053271e-06, + "loss": 1.006, + "step": 3213 + }, + { + "epoch": 0.24776441566450816, + "grad_norm": 3.553562641143799, + "learning_rate": 8.807659171753513e-06, + "loss": 0.9913, + "step": 3214 + }, + { + "epoch": 0.24784150477952513, + "grad_norm": 3.982586145401001, + "learning_rate": 8.806849900065925e-06, + "loss": 1.0069, + "step": 3215 + }, + { + "epoch": 0.2479185938945421, + "grad_norm": 4.025597095489502, + "learning_rate": 8.806040391040962e-06, + "loss": 0.9886, + "step": 3216 + }, + { + "epoch": 0.24799568300955904, + "grad_norm": 3.7594804763793945, + "learning_rate": 8.805230644729093e-06, + "loss": 0.9497, + "step": 3217 + }, + { + "epoch": 0.248072772124576, + "grad_norm": 3.5578064918518066, + "learning_rate": 8.804420661180801e-06, + "loss": 0.9121, + "step": 3218 + }, + { + "epoch": 0.24814986123959296, + "grad_norm": 3.8382201194763184, + "learning_rate": 8.803610440446584e-06, + "loss": 1.0003, + "step": 3219 + }, + { + "epoch": 0.24822695035460993, + "grad_norm": 4.217833995819092, + "learning_rate": 8.802799982576956e-06, + "loss": 1.1424, + "step": 3220 + }, + { + "epoch": 0.2483040394696269, + "grad_norm": 3.7423839569091797, + "learning_rate": 8.801989287622443e-06, + "loss": 1.0629, + "step": 3221 + }, + { + "epoch": 0.24838112858464384, + "grad_norm": 3.7119791507720947, + "learning_rate": 8.801178355633591e-06, + "loss": 1.1239, + "step": 3222 + }, + { + "epoch": 0.2484582176996608, + "grad_norm": 3.545254707336426, + "learning_rate": 8.800367186660955e-06, + "loss": 1.0791, + "step": 3223 + }, + { + "epoch": 0.24853530681467778, + "grad_norm": 3.572874069213867, + "learning_rate": 8.799555780755108e-06, + "loss": 1.0352, + "step": 3224 + }, + { + "epoch": 0.24861239592969472, + "grad_norm": 4.043218612670898, + "learning_rate": 8.798744137966634e-06, + "loss": 1.0846, + "step": 3225 + }, + { + "epoch": 0.2486894850447117, + "grad_norm": 3.8153271675109863, + "learning_rate": 8.797932258346138e-06, + "loss": 1.0246, + "step": 3226 + }, + { + "epoch": 0.24876657415972864, + "grad_norm": 3.6751506328582764, + "learning_rate": 8.797120141944237e-06, + "loss": 0.9848, + "step": 3227 + }, + { + "epoch": 0.2488436632747456, + "grad_norm": 3.716115713119507, + "learning_rate": 8.79630778881156e-06, + "loss": 0.899, + "step": 3228 + }, + { + "epoch": 0.24892075238976258, + "grad_norm": 3.8847246170043945, + "learning_rate": 8.795495198998753e-06, + "loss": 1.0613, + "step": 3229 + }, + { + "epoch": 0.24899784150477952, + "grad_norm": 3.672245502471924, + "learning_rate": 8.79468237255648e-06, + "loss": 1.0189, + "step": 3230 + }, + { + "epoch": 0.2490749306197965, + "grad_norm": 4.07139253616333, + "learning_rate": 8.793869309535411e-06, + "loss": 1.1237, + "step": 3231 + }, + { + "epoch": 0.24915201973481343, + "grad_norm": 3.6188600063323975, + "learning_rate": 8.793056009986243e-06, + "loss": 0.9832, + "step": 3232 + }, + { + "epoch": 0.2492291088498304, + "grad_norm": 3.553046703338623, + "learning_rate": 8.792242473959676e-06, + "loss": 1.0778, + "step": 3233 + }, + { + "epoch": 0.24930619796484738, + "grad_norm": 3.8086423873901367, + "learning_rate": 8.791428701506433e-06, + "loss": 1.0775, + "step": 3234 + }, + { + "epoch": 0.24938328707986432, + "grad_norm": 3.5978622436523438, + "learning_rate": 8.790614692677244e-06, + "loss": 1.0341, + "step": 3235 + }, + { + "epoch": 0.2494603761948813, + "grad_norm": 3.933943271636963, + "learning_rate": 8.789800447522863e-06, + "loss": 1.0398, + "step": 3236 + }, + { + "epoch": 0.24953746530989823, + "grad_norm": 3.6028807163238525, + "learning_rate": 8.78898596609405e-06, + "loss": 0.9908, + "step": 3237 + }, + { + "epoch": 0.2496145544249152, + "grad_norm": 3.7010693550109863, + "learning_rate": 8.788171248441588e-06, + "loss": 0.9977, + "step": 3238 + }, + { + "epoch": 0.24969164353993217, + "grad_norm": 3.41683292388916, + "learning_rate": 8.787356294616266e-06, + "loss": 0.9295, + "step": 3239 + }, + { + "epoch": 0.24976873265494912, + "grad_norm": 3.393082618713379, + "learning_rate": 8.786541104668895e-06, + "loss": 0.9624, + "step": 3240 + }, + { + "epoch": 0.2498458217699661, + "grad_norm": 3.504213333129883, + "learning_rate": 8.785725678650298e-06, + "loss": 1.0289, + "step": 3241 + }, + { + "epoch": 0.24992291088498303, + "grad_norm": 3.6844801902770996, + "learning_rate": 8.78491001661131e-06, + "loss": 0.9146, + "step": 3242 + }, + { + "epoch": 0.25, + "grad_norm": 3.710338830947876, + "learning_rate": 8.784094118602788e-06, + "loss": 1.0341, + "step": 3243 + }, + { + "epoch": 0.25007708911501697, + "grad_norm": 4.066130638122559, + "learning_rate": 8.783277984675593e-06, + "loss": 1.0393, + "step": 3244 + }, + { + "epoch": 0.25015417823003394, + "grad_norm": 3.7111411094665527, + "learning_rate": 8.782461614880611e-06, + "loss": 1.0909, + "step": 3245 + }, + { + "epoch": 0.25023126734505086, + "grad_norm": 3.810129404067993, + "learning_rate": 8.781645009268738e-06, + "loss": 1.0154, + "step": 3246 + }, + { + "epoch": 0.2503083564600678, + "grad_norm": 3.654636859893799, + "learning_rate": 8.780828167890882e-06, + "loss": 0.9366, + "step": 3247 + }, + { + "epoch": 0.2503854455750848, + "grad_norm": 3.8680331707000732, + "learning_rate": 8.780011090797974e-06, + "loss": 1.0582, + "step": 3248 + }, + { + "epoch": 0.25046253469010177, + "grad_norm": 3.812549591064453, + "learning_rate": 8.779193778040948e-06, + "loss": 0.9273, + "step": 3249 + }, + { + "epoch": 0.25053962380511874, + "grad_norm": 3.7380712032318115, + "learning_rate": 8.778376229670766e-06, + "loss": 0.9623, + "step": 3250 + }, + { + "epoch": 0.25061671292013565, + "grad_norm": 4.139057159423828, + "learning_rate": 8.777558445738394e-06, + "loss": 1.1061, + "step": 3251 + }, + { + "epoch": 0.2506938020351526, + "grad_norm": 3.806485891342163, + "learning_rate": 8.776740426294818e-06, + "loss": 1.0968, + "step": 3252 + }, + { + "epoch": 0.2507708911501696, + "grad_norm": 3.924656867980957, + "learning_rate": 8.775922171391035e-06, + "loss": 1.0496, + "step": 3253 + }, + { + "epoch": 0.25084798026518657, + "grad_norm": 3.7743098735809326, + "learning_rate": 8.775103681078063e-06, + "loss": 1.1391, + "step": 3254 + }, + { + "epoch": 0.25092506938020354, + "grad_norm": 3.758241653442383, + "learning_rate": 8.774284955406925e-06, + "loss": 0.9986, + "step": 3255 + }, + { + "epoch": 0.25100215849522045, + "grad_norm": 3.812593460083008, + "learning_rate": 8.77346599442867e-06, + "loss": 0.9571, + "step": 3256 + }, + { + "epoch": 0.2510792476102374, + "grad_norm": 3.9296791553497314, + "learning_rate": 8.772646798194353e-06, + "loss": 1.0788, + "step": 3257 + }, + { + "epoch": 0.2511563367252544, + "grad_norm": 3.9175896644592285, + "learning_rate": 8.771827366755046e-06, + "loss": 1.1493, + "step": 3258 + }, + { + "epoch": 0.25123342584027136, + "grad_norm": 3.4963676929473877, + "learning_rate": 8.771007700161839e-06, + "loss": 1.0635, + "step": 3259 + }, + { + "epoch": 0.25131051495528833, + "grad_norm": 3.776111602783203, + "learning_rate": 8.770187798465832e-06, + "loss": 1.0474, + "step": 3260 + }, + { + "epoch": 0.25138760407030525, + "grad_norm": 3.916110038757324, + "learning_rate": 8.769367661718143e-06, + "loss": 1.0697, + "step": 3261 + }, + { + "epoch": 0.2514646931853222, + "grad_norm": 3.6865622997283936, + "learning_rate": 8.7685472899699e-06, + "loss": 0.9469, + "step": 3262 + }, + { + "epoch": 0.2515417823003392, + "grad_norm": 4.2629594802856445, + "learning_rate": 8.767726683272253e-06, + "loss": 1.1568, + "step": 3263 + }, + { + "epoch": 0.25161887141535616, + "grad_norm": 3.736191511154175, + "learning_rate": 8.766905841676361e-06, + "loss": 1.0092, + "step": 3264 + }, + { + "epoch": 0.25169596053037313, + "grad_norm": 3.6174426078796387, + "learning_rate": 8.766084765233399e-06, + "loss": 0.9476, + "step": 3265 + }, + { + "epoch": 0.25177304964539005, + "grad_norm": 3.8853113651275635, + "learning_rate": 8.765263453994556e-06, + "loss": 0.9572, + "step": 3266 + }, + { + "epoch": 0.251850138760407, + "grad_norm": 5.05413293838501, + "learning_rate": 8.764441908011038e-06, + "loss": 1.0719, + "step": 3267 + }, + { + "epoch": 0.251927227875424, + "grad_norm": 3.739021062850952, + "learning_rate": 8.763620127334063e-06, + "loss": 0.9814, + "step": 3268 + }, + { + "epoch": 0.25200431699044096, + "grad_norm": 3.654526948928833, + "learning_rate": 8.762798112014867e-06, + "loss": 1.0566, + "step": 3269 + }, + { + "epoch": 0.25208140610545793, + "grad_norm": 4.029781818389893, + "learning_rate": 8.761975862104694e-06, + "loss": 1.1511, + "step": 3270 + }, + { + "epoch": 0.25215849522047484, + "grad_norm": 4.01661491394043, + "learning_rate": 8.761153377654811e-06, + "loss": 1.0468, + "step": 3271 + }, + { + "epoch": 0.2522355843354918, + "grad_norm": 3.568437337875366, + "learning_rate": 8.760330658716497e-06, + "loss": 0.9401, + "step": 3272 + }, + { + "epoch": 0.2523126734505088, + "grad_norm": 3.7211058139801025, + "learning_rate": 8.75950770534104e-06, + "loss": 1.0863, + "step": 3273 + }, + { + "epoch": 0.25238976256552575, + "grad_norm": 3.9265799522399902, + "learning_rate": 8.758684517579746e-06, + "loss": 1.0554, + "step": 3274 + }, + { + "epoch": 0.2524668516805427, + "grad_norm": 3.979198694229126, + "learning_rate": 8.757861095483942e-06, + "loss": 1.0998, + "step": 3275 + }, + { + "epoch": 0.25254394079555964, + "grad_norm": 3.5346436500549316, + "learning_rate": 8.75703743910496e-06, + "loss": 0.9852, + "step": 3276 + }, + { + "epoch": 0.2526210299105766, + "grad_norm": 3.676072835922241, + "learning_rate": 8.756213548494152e-06, + "loss": 0.9912, + "step": 3277 + }, + { + "epoch": 0.2526981190255936, + "grad_norm": 3.5911636352539062, + "learning_rate": 8.755389423702884e-06, + "loss": 0.9803, + "step": 3278 + }, + { + "epoch": 0.25277520814061055, + "grad_norm": 3.92370343208313, + "learning_rate": 8.754565064782533e-06, + "loss": 0.9514, + "step": 3279 + }, + { + "epoch": 0.2528522972556275, + "grad_norm": 3.5190653800964355, + "learning_rate": 8.753740471784497e-06, + "loss": 1.0362, + "step": 3280 + }, + { + "epoch": 0.25292938637064444, + "grad_norm": 4.347536087036133, + "learning_rate": 8.752915644760185e-06, + "loss": 1.0841, + "step": 3281 + }, + { + "epoch": 0.2530064754856614, + "grad_norm": 3.641761064529419, + "learning_rate": 8.752090583761017e-06, + "loss": 1.0113, + "step": 3282 + }, + { + "epoch": 0.2530835646006784, + "grad_norm": 3.5918169021606445, + "learning_rate": 8.751265288838435e-06, + "loss": 0.9651, + "step": 3283 + }, + { + "epoch": 0.25316065371569535, + "grad_norm": 3.788519859313965, + "learning_rate": 8.750439760043892e-06, + "loss": 1.0641, + "step": 3284 + }, + { + "epoch": 0.2532377428307123, + "grad_norm": 3.539388656616211, + "learning_rate": 8.749613997428852e-06, + "loss": 1.1261, + "step": 3285 + }, + { + "epoch": 0.2533148319457293, + "grad_norm": 4.3505167961120605, + "learning_rate": 8.748788001044799e-06, + "loss": 1.1336, + "step": 3286 + }, + { + "epoch": 0.2533919210607462, + "grad_norm": 3.823928117752075, + "learning_rate": 8.74796177094323e-06, + "loss": 1.0041, + "step": 3287 + }, + { + "epoch": 0.2534690101757632, + "grad_norm": 3.804438829421997, + "learning_rate": 8.747135307175657e-06, + "loss": 1.0816, + "step": 3288 + }, + { + "epoch": 0.25354609929078015, + "grad_norm": 3.7979674339294434, + "learning_rate": 8.746308609793601e-06, + "loss": 1.0633, + "step": 3289 + }, + { + "epoch": 0.2536231884057971, + "grad_norm": 3.7804579734802246, + "learning_rate": 8.745481678848609e-06, + "loss": 1.09, + "step": 3290 + }, + { + "epoch": 0.2537002775208141, + "grad_norm": 4.1087470054626465, + "learning_rate": 8.744654514392232e-06, + "loss": 1.1016, + "step": 3291 + }, + { + "epoch": 0.253777366635831, + "grad_norm": 3.6745924949645996, + "learning_rate": 8.743827116476039e-06, + "loss": 1.0273, + "step": 3292 + }, + { + "epoch": 0.253854455750848, + "grad_norm": 3.827929735183716, + "learning_rate": 8.742999485151617e-06, + "loss": 1.1064, + "step": 3293 + }, + { + "epoch": 0.25393154486586494, + "grad_norm": 3.5309934616088867, + "learning_rate": 8.742171620470561e-06, + "loss": 0.9478, + "step": 3294 + }, + { + "epoch": 0.2540086339808819, + "grad_norm": 3.4990084171295166, + "learning_rate": 8.741343522484486e-06, + "loss": 0.9285, + "step": 3295 + }, + { + "epoch": 0.2540857230958989, + "grad_norm": 3.40860915184021, + "learning_rate": 8.74051519124502e-06, + "loss": 0.8972, + "step": 3296 + }, + { + "epoch": 0.2541628122109158, + "grad_norm": 3.403424024581909, + "learning_rate": 8.739686626803802e-06, + "loss": 1.059, + "step": 3297 + }, + { + "epoch": 0.25423990132593277, + "grad_norm": 4.193357944488525, + "learning_rate": 8.738857829212495e-06, + "loss": 0.9641, + "step": 3298 + }, + { + "epoch": 0.25431699044094974, + "grad_norm": 4.222657203674316, + "learning_rate": 8.738028798522762e-06, + "loss": 1.0695, + "step": 3299 + }, + { + "epoch": 0.2543940795559667, + "grad_norm": 3.387126922607422, + "learning_rate": 8.737199534786297e-06, + "loss": 1.1003, + "step": 3300 + }, + { + "epoch": 0.2544711686709837, + "grad_norm": 3.9024600982666016, + "learning_rate": 8.736370038054796e-06, + "loss": 1.0785, + "step": 3301 + }, + { + "epoch": 0.2545482577860006, + "grad_norm": 3.78934383392334, + "learning_rate": 8.735540308379974e-06, + "loss": 1.0066, + "step": 3302 + }, + { + "epoch": 0.25462534690101757, + "grad_norm": 3.661975383758545, + "learning_rate": 8.73471034581356e-06, + "loss": 1.0309, + "step": 3303 + }, + { + "epoch": 0.25470243601603454, + "grad_norm": 3.687753915786743, + "learning_rate": 8.733880150407296e-06, + "loss": 1.0173, + "step": 3304 + }, + { + "epoch": 0.2547795251310515, + "grad_norm": 3.935116767883301, + "learning_rate": 8.733049722212946e-06, + "loss": 1.1019, + "step": 3305 + }, + { + "epoch": 0.2548566142460685, + "grad_norm": 3.9937877655029297, + "learning_rate": 8.73221906128228e-06, + "loss": 1.1622, + "step": 3306 + }, + { + "epoch": 0.2549337033610854, + "grad_norm": 3.838475227355957, + "learning_rate": 8.731388167667083e-06, + "loss": 0.977, + "step": 3307 + }, + { + "epoch": 0.25501079247610237, + "grad_norm": 3.6274874210357666, + "learning_rate": 8.73055704141916e-06, + "loss": 1.098, + "step": 3308 + }, + { + "epoch": 0.25508788159111934, + "grad_norm": 3.570650577545166, + "learning_rate": 8.729725682590329e-06, + "loss": 0.9655, + "step": 3309 + }, + { + "epoch": 0.2551649707061363, + "grad_norm": 3.7613351345062256, + "learning_rate": 8.728894091232417e-06, + "loss": 0.9714, + "step": 3310 + }, + { + "epoch": 0.2552420598211533, + "grad_norm": 3.4735591411590576, + "learning_rate": 8.728062267397268e-06, + "loss": 0.8959, + "step": 3311 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 3.8954594135284424, + "learning_rate": 8.727230211136747e-06, + "loss": 1.0228, + "step": 3312 + }, + { + "epoch": 0.25539623805118716, + "grad_norm": 4.0239949226379395, + "learning_rate": 8.726397922502727e-06, + "loss": 1.0255, + "step": 3313 + }, + { + "epoch": 0.25547332716620413, + "grad_norm": 3.6205320358276367, + "learning_rate": 8.725565401547096e-06, + "loss": 0.8827, + "step": 3314 + }, + { + "epoch": 0.2555504162812211, + "grad_norm": 3.7505264282226562, + "learning_rate": 8.724732648321756e-06, + "loss": 1.0148, + "step": 3315 + }, + { + "epoch": 0.2556275053962381, + "grad_norm": 3.5387229919433594, + "learning_rate": 8.723899662878627e-06, + "loss": 0.9492, + "step": 3316 + }, + { + "epoch": 0.255704594511255, + "grad_norm": 3.82112193107605, + "learning_rate": 8.72306644526964e-06, + "loss": 0.9834, + "step": 3317 + }, + { + "epoch": 0.25578168362627196, + "grad_norm": 3.7389678955078125, + "learning_rate": 8.722232995546742e-06, + "loss": 0.9932, + "step": 3318 + }, + { + "epoch": 0.25585877274128893, + "grad_norm": 3.6923506259918213, + "learning_rate": 8.721399313761896e-06, + "loss": 0.9353, + "step": 3319 + }, + { + "epoch": 0.2559358618563059, + "grad_norm": 3.9490480422973633, + "learning_rate": 8.720565399967076e-06, + "loss": 1.0178, + "step": 3320 + }, + { + "epoch": 0.2560129509713229, + "grad_norm": 3.4159951210021973, + "learning_rate": 8.719731254214271e-06, + "loss": 0.967, + "step": 3321 + }, + { + "epoch": 0.2560900400863398, + "grad_norm": 3.7180635929107666, + "learning_rate": 8.71889687655549e-06, + "loss": 0.902, + "step": 3322 + }, + { + "epoch": 0.25616712920135676, + "grad_norm": 4.072498321533203, + "learning_rate": 8.718062267042749e-06, + "loss": 1.0151, + "step": 3323 + }, + { + "epoch": 0.25624421831637373, + "grad_norm": 4.77862548828125, + "learning_rate": 8.71722742572808e-06, + "loss": 1.0888, + "step": 3324 + }, + { + "epoch": 0.2563213074313907, + "grad_norm": 3.858112096786499, + "learning_rate": 8.716392352663535e-06, + "loss": 1.015, + "step": 3325 + }, + { + "epoch": 0.25639839654640767, + "grad_norm": 3.6426641941070557, + "learning_rate": 8.715557047901174e-06, + "loss": 1.0053, + "step": 3326 + }, + { + "epoch": 0.2564754856614246, + "grad_norm": 3.9015743732452393, + "learning_rate": 8.714721511493074e-06, + "loss": 1.2689, + "step": 3327 + }, + { + "epoch": 0.25655257477644156, + "grad_norm": 4.314653396606445, + "learning_rate": 8.713885743491327e-06, + "loss": 1.0598, + "step": 3328 + }, + { + "epoch": 0.2566296638914585, + "grad_norm": 4.168483257293701, + "learning_rate": 8.713049743948038e-06, + "loss": 1.1855, + "step": 3329 + }, + { + "epoch": 0.2567067530064755, + "grad_norm": 4.1147356033325195, + "learning_rate": 8.71221351291533e-06, + "loss": 1.0772, + "step": 3330 + }, + { + "epoch": 0.25678384212149247, + "grad_norm": 4.284569263458252, + "learning_rate": 8.711377050445333e-06, + "loss": 0.9965, + "step": 3331 + }, + { + "epoch": 0.2568609312365094, + "grad_norm": 4.178953647613525, + "learning_rate": 8.710540356590198e-06, + "loss": 1.056, + "step": 3332 + }, + { + "epoch": 0.25693802035152635, + "grad_norm": 3.9649317264556885, + "learning_rate": 8.709703431402092e-06, + "loss": 0.9488, + "step": 3333 + }, + { + "epoch": 0.2570151094665433, + "grad_norm": 3.9904897212982178, + "learning_rate": 8.70886627493319e-06, + "loss": 1.033, + "step": 3334 + }, + { + "epoch": 0.2570921985815603, + "grad_norm": 3.681057929992676, + "learning_rate": 8.708028887235682e-06, + "loss": 0.9747, + "step": 3335 + }, + { + "epoch": 0.25716928769657726, + "grad_norm": 3.687772750854492, + "learning_rate": 8.707191268361778e-06, + "loss": 0.9872, + "step": 3336 + }, + { + "epoch": 0.2572463768115942, + "grad_norm": 3.913752794265747, + "learning_rate": 8.7063534183637e-06, + "loss": 1.085, + "step": 3337 + }, + { + "epoch": 0.25732346592661115, + "grad_norm": 4.11881685256958, + "learning_rate": 8.705515337293682e-06, + "loss": 1.0011, + "step": 3338 + }, + { + "epoch": 0.2574005550416281, + "grad_norm": 3.4855170249938965, + "learning_rate": 8.704677025203972e-06, + "loss": 0.9699, + "step": 3339 + }, + { + "epoch": 0.2574776441566451, + "grad_norm": 3.873643636703491, + "learning_rate": 8.703838482146837e-06, + "loss": 1.0217, + "step": 3340 + }, + { + "epoch": 0.25755473327166206, + "grad_norm": 3.451205015182495, + "learning_rate": 8.702999708174557e-06, + "loss": 0.9768, + "step": 3341 + }, + { + "epoch": 0.257631822386679, + "grad_norm": 3.673360586166382, + "learning_rate": 8.702160703339422e-06, + "loss": 1.0151, + "step": 3342 + }, + { + "epoch": 0.25770891150169595, + "grad_norm": 3.864973306655884, + "learning_rate": 8.701321467693741e-06, + "loss": 1.0275, + "step": 3343 + }, + { + "epoch": 0.2577860006167129, + "grad_norm": 3.8561880588531494, + "learning_rate": 8.700482001289838e-06, + "loss": 1.0697, + "step": 3344 + }, + { + "epoch": 0.2578630897317299, + "grad_norm": 4.72223424911499, + "learning_rate": 8.699642304180045e-06, + "loss": 1.1577, + "step": 3345 + }, + { + "epoch": 0.25794017884674686, + "grad_norm": 3.793133020401001, + "learning_rate": 8.698802376416718e-06, + "loss": 0.9992, + "step": 3346 + }, + { + "epoch": 0.2580172679617638, + "grad_norm": 3.360287666320801, + "learning_rate": 8.697962218052217e-06, + "loss": 0.9239, + "step": 3347 + }, + { + "epoch": 0.25809435707678074, + "grad_norm": 3.907996416091919, + "learning_rate": 8.697121829138925e-06, + "loss": 0.9704, + "step": 3348 + }, + { + "epoch": 0.2581714461917977, + "grad_norm": 3.603130578994751, + "learning_rate": 8.696281209729234e-06, + "loss": 0.9822, + "step": 3349 + }, + { + "epoch": 0.2582485353068147, + "grad_norm": 3.7168142795562744, + "learning_rate": 8.695440359875554e-06, + "loss": 1.0306, + "step": 3350 + }, + { + "epoch": 0.25832562442183166, + "grad_norm": 3.7132720947265625, + "learning_rate": 8.694599279630306e-06, + "loss": 1.0643, + "step": 3351 + }, + { + "epoch": 0.25840271353684857, + "grad_norm": 3.9718921184539795, + "learning_rate": 8.693757969045928e-06, + "loss": 0.9594, + "step": 3352 + }, + { + "epoch": 0.25847980265186554, + "grad_norm": 3.7555110454559326, + "learning_rate": 8.692916428174872e-06, + "loss": 1.0797, + "step": 3353 + }, + { + "epoch": 0.2585568917668825, + "grad_norm": 3.6487925052642822, + "learning_rate": 8.692074657069602e-06, + "loss": 0.9086, + "step": 3354 + }, + { + "epoch": 0.2586339808818995, + "grad_norm": 3.890230178833008, + "learning_rate": 8.6912326557826e-06, + "loss": 1.0957, + "step": 3355 + }, + { + "epoch": 0.25871106999691645, + "grad_norm": 3.4611878395080566, + "learning_rate": 8.690390424366358e-06, + "loss": 0.9665, + "step": 3356 + }, + { + "epoch": 0.25878815911193337, + "grad_norm": 3.949563980102539, + "learning_rate": 8.689547962873386e-06, + "loss": 0.9853, + "step": 3357 + }, + { + "epoch": 0.25886524822695034, + "grad_norm": 3.7639212608337402, + "learning_rate": 8.688705271356208e-06, + "loss": 1.0879, + "step": 3358 + }, + { + "epoch": 0.2589423373419673, + "grad_norm": 3.696392297744751, + "learning_rate": 8.68786234986736e-06, + "loss": 0.9405, + "step": 3359 + }, + { + "epoch": 0.2590194264569843, + "grad_norm": 4.061792850494385, + "learning_rate": 8.687019198459395e-06, + "loss": 1.0346, + "step": 3360 + }, + { + "epoch": 0.25909651557200125, + "grad_norm": 3.8702781200408936, + "learning_rate": 8.686175817184878e-06, + "loss": 1.0629, + "step": 3361 + }, + { + "epoch": 0.25917360468701817, + "grad_norm": 3.8583784103393555, + "learning_rate": 8.685332206096391e-06, + "loss": 1.0525, + "step": 3362 + }, + { + "epoch": 0.25925069380203514, + "grad_norm": 3.554065465927124, + "learning_rate": 8.684488365246526e-06, + "loss": 0.9169, + "step": 3363 + }, + { + "epoch": 0.2593277829170521, + "grad_norm": 3.972473382949829, + "learning_rate": 8.683644294687893e-06, + "loss": 1.012, + "step": 3364 + }, + { + "epoch": 0.2594048720320691, + "grad_norm": 4.070230484008789, + "learning_rate": 8.68279999447312e-06, + "loss": 0.9605, + "step": 3365 + }, + { + "epoch": 0.25948196114708605, + "grad_norm": 3.5376861095428467, + "learning_rate": 8.681955464654839e-06, + "loss": 0.8759, + "step": 3366 + }, + { + "epoch": 0.25955905026210296, + "grad_norm": 3.8391125202178955, + "learning_rate": 8.681110705285705e-06, + "loss": 0.989, + "step": 3367 + }, + { + "epoch": 0.25963613937711993, + "grad_norm": 3.567213296890259, + "learning_rate": 8.680265716418381e-06, + "loss": 1.0166, + "step": 3368 + }, + { + "epoch": 0.2597132284921369, + "grad_norm": 3.484469413757324, + "learning_rate": 8.679420498105553e-06, + "loss": 0.9996, + "step": 3369 + }, + { + "epoch": 0.2597903176071539, + "grad_norm": 3.5866034030914307, + "learning_rate": 8.678575050399912e-06, + "loss": 0.9088, + "step": 3370 + }, + { + "epoch": 0.25986740672217085, + "grad_norm": 3.686366319656372, + "learning_rate": 8.677729373354169e-06, + "loss": 1.0459, + "step": 3371 + }, + { + "epoch": 0.2599444958371878, + "grad_norm": 3.622049570083618, + "learning_rate": 8.676883467021046e-06, + "loss": 1.1149, + "step": 3372 + }, + { + "epoch": 0.26002158495220473, + "grad_norm": 3.5664947032928467, + "learning_rate": 8.676037331453283e-06, + "loss": 1.0691, + "step": 3373 + }, + { + "epoch": 0.2600986740672217, + "grad_norm": 3.5930142402648926, + "learning_rate": 8.675190966703631e-06, + "loss": 1.0553, + "step": 3374 + }, + { + "epoch": 0.2601757631822387, + "grad_norm": 3.7880308628082275, + "learning_rate": 8.674344372824855e-06, + "loss": 1.024, + "step": 3375 + }, + { + "epoch": 0.26025285229725564, + "grad_norm": 4.235076427459717, + "learning_rate": 8.673497549869738e-06, + "loss": 1.0325, + "step": 3376 + }, + { + "epoch": 0.2603299414122726, + "grad_norm": 3.9409871101379395, + "learning_rate": 8.672650497891075e-06, + "loss": 1.0099, + "step": 3377 + }, + { + "epoch": 0.26040703052728953, + "grad_norm": 3.7281947135925293, + "learning_rate": 8.671803216941674e-06, + "loss": 1.0297, + "step": 3378 + }, + { + "epoch": 0.2604841196423065, + "grad_norm": 4.101794719696045, + "learning_rate": 8.67095570707436e-06, + "loss": 1.0915, + "step": 3379 + }, + { + "epoch": 0.26056120875732347, + "grad_norm": 3.782160997390747, + "learning_rate": 8.67010796834197e-06, + "loss": 1.0381, + "step": 3380 + }, + { + "epoch": 0.26063829787234044, + "grad_norm": 3.6193161010742188, + "learning_rate": 8.669260000797355e-06, + "loss": 0.9774, + "step": 3381 + }, + { + "epoch": 0.2607153869873574, + "grad_norm": 3.8134310245513916, + "learning_rate": 8.668411804493384e-06, + "loss": 1.0437, + "step": 3382 + }, + { + "epoch": 0.2607924761023743, + "grad_norm": 3.7592546939849854, + "learning_rate": 8.667563379482934e-06, + "loss": 0.922, + "step": 3383 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 3.698714256286621, + "learning_rate": 8.666714725818903e-06, + "loss": 0.9594, + "step": 3384 + }, + { + "epoch": 0.26094665433240827, + "grad_norm": 3.6574625968933105, + "learning_rate": 8.6658658435542e-06, + "loss": 1.0836, + "step": 3385 + }, + { + "epoch": 0.26102374344742524, + "grad_norm": 3.509253740310669, + "learning_rate": 8.665016732741748e-06, + "loss": 1.0096, + "step": 3386 + }, + { + "epoch": 0.2611008325624422, + "grad_norm": 3.7333405017852783, + "learning_rate": 8.664167393434484e-06, + "loss": 1.1459, + "step": 3387 + }, + { + "epoch": 0.2611779216774591, + "grad_norm": 3.782824993133545, + "learning_rate": 8.66331782568536e-06, + "loss": 1.0318, + "step": 3388 + }, + { + "epoch": 0.2612550107924761, + "grad_norm": 3.6097073554992676, + "learning_rate": 8.662468029547341e-06, + "loss": 0.8997, + "step": 3389 + }, + { + "epoch": 0.26133209990749307, + "grad_norm": 3.526141405105591, + "learning_rate": 8.661618005073412e-06, + "loss": 0.9869, + "step": 3390 + }, + { + "epoch": 0.26140918902251004, + "grad_norm": 3.5635085105895996, + "learning_rate": 8.66076775231656e-06, + "loss": 0.9146, + "step": 3391 + }, + { + "epoch": 0.261486278137527, + "grad_norm": 3.9402894973754883, + "learning_rate": 8.659917271329801e-06, + "loss": 1.1514, + "step": 3392 + }, + { + "epoch": 0.2615633672525439, + "grad_norm": 4.007856845855713, + "learning_rate": 8.659066562166157e-06, + "loss": 1.0842, + "step": 3393 + }, + { + "epoch": 0.2616404563675609, + "grad_norm": 3.9499897956848145, + "learning_rate": 8.65821562487866e-06, + "loss": 1.0736, + "step": 3394 + }, + { + "epoch": 0.26171754548257786, + "grad_norm": 4.446572303771973, + "learning_rate": 8.657364459520367e-06, + "loss": 1.0391, + "step": 3395 + }, + { + "epoch": 0.26179463459759483, + "grad_norm": 3.946072816848755, + "learning_rate": 8.656513066144342e-06, + "loss": 1.1018, + "step": 3396 + }, + { + "epoch": 0.2618717237126118, + "grad_norm": 3.971186637878418, + "learning_rate": 8.655661444803664e-06, + "loss": 1.0861, + "step": 3397 + }, + { + "epoch": 0.2619488128276287, + "grad_norm": 3.788757801055908, + "learning_rate": 8.654809595551429e-06, + "loss": 1.1189, + "step": 3398 + }, + { + "epoch": 0.2620259019426457, + "grad_norm": 3.536916732788086, + "learning_rate": 8.653957518440743e-06, + "loss": 0.9544, + "step": 3399 + }, + { + "epoch": 0.26210299105766266, + "grad_norm": 3.843003749847412, + "learning_rate": 8.653105213524733e-06, + "loss": 0.9992, + "step": 3400 + }, + { + "epoch": 0.26218008017267963, + "grad_norm": 3.6438448429107666, + "learning_rate": 8.65225268085653e-06, + "loss": 1.0176, + "step": 3401 + }, + { + "epoch": 0.2622571692876966, + "grad_norm": 4.137069225311279, + "learning_rate": 8.65139992048929e-06, + "loss": 1.0994, + "step": 3402 + }, + { + "epoch": 0.2623342584027135, + "grad_norm": 3.9459714889526367, + "learning_rate": 8.650546932476173e-06, + "loss": 1.0303, + "step": 3403 + }, + { + "epoch": 0.2624113475177305, + "grad_norm": 3.64534068107605, + "learning_rate": 8.649693716870364e-06, + "loss": 0.9538, + "step": 3404 + }, + { + "epoch": 0.26248843663274746, + "grad_norm": 3.5429468154907227, + "learning_rate": 8.648840273725055e-06, + "loss": 0.9813, + "step": 3405 + }, + { + "epoch": 0.26256552574776443, + "grad_norm": 3.9356422424316406, + "learning_rate": 8.64798660309345e-06, + "loss": 1.0192, + "step": 3406 + }, + { + "epoch": 0.2626426148627814, + "grad_norm": 3.7008917331695557, + "learning_rate": 8.647132705028776e-06, + "loss": 1.0213, + "step": 3407 + }, + { + "epoch": 0.2627197039777983, + "grad_norm": 3.5144479274749756, + "learning_rate": 8.646278579584265e-06, + "loss": 0.9657, + "step": 3408 + }, + { + "epoch": 0.2627967930928153, + "grad_norm": 3.8584909439086914, + "learning_rate": 8.64542422681317e-06, + "loss": 1.1071, + "step": 3409 + }, + { + "epoch": 0.26287388220783225, + "grad_norm": 3.93780779838562, + "learning_rate": 8.644569646768755e-06, + "loss": 1.0209, + "step": 3410 + }, + { + "epoch": 0.2629509713228492, + "grad_norm": 3.6556358337402344, + "learning_rate": 8.643714839504296e-06, + "loss": 1.0174, + "step": 3411 + }, + { + "epoch": 0.2630280604378662, + "grad_norm": 4.030715465545654, + "learning_rate": 8.64285980507309e-06, + "loss": 1.0071, + "step": 3412 + }, + { + "epoch": 0.2631051495528831, + "grad_norm": 3.5671634674072266, + "learning_rate": 8.642004543528442e-06, + "loss": 1.0613, + "step": 3413 + }, + { + "epoch": 0.2631822386679001, + "grad_norm": 3.487668991088867, + "learning_rate": 8.641149054923673e-06, + "loss": 0.9665, + "step": 3414 + }, + { + "epoch": 0.26325932778291705, + "grad_norm": 3.8126344680786133, + "learning_rate": 8.640293339312119e-06, + "loss": 0.9724, + "step": 3415 + }, + { + "epoch": 0.263336416897934, + "grad_norm": 3.6022071838378906, + "learning_rate": 8.639437396747127e-06, + "loss": 1.0247, + "step": 3416 + }, + { + "epoch": 0.263413506012951, + "grad_norm": 3.6988253593444824, + "learning_rate": 8.638581227282064e-06, + "loss": 1.0596, + "step": 3417 + }, + { + "epoch": 0.2634905951279679, + "grad_norm": 3.569603204727173, + "learning_rate": 8.637724830970307e-06, + "loss": 1.0096, + "step": 3418 + }, + { + "epoch": 0.2635676842429849, + "grad_norm": 3.479275941848755, + "learning_rate": 8.636868207865244e-06, + "loss": 1.0446, + "step": 3419 + }, + { + "epoch": 0.26364477335800185, + "grad_norm": 3.994907855987549, + "learning_rate": 8.636011358020286e-06, + "loss": 0.9947, + "step": 3420 + }, + { + "epoch": 0.2637218624730188, + "grad_norm": 3.892772912979126, + "learning_rate": 8.635154281488851e-06, + "loss": 1.1448, + "step": 3421 + }, + { + "epoch": 0.2637989515880358, + "grad_norm": 3.7717888355255127, + "learning_rate": 8.634296978324374e-06, + "loss": 1.0227, + "step": 3422 + }, + { + "epoch": 0.2638760407030527, + "grad_norm": 3.6846694946289062, + "learning_rate": 8.633439448580302e-06, + "loss": 0.9988, + "step": 3423 + }, + { + "epoch": 0.2639531298180697, + "grad_norm": 3.5329177379608154, + "learning_rate": 8.6325816923101e-06, + "loss": 0.9628, + "step": 3424 + }, + { + "epoch": 0.26403021893308665, + "grad_norm": 3.6999776363372803, + "learning_rate": 8.631723709567242e-06, + "loss": 1.0493, + "step": 3425 + }, + { + "epoch": 0.2641073080481036, + "grad_norm": 3.73701810836792, + "learning_rate": 8.630865500405218e-06, + "loss": 1.0119, + "step": 3426 + }, + { + "epoch": 0.2641843971631206, + "grad_norm": 3.6966707706451416, + "learning_rate": 8.630007064877538e-06, + "loss": 1.0202, + "step": 3427 + }, + { + "epoch": 0.2642614862781375, + "grad_norm": 3.5695645809173584, + "learning_rate": 8.629148403037715e-06, + "loss": 0.9439, + "step": 3428 + }, + { + "epoch": 0.2643385753931545, + "grad_norm": 3.6183085441589355, + "learning_rate": 8.628289514939287e-06, + "loss": 1.0232, + "step": 3429 + }, + { + "epoch": 0.26441566450817144, + "grad_norm": 3.7045369148254395, + "learning_rate": 8.6274304006358e-06, + "loss": 0.932, + "step": 3430 + }, + { + "epoch": 0.2644927536231884, + "grad_norm": 3.988922119140625, + "learning_rate": 8.626571060180812e-06, + "loss": 1.024, + "step": 3431 + }, + { + "epoch": 0.2645698427382054, + "grad_norm": 3.669790029525757, + "learning_rate": 8.625711493627902e-06, + "loss": 1.0481, + "step": 3432 + }, + { + "epoch": 0.2646469318532223, + "grad_norm": 3.9124808311462402, + "learning_rate": 8.62485170103066e-06, + "loss": 1.0879, + "step": 3433 + }, + { + "epoch": 0.26472402096823927, + "grad_norm": 3.7043819427490234, + "learning_rate": 8.623991682442685e-06, + "loss": 1.0116, + "step": 3434 + }, + { + "epoch": 0.26480111008325624, + "grad_norm": 3.8101680278778076, + "learning_rate": 8.623131437917598e-06, + "loss": 1.0168, + "step": 3435 + }, + { + "epoch": 0.2648781991982732, + "grad_norm": 3.652815103530884, + "learning_rate": 8.622270967509032e-06, + "loss": 1.1259, + "step": 3436 + }, + { + "epoch": 0.2649552883132902, + "grad_norm": 3.873318910598755, + "learning_rate": 8.621410271270632e-06, + "loss": 1.1589, + "step": 3437 + }, + { + "epoch": 0.2650323774283071, + "grad_norm": 3.489452362060547, + "learning_rate": 8.620549349256056e-06, + "loss": 0.9174, + "step": 3438 + }, + { + "epoch": 0.26510946654332407, + "grad_norm": 3.7744300365448, + "learning_rate": 8.61968820151898e-06, + "loss": 1.0291, + "step": 3439 + }, + { + "epoch": 0.26518655565834104, + "grad_norm": 3.8608028888702393, + "learning_rate": 8.61882682811309e-06, + "loss": 1.0441, + "step": 3440 + }, + { + "epoch": 0.265263644773358, + "grad_norm": 3.826913595199585, + "learning_rate": 8.61796522909209e-06, + "loss": 1.058, + "step": 3441 + }, + { + "epoch": 0.265340733888375, + "grad_norm": 4.063562393188477, + "learning_rate": 8.617103404509699e-06, + "loss": 0.9656, + "step": 3442 + }, + { + "epoch": 0.2654178230033919, + "grad_norm": 4.010993957519531, + "learning_rate": 8.616241354419642e-06, + "loss": 1.1192, + "step": 3443 + }, + { + "epoch": 0.26549491211840887, + "grad_norm": 4.178236961364746, + "learning_rate": 8.615379078875664e-06, + "loss": 0.9984, + "step": 3444 + }, + { + "epoch": 0.26557200123342584, + "grad_norm": 4.172964572906494, + "learning_rate": 8.614516577931526e-06, + "loss": 1.0439, + "step": 3445 + }, + { + "epoch": 0.2656490903484428, + "grad_norm": 3.271082878112793, + "learning_rate": 8.613653851641001e-06, + "loss": 0.9694, + "step": 3446 + }, + { + "epoch": 0.2657261794634598, + "grad_norm": 3.594104766845703, + "learning_rate": 8.612790900057873e-06, + "loss": 1.0078, + "step": 3447 + }, + { + "epoch": 0.2658032685784767, + "grad_norm": 3.8845951557159424, + "learning_rate": 8.611927723235943e-06, + "loss": 1.1074, + "step": 3448 + }, + { + "epoch": 0.26588035769349366, + "grad_norm": 3.545318365097046, + "learning_rate": 8.611064321229027e-06, + "loss": 1.0609, + "step": 3449 + }, + { + "epoch": 0.26595744680851063, + "grad_norm": 3.784323215484619, + "learning_rate": 8.610200694090951e-06, + "loss": 0.9997, + "step": 3450 + }, + { + "epoch": 0.2660345359235276, + "grad_norm": 3.6356170177459717, + "learning_rate": 8.609336841875561e-06, + "loss": 0.9148, + "step": 3451 + }, + { + "epoch": 0.2661116250385446, + "grad_norm": 4.235599040985107, + "learning_rate": 8.608472764636714e-06, + "loss": 1.1123, + "step": 3452 + }, + { + "epoch": 0.2661887141535615, + "grad_norm": 3.789675712585449, + "learning_rate": 8.607608462428273e-06, + "loss": 1.0678, + "step": 3453 + }, + { + "epoch": 0.26626580326857846, + "grad_norm": 3.547661542892456, + "learning_rate": 8.606743935304134e-06, + "loss": 1.0874, + "step": 3454 + }, + { + "epoch": 0.26634289238359543, + "grad_norm": 3.812304735183716, + "learning_rate": 8.605879183318188e-06, + "loss": 0.9825, + "step": 3455 + }, + { + "epoch": 0.2664199814986124, + "grad_norm": 3.6646666526794434, + "learning_rate": 8.605014206524352e-06, + "loss": 1.097, + "step": 3456 + }, + { + "epoch": 0.26649707061362937, + "grad_norm": 3.943058490753174, + "learning_rate": 8.60414900497655e-06, + "loss": 0.9482, + "step": 3457 + }, + { + "epoch": 0.26657415972864634, + "grad_norm": 3.732710838317871, + "learning_rate": 8.603283578728723e-06, + "loss": 1.0008, + "step": 3458 + }, + { + "epoch": 0.26665124884366326, + "grad_norm": 3.5974366664886475, + "learning_rate": 8.60241792783483e-06, + "loss": 1.0185, + "step": 3459 + }, + { + "epoch": 0.26672833795868023, + "grad_norm": 3.4493045806884766, + "learning_rate": 8.601552052348833e-06, + "loss": 1.0396, + "step": 3460 + }, + { + "epoch": 0.2668054270736972, + "grad_norm": 3.9309725761413574, + "learning_rate": 8.60068595232472e-06, + "loss": 0.9397, + "step": 3461 + }, + { + "epoch": 0.26688251618871417, + "grad_norm": 3.614811420440674, + "learning_rate": 8.599819627816486e-06, + "loss": 1.0507, + "step": 3462 + }, + { + "epoch": 0.26695960530373114, + "grad_norm": 3.847226619720459, + "learning_rate": 8.598953078878142e-06, + "loss": 1.0411, + "step": 3463 + }, + { + "epoch": 0.26703669441874806, + "grad_norm": 4.487602233886719, + "learning_rate": 8.598086305563714e-06, + "loss": 1.1693, + "step": 3464 + }, + { + "epoch": 0.267113783533765, + "grad_norm": 3.617213487625122, + "learning_rate": 8.597219307927239e-06, + "loss": 1.0972, + "step": 3465 + }, + { + "epoch": 0.267190872648782, + "grad_norm": 4.097979545593262, + "learning_rate": 8.59635208602277e-06, + "loss": 1.0642, + "step": 3466 + }, + { + "epoch": 0.26726796176379897, + "grad_norm": 3.6943976879119873, + "learning_rate": 8.595484639904375e-06, + "loss": 1.0495, + "step": 3467 + }, + { + "epoch": 0.26734505087881594, + "grad_norm": 4.322422981262207, + "learning_rate": 8.594616969626134e-06, + "loss": 1.0781, + "step": 3468 + }, + { + "epoch": 0.26742213999383285, + "grad_norm": 3.314831495285034, + "learning_rate": 8.593749075242143e-06, + "loss": 0.8829, + "step": 3469 + }, + { + "epoch": 0.2674992291088498, + "grad_norm": 3.7989742755889893, + "learning_rate": 8.592880956806509e-06, + "loss": 0.9417, + "step": 3470 + }, + { + "epoch": 0.2675763182238668, + "grad_norm": 3.5251259803771973, + "learning_rate": 8.592012614373355e-06, + "loss": 0.9525, + "step": 3471 + }, + { + "epoch": 0.26765340733888376, + "grad_norm": 3.975076198577881, + "learning_rate": 8.591144047996817e-06, + "loss": 1.1792, + "step": 3472 + }, + { + "epoch": 0.26773049645390073, + "grad_norm": 3.9003539085388184, + "learning_rate": 8.59027525773105e-06, + "loss": 1.0923, + "step": 3473 + }, + { + "epoch": 0.26780758556891765, + "grad_norm": 3.6139657497406006, + "learning_rate": 8.589406243630212e-06, + "loss": 0.9957, + "step": 3474 + }, + { + "epoch": 0.2678846746839346, + "grad_norm": 4.016913414001465, + "learning_rate": 8.588537005748484e-06, + "loss": 1.1282, + "step": 3475 + }, + { + "epoch": 0.2679617637989516, + "grad_norm": 3.5826406478881836, + "learning_rate": 8.587667544140063e-06, + "loss": 1.0285, + "step": 3476 + }, + { + "epoch": 0.26803885291396856, + "grad_norm": 3.5834274291992188, + "learning_rate": 8.586797858859149e-06, + "loss": 0.941, + "step": 3477 + }, + { + "epoch": 0.26811594202898553, + "grad_norm": 3.8702480792999268, + "learning_rate": 8.585927949959965e-06, + "loss": 1.0519, + "step": 3478 + }, + { + "epoch": 0.26819303114400245, + "grad_norm": 3.2615435123443604, + "learning_rate": 8.585057817496747e-06, + "loss": 0.9371, + "step": 3479 + }, + { + "epoch": 0.2682701202590194, + "grad_norm": 3.683424949645996, + "learning_rate": 8.584187461523741e-06, + "loss": 0.9695, + "step": 3480 + }, + { + "epoch": 0.2683472093740364, + "grad_norm": 3.533581018447876, + "learning_rate": 8.583316882095209e-06, + "loss": 1.0256, + "step": 3481 + }, + { + "epoch": 0.26842429848905336, + "grad_norm": 3.866887331008911, + "learning_rate": 8.582446079265428e-06, + "loss": 1.115, + "step": 3482 + }, + { + "epoch": 0.26850138760407033, + "grad_norm": 3.444634437561035, + "learning_rate": 8.581575053088687e-06, + "loss": 0.9583, + "step": 3483 + }, + { + "epoch": 0.26857847671908724, + "grad_norm": 3.9610986709594727, + "learning_rate": 8.580703803619292e-06, + "loss": 0.9974, + "step": 3484 + }, + { + "epoch": 0.2686555658341042, + "grad_norm": 3.6639208793640137, + "learning_rate": 8.57983233091156e-06, + "loss": 1.0442, + "step": 3485 + }, + { + "epoch": 0.2687326549491212, + "grad_norm": 3.690537929534912, + "learning_rate": 8.578960635019822e-06, + "loss": 1.0666, + "step": 3486 + }, + { + "epoch": 0.26880974406413816, + "grad_norm": 3.4930202960968018, + "learning_rate": 8.578088715998425e-06, + "loss": 0.9999, + "step": 3487 + }, + { + "epoch": 0.2688868331791551, + "grad_norm": 3.7361273765563965, + "learning_rate": 8.577216573901727e-06, + "loss": 0.9615, + "step": 3488 + }, + { + "epoch": 0.26896392229417204, + "grad_norm": 3.7041215896606445, + "learning_rate": 8.576344208784104e-06, + "loss": 1.1125, + "step": 3489 + }, + { + "epoch": 0.269041011409189, + "grad_norm": 3.4133498668670654, + "learning_rate": 8.575471620699942e-06, + "loss": 0.9674, + "step": 3490 + }, + { + "epoch": 0.269118100524206, + "grad_norm": 3.8262181282043457, + "learning_rate": 8.57459880970364e-06, + "loss": 1.1, + "step": 3491 + }, + { + "epoch": 0.26919518963922295, + "grad_norm": 3.9435431957244873, + "learning_rate": 8.573725775849617e-06, + "loss": 1.1601, + "step": 3492 + }, + { + "epoch": 0.2692722787542399, + "grad_norm": 4.27089786529541, + "learning_rate": 8.5728525191923e-06, + "loss": 1.085, + "step": 3493 + }, + { + "epoch": 0.26934936786925684, + "grad_norm": 3.4078094959259033, + "learning_rate": 8.571979039786135e-06, + "loss": 0.9794, + "step": 3494 + }, + { + "epoch": 0.2694264569842738, + "grad_norm": 3.4169962406158447, + "learning_rate": 8.571105337685575e-06, + "loss": 0.9074, + "step": 3495 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 3.312422752380371, + "learning_rate": 8.570231412945092e-06, + "loss": 0.9965, + "step": 3496 + }, + { + "epoch": 0.26958063521430775, + "grad_norm": 4.496835231781006, + "learning_rate": 8.569357265619172e-06, + "loss": 1.0674, + "step": 3497 + }, + { + "epoch": 0.2696577243293247, + "grad_norm": 3.7908670902252197, + "learning_rate": 8.56848289576231e-06, + "loss": 0.9237, + "step": 3498 + }, + { + "epoch": 0.26973481344434164, + "grad_norm": 3.9191906452178955, + "learning_rate": 8.567608303429024e-06, + "loss": 0.9648, + "step": 3499 + }, + { + "epoch": 0.2698119025593586, + "grad_norm": 3.607534408569336, + "learning_rate": 8.566733488673837e-06, + "loss": 1.0898, + "step": 3500 + }, + { + "epoch": 0.2698889916743756, + "grad_norm": 4.031132698059082, + "learning_rate": 8.56585845155129e-06, + "loss": 0.9965, + "step": 3501 + }, + { + "epoch": 0.26996608078939255, + "grad_norm": 3.5480751991271973, + "learning_rate": 8.564983192115934e-06, + "loss": 0.9761, + "step": 3502 + }, + { + "epoch": 0.2700431699044095, + "grad_norm": 4.214536666870117, + "learning_rate": 8.564107710422343e-06, + "loss": 1.0331, + "step": 3503 + }, + { + "epoch": 0.27012025901942643, + "grad_norm": 3.7173125743865967, + "learning_rate": 8.563232006525093e-06, + "loss": 1.0236, + "step": 3504 + }, + { + "epoch": 0.2701973481344434, + "grad_norm": 4.071146011352539, + "learning_rate": 8.562356080478781e-06, + "loss": 1.0041, + "step": 3505 + }, + { + "epoch": 0.2702744372494604, + "grad_norm": 3.6947145462036133, + "learning_rate": 8.56147993233802e-06, + "loss": 1.0646, + "step": 3506 + }, + { + "epoch": 0.27035152636447735, + "grad_norm": 3.899477243423462, + "learning_rate": 8.560603562157428e-06, + "loss": 0.9962, + "step": 3507 + }, + { + "epoch": 0.2704286154794943, + "grad_norm": 3.6149401664733887, + "learning_rate": 8.559726969991646e-06, + "loss": 0.9407, + "step": 3508 + }, + { + "epoch": 0.27050570459451123, + "grad_norm": 3.5481820106506348, + "learning_rate": 8.558850155895325e-06, + "loss": 0.9833, + "step": 3509 + }, + { + "epoch": 0.2705827937095282, + "grad_norm": 3.8647804260253906, + "learning_rate": 8.557973119923126e-06, + "loss": 0.9107, + "step": 3510 + }, + { + "epoch": 0.2706598828245452, + "grad_norm": 4.967718601226807, + "learning_rate": 8.557095862129732e-06, + "loss": 1.1314, + "step": 3511 + }, + { + "epoch": 0.27073697193956214, + "grad_norm": 3.775641918182373, + "learning_rate": 8.556218382569832e-06, + "loss": 0.9328, + "step": 3512 + }, + { + "epoch": 0.2708140610545791, + "grad_norm": 3.7387449741363525, + "learning_rate": 8.555340681298136e-06, + "loss": 1.074, + "step": 3513 + }, + { + "epoch": 0.27089115016959603, + "grad_norm": 3.9792206287384033, + "learning_rate": 8.554462758369362e-06, + "loss": 1.0373, + "step": 3514 + }, + { + "epoch": 0.270968239284613, + "grad_norm": 4.240675926208496, + "learning_rate": 8.553584613838243e-06, + "loss": 1.0737, + "step": 3515 + }, + { + "epoch": 0.27104532839962997, + "grad_norm": 3.890157699584961, + "learning_rate": 8.552706247759527e-06, + "loss": 0.9872, + "step": 3516 + }, + { + "epoch": 0.27112241751464694, + "grad_norm": 3.4992425441741943, + "learning_rate": 8.551827660187978e-06, + "loss": 0.9768, + "step": 3517 + }, + { + "epoch": 0.2711995066296639, + "grad_norm": 3.600581407546997, + "learning_rate": 8.550948851178368e-06, + "loss": 0.9799, + "step": 3518 + }, + { + "epoch": 0.2712765957446808, + "grad_norm": 4.245118141174316, + "learning_rate": 8.55006982078549e-06, + "loss": 1.1133, + "step": 3519 + }, + { + "epoch": 0.2713536848596978, + "grad_norm": 3.7988290786743164, + "learning_rate": 8.549190569064144e-06, + "loss": 0.9969, + "step": 3520 + }, + { + "epoch": 0.27143077397471477, + "grad_norm": 4.244655132293701, + "learning_rate": 8.548311096069148e-06, + "loss": 1.1576, + "step": 3521 + }, + { + "epoch": 0.27150786308973174, + "grad_norm": 3.901003360748291, + "learning_rate": 8.547431401855333e-06, + "loss": 1.0142, + "step": 3522 + }, + { + "epoch": 0.2715849522047487, + "grad_norm": 3.8016793727874756, + "learning_rate": 8.546551486477542e-06, + "loss": 1.0619, + "step": 3523 + }, + { + "epoch": 0.2716620413197656, + "grad_norm": 3.773573637008667, + "learning_rate": 8.545671349990633e-06, + "loss": 1.0336, + "step": 3524 + }, + { + "epoch": 0.2717391304347826, + "grad_norm": 3.533284902572632, + "learning_rate": 8.544790992449479e-06, + "loss": 1.1181, + "step": 3525 + }, + { + "epoch": 0.27181621954979956, + "grad_norm": 3.5303707122802734, + "learning_rate": 8.543910413908967e-06, + "loss": 0.9646, + "step": 3526 + }, + { + "epoch": 0.27189330866481654, + "grad_norm": 3.93528413772583, + "learning_rate": 8.543029614423994e-06, + "loss": 1.0863, + "step": 3527 + }, + { + "epoch": 0.2719703977798335, + "grad_norm": 3.518523931503296, + "learning_rate": 8.542148594049475e-06, + "loss": 1.0908, + "step": 3528 + }, + { + "epoch": 0.2720474868948504, + "grad_norm": 3.6154558658599854, + "learning_rate": 8.541267352840336e-06, + "loss": 1.0267, + "step": 3529 + }, + { + "epoch": 0.2721245760098674, + "grad_norm": 3.5488035678863525, + "learning_rate": 8.54038589085152e-06, + "loss": 1.0351, + "step": 3530 + }, + { + "epoch": 0.27220166512488436, + "grad_norm": 3.697274923324585, + "learning_rate": 8.539504208137977e-06, + "loss": 1.0168, + "step": 3531 + }, + { + "epoch": 0.27227875423990133, + "grad_norm": 3.5824666023254395, + "learning_rate": 8.538622304754679e-06, + "loss": 1.0174, + "step": 3532 + }, + { + "epoch": 0.2723558433549183, + "grad_norm": 3.9082562923431396, + "learning_rate": 8.537740180756608e-06, + "loss": 1.1437, + "step": 3533 + }, + { + "epoch": 0.2724329324699352, + "grad_norm": 3.7677783966064453, + "learning_rate": 8.536857836198759e-06, + "loss": 0.9499, + "step": 3534 + }, + { + "epoch": 0.2725100215849522, + "grad_norm": 3.8008973598480225, + "learning_rate": 8.535975271136142e-06, + "loss": 0.9116, + "step": 3535 + }, + { + "epoch": 0.27258711069996916, + "grad_norm": 3.5162312984466553, + "learning_rate": 8.53509248562378e-06, + "loss": 0.9755, + "step": 3536 + }, + { + "epoch": 0.27266419981498613, + "grad_norm": 3.512768030166626, + "learning_rate": 8.534209479716708e-06, + "loss": 0.9291, + "step": 3537 + }, + { + "epoch": 0.2727412889300031, + "grad_norm": 3.6765835285186768, + "learning_rate": 8.533326253469983e-06, + "loss": 0.9966, + "step": 3538 + }, + { + "epoch": 0.27281837804502, + "grad_norm": 3.5264675617218018, + "learning_rate": 8.532442806938663e-06, + "loss": 0.985, + "step": 3539 + }, + { + "epoch": 0.272895467160037, + "grad_norm": 3.555009365081787, + "learning_rate": 8.531559140177828e-06, + "loss": 1.0493, + "step": 3540 + }, + { + "epoch": 0.27297255627505396, + "grad_norm": 3.558711528778076, + "learning_rate": 8.530675253242573e-06, + "loss": 0.8952, + "step": 3541 + }, + { + "epoch": 0.2730496453900709, + "grad_norm": 3.6628801822662354, + "learning_rate": 8.529791146188003e-06, + "loss": 1.0614, + "step": 3542 + }, + { + "epoch": 0.2731267345050879, + "grad_norm": 3.692908525466919, + "learning_rate": 8.528906819069234e-06, + "loss": 1.0244, + "step": 3543 + }, + { + "epoch": 0.27320382362010487, + "grad_norm": 3.419480085372925, + "learning_rate": 8.528022271941404e-06, + "loss": 0.9742, + "step": 3544 + }, + { + "epoch": 0.2732809127351218, + "grad_norm": 3.549691915512085, + "learning_rate": 8.527137504859654e-06, + "loss": 0.9707, + "step": 3545 + }, + { + "epoch": 0.27335800185013875, + "grad_norm": 3.792904853820801, + "learning_rate": 8.52625251787915e-06, + "loss": 1.008, + "step": 3546 + }, + { + "epoch": 0.2734350909651557, + "grad_norm": 3.560368776321411, + "learning_rate": 8.525367311055063e-06, + "loss": 0.961, + "step": 3547 + }, + { + "epoch": 0.2735121800801727, + "grad_norm": 3.6131019592285156, + "learning_rate": 8.524481884442583e-06, + "loss": 0.9666, + "step": 3548 + }, + { + "epoch": 0.27358926919518967, + "grad_norm": 3.8992390632629395, + "learning_rate": 8.523596238096913e-06, + "loss": 1.0684, + "step": 3549 + }, + { + "epoch": 0.2736663583102066, + "grad_norm": 3.5087990760803223, + "learning_rate": 8.522710372073265e-06, + "loss": 0.9931, + "step": 3550 + }, + { + "epoch": 0.27374344742522355, + "grad_norm": 4.000070095062256, + "learning_rate": 8.521824286426872e-06, + "loss": 1.0817, + "step": 3551 + }, + { + "epoch": 0.2738205365402405, + "grad_norm": 3.854365825653076, + "learning_rate": 8.520937981212973e-06, + "loss": 0.9886, + "step": 3552 + }, + { + "epoch": 0.2738976256552575, + "grad_norm": 3.266382932662964, + "learning_rate": 8.520051456486827e-06, + "loss": 0.9731, + "step": 3553 + }, + { + "epoch": 0.27397471477027446, + "grad_norm": 3.62668514251709, + "learning_rate": 8.519164712303703e-06, + "loss": 1.0892, + "step": 3554 + }, + { + "epoch": 0.2740518038852914, + "grad_norm": 3.941357135772705, + "learning_rate": 8.518277748718887e-06, + "loss": 1.0447, + "step": 3555 + }, + { + "epoch": 0.27412889300030835, + "grad_norm": 3.726266860961914, + "learning_rate": 8.517390565787672e-06, + "loss": 1.0465, + "step": 3556 + }, + { + "epoch": 0.2742059821153253, + "grad_norm": 3.8830811977386475, + "learning_rate": 8.516503163565374e-06, + "loss": 1.0176, + "step": 3557 + }, + { + "epoch": 0.2742830712303423, + "grad_norm": 3.832172393798828, + "learning_rate": 8.515615542107317e-06, + "loss": 1.066, + "step": 3558 + }, + { + "epoch": 0.27436016034535926, + "grad_norm": 3.768930196762085, + "learning_rate": 8.514727701468837e-06, + "loss": 1.2196, + "step": 3559 + }, + { + "epoch": 0.2744372494603762, + "grad_norm": 3.7359375953674316, + "learning_rate": 8.513839641705288e-06, + "loss": 1.0866, + "step": 3560 + }, + { + "epoch": 0.27451433857539315, + "grad_norm": 3.686213970184326, + "learning_rate": 8.512951362872037e-06, + "loss": 1.072, + "step": 3561 + }, + { + "epoch": 0.2745914276904101, + "grad_norm": 3.735572338104248, + "learning_rate": 8.512062865024463e-06, + "loss": 0.9229, + "step": 3562 + }, + { + "epoch": 0.2746685168054271, + "grad_norm": 3.594026803970337, + "learning_rate": 8.511174148217958e-06, + "loss": 1.0068, + "step": 3563 + }, + { + "epoch": 0.27474560592044406, + "grad_norm": 3.5880775451660156, + "learning_rate": 8.51028521250793e-06, + "loss": 1.0385, + "step": 3564 + }, + { + "epoch": 0.274822695035461, + "grad_norm": 3.8719637393951416, + "learning_rate": 8.509396057949799e-06, + "loss": 1.0011, + "step": 3565 + }, + { + "epoch": 0.27489978415047794, + "grad_norm": 3.555837631225586, + "learning_rate": 8.508506684598999e-06, + "loss": 0.9061, + "step": 3566 + }, + { + "epoch": 0.2749768732654949, + "grad_norm": 3.370540142059326, + "learning_rate": 8.50761709251098e-06, + "loss": 0.9201, + "step": 3567 + }, + { + "epoch": 0.2750539623805119, + "grad_norm": 3.50242280960083, + "learning_rate": 8.5067272817412e-06, + "loss": 0.9497, + "step": 3568 + }, + { + "epoch": 0.27513105149552886, + "grad_norm": 3.7081899642944336, + "learning_rate": 8.505837252345135e-06, + "loss": 0.9182, + "step": 3569 + }, + { + "epoch": 0.27520814061054577, + "grad_norm": 3.7657527923583984, + "learning_rate": 8.504947004378277e-06, + "loss": 1.0016, + "step": 3570 + }, + { + "epoch": 0.27528522972556274, + "grad_norm": 3.5126376152038574, + "learning_rate": 8.504056537896123e-06, + "loss": 0.93, + "step": 3571 + }, + { + "epoch": 0.2753623188405797, + "grad_norm": 3.6629865169525146, + "learning_rate": 8.503165852954193e-06, + "loss": 1.0295, + "step": 3572 + }, + { + "epoch": 0.2754394079555967, + "grad_norm": 3.717280149459839, + "learning_rate": 8.502274949608018e-06, + "loss": 1.0295, + "step": 3573 + }, + { + "epoch": 0.27551649707061365, + "grad_norm": 4.10526180267334, + "learning_rate": 8.501383827913137e-06, + "loss": 1.1103, + "step": 3574 + }, + { + "epoch": 0.27559358618563057, + "grad_norm": 3.4711949825286865, + "learning_rate": 8.500492487925107e-06, + "loss": 0.9718, + "step": 3575 + }, + { + "epoch": 0.27567067530064754, + "grad_norm": 3.5179035663604736, + "learning_rate": 8.499600929699501e-06, + "loss": 1.0066, + "step": 3576 + }, + { + "epoch": 0.2757477644156645, + "grad_norm": 3.881817579269409, + "learning_rate": 8.498709153291901e-06, + "loss": 1.0456, + "step": 3577 + }, + { + "epoch": 0.2758248535306815, + "grad_norm": 4.031117916107178, + "learning_rate": 8.497817158757906e-06, + "loss": 1.0358, + "step": 3578 + }, + { + "epoch": 0.27590194264569845, + "grad_norm": 4.566905498504639, + "learning_rate": 8.49692494615313e-06, + "loss": 1.0547, + "step": 3579 + }, + { + "epoch": 0.27597903176071537, + "grad_norm": 3.682335138320923, + "learning_rate": 8.49603251553319e-06, + "loss": 1.0357, + "step": 3580 + }, + { + "epoch": 0.27605612087573234, + "grad_norm": 3.5989487171173096, + "learning_rate": 8.495139866953732e-06, + "loss": 0.8995, + "step": 3581 + }, + { + "epoch": 0.2761332099907493, + "grad_norm": 3.775329351425171, + "learning_rate": 8.494247000470404e-06, + "loss": 1.0149, + "step": 3582 + }, + { + "epoch": 0.2762102991057663, + "grad_norm": 3.7653584480285645, + "learning_rate": 8.493353916138873e-06, + "loss": 1.0037, + "step": 3583 + }, + { + "epoch": 0.27628738822078325, + "grad_norm": 4.354711532592773, + "learning_rate": 8.492460614014816e-06, + "loss": 1.0801, + "step": 3584 + }, + { + "epoch": 0.27636447733580016, + "grad_norm": 3.7385101318359375, + "learning_rate": 8.49156709415393e-06, + "loss": 0.9805, + "step": 3585 + }, + { + "epoch": 0.27644156645081713, + "grad_norm": 3.78240704536438, + "learning_rate": 8.490673356611919e-06, + "loss": 1.0614, + "step": 3586 + }, + { + "epoch": 0.2765186555658341, + "grad_norm": 4.034103870391846, + "learning_rate": 8.489779401444503e-06, + "loss": 1.0574, + "step": 3587 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 3.6883256435394287, + "learning_rate": 8.488885228707413e-06, + "loss": 1.165, + "step": 3588 + }, + { + "epoch": 0.27667283379586805, + "grad_norm": 3.4828081130981445, + "learning_rate": 8.4879908384564e-06, + "loss": 0.924, + "step": 3589 + }, + { + "epoch": 0.27674992291088496, + "grad_norm": 3.475923776626587, + "learning_rate": 8.487096230747223e-06, + "loss": 0.9768, + "step": 3590 + }, + { + "epoch": 0.27682701202590193, + "grad_norm": 3.595651626586914, + "learning_rate": 8.486201405635655e-06, + "loss": 0.8966, + "step": 3591 + }, + { + "epoch": 0.2769041011409189, + "grad_norm": 4.367059707641602, + "learning_rate": 8.485306363177485e-06, + "loss": 0.9618, + "step": 3592 + }, + { + "epoch": 0.27698119025593587, + "grad_norm": 3.2522506713867188, + "learning_rate": 8.484411103428516e-06, + "loss": 0.9475, + "step": 3593 + }, + { + "epoch": 0.27705827937095284, + "grad_norm": 3.634666919708252, + "learning_rate": 8.48351562644456e-06, + "loss": 1.0517, + "step": 3594 + }, + { + "epoch": 0.27713536848596976, + "grad_norm": 3.7717621326446533, + "learning_rate": 8.482619932281446e-06, + "loss": 1.017, + "step": 3595 + }, + { + "epoch": 0.27721245760098673, + "grad_norm": 3.5509636402130127, + "learning_rate": 8.481724020995017e-06, + "loss": 0.9433, + "step": 3596 + }, + { + "epoch": 0.2772895467160037, + "grad_norm": 3.6704728603363037, + "learning_rate": 8.480827892641125e-06, + "loss": 0.991, + "step": 3597 + }, + { + "epoch": 0.27736663583102067, + "grad_norm": 3.710653305053711, + "learning_rate": 8.479931547275644e-06, + "loss": 1.1041, + "step": 3598 + }, + { + "epoch": 0.27744372494603764, + "grad_norm": 3.6124982833862305, + "learning_rate": 8.479034984954454e-06, + "loss": 0.9931, + "step": 3599 + }, + { + "epoch": 0.27752081406105455, + "grad_norm": 3.4628562927246094, + "learning_rate": 8.47813820573345e-06, + "loss": 1.1006, + "step": 3600 + }, + { + "epoch": 0.2775979031760715, + "grad_norm": 3.8903744220733643, + "learning_rate": 8.477241209668543e-06, + "loss": 1.072, + "step": 3601 + }, + { + "epoch": 0.2776749922910885, + "grad_norm": 3.902669906616211, + "learning_rate": 8.476343996815657e-06, + "loss": 0.9923, + "step": 3602 + }, + { + "epoch": 0.27775208140610547, + "grad_norm": 3.763310670852661, + "learning_rate": 8.475446567230727e-06, + "loss": 0.9965, + "step": 3603 + }, + { + "epoch": 0.27782917052112244, + "grad_norm": 3.7086451053619385, + "learning_rate": 8.4745489209697e-06, + "loss": 1.0033, + "step": 3604 + }, + { + "epoch": 0.27790625963613935, + "grad_norm": 3.9299726486206055, + "learning_rate": 8.473651058088548e-06, + "loss": 1.1179, + "step": 3605 + }, + { + "epoch": 0.2779833487511563, + "grad_norm": 3.552938222885132, + "learning_rate": 8.47275297864324e-06, + "loss": 1.0242, + "step": 3606 + }, + { + "epoch": 0.2780604378661733, + "grad_norm": 3.6222376823425293, + "learning_rate": 8.47185468268977e-06, + "loss": 1.0164, + "step": 3607 + }, + { + "epoch": 0.27813752698119026, + "grad_norm": 4.27018928527832, + "learning_rate": 8.470956170284141e-06, + "loss": 1.0589, + "step": 3608 + }, + { + "epoch": 0.27821461609620723, + "grad_norm": 3.718721628189087, + "learning_rate": 8.470057441482374e-06, + "loss": 0.9983, + "step": 3609 + }, + { + "epoch": 0.27829170521122415, + "grad_norm": 3.726409673690796, + "learning_rate": 8.469158496340496e-06, + "loss": 1.028, + "step": 3610 + }, + { + "epoch": 0.2783687943262411, + "grad_norm": 3.4664266109466553, + "learning_rate": 8.46825933491455e-06, + "loss": 0.9796, + "step": 3611 + }, + { + "epoch": 0.2784458834412581, + "grad_norm": 3.7428462505340576, + "learning_rate": 8.4673599572606e-06, + "loss": 1.0337, + "step": 3612 + }, + { + "epoch": 0.27852297255627506, + "grad_norm": 3.687169313430786, + "learning_rate": 8.466460363434714e-06, + "loss": 1.0132, + "step": 3613 + }, + { + "epoch": 0.27860006167129203, + "grad_norm": 4.0733113288879395, + "learning_rate": 8.465560553492978e-06, + "loss": 0.9581, + "step": 3614 + }, + { + "epoch": 0.27867715078630895, + "grad_norm": 3.739532709121704, + "learning_rate": 8.46466052749149e-06, + "loss": 0.9367, + "step": 3615 + }, + { + "epoch": 0.2787542399013259, + "grad_norm": 3.9746146202087402, + "learning_rate": 8.463760285486362e-06, + "loss": 0.9157, + "step": 3616 + }, + { + "epoch": 0.2788313290163429, + "grad_norm": 3.625314235687256, + "learning_rate": 8.462859827533718e-06, + "loss": 0.9241, + "step": 3617 + }, + { + "epoch": 0.27890841813135986, + "grad_norm": 3.546565532684326, + "learning_rate": 8.4619591536897e-06, + "loss": 0.9287, + "step": 3618 + }, + { + "epoch": 0.27898550724637683, + "grad_norm": 3.9717471599578857, + "learning_rate": 8.46105826401046e-06, + "loss": 1.0355, + "step": 3619 + }, + { + "epoch": 0.27906259636139374, + "grad_norm": 3.866204261779785, + "learning_rate": 8.46015715855216e-06, + "loss": 1.0423, + "step": 3620 + }, + { + "epoch": 0.2791396854764107, + "grad_norm": 3.634885549545288, + "learning_rate": 8.459255837370984e-06, + "loss": 1.0229, + "step": 3621 + }, + { + "epoch": 0.2792167745914277, + "grad_norm": 3.85805344581604, + "learning_rate": 8.45835430052312e-06, + "loss": 0.9998, + "step": 3622 + }, + { + "epoch": 0.27929386370644466, + "grad_norm": 3.9002537727355957, + "learning_rate": 8.457452548064778e-06, + "loss": 0.9979, + "step": 3623 + }, + { + "epoch": 0.2793709528214616, + "grad_norm": 3.3640360832214355, + "learning_rate": 8.456550580052177e-06, + "loss": 0.8977, + "step": 3624 + }, + { + "epoch": 0.27944804193647854, + "grad_norm": 3.474541664123535, + "learning_rate": 8.455648396541548e-06, + "loss": 1.0045, + "step": 3625 + }, + { + "epoch": 0.2795251310514955, + "grad_norm": 3.5307717323303223, + "learning_rate": 8.454745997589139e-06, + "loss": 0.9458, + "step": 3626 + }, + { + "epoch": 0.2796022201665125, + "grad_norm": 3.8024139404296875, + "learning_rate": 8.45384338325121e-06, + "loss": 1.0375, + "step": 3627 + }, + { + "epoch": 0.27967930928152945, + "grad_norm": 3.663609027862549, + "learning_rate": 8.452940553584032e-06, + "loss": 0.8606, + "step": 3628 + }, + { + "epoch": 0.2797563983965464, + "grad_norm": 3.6172802448272705, + "learning_rate": 8.452037508643897e-06, + "loss": 0.9782, + "step": 3629 + }, + { + "epoch": 0.2798334875115634, + "grad_norm": 3.8547043800354004, + "learning_rate": 8.4511342484871e-06, + "loss": 0.9364, + "step": 3630 + }, + { + "epoch": 0.2799105766265803, + "grad_norm": 3.862354040145874, + "learning_rate": 8.450230773169958e-06, + "loss": 1.0593, + "step": 3631 + }, + { + "epoch": 0.2799876657415973, + "grad_norm": 3.5931904315948486, + "learning_rate": 8.449327082748794e-06, + "loss": 1.0177, + "step": 3632 + }, + { + "epoch": 0.28006475485661425, + "grad_norm": 3.79412579536438, + "learning_rate": 8.448423177279954e-06, + "loss": 1.0407, + "step": 3633 + }, + { + "epoch": 0.2801418439716312, + "grad_norm": 3.9364240169525146, + "learning_rate": 8.447519056819787e-06, + "loss": 1.0459, + "step": 3634 + }, + { + "epoch": 0.2802189330866482, + "grad_norm": 3.5306715965270996, + "learning_rate": 8.446614721424661e-06, + "loss": 1.0045, + "step": 3635 + }, + { + "epoch": 0.2802960222016651, + "grad_norm": 4.1420111656188965, + "learning_rate": 8.445710171150957e-06, + "loss": 1.1036, + "step": 3636 + }, + { + "epoch": 0.2803731113166821, + "grad_norm": 3.604893684387207, + "learning_rate": 8.444805406055072e-06, + "loss": 0.9265, + "step": 3637 + }, + { + "epoch": 0.28045020043169905, + "grad_norm": 3.910055637359619, + "learning_rate": 8.44390042619341e-06, + "loss": 1.0674, + "step": 3638 + }, + { + "epoch": 0.280527289546716, + "grad_norm": 4.729837894439697, + "learning_rate": 8.442995231622393e-06, + "loss": 1.0476, + "step": 3639 + }, + { + "epoch": 0.280604378661733, + "grad_norm": 3.926619529724121, + "learning_rate": 8.442089822398456e-06, + "loss": 0.9571, + "step": 3640 + }, + { + "epoch": 0.2806814677767499, + "grad_norm": 3.6743786334991455, + "learning_rate": 8.441184198578044e-06, + "loss": 1.0757, + "step": 3641 + }, + { + "epoch": 0.2807585568917669, + "grad_norm": 3.5217857360839844, + "learning_rate": 8.44027836021762e-06, + "loss": 1.0539, + "step": 3642 + }, + { + "epoch": 0.28083564600678385, + "grad_norm": 3.6347432136535645, + "learning_rate": 8.439372307373658e-06, + "loss": 0.9301, + "step": 3643 + }, + { + "epoch": 0.2809127351218008, + "grad_norm": 4.677338123321533, + "learning_rate": 8.438466040102647e-06, + "loss": 0.9828, + "step": 3644 + }, + { + "epoch": 0.2809898242368178, + "grad_norm": 3.5795114040374756, + "learning_rate": 8.437559558461085e-06, + "loss": 0.8683, + "step": 3645 + }, + { + "epoch": 0.2810669133518347, + "grad_norm": 3.54679799079895, + "learning_rate": 8.436652862505488e-06, + "loss": 1.0562, + "step": 3646 + }, + { + "epoch": 0.2811440024668517, + "grad_norm": 4.047598838806152, + "learning_rate": 8.435745952292384e-06, + "loss": 0.981, + "step": 3647 + }, + { + "epoch": 0.28122109158186864, + "grad_norm": 3.662014961242676, + "learning_rate": 8.434838827878315e-06, + "loss": 0.9972, + "step": 3648 + }, + { + "epoch": 0.2812981806968856, + "grad_norm": 3.410169839859009, + "learning_rate": 8.433931489319835e-06, + "loss": 0.9462, + "step": 3649 + }, + { + "epoch": 0.2813752698119026, + "grad_norm": 3.725191354751587, + "learning_rate": 8.43302393667351e-06, + "loss": 0.9657, + "step": 3650 + }, + { + "epoch": 0.2814523589269195, + "grad_norm": 3.510698080062866, + "learning_rate": 8.432116169995923e-06, + "loss": 1.0496, + "step": 3651 + }, + { + "epoch": 0.28152944804193647, + "grad_norm": 3.6888866424560547, + "learning_rate": 8.43120818934367e-06, + "loss": 0.9789, + "step": 3652 + }, + { + "epoch": 0.28160653715695344, + "grad_norm": 3.4661200046539307, + "learning_rate": 8.430299994773354e-06, + "loss": 0.9091, + "step": 3653 + }, + { + "epoch": 0.2816836262719704, + "grad_norm": 3.9719278812408447, + "learning_rate": 8.429391586341602e-06, + "loss": 1.0466, + "step": 3654 + }, + { + "epoch": 0.2817607153869874, + "grad_norm": 3.836193323135376, + "learning_rate": 8.428482964105043e-06, + "loss": 1.0741, + "step": 3655 + }, + { + "epoch": 0.2818378045020043, + "grad_norm": 3.6839404106140137, + "learning_rate": 8.427574128120331e-06, + "loss": 0.9394, + "step": 3656 + }, + { + "epoch": 0.28191489361702127, + "grad_norm": 3.4464738368988037, + "learning_rate": 8.42666507844412e-06, + "loss": 1.0155, + "step": 3657 + }, + { + "epoch": 0.28199198273203824, + "grad_norm": 4.588211536407471, + "learning_rate": 8.425755815133092e-06, + "loss": 1.1744, + "step": 3658 + }, + { + "epoch": 0.2820690718470552, + "grad_norm": 4.032186508178711, + "learning_rate": 8.42484633824393e-06, + "loss": 1.0072, + "step": 3659 + }, + { + "epoch": 0.2821461609620722, + "grad_norm": 3.7464022636413574, + "learning_rate": 8.423936647833337e-06, + "loss": 0.9378, + "step": 3660 + }, + { + "epoch": 0.2822232500770891, + "grad_norm": 3.7981278896331787, + "learning_rate": 8.423026743958028e-06, + "loss": 1.0546, + "step": 3661 + }, + { + "epoch": 0.28230033919210606, + "grad_norm": 4.554448127746582, + "learning_rate": 8.422116626674728e-06, + "loss": 1.037, + "step": 3662 + }, + { + "epoch": 0.28237742830712304, + "grad_norm": 3.7701575756073, + "learning_rate": 8.42120629604018e-06, + "loss": 1.0272, + "step": 3663 + }, + { + "epoch": 0.28245451742214, + "grad_norm": 3.8832554817199707, + "learning_rate": 8.420295752111138e-06, + "loss": 1.0422, + "step": 3664 + }, + { + "epoch": 0.282531606537157, + "grad_norm": 4.044317245483398, + "learning_rate": 8.41938499494437e-06, + "loss": 1.0457, + "step": 3665 + }, + { + "epoch": 0.2826086956521739, + "grad_norm": 3.7039005756378174, + "learning_rate": 8.418474024596659e-06, + "loss": 0.9673, + "step": 3666 + }, + { + "epoch": 0.28268578476719086, + "grad_norm": 3.4527359008789062, + "learning_rate": 8.417562841124797e-06, + "loss": 0.9998, + "step": 3667 + }, + { + "epoch": 0.28276287388220783, + "grad_norm": 4.247575759887695, + "learning_rate": 8.416651444585591e-06, + "loss": 1.0611, + "step": 3668 + }, + { + "epoch": 0.2828399629972248, + "grad_norm": 3.4777634143829346, + "learning_rate": 8.415739835035864e-06, + "loss": 0.9773, + "step": 3669 + }, + { + "epoch": 0.2829170521122418, + "grad_norm": 4.208752155303955, + "learning_rate": 8.414828012532446e-06, + "loss": 0.9631, + "step": 3670 + }, + { + "epoch": 0.2829941412272587, + "grad_norm": 3.4110782146453857, + "learning_rate": 8.41391597713219e-06, + "loss": 1.0187, + "step": 3671 + }, + { + "epoch": 0.28307123034227566, + "grad_norm": 3.7919259071350098, + "learning_rate": 8.413003728891953e-06, + "loss": 1.1607, + "step": 3672 + }, + { + "epoch": 0.28314831945729263, + "grad_norm": 3.560163736343384, + "learning_rate": 8.41209126786861e-06, + "loss": 0.9773, + "step": 3673 + }, + { + "epoch": 0.2832254085723096, + "grad_norm": 3.5800511837005615, + "learning_rate": 8.411178594119046e-06, + "loss": 0.8893, + "step": 3674 + }, + { + "epoch": 0.28330249768732657, + "grad_norm": 4.5871901512146, + "learning_rate": 8.410265707700167e-06, + "loss": 1.1218, + "step": 3675 + }, + { + "epoch": 0.2833795868023435, + "grad_norm": 3.5251963138580322, + "learning_rate": 8.409352608668882e-06, + "loss": 1.1023, + "step": 3676 + }, + { + "epoch": 0.28345667591736046, + "grad_norm": 3.715285539627075, + "learning_rate": 8.408439297082118e-06, + "loss": 0.9962, + "step": 3677 + }, + { + "epoch": 0.2835337650323774, + "grad_norm": 3.63982892036438, + "learning_rate": 8.407525772996818e-06, + "loss": 1.043, + "step": 3678 + }, + { + "epoch": 0.2836108541473944, + "grad_norm": 4.022475242614746, + "learning_rate": 8.406612036469935e-06, + "loss": 1.2085, + "step": 3679 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 3.487618923187256, + "learning_rate": 8.405698087558432e-06, + "loss": 1.0158, + "step": 3680 + }, + { + "epoch": 0.2837650323774283, + "grad_norm": 3.937352180480957, + "learning_rate": 8.404783926319295e-06, + "loss": 1.0598, + "step": 3681 + }, + { + "epoch": 0.28384212149244525, + "grad_norm": 3.6437175273895264, + "learning_rate": 8.403869552809512e-06, + "loss": 0.9678, + "step": 3682 + }, + { + "epoch": 0.2839192106074622, + "grad_norm": 4.383546829223633, + "learning_rate": 8.402954967086093e-06, + "loss": 1.051, + "step": 3683 + }, + { + "epoch": 0.2839962997224792, + "grad_norm": 4.169625282287598, + "learning_rate": 8.402040169206054e-06, + "loss": 1.0238, + "step": 3684 + }, + { + "epoch": 0.28407338883749617, + "grad_norm": 3.8336832523345947, + "learning_rate": 8.40112515922643e-06, + "loss": 0.9547, + "step": 3685 + }, + { + "epoch": 0.2841504779525131, + "grad_norm": 3.8428425788879395, + "learning_rate": 8.40020993720427e-06, + "loss": 0.9351, + "step": 3686 + }, + { + "epoch": 0.28422756706753005, + "grad_norm": 3.674729108810425, + "learning_rate": 8.399294503196629e-06, + "loss": 1.0025, + "step": 3687 + }, + { + "epoch": 0.284304656182547, + "grad_norm": 3.847797155380249, + "learning_rate": 8.398378857260581e-06, + "loss": 1.0309, + "step": 3688 + }, + { + "epoch": 0.284381745297564, + "grad_norm": 3.6674485206604004, + "learning_rate": 8.397462999453212e-06, + "loss": 0.9968, + "step": 3689 + }, + { + "epoch": 0.28445883441258096, + "grad_norm": 4.155699253082275, + "learning_rate": 8.39654692983162e-06, + "loss": 1.1423, + "step": 3690 + }, + { + "epoch": 0.2845359235275979, + "grad_norm": 3.99698543548584, + "learning_rate": 8.395630648452919e-06, + "loss": 1.0867, + "step": 3691 + }, + { + "epoch": 0.28461301264261485, + "grad_norm": 3.8566694259643555, + "learning_rate": 8.394714155374234e-06, + "loss": 0.9227, + "step": 3692 + }, + { + "epoch": 0.2846901017576318, + "grad_norm": 3.5863726139068604, + "learning_rate": 8.393797450652701e-06, + "loss": 0.9605, + "step": 3693 + }, + { + "epoch": 0.2847671908726488, + "grad_norm": 3.88511323928833, + "learning_rate": 8.392880534345477e-06, + "loss": 0.9894, + "step": 3694 + }, + { + "epoch": 0.28484427998766576, + "grad_norm": 3.3645994663238525, + "learning_rate": 8.391963406509721e-06, + "loss": 1.0001, + "step": 3695 + }, + { + "epoch": 0.2849213691026827, + "grad_norm": 3.5766923427581787, + "learning_rate": 8.391046067202617e-06, + "loss": 0.9889, + "step": 3696 + }, + { + "epoch": 0.28499845821769965, + "grad_norm": 4.069863796234131, + "learning_rate": 8.390128516481351e-06, + "loss": 0.9495, + "step": 3697 + }, + { + "epoch": 0.2850755473327166, + "grad_norm": 3.9088964462280273, + "learning_rate": 8.389210754403132e-06, + "loss": 1.0693, + "step": 3698 + }, + { + "epoch": 0.2851526364477336, + "grad_norm": 4.604135990142822, + "learning_rate": 8.388292781025173e-06, + "loss": 1.1569, + "step": 3699 + }, + { + "epoch": 0.28522972556275056, + "grad_norm": 3.65946364402771, + "learning_rate": 8.38737459640471e-06, + "loss": 0.9248, + "step": 3700 + }, + { + "epoch": 0.2853068146777675, + "grad_norm": 3.6877028942108154, + "learning_rate": 8.386456200598982e-06, + "loss": 1.1605, + "step": 3701 + }, + { + "epoch": 0.28538390379278444, + "grad_norm": 3.4884543418884277, + "learning_rate": 8.38553759366525e-06, + "loss": 1.0238, + "step": 3702 + }, + { + "epoch": 0.2854609929078014, + "grad_norm": 3.4332854747772217, + "learning_rate": 8.384618775660784e-06, + "loss": 1.0155, + "step": 3703 + }, + { + "epoch": 0.2855380820228184, + "grad_norm": 4.001807689666748, + "learning_rate": 8.383699746642866e-06, + "loss": 1.1088, + "step": 3704 + }, + { + "epoch": 0.28561517113783536, + "grad_norm": 3.441525459289551, + "learning_rate": 8.382780506668792e-06, + "loss": 0.9096, + "step": 3705 + }, + { + "epoch": 0.28569226025285227, + "grad_norm": 3.314866542816162, + "learning_rate": 8.381861055795876e-06, + "loss": 1.0886, + "step": 3706 + }, + { + "epoch": 0.28576934936786924, + "grad_norm": 4.307245254516602, + "learning_rate": 8.380941394081437e-06, + "loss": 1.0525, + "step": 3707 + }, + { + "epoch": 0.2858464384828862, + "grad_norm": 3.5689730644226074, + "learning_rate": 8.380021521582813e-06, + "loss": 1.0542, + "step": 3708 + }, + { + "epoch": 0.2859235275979032, + "grad_norm": 3.881000518798828, + "learning_rate": 8.379101438357352e-06, + "loss": 0.9946, + "step": 3709 + }, + { + "epoch": 0.28600061671292015, + "grad_norm": 4.110035419464111, + "learning_rate": 8.378181144462418e-06, + "loss": 1.0302, + "step": 3710 + }, + { + "epoch": 0.28607770582793707, + "grad_norm": 3.9843645095825195, + "learning_rate": 8.377260639955385e-06, + "loss": 1.026, + "step": 3711 + }, + { + "epoch": 0.28615479494295404, + "grad_norm": 3.5178589820861816, + "learning_rate": 8.376339924893642e-06, + "loss": 0.9419, + "step": 3712 + }, + { + "epoch": 0.286231884057971, + "grad_norm": 3.800537109375, + "learning_rate": 8.375418999334591e-06, + "loss": 0.9474, + "step": 3713 + }, + { + "epoch": 0.286308973172988, + "grad_norm": 3.798588991165161, + "learning_rate": 8.374497863335649e-06, + "loss": 1.0826, + "step": 3714 + }, + { + "epoch": 0.28638606228800495, + "grad_norm": 3.9119489192962646, + "learning_rate": 8.37357651695424e-06, + "loss": 1.0957, + "step": 3715 + }, + { + "epoch": 0.2864631514030219, + "grad_norm": 3.5828166007995605, + "learning_rate": 8.37265496024781e-06, + "loss": 0.951, + "step": 3716 + }, + { + "epoch": 0.28654024051803884, + "grad_norm": 3.7555758953094482, + "learning_rate": 8.371733193273808e-06, + "loss": 0.9873, + "step": 3717 + }, + { + "epoch": 0.2866173296330558, + "grad_norm": 3.638547897338867, + "learning_rate": 8.370811216089705e-06, + "loss": 1.1217, + "step": 3718 + }, + { + "epoch": 0.2866944187480728, + "grad_norm": 3.6661555767059326, + "learning_rate": 8.36988902875298e-06, + "loss": 1.0319, + "step": 3719 + }, + { + "epoch": 0.28677150786308975, + "grad_norm": 3.355048179626465, + "learning_rate": 8.36896663132113e-06, + "loss": 0.9512, + "step": 3720 + }, + { + "epoch": 0.2868485969781067, + "grad_norm": 3.8450980186462402, + "learning_rate": 8.368044023851656e-06, + "loss": 1.0523, + "step": 3721 + }, + { + "epoch": 0.28692568609312363, + "grad_norm": 3.896583318710327, + "learning_rate": 8.36712120640208e-06, + "loss": 1.0813, + "step": 3722 + }, + { + "epoch": 0.2870027752081406, + "grad_norm": 3.890866279602051, + "learning_rate": 8.366198179029937e-06, + "loss": 1.0645, + "step": 3723 + }, + { + "epoch": 0.2870798643231576, + "grad_norm": 3.7487337589263916, + "learning_rate": 8.365274941792771e-06, + "loss": 0.9914, + "step": 3724 + }, + { + "epoch": 0.28715695343817454, + "grad_norm": 3.695530652999878, + "learning_rate": 8.364351494748141e-06, + "loss": 1.011, + "step": 3725 + }, + { + "epoch": 0.2872340425531915, + "grad_norm": 3.758422613143921, + "learning_rate": 8.363427837953622e-06, + "loss": 1.0366, + "step": 3726 + }, + { + "epoch": 0.28731113166820843, + "grad_norm": 4.224013328552246, + "learning_rate": 8.362503971466795e-06, + "loss": 1.0637, + "step": 3727 + }, + { + "epoch": 0.2873882207832254, + "grad_norm": 4.0081305503845215, + "learning_rate": 8.361579895345263e-06, + "loss": 1.133, + "step": 3728 + }, + { + "epoch": 0.28746530989824237, + "grad_norm": 3.7337818145751953, + "learning_rate": 8.36065560964663e-06, + "loss": 0.9504, + "step": 3729 + }, + { + "epoch": 0.28754239901325934, + "grad_norm": 3.62839412689209, + "learning_rate": 8.359731114428529e-06, + "loss": 1.093, + "step": 3730 + }, + { + "epoch": 0.2876194881282763, + "grad_norm": 3.5839309692382812, + "learning_rate": 8.358806409748592e-06, + "loss": 1.0897, + "step": 3731 + }, + { + "epoch": 0.28769657724329323, + "grad_norm": 3.644613265991211, + "learning_rate": 8.35788149566447e-06, + "loss": 1.0344, + "step": 3732 + }, + { + "epoch": 0.2877736663583102, + "grad_norm": 3.906480073928833, + "learning_rate": 8.356956372233829e-06, + "loss": 1.092, + "step": 3733 + }, + { + "epoch": 0.28785075547332717, + "grad_norm": 3.4734041690826416, + "learning_rate": 8.356031039514344e-06, + "loss": 1.0349, + "step": 3734 + }, + { + "epoch": 0.28792784458834414, + "grad_norm": 3.4001238346099854, + "learning_rate": 8.355105497563705e-06, + "loss": 0.9855, + "step": 3735 + }, + { + "epoch": 0.2880049337033611, + "grad_norm": 3.7522032260894775, + "learning_rate": 8.354179746439614e-06, + "loss": 1.0141, + "step": 3736 + }, + { + "epoch": 0.288082022818378, + "grad_norm": 3.7555811405181885, + "learning_rate": 8.353253786199788e-06, + "loss": 1.0243, + "step": 3737 + }, + { + "epoch": 0.288159111933395, + "grad_norm": 3.771195411682129, + "learning_rate": 8.352327616901956e-06, + "loss": 0.9578, + "step": 3738 + }, + { + "epoch": 0.28823620104841197, + "grad_norm": 3.630093574523926, + "learning_rate": 8.35140123860386e-06, + "loss": 1.0626, + "step": 3739 + }, + { + "epoch": 0.28831329016342894, + "grad_norm": 3.6648457050323486, + "learning_rate": 8.350474651363254e-06, + "loss": 1.1257, + "step": 3740 + }, + { + "epoch": 0.2883903792784459, + "grad_norm": 3.7269270420074463, + "learning_rate": 8.349547855237904e-06, + "loss": 1.0953, + "step": 3741 + }, + { + "epoch": 0.2884674683934628, + "grad_norm": 3.8183059692382812, + "learning_rate": 8.348620850285594e-06, + "loss": 1.1094, + "step": 3742 + }, + { + "epoch": 0.2885445575084798, + "grad_norm": 3.764166831970215, + "learning_rate": 8.347693636564119e-06, + "loss": 0.9972, + "step": 3743 + }, + { + "epoch": 0.28862164662349676, + "grad_norm": 3.2361299991607666, + "learning_rate": 8.34676621413128e-06, + "loss": 0.8723, + "step": 3744 + }, + { + "epoch": 0.28869873573851373, + "grad_norm": 4.178921699523926, + "learning_rate": 8.345838583044903e-06, + "loss": 0.8852, + "step": 3745 + }, + { + "epoch": 0.2887758248535307, + "grad_norm": 4.049960136413574, + "learning_rate": 8.344910743362819e-06, + "loss": 1.1337, + "step": 3746 + }, + { + "epoch": 0.2888529139685476, + "grad_norm": 4.307690143585205, + "learning_rate": 8.343982695142873e-06, + "loss": 1.0333, + "step": 3747 + }, + { + "epoch": 0.2889300030835646, + "grad_norm": 3.709958791732788, + "learning_rate": 8.343054438442926e-06, + "loss": 1.0488, + "step": 3748 + }, + { + "epoch": 0.28900709219858156, + "grad_norm": 3.61205792427063, + "learning_rate": 8.342125973320848e-06, + "loss": 0.9665, + "step": 3749 + }, + { + "epoch": 0.28908418131359853, + "grad_norm": 3.7218470573425293, + "learning_rate": 8.341197299834524e-06, + "loss": 1.0409, + "step": 3750 + }, + { + "epoch": 0.2891612704286155, + "grad_norm": 3.4968018531799316, + "learning_rate": 8.340268418041855e-06, + "loss": 0.9222, + "step": 3751 + }, + { + "epoch": 0.2892383595436324, + "grad_norm": 3.451004981994629, + "learning_rate": 8.339339328000749e-06, + "loss": 1.0035, + "step": 3752 + }, + { + "epoch": 0.2893154486586494, + "grad_norm": 3.4331042766571045, + "learning_rate": 8.338410029769133e-06, + "loss": 0.9606, + "step": 3753 + }, + { + "epoch": 0.28939253777366636, + "grad_norm": 3.4850504398345947, + "learning_rate": 8.337480523404938e-06, + "loss": 0.9549, + "step": 3754 + }, + { + "epoch": 0.28946962688868333, + "grad_norm": 3.5593273639678955, + "learning_rate": 8.33655080896612e-06, + "loss": 1.0124, + "step": 3755 + }, + { + "epoch": 0.2895467160037003, + "grad_norm": 3.5251691341400146, + "learning_rate": 8.335620886510637e-06, + "loss": 0.9167, + "step": 3756 + }, + { + "epoch": 0.2896238051187172, + "grad_norm": 3.627453088760376, + "learning_rate": 8.33469075609647e-06, + "loss": 0.9891, + "step": 3757 + }, + { + "epoch": 0.2897008942337342, + "grad_norm": 3.599273204803467, + "learning_rate": 8.333760417781605e-06, + "loss": 1.0972, + "step": 3758 + }, + { + "epoch": 0.28977798334875116, + "grad_norm": 3.6854488849639893, + "learning_rate": 8.332829871624042e-06, + "loss": 1.1368, + "step": 3759 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 3.5526113510131836, + "learning_rate": 8.331899117681799e-06, + "loss": 0.9347, + "step": 3760 + }, + { + "epoch": 0.2899321615787851, + "grad_norm": 3.6977930068969727, + "learning_rate": 8.330968156012902e-06, + "loss": 0.9981, + "step": 3761 + }, + { + "epoch": 0.290009250693802, + "grad_norm": 3.659409761428833, + "learning_rate": 8.330036986675392e-06, + "loss": 1.023, + "step": 3762 + }, + { + "epoch": 0.290086339808819, + "grad_norm": 4.15533971786499, + "learning_rate": 8.329105609727323e-06, + "loss": 1.018, + "step": 3763 + }, + { + "epoch": 0.29016342892383595, + "grad_norm": 3.824834108352661, + "learning_rate": 8.32817402522676e-06, + "loss": 0.8737, + "step": 3764 + }, + { + "epoch": 0.2902405180388529, + "grad_norm": 3.59409499168396, + "learning_rate": 8.327242233231784e-06, + "loss": 1.0568, + "step": 3765 + }, + { + "epoch": 0.2903176071538699, + "grad_norm": 3.5583786964416504, + "learning_rate": 8.326310233800488e-06, + "loss": 0.9326, + "step": 3766 + }, + { + "epoch": 0.2903946962688868, + "grad_norm": 3.7277417182922363, + "learning_rate": 8.325378026990976e-06, + "loss": 0.972, + "step": 3767 + }, + { + "epoch": 0.2904717853839038, + "grad_norm": 3.8586301803588867, + "learning_rate": 8.324445612861367e-06, + "loss": 1.0652, + "step": 3768 + }, + { + "epoch": 0.29054887449892075, + "grad_norm": 3.91520357131958, + "learning_rate": 8.32351299146979e-06, + "loss": 0.9839, + "step": 3769 + }, + { + "epoch": 0.2906259636139377, + "grad_norm": 3.715453863143921, + "learning_rate": 8.322580162874393e-06, + "loss": 1.1271, + "step": 3770 + }, + { + "epoch": 0.2907030527289547, + "grad_norm": 3.917518138885498, + "learning_rate": 8.321647127133327e-06, + "loss": 1.0822, + "step": 3771 + }, + { + "epoch": 0.2907801418439716, + "grad_norm": 4.15562629699707, + "learning_rate": 8.320713884304769e-06, + "loss": 0.9178, + "step": 3772 + }, + { + "epoch": 0.2908572309589886, + "grad_norm": 3.6164376735687256, + "learning_rate": 8.3197804344469e-06, + "loss": 0.9087, + "step": 3773 + }, + { + "epoch": 0.29093432007400555, + "grad_norm": 3.697296380996704, + "learning_rate": 8.318846777617913e-06, + "loss": 1.0643, + "step": 3774 + }, + { + "epoch": 0.2910114091890225, + "grad_norm": 3.5980827808380127, + "learning_rate": 8.317912913876019e-06, + "loss": 1.009, + "step": 3775 + }, + { + "epoch": 0.2910884983040395, + "grad_norm": 3.787522077560425, + "learning_rate": 8.316978843279438e-06, + "loss": 1.124, + "step": 3776 + }, + { + "epoch": 0.2911655874190564, + "grad_norm": 3.599940299987793, + "learning_rate": 8.316044565886405e-06, + "loss": 1.0432, + "step": 3777 + }, + { + "epoch": 0.2912426765340734, + "grad_norm": 3.8612630367279053, + "learning_rate": 8.315110081755166e-06, + "loss": 0.9916, + "step": 3778 + }, + { + "epoch": 0.29131976564909035, + "grad_norm": 3.687645673751831, + "learning_rate": 8.314175390943987e-06, + "loss": 1.0088, + "step": 3779 + }, + { + "epoch": 0.2913968547641073, + "grad_norm": 3.6906330585479736, + "learning_rate": 8.313240493511132e-06, + "loss": 0.9998, + "step": 3780 + }, + { + "epoch": 0.2914739438791243, + "grad_norm": 3.6079602241516113, + "learning_rate": 8.312305389514894e-06, + "loss": 1.0303, + "step": 3781 + }, + { + "epoch": 0.2915510329941412, + "grad_norm": 3.642810583114624, + "learning_rate": 8.31137007901357e-06, + "loss": 0.9044, + "step": 3782 + }, + { + "epoch": 0.29162812210915817, + "grad_norm": 3.4344608783721924, + "learning_rate": 8.310434562065472e-06, + "loss": 0.9379, + "step": 3783 + }, + { + "epoch": 0.29170521122417514, + "grad_norm": 3.8744451999664307, + "learning_rate": 8.309498838728924e-06, + "loss": 1.1047, + "step": 3784 + }, + { + "epoch": 0.2917823003391921, + "grad_norm": 3.5419445037841797, + "learning_rate": 8.308562909062264e-06, + "loss": 1.0705, + "step": 3785 + }, + { + "epoch": 0.2918593894542091, + "grad_norm": 3.666670560836792, + "learning_rate": 8.307626773123842e-06, + "loss": 1.0101, + "step": 3786 + }, + { + "epoch": 0.291936478569226, + "grad_norm": 3.46775221824646, + "learning_rate": 8.306690430972023e-06, + "loss": 0.983, + "step": 3787 + }, + { + "epoch": 0.29201356768424297, + "grad_norm": 3.711484909057617, + "learning_rate": 8.305753882665178e-06, + "loss": 0.97, + "step": 3788 + }, + { + "epoch": 0.29209065679925994, + "grad_norm": 3.968447685241699, + "learning_rate": 8.304817128261702e-06, + "loss": 1.0543, + "step": 3789 + }, + { + "epoch": 0.2921677459142769, + "grad_norm": 3.9683480262756348, + "learning_rate": 8.303880167819994e-06, + "loss": 0.97, + "step": 3790 + }, + { + "epoch": 0.2922448350292939, + "grad_norm": 3.354612112045288, + "learning_rate": 8.302943001398466e-06, + "loss": 1.0434, + "step": 3791 + }, + { + "epoch": 0.2923219241443108, + "grad_norm": 4.114500999450684, + "learning_rate": 8.302005629055549e-06, + "loss": 1.0238, + "step": 3792 + }, + { + "epoch": 0.29239901325932777, + "grad_norm": 3.511687994003296, + "learning_rate": 8.301068050849685e-06, + "loss": 0.9847, + "step": 3793 + }, + { + "epoch": 0.29247610237434474, + "grad_norm": 3.513735294342041, + "learning_rate": 8.300130266839323e-06, + "loss": 0.9874, + "step": 3794 + }, + { + "epoch": 0.2925531914893617, + "grad_norm": 3.9487833976745605, + "learning_rate": 8.29919227708293e-06, + "loss": 1.0424, + "step": 3795 + }, + { + "epoch": 0.2926302806043787, + "grad_norm": 3.702522039413452, + "learning_rate": 8.298254081638988e-06, + "loss": 0.9582, + "step": 3796 + }, + { + "epoch": 0.2927073697193956, + "grad_norm": 4.400388240814209, + "learning_rate": 8.297315680565984e-06, + "loss": 1.0504, + "step": 3797 + }, + { + "epoch": 0.29278445883441256, + "grad_norm": 3.7499802112579346, + "learning_rate": 8.296377073922427e-06, + "loss": 1.0402, + "step": 3798 + }, + { + "epoch": 0.29286154794942953, + "grad_norm": 3.343437433242798, + "learning_rate": 8.295438261766829e-06, + "loss": 0.8718, + "step": 3799 + }, + { + "epoch": 0.2929386370644465, + "grad_norm": 3.8932011127471924, + "learning_rate": 8.294499244157724e-06, + "loss": 1.0511, + "step": 3800 + }, + { + "epoch": 0.2930157261794635, + "grad_norm": 3.75187087059021, + "learning_rate": 8.293560021153652e-06, + "loss": 0.9994, + "step": 3801 + }, + { + "epoch": 0.29309281529448045, + "grad_norm": 4.034709930419922, + "learning_rate": 8.292620592813173e-06, + "loss": 1.0376, + "step": 3802 + }, + { + "epoch": 0.29316990440949736, + "grad_norm": 3.411324977874756, + "learning_rate": 8.291680959194852e-06, + "loss": 0.9321, + "step": 3803 + }, + { + "epoch": 0.29324699352451433, + "grad_norm": 3.779232978820801, + "learning_rate": 8.29074112035727e-06, + "loss": 1.0499, + "step": 3804 + }, + { + "epoch": 0.2933240826395313, + "grad_norm": 4.236926555633545, + "learning_rate": 8.289801076359025e-06, + "loss": 1.0988, + "step": 3805 + }, + { + "epoch": 0.2934011717545483, + "grad_norm": 3.3468666076660156, + "learning_rate": 8.28886082725872e-06, + "loss": 0.9899, + "step": 3806 + }, + { + "epoch": 0.29347826086956524, + "grad_norm": 3.675314426422119, + "learning_rate": 8.287920373114976e-06, + "loss": 0.9013, + "step": 3807 + }, + { + "epoch": 0.29355534998458216, + "grad_norm": 4.145412445068359, + "learning_rate": 8.286979713986426e-06, + "loss": 1.0496, + "step": 3808 + }, + { + "epoch": 0.29363243909959913, + "grad_norm": 4.000509738922119, + "learning_rate": 8.286038849931713e-06, + "loss": 1.0509, + "step": 3809 + }, + { + "epoch": 0.2937095282146161, + "grad_norm": 3.838998794555664, + "learning_rate": 8.285097781009497e-06, + "loss": 0.9234, + "step": 3810 + }, + { + "epoch": 0.29378661732963307, + "grad_norm": 3.82969069480896, + "learning_rate": 8.284156507278448e-06, + "loss": 1.0399, + "step": 3811 + }, + { + "epoch": 0.29386370644465004, + "grad_norm": 3.689297914505005, + "learning_rate": 8.283215028797252e-06, + "loss": 0.886, + "step": 3812 + }, + { + "epoch": 0.29394079555966696, + "grad_norm": 3.5808119773864746, + "learning_rate": 8.2822733456246e-06, + "loss": 1.0377, + "step": 3813 + }, + { + "epoch": 0.2940178846746839, + "grad_norm": 4.0584821701049805, + "learning_rate": 8.281331457819204e-06, + "loss": 1.1038, + "step": 3814 + }, + { + "epoch": 0.2940949737897009, + "grad_norm": 4.134930610656738, + "learning_rate": 8.28038936543979e-06, + "loss": 0.9839, + "step": 3815 + }, + { + "epoch": 0.29417206290471787, + "grad_norm": 3.6218936443328857, + "learning_rate": 8.279447068545085e-06, + "loss": 1.027, + "step": 3816 + }, + { + "epoch": 0.29424915201973484, + "grad_norm": 4.330170154571533, + "learning_rate": 8.27850456719384e-06, + "loss": 1.103, + "step": 3817 + }, + { + "epoch": 0.29432624113475175, + "grad_norm": 3.6379551887512207, + "learning_rate": 8.277561861444818e-06, + "loss": 1.0382, + "step": 3818 + }, + { + "epoch": 0.2944033302497687, + "grad_norm": 3.776359796524048, + "learning_rate": 8.276618951356787e-06, + "loss": 1.0924, + "step": 3819 + }, + { + "epoch": 0.2944804193647857, + "grad_norm": 3.7215919494628906, + "learning_rate": 8.275675836988535e-06, + "loss": 1.1014, + "step": 3820 + }, + { + "epoch": 0.29455750847980267, + "grad_norm": 3.9651782512664795, + "learning_rate": 8.27473251839886e-06, + "loss": 1.0527, + "step": 3821 + }, + { + "epoch": 0.29463459759481964, + "grad_norm": 3.5446691513061523, + "learning_rate": 8.273788995646571e-06, + "loss": 0.9641, + "step": 3822 + }, + { + "epoch": 0.29471168670983655, + "grad_norm": 3.6998002529144287, + "learning_rate": 8.272845268790494e-06, + "loss": 1.1796, + "step": 3823 + }, + { + "epoch": 0.2947887758248535, + "grad_norm": 3.6286091804504395, + "learning_rate": 8.271901337889468e-06, + "loss": 1.0408, + "step": 3824 + }, + { + "epoch": 0.2948658649398705, + "grad_norm": 3.6755175590515137, + "learning_rate": 8.270957203002337e-06, + "loss": 0.9034, + "step": 3825 + }, + { + "epoch": 0.29494295405488746, + "grad_norm": 3.6150128841400146, + "learning_rate": 8.270012864187965e-06, + "loss": 0.9529, + "step": 3826 + }, + { + "epoch": 0.29502004316990443, + "grad_norm": 3.8087635040283203, + "learning_rate": 8.269068321505225e-06, + "loss": 0.9891, + "step": 3827 + }, + { + "epoch": 0.29509713228492135, + "grad_norm": 3.8580355644226074, + "learning_rate": 8.268123575013008e-06, + "loss": 0.9276, + "step": 3828 + }, + { + "epoch": 0.2951742213999383, + "grad_norm": 3.485034227371216, + "learning_rate": 8.267178624770212e-06, + "loss": 1.0299, + "step": 3829 + }, + { + "epoch": 0.2952513105149553, + "grad_norm": 3.557379961013794, + "learning_rate": 8.26623347083575e-06, + "loss": 0.9734, + "step": 3830 + }, + { + "epoch": 0.29532839962997226, + "grad_norm": 3.9309725761413574, + "learning_rate": 8.265288113268548e-06, + "loss": 1.0063, + "step": 3831 + }, + { + "epoch": 0.29540548874498923, + "grad_norm": 3.5980284214019775, + "learning_rate": 8.264342552127542e-06, + "loss": 0.8885, + "step": 3832 + }, + { + "epoch": 0.29548257786000615, + "grad_norm": 3.4705045223236084, + "learning_rate": 8.263396787471685e-06, + "loss": 0.9333, + "step": 3833 + }, + { + "epoch": 0.2955596669750231, + "grad_norm": 3.6276614665985107, + "learning_rate": 8.26245081935994e-06, + "loss": 0.9775, + "step": 3834 + }, + { + "epoch": 0.2956367560900401, + "grad_norm": 3.9132349491119385, + "learning_rate": 8.261504647851283e-06, + "loss": 0.9186, + "step": 3835 + }, + { + "epoch": 0.29571384520505706, + "grad_norm": 3.6847267150878906, + "learning_rate": 8.260558273004703e-06, + "loss": 0.934, + "step": 3836 + }, + { + "epoch": 0.29579093432007403, + "grad_norm": 3.9422383308410645, + "learning_rate": 8.259611694879202e-06, + "loss": 0.9443, + "step": 3837 + }, + { + "epoch": 0.29586802343509094, + "grad_norm": 3.9089691638946533, + "learning_rate": 8.258664913533791e-06, + "loss": 1.0795, + "step": 3838 + }, + { + "epoch": 0.2959451125501079, + "grad_norm": 3.8518786430358887, + "learning_rate": 8.257717929027504e-06, + "loss": 1.1894, + "step": 3839 + }, + { + "epoch": 0.2960222016651249, + "grad_norm": 3.669287919998169, + "learning_rate": 8.256770741419374e-06, + "loss": 0.9979, + "step": 3840 + }, + { + "epoch": 0.29609929078014185, + "grad_norm": 4.07830286026001, + "learning_rate": 8.255823350768455e-06, + "loss": 1.0086, + "step": 3841 + }, + { + "epoch": 0.2961763798951588, + "grad_norm": 3.9762747287750244, + "learning_rate": 8.254875757133813e-06, + "loss": 1.0408, + "step": 3842 + }, + { + "epoch": 0.29625346901017574, + "grad_norm": 3.584479331970215, + "learning_rate": 8.253927960574525e-06, + "loss": 1.0119, + "step": 3843 + }, + { + "epoch": 0.2963305581251927, + "grad_norm": 3.4980578422546387, + "learning_rate": 8.252979961149683e-06, + "loss": 0.997, + "step": 3844 + }, + { + "epoch": 0.2964076472402097, + "grad_norm": 3.391723155975342, + "learning_rate": 8.252031758918386e-06, + "loss": 0.9273, + "step": 3845 + }, + { + "epoch": 0.29648473635522665, + "grad_norm": 3.5890274047851562, + "learning_rate": 8.251083353939752e-06, + "loss": 1.0653, + "step": 3846 + }, + { + "epoch": 0.2965618254702436, + "grad_norm": 3.8049674034118652, + "learning_rate": 8.250134746272909e-06, + "loss": 1.0471, + "step": 3847 + }, + { + "epoch": 0.29663891458526054, + "grad_norm": 3.6828417778015137, + "learning_rate": 8.249185935976998e-06, + "loss": 1.0122, + "step": 3848 + }, + { + "epoch": 0.2967160037002775, + "grad_norm": 3.8039426803588867, + "learning_rate": 8.24823692311117e-06, + "loss": 1.0566, + "step": 3849 + }, + { + "epoch": 0.2967930928152945, + "grad_norm": 4.135082721710205, + "learning_rate": 8.247287707734594e-06, + "loss": 1.0346, + "step": 3850 + }, + { + "epoch": 0.29687018193031145, + "grad_norm": 3.7773916721343994, + "learning_rate": 8.246338289906447e-06, + "loss": 0.9774, + "step": 3851 + }, + { + "epoch": 0.2969472710453284, + "grad_norm": 3.629441499710083, + "learning_rate": 8.245388669685922e-06, + "loss": 1.0698, + "step": 3852 + }, + { + "epoch": 0.29702436016034534, + "grad_norm": 3.7880666255950928, + "learning_rate": 8.24443884713222e-06, + "loss": 1.0878, + "step": 3853 + }, + { + "epoch": 0.2971014492753623, + "grad_norm": 3.7315597534179688, + "learning_rate": 8.243488822304561e-06, + "loss": 1.0335, + "step": 3854 + }, + { + "epoch": 0.2971785383903793, + "grad_norm": 4.079202651977539, + "learning_rate": 8.24253859526217e-06, + "loss": 1.0879, + "step": 3855 + }, + { + "epoch": 0.29725562750539625, + "grad_norm": 4.179590702056885, + "learning_rate": 8.241588166064294e-06, + "loss": 1.0974, + "step": 3856 + }, + { + "epoch": 0.2973327166204132, + "grad_norm": 3.6889772415161133, + "learning_rate": 8.240637534770182e-06, + "loss": 1.0824, + "step": 3857 + }, + { + "epoch": 0.29740980573543013, + "grad_norm": 3.830030679702759, + "learning_rate": 8.239686701439105e-06, + "loss": 1.0321, + "step": 3858 + }, + { + "epoch": 0.2974868948504471, + "grad_norm": 4.01475191116333, + "learning_rate": 8.23873566613034e-06, + "loss": 1.0373, + "step": 3859 + }, + { + "epoch": 0.2975639839654641, + "grad_norm": 3.6347076892852783, + "learning_rate": 8.237784428903182e-06, + "loss": 0.9952, + "step": 3860 + }, + { + "epoch": 0.29764107308048104, + "grad_norm": 3.510388135910034, + "learning_rate": 8.236832989816932e-06, + "loss": 0.9956, + "step": 3861 + }, + { + "epoch": 0.297718162195498, + "grad_norm": 3.1566152572631836, + "learning_rate": 8.23588134893091e-06, + "loss": 0.9462, + "step": 3862 + }, + { + "epoch": 0.29779525131051493, + "grad_norm": 4.099442481994629, + "learning_rate": 8.234929506304443e-06, + "loss": 0.8942, + "step": 3863 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 3.81709885597229, + "learning_rate": 8.233977461996879e-06, + "loss": 0.9744, + "step": 3864 + }, + { + "epoch": 0.29794942954054887, + "grad_norm": 3.949967861175537, + "learning_rate": 8.233025216067567e-06, + "loss": 1.0687, + "step": 3865 + }, + { + "epoch": 0.29802651865556584, + "grad_norm": 3.714970111846924, + "learning_rate": 8.232072768575875e-06, + "loss": 1.1127, + "step": 3866 + }, + { + "epoch": 0.2981036077705828, + "grad_norm": 3.600409984588623, + "learning_rate": 8.231120119581189e-06, + "loss": 0.9565, + "step": 3867 + }, + { + "epoch": 0.2981806968855997, + "grad_norm": 3.3856570720672607, + "learning_rate": 8.230167269142897e-06, + "loss": 0.9503, + "step": 3868 + }, + { + "epoch": 0.2982577860006167, + "grad_norm": 4.353588581085205, + "learning_rate": 8.229214217320405e-06, + "loss": 1.0665, + "step": 3869 + }, + { + "epoch": 0.29833487511563367, + "grad_norm": 3.9014816284179688, + "learning_rate": 8.22826096417313e-06, + "loss": 1.003, + "step": 3870 + }, + { + "epoch": 0.29841196423065064, + "grad_norm": 4.135391712188721, + "learning_rate": 8.227307509760505e-06, + "loss": 1.0497, + "step": 3871 + }, + { + "epoch": 0.2984890533456676, + "grad_norm": 3.6809937953948975, + "learning_rate": 8.22635385414197e-06, + "loss": 1.09, + "step": 3872 + }, + { + "epoch": 0.2985661424606845, + "grad_norm": 3.6578569412231445, + "learning_rate": 8.225399997376984e-06, + "loss": 1.0836, + "step": 3873 + }, + { + "epoch": 0.2986432315757015, + "grad_norm": 3.6818604469299316, + "learning_rate": 8.22444593952501e-06, + "loss": 1.0283, + "step": 3874 + }, + { + "epoch": 0.29872032069071847, + "grad_norm": 3.6182854175567627, + "learning_rate": 8.223491680645533e-06, + "loss": 0.9956, + "step": 3875 + }, + { + "epoch": 0.29879740980573544, + "grad_norm": 3.5641016960144043, + "learning_rate": 8.222537220798046e-06, + "loss": 1.107, + "step": 3876 + }, + { + "epoch": 0.2988744989207524, + "grad_norm": 3.3794901371002197, + "learning_rate": 8.22158256004205e-06, + "loss": 0.9449, + "step": 3877 + }, + { + "epoch": 0.2989515880357693, + "grad_norm": 3.4423718452453613, + "learning_rate": 8.220627698437069e-06, + "loss": 1.0393, + "step": 3878 + }, + { + "epoch": 0.2990286771507863, + "grad_norm": 4.050296306610107, + "learning_rate": 8.21967263604263e-06, + "loss": 1.0819, + "step": 3879 + }, + { + "epoch": 0.29910576626580326, + "grad_norm": 3.7262563705444336, + "learning_rate": 8.218717372918277e-06, + "loss": 1.0081, + "step": 3880 + }, + { + "epoch": 0.29918285538082023, + "grad_norm": 3.7771074771881104, + "learning_rate": 8.217761909123567e-06, + "loss": 0.932, + "step": 3881 + }, + { + "epoch": 0.2992599444958372, + "grad_norm": 3.4512460231781006, + "learning_rate": 8.216806244718068e-06, + "loss": 0.9152, + "step": 3882 + }, + { + "epoch": 0.2993370336108541, + "grad_norm": 3.5177505016326904, + "learning_rate": 8.215850379761357e-06, + "loss": 0.9113, + "step": 3883 + }, + { + "epoch": 0.2994141227258711, + "grad_norm": 3.6450557708740234, + "learning_rate": 8.214894314313034e-06, + "loss": 0.964, + "step": 3884 + }, + { + "epoch": 0.29949121184088806, + "grad_norm": 3.773829698562622, + "learning_rate": 8.213938048432697e-06, + "loss": 1.1156, + "step": 3885 + }, + { + "epoch": 0.29956830095590503, + "grad_norm": 3.8969192504882812, + "learning_rate": 8.21298158217997e-06, + "loss": 1.045, + "step": 3886 + }, + { + "epoch": 0.299645390070922, + "grad_norm": 3.853782892227173, + "learning_rate": 8.212024915614482e-06, + "loss": 1.0426, + "step": 3887 + }, + { + "epoch": 0.299722479185939, + "grad_norm": 3.673476219177246, + "learning_rate": 8.211068048795877e-06, + "loss": 1.0286, + "step": 3888 + }, + { + "epoch": 0.2997995683009559, + "grad_norm": 3.4116404056549072, + "learning_rate": 8.210110981783807e-06, + "loss": 0.8559, + "step": 3889 + }, + { + "epoch": 0.29987665741597286, + "grad_norm": 3.4718472957611084, + "learning_rate": 8.209153714637943e-06, + "loss": 0.9776, + "step": 3890 + }, + { + "epoch": 0.29995374653098983, + "grad_norm": 3.6767029762268066, + "learning_rate": 8.208196247417968e-06, + "loss": 1.1066, + "step": 3891 + }, + { + "epoch": 0.3000308356460068, + "grad_norm": 3.493272066116333, + "learning_rate": 8.207238580183571e-06, + "loss": 1.0995, + "step": 3892 + }, + { + "epoch": 0.30010792476102377, + "grad_norm": 3.817072868347168, + "learning_rate": 8.206280712994459e-06, + "loss": 1.033, + "step": 3893 + }, + { + "epoch": 0.3001850138760407, + "grad_norm": 4.376103401184082, + "learning_rate": 8.205322645910352e-06, + "loss": 0.8971, + "step": 3894 + }, + { + "epoch": 0.30026210299105766, + "grad_norm": 3.5054662227630615, + "learning_rate": 8.204364378990976e-06, + "loss": 1.0437, + "step": 3895 + }, + { + "epoch": 0.3003391921060746, + "grad_norm": 3.6018145084381104, + "learning_rate": 8.203405912296079e-06, + "loss": 0.9484, + "step": 3896 + }, + { + "epoch": 0.3004162812210916, + "grad_norm": 3.9170470237731934, + "learning_rate": 8.202447245885414e-06, + "loss": 0.9874, + "step": 3897 + }, + { + "epoch": 0.30049337033610857, + "grad_norm": 3.545153856277466, + "learning_rate": 8.20148837981875e-06, + "loss": 0.9289, + "step": 3898 + }, + { + "epoch": 0.3005704594511255, + "grad_norm": 4.0050482749938965, + "learning_rate": 8.200529314155865e-06, + "loss": 1.1052, + "step": 3899 + }, + { + "epoch": 0.30064754856614245, + "grad_norm": 3.5742580890655518, + "learning_rate": 8.199570048956553e-06, + "loss": 0.9474, + "step": 3900 + }, + { + "epoch": 0.3007246376811594, + "grad_norm": 3.7283453941345215, + "learning_rate": 8.19861058428062e-06, + "loss": 0.9697, + "step": 3901 + }, + { + "epoch": 0.3008017267961764, + "grad_norm": 3.4887049198150635, + "learning_rate": 8.197650920187882e-06, + "loss": 0.9355, + "step": 3902 + }, + { + "epoch": 0.30087881591119336, + "grad_norm": 4.035374641418457, + "learning_rate": 8.196691056738173e-06, + "loss": 1.0705, + "step": 3903 + }, + { + "epoch": 0.3009559050262103, + "grad_norm": 3.5932343006134033, + "learning_rate": 8.19573099399133e-06, + "loss": 1.0442, + "step": 3904 + }, + { + "epoch": 0.30103299414122725, + "grad_norm": 3.732898712158203, + "learning_rate": 8.19477073200721e-06, + "loss": 0.9415, + "step": 3905 + }, + { + "epoch": 0.3011100832562442, + "grad_norm": 3.413482666015625, + "learning_rate": 8.193810270845683e-06, + "loss": 0.8275, + "step": 3906 + }, + { + "epoch": 0.3011871723712612, + "grad_norm": 4.12077522277832, + "learning_rate": 8.192849610566627e-06, + "loss": 0.9177, + "step": 3907 + }, + { + "epoch": 0.30126426148627816, + "grad_norm": 3.9081289768218994, + "learning_rate": 8.191888751229934e-06, + "loss": 0.9281, + "step": 3908 + }, + { + "epoch": 0.3013413506012951, + "grad_norm": 3.6880979537963867, + "learning_rate": 8.190927692895508e-06, + "loss": 1.0174, + "step": 3909 + }, + { + "epoch": 0.30141843971631205, + "grad_norm": 3.402858018875122, + "learning_rate": 8.189966435623266e-06, + "loss": 1.0482, + "step": 3910 + }, + { + "epoch": 0.301495528831329, + "grad_norm": 3.7268900871276855, + "learning_rate": 8.189004979473138e-06, + "loss": 1.1489, + "step": 3911 + }, + { + "epoch": 0.301572617946346, + "grad_norm": 3.7350738048553467, + "learning_rate": 8.188043324505067e-06, + "loss": 1.0077, + "step": 3912 + }, + { + "epoch": 0.30164970706136296, + "grad_norm": 3.5450897216796875, + "learning_rate": 8.187081470779006e-06, + "loss": 0.9558, + "step": 3913 + }, + { + "epoch": 0.3017267961763799, + "grad_norm": 3.7310359477996826, + "learning_rate": 8.18611941835492e-06, + "loss": 0.9612, + "step": 3914 + }, + { + "epoch": 0.30180388529139685, + "grad_norm": 3.8756463527679443, + "learning_rate": 8.185157167292791e-06, + "loss": 0.9793, + "step": 3915 + }, + { + "epoch": 0.3018809744064138, + "grad_norm": 4.031418323516846, + "learning_rate": 8.184194717652609e-06, + "loss": 1.0657, + "step": 3916 + }, + { + "epoch": 0.3019580635214308, + "grad_norm": 3.99357008934021, + "learning_rate": 8.183232069494378e-06, + "loss": 0.984, + "step": 3917 + }, + { + "epoch": 0.30203515263644776, + "grad_norm": 3.870607852935791, + "learning_rate": 8.182269222878112e-06, + "loss": 0.9012, + "step": 3918 + }, + { + "epoch": 0.30211224175146467, + "grad_norm": 3.4795637130737305, + "learning_rate": 8.181306177863843e-06, + "loss": 0.8978, + "step": 3919 + }, + { + "epoch": 0.30218933086648164, + "grad_norm": 3.652648687362671, + "learning_rate": 8.18034293451161e-06, + "loss": 1.0422, + "step": 3920 + }, + { + "epoch": 0.3022664199814986, + "grad_norm": 3.868264675140381, + "learning_rate": 8.179379492881465e-06, + "loss": 1.0272, + "step": 3921 + }, + { + "epoch": 0.3023435090965156, + "grad_norm": 3.5765531063079834, + "learning_rate": 8.178415853033477e-06, + "loss": 0.9447, + "step": 3922 + }, + { + "epoch": 0.30242059821153255, + "grad_norm": 4.241714000701904, + "learning_rate": 8.177452015027721e-06, + "loss": 0.9008, + "step": 3923 + }, + { + "epoch": 0.30249768732654947, + "grad_norm": 3.668001413345337, + "learning_rate": 8.176487978924288e-06, + "loss": 0.9762, + "step": 3924 + }, + { + "epoch": 0.30257477644156644, + "grad_norm": 3.9289772510528564, + "learning_rate": 8.175523744783281e-06, + "loss": 1.0743, + "step": 3925 + }, + { + "epoch": 0.3026518655565834, + "grad_norm": 3.804590940475464, + "learning_rate": 8.174559312664815e-06, + "loss": 0.9993, + "step": 3926 + }, + { + "epoch": 0.3027289546716004, + "grad_norm": 3.694150924682617, + "learning_rate": 8.173594682629018e-06, + "loss": 0.9693, + "step": 3927 + }, + { + "epoch": 0.30280604378661735, + "grad_norm": 3.8130977153778076, + "learning_rate": 8.172629854736029e-06, + "loss": 1.0525, + "step": 3928 + }, + { + "epoch": 0.30288313290163427, + "grad_norm": 4.602511405944824, + "learning_rate": 8.171664829046e-06, + "loss": 1.1879, + "step": 3929 + }, + { + "epoch": 0.30296022201665124, + "grad_norm": 3.7024033069610596, + "learning_rate": 8.170699605619096e-06, + "loss": 0.9487, + "step": 3930 + }, + { + "epoch": 0.3030373111316682, + "grad_norm": 3.632319688796997, + "learning_rate": 8.169734184515493e-06, + "loss": 1.1081, + "step": 3931 + }, + { + "epoch": 0.3031144002466852, + "grad_norm": 3.9944117069244385, + "learning_rate": 8.168768565795377e-06, + "loss": 0.9995, + "step": 3932 + }, + { + "epoch": 0.30319148936170215, + "grad_norm": 3.9244701862335205, + "learning_rate": 8.167802749518956e-06, + "loss": 1.1025, + "step": 3933 + }, + { + "epoch": 0.30326857847671906, + "grad_norm": 3.9141428470611572, + "learning_rate": 8.166836735746438e-06, + "loss": 1.0337, + "step": 3934 + }, + { + "epoch": 0.30334566759173603, + "grad_norm": 3.821218967437744, + "learning_rate": 8.165870524538052e-06, + "loss": 1.0971, + "step": 3935 + }, + { + "epoch": 0.303422756706753, + "grad_norm": 4.062706470489502, + "learning_rate": 8.164904115954036e-06, + "loss": 1.1178, + "step": 3936 + }, + { + "epoch": 0.30349984582177, + "grad_norm": 3.5247349739074707, + "learning_rate": 8.163937510054638e-06, + "loss": 0.9598, + "step": 3937 + }, + { + "epoch": 0.30357693493678695, + "grad_norm": 3.914620876312256, + "learning_rate": 8.162970706900124e-06, + "loss": 1.0434, + "step": 3938 + }, + { + "epoch": 0.30365402405180386, + "grad_norm": 3.9191715717315674, + "learning_rate": 8.162003706550767e-06, + "loss": 1.163, + "step": 3939 + }, + { + "epoch": 0.30373111316682083, + "grad_norm": 3.459897994995117, + "learning_rate": 8.161036509066856e-06, + "loss": 1.0139, + "step": 3940 + }, + { + "epoch": 0.3038082022818378, + "grad_norm": 3.6575376987457275, + "learning_rate": 8.16006911450869e-06, + "loss": 0.9167, + "step": 3941 + }, + { + "epoch": 0.3038852913968548, + "grad_norm": 3.691288948059082, + "learning_rate": 8.159101522936582e-06, + "loss": 0.9456, + "step": 3942 + }, + { + "epoch": 0.30396238051187174, + "grad_norm": 4.538893222808838, + "learning_rate": 8.158133734410853e-06, + "loss": 1.0096, + "step": 3943 + }, + { + "epoch": 0.30403946962688866, + "grad_norm": 3.614013671875, + "learning_rate": 8.157165748991845e-06, + "loss": 1.0055, + "step": 3944 + }, + { + "epoch": 0.30411655874190563, + "grad_norm": 3.9665722846984863, + "learning_rate": 8.156197566739901e-06, + "loss": 1.1126, + "step": 3945 + }, + { + "epoch": 0.3041936478569226, + "grad_norm": 3.547726631164551, + "learning_rate": 8.155229187715385e-06, + "loss": 1.0368, + "step": 3946 + }, + { + "epoch": 0.30427073697193957, + "grad_norm": 3.3927464485168457, + "learning_rate": 8.154260611978673e-06, + "loss": 1.0033, + "step": 3947 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 3.441749334335327, + "learning_rate": 8.153291839590147e-06, + "loss": 0.9378, + "step": 3948 + }, + { + "epoch": 0.30442491520197346, + "grad_norm": 3.7463624477386475, + "learning_rate": 8.152322870610206e-06, + "loss": 1.0562, + "step": 3949 + }, + { + "epoch": 0.3045020043169904, + "grad_norm": 3.6333110332489014, + "learning_rate": 8.151353705099261e-06, + "loss": 1.0003, + "step": 3950 + }, + { + "epoch": 0.3045790934320074, + "grad_norm": 3.9135475158691406, + "learning_rate": 8.150384343117733e-06, + "loss": 1.1028, + "step": 3951 + }, + { + "epoch": 0.30465618254702437, + "grad_norm": 3.670753240585327, + "learning_rate": 8.149414784726058e-06, + "loss": 0.9757, + "step": 3952 + }, + { + "epoch": 0.30473327166204134, + "grad_norm": 3.691105365753174, + "learning_rate": 8.148445029984683e-06, + "loss": 1.0124, + "step": 3953 + }, + { + "epoch": 0.30481036077705825, + "grad_norm": 3.77278995513916, + "learning_rate": 8.147475078954067e-06, + "loss": 0.9402, + "step": 3954 + }, + { + "epoch": 0.3048874498920752, + "grad_norm": 3.847795009613037, + "learning_rate": 8.146504931694678e-06, + "loss": 1.0041, + "step": 3955 + }, + { + "epoch": 0.3049645390070922, + "grad_norm": 3.791046380996704, + "learning_rate": 8.145534588267006e-06, + "loss": 1.023, + "step": 3956 + }, + { + "epoch": 0.30504162812210917, + "grad_norm": 3.8239328861236572, + "learning_rate": 8.144564048731542e-06, + "loss": 0.9401, + "step": 3957 + }, + { + "epoch": 0.30511871723712614, + "grad_norm": 3.5764706134796143, + "learning_rate": 8.143593313148794e-06, + "loss": 0.9707, + "step": 3958 + }, + { + "epoch": 0.30519580635214305, + "grad_norm": 4.011223793029785, + "learning_rate": 8.142622381579285e-06, + "loss": 1.0103, + "step": 3959 + }, + { + "epoch": 0.30527289546716, + "grad_norm": 4.034478664398193, + "learning_rate": 8.14165125408355e-06, + "loss": 0.8388, + "step": 3960 + }, + { + "epoch": 0.305349984582177, + "grad_norm": 3.6760923862457275, + "learning_rate": 8.140679930722126e-06, + "loss": 1.0874, + "step": 3961 + }, + { + "epoch": 0.30542707369719396, + "grad_norm": 3.658113718032837, + "learning_rate": 8.139708411555575e-06, + "loss": 0.9909, + "step": 3962 + }, + { + "epoch": 0.30550416281221093, + "grad_norm": 4.3825364112854, + "learning_rate": 8.138736696644467e-06, + "loss": 1.0232, + "step": 3963 + }, + { + "epoch": 0.30558125192722785, + "grad_norm": 3.9493494033813477, + "learning_rate": 8.137764786049382e-06, + "loss": 0.9977, + "step": 3964 + }, + { + "epoch": 0.3056583410422448, + "grad_norm": 3.7453453540802, + "learning_rate": 8.136792679830913e-06, + "loss": 0.9792, + "step": 3965 + }, + { + "epoch": 0.3057354301572618, + "grad_norm": 3.767953634262085, + "learning_rate": 8.135820378049667e-06, + "loss": 1.0847, + "step": 3966 + }, + { + "epoch": 0.30581251927227876, + "grad_norm": 3.4496097564697266, + "learning_rate": 8.13484788076626e-06, + "loss": 0.9574, + "step": 3967 + }, + { + "epoch": 0.30588960838729573, + "grad_norm": 3.7329258918762207, + "learning_rate": 8.133875188041323e-06, + "loss": 0.9645, + "step": 3968 + }, + { + "epoch": 0.3059666975023127, + "grad_norm": 3.522775888442993, + "learning_rate": 8.1329022999355e-06, + "loss": 1.0296, + "step": 3969 + }, + { + "epoch": 0.3060437866173296, + "grad_norm": 3.753998041152954, + "learning_rate": 8.131929216509445e-06, + "loss": 1.0741, + "step": 3970 + }, + { + "epoch": 0.3061208757323466, + "grad_norm": 3.6926846504211426, + "learning_rate": 8.130955937823821e-06, + "loss": 0.963, + "step": 3971 + }, + { + "epoch": 0.30619796484736356, + "grad_norm": 3.8235857486724854, + "learning_rate": 8.129982463939313e-06, + "loss": 1.0803, + "step": 3972 + }, + { + "epoch": 0.30627505396238053, + "grad_norm": 3.8400890827178955, + "learning_rate": 8.129008794916609e-06, + "loss": 1.074, + "step": 3973 + }, + { + "epoch": 0.3063521430773975, + "grad_norm": 3.7315080165863037, + "learning_rate": 8.12803493081641e-06, + "loss": 1.1322, + "step": 3974 + }, + { + "epoch": 0.3064292321924144, + "grad_norm": 3.864604949951172, + "learning_rate": 8.127060871699435e-06, + "loss": 1.0681, + "step": 3975 + }, + { + "epoch": 0.3065063213074314, + "grad_norm": 4.137643337249756, + "learning_rate": 8.12608661762641e-06, + "loss": 1.0562, + "step": 3976 + }, + { + "epoch": 0.30658341042244835, + "grad_norm": 3.801588296890259, + "learning_rate": 8.125112168658074e-06, + "loss": 1.0874, + "step": 3977 + }, + { + "epoch": 0.3066604995374653, + "grad_norm": 3.4541311264038086, + "learning_rate": 8.12413752485518e-06, + "loss": 0.9565, + "step": 3978 + }, + { + "epoch": 0.3067375886524823, + "grad_norm": 3.6811790466308594, + "learning_rate": 8.123162686278493e-06, + "loss": 1.0322, + "step": 3979 + }, + { + "epoch": 0.3068146777674992, + "grad_norm": 4.22686243057251, + "learning_rate": 8.122187652988786e-06, + "loss": 0.9425, + "step": 3980 + }, + { + "epoch": 0.3068917668825162, + "grad_norm": 3.42470383644104, + "learning_rate": 8.12121242504685e-06, + "loss": 0.9987, + "step": 3981 + }, + { + "epoch": 0.30696885599753315, + "grad_norm": 3.6424388885498047, + "learning_rate": 8.120237002513484e-06, + "loss": 0.979, + "step": 3982 + }, + { + "epoch": 0.3070459451125501, + "grad_norm": 3.6647961139678955, + "learning_rate": 8.119261385449502e-06, + "loss": 1.0419, + "step": 3983 + }, + { + "epoch": 0.3071230342275671, + "grad_norm": 3.583043098449707, + "learning_rate": 8.118285573915726e-06, + "loss": 0.9044, + "step": 3984 + }, + { + "epoch": 0.307200123342584, + "grad_norm": 3.9360716342926025, + "learning_rate": 8.117309567972995e-06, + "loss": 1.0924, + "step": 3985 + }, + { + "epoch": 0.307277212457601, + "grad_norm": 3.999488592147827, + "learning_rate": 8.116333367682158e-06, + "loss": 1.0132, + "step": 3986 + }, + { + "epoch": 0.30735430157261795, + "grad_norm": 3.6380109786987305, + "learning_rate": 8.115356973104076e-06, + "loss": 1.0445, + "step": 3987 + }, + { + "epoch": 0.3074313906876349, + "grad_norm": 3.977320909500122, + "learning_rate": 8.11438038429962e-06, + "loss": 1.0968, + "step": 3988 + }, + { + "epoch": 0.3075084798026519, + "grad_norm": 3.90857195854187, + "learning_rate": 8.113403601329678e-06, + "loss": 1.1308, + "step": 3989 + }, + { + "epoch": 0.3075855689176688, + "grad_norm": 3.865142345428467, + "learning_rate": 8.112426624255145e-06, + "loss": 0.9358, + "step": 3990 + }, + { + "epoch": 0.3076626580326858, + "grad_norm": 3.489595890045166, + "learning_rate": 8.111449453136932e-06, + "loss": 0.9765, + "step": 3991 + }, + { + "epoch": 0.30773974714770275, + "grad_norm": 4.063758850097656, + "learning_rate": 8.110472088035961e-06, + "loss": 0.9821, + "step": 3992 + }, + { + "epoch": 0.3078168362627197, + "grad_norm": 3.65039324760437, + "learning_rate": 8.109494529013165e-06, + "loss": 1.0804, + "step": 3993 + }, + { + "epoch": 0.3078939253777367, + "grad_norm": 3.800696611404419, + "learning_rate": 8.108516776129489e-06, + "loss": 0.8819, + "step": 3994 + }, + { + "epoch": 0.3079710144927536, + "grad_norm": 3.8089771270751953, + "learning_rate": 8.107538829445891e-06, + "loss": 1.0294, + "step": 3995 + }, + { + "epoch": 0.3080481036077706, + "grad_norm": 3.4781205654144287, + "learning_rate": 8.106560689023342e-06, + "loss": 0.9528, + "step": 3996 + }, + { + "epoch": 0.30812519272278754, + "grad_norm": 3.5084052085876465, + "learning_rate": 8.105582354922822e-06, + "loss": 0.9852, + "step": 3997 + }, + { + "epoch": 0.3082022818378045, + "grad_norm": 3.660351037979126, + "learning_rate": 8.104603827205329e-06, + "loss": 1.032, + "step": 3998 + }, + { + "epoch": 0.3082793709528215, + "grad_norm": 3.5761704444885254, + "learning_rate": 8.103625105931865e-06, + "loss": 0.9191, + "step": 3999 + }, + { + "epoch": 0.3083564600678384, + "grad_norm": 3.8501265048980713, + "learning_rate": 8.10264619116345e-06, + "loss": 1.0343, + "step": 4000 + }, + { + "epoch": 0.30843354918285537, + "grad_norm": 3.4951345920562744, + "learning_rate": 8.101667082961114e-06, + "loss": 0.9921, + "step": 4001 + }, + { + "epoch": 0.30851063829787234, + "grad_norm": 3.844111204147339, + "learning_rate": 8.1006877813859e-06, + "loss": 0.979, + "step": 4002 + }, + { + "epoch": 0.3085877274128893, + "grad_norm": 3.7992372512817383, + "learning_rate": 8.09970828649886e-06, + "loss": 1.0304, + "step": 4003 + }, + { + "epoch": 0.3086648165279063, + "grad_norm": 4.003841876983643, + "learning_rate": 8.098728598361063e-06, + "loss": 1.0203, + "step": 4004 + }, + { + "epoch": 0.3087419056429232, + "grad_norm": 3.7176103591918945, + "learning_rate": 8.097748717033587e-06, + "loss": 0.9943, + "step": 4005 + }, + { + "epoch": 0.30881899475794017, + "grad_norm": 3.827420234680176, + "learning_rate": 8.096768642577521e-06, + "loss": 0.9806, + "step": 4006 + }, + { + "epoch": 0.30889608387295714, + "grad_norm": 3.841545820236206, + "learning_rate": 8.09578837505397e-06, + "loss": 1.0092, + "step": 4007 + }, + { + "epoch": 0.3089731729879741, + "grad_norm": 3.845114231109619, + "learning_rate": 8.094807914524048e-06, + "loss": 1.0792, + "step": 4008 + }, + { + "epoch": 0.3090502621029911, + "grad_norm": 3.8567841053009033, + "learning_rate": 8.093827261048879e-06, + "loss": 1.0728, + "step": 4009 + }, + { + "epoch": 0.309127351218008, + "grad_norm": 3.801530361175537, + "learning_rate": 8.092846414689605e-06, + "loss": 0.9051, + "step": 4010 + }, + { + "epoch": 0.30920444033302497, + "grad_norm": 3.59698748588562, + "learning_rate": 8.091865375507375e-06, + "loss": 1.0773, + "step": 4011 + }, + { + "epoch": 0.30928152944804194, + "grad_norm": 3.8643345832824707, + "learning_rate": 8.090884143563352e-06, + "loss": 0.9275, + "step": 4012 + }, + { + "epoch": 0.3093586185630589, + "grad_norm": 3.8212454319000244, + "learning_rate": 8.08990271891871e-06, + "loss": 1.0513, + "step": 4013 + }, + { + "epoch": 0.3094357076780759, + "grad_norm": 3.5950045585632324, + "learning_rate": 8.088921101634637e-06, + "loss": 1.0344, + "step": 4014 + }, + { + "epoch": 0.3095127967930928, + "grad_norm": 3.403311252593994, + "learning_rate": 8.087939291772331e-06, + "loss": 0.9926, + "step": 4015 + }, + { + "epoch": 0.30958988590810976, + "grad_norm": 4.000549793243408, + "learning_rate": 8.086957289393002e-06, + "loss": 1.0657, + "step": 4016 + }, + { + "epoch": 0.30966697502312673, + "grad_norm": 3.72603702545166, + "learning_rate": 8.085975094557876e-06, + "loss": 1.061, + "step": 4017 + }, + { + "epoch": 0.3097440641381437, + "grad_norm": 3.4024152755737305, + "learning_rate": 8.084992707328184e-06, + "loss": 0.9515, + "step": 4018 + }, + { + "epoch": 0.3098211532531607, + "grad_norm": 3.527376174926758, + "learning_rate": 8.084010127765174e-06, + "loss": 0.9454, + "step": 4019 + }, + { + "epoch": 0.3098982423681776, + "grad_norm": 3.594805955886841, + "learning_rate": 8.083027355930106e-06, + "loss": 0.9794, + "step": 4020 + }, + { + "epoch": 0.30997533148319456, + "grad_norm": 3.747175693511963, + "learning_rate": 8.08204439188425e-06, + "loss": 0.9221, + "step": 4021 + }, + { + "epoch": 0.31005242059821153, + "grad_norm": 3.6072657108306885, + "learning_rate": 8.081061235688889e-06, + "loss": 1.0778, + "step": 4022 + }, + { + "epoch": 0.3101295097132285, + "grad_norm": 3.919334888458252, + "learning_rate": 8.080077887405315e-06, + "loss": 0.9001, + "step": 4023 + }, + { + "epoch": 0.3102065988282455, + "grad_norm": 4.0752973556518555, + "learning_rate": 8.079094347094839e-06, + "loss": 1.0525, + "step": 4024 + }, + { + "epoch": 0.3102836879432624, + "grad_norm": 3.7410547733306885, + "learning_rate": 8.078110614818777e-06, + "loss": 0.9558, + "step": 4025 + }, + { + "epoch": 0.31036077705827936, + "grad_norm": 3.7749099731445312, + "learning_rate": 8.07712669063846e-06, + "loss": 0.9536, + "step": 4026 + }, + { + "epoch": 0.31043786617329633, + "grad_norm": 3.8618111610412598, + "learning_rate": 8.07614257461523e-06, + "loss": 0.9301, + "step": 4027 + }, + { + "epoch": 0.3105149552883133, + "grad_norm": 3.772061586380005, + "learning_rate": 8.075158266810442e-06, + "loss": 1.0186, + "step": 4028 + }, + { + "epoch": 0.31059204440333027, + "grad_norm": 4.043780326843262, + "learning_rate": 8.074173767285465e-06, + "loss": 1.0044, + "step": 4029 + }, + { + "epoch": 0.3106691335183472, + "grad_norm": 4.081550598144531, + "learning_rate": 8.073189076101673e-06, + "loss": 1.1462, + "step": 4030 + }, + { + "epoch": 0.31074622263336416, + "grad_norm": 3.861663818359375, + "learning_rate": 8.072204193320459e-06, + "loss": 1.1358, + "step": 4031 + }, + { + "epoch": 0.3108233117483811, + "grad_norm": 3.9835574626922607, + "learning_rate": 8.071219119003223e-06, + "loss": 1.1444, + "step": 4032 + }, + { + "epoch": 0.3109004008633981, + "grad_norm": 3.3269333839416504, + "learning_rate": 8.070233853211385e-06, + "loss": 0.8577, + "step": 4033 + }, + { + "epoch": 0.31097748997841507, + "grad_norm": 4.0250091552734375, + "learning_rate": 8.069248396006365e-06, + "loss": 0.9865, + "step": 4034 + }, + { + "epoch": 0.311054579093432, + "grad_norm": 3.576920986175537, + "learning_rate": 8.068262747449604e-06, + "loss": 1.0196, + "step": 4035 + }, + { + "epoch": 0.31113166820844895, + "grad_norm": 3.694272994995117, + "learning_rate": 8.067276907602551e-06, + "loss": 1.0175, + "step": 4036 + }, + { + "epoch": 0.3112087573234659, + "grad_norm": 3.5465962886810303, + "learning_rate": 8.06629087652667e-06, + "loss": 1.0006, + "step": 4037 + }, + { + "epoch": 0.3112858464384829, + "grad_norm": 3.7290923595428467, + "learning_rate": 8.065304654283434e-06, + "loss": 0.9712, + "step": 4038 + }, + { + "epoch": 0.31136293555349986, + "grad_norm": 3.9318015575408936, + "learning_rate": 8.064318240934327e-06, + "loss": 0.9798, + "step": 4039 + }, + { + "epoch": 0.3114400246685168, + "grad_norm": 3.745164394378662, + "learning_rate": 8.063331636540848e-06, + "loss": 0.9533, + "step": 4040 + }, + { + "epoch": 0.31151711378353375, + "grad_norm": 4.017617225646973, + "learning_rate": 8.062344841164508e-06, + "loss": 1.0166, + "step": 4041 + }, + { + "epoch": 0.3115942028985507, + "grad_norm": 3.4828991889953613, + "learning_rate": 8.061357854866827e-06, + "loss": 0.9534, + "step": 4042 + }, + { + "epoch": 0.3116712920135677, + "grad_norm": 3.8771891593933105, + "learning_rate": 8.060370677709338e-06, + "loss": 0.9229, + "step": 4043 + }, + { + "epoch": 0.31174838112858466, + "grad_norm": 4.284106731414795, + "learning_rate": 8.059383309753587e-06, + "loss": 0.8905, + "step": 4044 + }, + { + "epoch": 0.3118254702436016, + "grad_norm": 3.635425567626953, + "learning_rate": 8.058395751061135e-06, + "loss": 1.0268, + "step": 4045 + }, + { + "epoch": 0.31190255935861855, + "grad_norm": 3.7544236183166504, + "learning_rate": 8.057408001693544e-06, + "loss": 0.9735, + "step": 4046 + }, + { + "epoch": 0.3119796484736355, + "grad_norm": 3.6741573810577393, + "learning_rate": 8.0564200617124e-06, + "loss": 1.0407, + "step": 4047 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 3.6029105186462402, + "learning_rate": 8.055431931179296e-06, + "loss": 0.9976, + "step": 4048 + }, + { + "epoch": 0.31213382670366946, + "grad_norm": 3.3681766986846924, + "learning_rate": 8.054443610155836e-06, + "loss": 0.9445, + "step": 4049 + }, + { + "epoch": 0.3122109158186864, + "grad_norm": 3.6307098865509033, + "learning_rate": 8.053455098703635e-06, + "loss": 0.9859, + "step": 4050 + }, + { + "epoch": 0.31228800493370334, + "grad_norm": 3.4906904697418213, + "learning_rate": 8.052466396884323e-06, + "loss": 1.0344, + "step": 4051 + }, + { + "epoch": 0.3123650940487203, + "grad_norm": 3.574552297592163, + "learning_rate": 8.05147750475954e-06, + "loss": 0.9103, + "step": 4052 + }, + { + "epoch": 0.3124421831637373, + "grad_norm": 4.1314377784729, + "learning_rate": 8.050488422390939e-06, + "loss": 1.0707, + "step": 4053 + }, + { + "epoch": 0.31251927227875426, + "grad_norm": 4.234683990478516, + "learning_rate": 8.049499149840183e-06, + "loss": 1.0068, + "step": 4054 + }, + { + "epoch": 0.3125963613937712, + "grad_norm": 3.7397048473358154, + "learning_rate": 8.048509687168949e-06, + "loss": 0.9691, + "step": 4055 + }, + { + "epoch": 0.31267345050878814, + "grad_norm": 3.67864727973938, + "learning_rate": 8.047520034438925e-06, + "loss": 0.9445, + "step": 4056 + }, + { + "epoch": 0.3127505396238051, + "grad_norm": 4.083065032958984, + "learning_rate": 8.046530191711808e-06, + "loss": 1.1873, + "step": 4057 + }, + { + "epoch": 0.3128276287388221, + "grad_norm": 3.7557568550109863, + "learning_rate": 8.045540159049315e-06, + "loss": 1.0247, + "step": 4058 + }, + { + "epoch": 0.31290471785383905, + "grad_norm": 3.5163233280181885, + "learning_rate": 8.044549936513165e-06, + "loss": 1.039, + "step": 4059 + }, + { + "epoch": 0.312981806968856, + "grad_norm": 7.071890354156494, + "learning_rate": 8.043559524165096e-06, + "loss": 1.0503, + "step": 4060 + }, + { + "epoch": 0.31305889608387294, + "grad_norm": 3.8393161296844482, + "learning_rate": 8.042568922066852e-06, + "loss": 1.083, + "step": 4061 + }, + { + "epoch": 0.3131359851988899, + "grad_norm": 4.509868144989014, + "learning_rate": 8.041578130280194e-06, + "loss": 1.0581, + "step": 4062 + }, + { + "epoch": 0.3132130743139069, + "grad_norm": 4.245510578155518, + "learning_rate": 8.040587148866893e-06, + "loss": 0.9752, + "step": 4063 + }, + { + "epoch": 0.31329016342892385, + "grad_norm": 4.26503849029541, + "learning_rate": 8.03959597788873e-06, + "loss": 0.9107, + "step": 4064 + }, + { + "epoch": 0.3133672525439408, + "grad_norm": 3.7438037395477295, + "learning_rate": 8.038604617407501e-06, + "loss": 0.9565, + "step": 4065 + }, + { + "epoch": 0.31344434165895774, + "grad_norm": 5.089138507843018, + "learning_rate": 8.037613067485012e-06, + "loss": 1.0168, + "step": 4066 + }, + { + "epoch": 0.3135214307739747, + "grad_norm": 4.153073310852051, + "learning_rate": 8.036621328183079e-06, + "loss": 1.0259, + "step": 4067 + }, + { + "epoch": 0.3135985198889917, + "grad_norm": 3.4369683265686035, + "learning_rate": 8.035629399563533e-06, + "loss": 0.9818, + "step": 4068 + }, + { + "epoch": 0.31367560900400865, + "grad_norm": 3.7807652950286865, + "learning_rate": 8.034637281688219e-06, + "loss": 1.0017, + "step": 4069 + }, + { + "epoch": 0.3137526981190256, + "grad_norm": 3.68034291267395, + "learning_rate": 8.033644974618983e-06, + "loss": 0.9743, + "step": 4070 + }, + { + "epoch": 0.31382978723404253, + "grad_norm": 3.763885021209717, + "learning_rate": 8.032652478417697e-06, + "loss": 0.9948, + "step": 4071 + }, + { + "epoch": 0.3139068763490595, + "grad_norm": 3.9806506633758545, + "learning_rate": 8.031659793146237e-06, + "loss": 1.1496, + "step": 4072 + }, + { + "epoch": 0.3139839654640765, + "grad_norm": 3.6926496028900146, + "learning_rate": 8.030666918866487e-06, + "loss": 1.0143, + "step": 4073 + }, + { + "epoch": 0.31406105457909345, + "grad_norm": 3.7122464179992676, + "learning_rate": 8.029673855640352e-06, + "loss": 1.0272, + "step": 4074 + }, + { + "epoch": 0.3141381436941104, + "grad_norm": 3.5680923461914062, + "learning_rate": 8.028680603529742e-06, + "loss": 0.8766, + "step": 4075 + }, + { + "epoch": 0.31421523280912733, + "grad_norm": 3.5684890747070312, + "learning_rate": 8.027687162596584e-06, + "loss": 0.9874, + "step": 4076 + }, + { + "epoch": 0.3142923219241443, + "grad_norm": 3.833163022994995, + "learning_rate": 8.026693532902811e-06, + "loss": 1.0566, + "step": 4077 + }, + { + "epoch": 0.3143694110391613, + "grad_norm": 4.123239040374756, + "learning_rate": 8.025699714510374e-06, + "loss": 1.093, + "step": 4078 + }, + { + "epoch": 0.31444650015417824, + "grad_norm": 3.9509053230285645, + "learning_rate": 8.024705707481228e-06, + "loss": 0.9761, + "step": 4079 + }, + { + "epoch": 0.3145235892691952, + "grad_norm": 3.3474016189575195, + "learning_rate": 8.023711511877347e-06, + "loss": 0.9236, + "step": 4080 + }, + { + "epoch": 0.31460067838421213, + "grad_norm": 4.015679359436035, + "learning_rate": 8.022717127760715e-06, + "loss": 1.0176, + "step": 4081 + }, + { + "epoch": 0.3146777674992291, + "grad_norm": 3.6416444778442383, + "learning_rate": 8.021722555193324e-06, + "loss": 1.0104, + "step": 4082 + }, + { + "epoch": 0.31475485661424607, + "grad_norm": 3.4460465908050537, + "learning_rate": 8.020727794237182e-06, + "loss": 0.8351, + "step": 4083 + }, + { + "epoch": 0.31483194572926304, + "grad_norm": 3.7392327785491943, + "learning_rate": 8.019732844954306e-06, + "loss": 1.059, + "step": 4084 + }, + { + "epoch": 0.31490903484428, + "grad_norm": 3.5918447971343994, + "learning_rate": 8.018737707406728e-06, + "loss": 1.0905, + "step": 4085 + }, + { + "epoch": 0.3149861239592969, + "grad_norm": 3.957096576690674, + "learning_rate": 8.017742381656486e-06, + "loss": 0.9457, + "step": 4086 + }, + { + "epoch": 0.3150632130743139, + "grad_norm": 3.9353511333465576, + "learning_rate": 8.016746867765639e-06, + "loss": 0.9994, + "step": 4087 + }, + { + "epoch": 0.31514030218933087, + "grad_norm": 3.582047700881958, + "learning_rate": 8.015751165796247e-06, + "loss": 1.0187, + "step": 4088 + }, + { + "epoch": 0.31521739130434784, + "grad_norm": 3.996525287628174, + "learning_rate": 8.014755275810389e-06, + "loss": 1.0468, + "step": 4089 + }, + { + "epoch": 0.3152944804193648, + "grad_norm": 3.70849871635437, + "learning_rate": 8.013759197870153e-06, + "loss": 1.1356, + "step": 4090 + }, + { + "epoch": 0.3153715695343817, + "grad_norm": 3.3762667179107666, + "learning_rate": 8.012762932037638e-06, + "loss": 0.873, + "step": 4091 + }, + { + "epoch": 0.3154486586493987, + "grad_norm": 3.614732503890991, + "learning_rate": 8.011766478374961e-06, + "loss": 0.9919, + "step": 4092 + }, + { + "epoch": 0.31552574776441566, + "grad_norm": 3.7229347229003906, + "learning_rate": 8.010769836944241e-06, + "loss": 0.9781, + "step": 4093 + }, + { + "epoch": 0.31560283687943264, + "grad_norm": 3.661196231842041, + "learning_rate": 8.009773007807615e-06, + "loss": 1.0271, + "step": 4094 + }, + { + "epoch": 0.3156799259944496, + "grad_norm": 3.394929885864258, + "learning_rate": 8.00877599102723e-06, + "loss": 1.0682, + "step": 4095 + }, + { + "epoch": 0.3157570151094665, + "grad_norm": 3.5974974632263184, + "learning_rate": 8.007778786665246e-06, + "loss": 0.9009, + "step": 4096 + }, + { + "epoch": 0.3158341042244835, + "grad_norm": 3.423668622970581, + "learning_rate": 8.006781394783831e-06, + "loss": 0.9942, + "step": 4097 + }, + { + "epoch": 0.31591119333950046, + "grad_norm": 3.99906849861145, + "learning_rate": 8.005783815445168e-06, + "loss": 1.1061, + "step": 4098 + }, + { + "epoch": 0.31598828245451743, + "grad_norm": 3.8605785369873047, + "learning_rate": 8.004786048711452e-06, + "loss": 1.059, + "step": 4099 + }, + { + "epoch": 0.3160653715695344, + "grad_norm": 3.41093373298645, + "learning_rate": 8.003788094644888e-06, + "loss": 0.9386, + "step": 4100 + }, + { + "epoch": 0.3161424606845513, + "grad_norm": 4.836550235748291, + "learning_rate": 8.002789953307692e-06, + "loss": 0.9721, + "step": 4101 + }, + { + "epoch": 0.3162195497995683, + "grad_norm": 3.4783308506011963, + "learning_rate": 8.001791624762097e-06, + "loss": 0.8727, + "step": 4102 + }, + { + "epoch": 0.31629663891458526, + "grad_norm": 3.9497110843658447, + "learning_rate": 8.00079310907034e-06, + "loss": 1.0713, + "step": 4103 + }, + { + "epoch": 0.31637372802960223, + "grad_norm": 3.943293809890747, + "learning_rate": 7.999794406294674e-06, + "loss": 0.9778, + "step": 4104 + }, + { + "epoch": 0.3164508171446192, + "grad_norm": 4.106851100921631, + "learning_rate": 7.998795516497362e-06, + "loss": 1.0506, + "step": 4105 + }, + { + "epoch": 0.3165279062596361, + "grad_norm": 4.133145332336426, + "learning_rate": 7.997796439740682e-06, + "loss": 1.1453, + "step": 4106 + }, + { + "epoch": 0.3166049953746531, + "grad_norm": 3.896491289138794, + "learning_rate": 7.99679717608692e-06, + "loss": 1.0087, + "step": 4107 + }, + { + "epoch": 0.31668208448967006, + "grad_norm": 3.7162492275238037, + "learning_rate": 7.995797725598373e-06, + "loss": 1.0709, + "step": 4108 + }, + { + "epoch": 0.316759173604687, + "grad_norm": 4.147136688232422, + "learning_rate": 7.994798088337357e-06, + "loss": 1.0759, + "step": 4109 + }, + { + "epoch": 0.316836262719704, + "grad_norm": 3.5119404792785645, + "learning_rate": 7.993798264366189e-06, + "loss": 0.9334, + "step": 4110 + }, + { + "epoch": 0.3169133518347209, + "grad_norm": 3.72426438331604, + "learning_rate": 7.992798253747202e-06, + "loss": 1.002, + "step": 4111 + }, + { + "epoch": 0.3169904409497379, + "grad_norm": 3.564457654953003, + "learning_rate": 7.991798056542747e-06, + "loss": 1.0832, + "step": 4112 + }, + { + "epoch": 0.31706753006475485, + "grad_norm": 3.688892126083374, + "learning_rate": 7.990797672815177e-06, + "loss": 1.0163, + "step": 4113 + }, + { + "epoch": 0.3171446191797718, + "grad_norm": 3.9110841751098633, + "learning_rate": 7.989797102626862e-06, + "loss": 1.015, + "step": 4114 + }, + { + "epoch": 0.3172217082947888, + "grad_norm": 3.6422502994537354, + "learning_rate": 7.988796346040182e-06, + "loss": 1.0146, + "step": 4115 + }, + { + "epoch": 0.3172987974098057, + "grad_norm": 3.7644057273864746, + "learning_rate": 7.987795403117528e-06, + "loss": 1.0263, + "step": 4116 + }, + { + "epoch": 0.3173758865248227, + "grad_norm": 3.885549306869507, + "learning_rate": 7.986794273921309e-06, + "loss": 1.0733, + "step": 4117 + }, + { + "epoch": 0.31745297563983965, + "grad_norm": 3.661712169647217, + "learning_rate": 7.985792958513932e-06, + "loss": 1.0752, + "step": 4118 + }, + { + "epoch": 0.3175300647548566, + "grad_norm": 3.5156235694885254, + "learning_rate": 7.98479145695783e-06, + "loss": 0.9757, + "step": 4119 + }, + { + "epoch": 0.3176071538698736, + "grad_norm": 3.432779312133789, + "learning_rate": 7.983789769315438e-06, + "loss": 1.0079, + "step": 4120 + }, + { + "epoch": 0.3176842429848905, + "grad_norm": 3.5986807346343994, + "learning_rate": 7.982787895649207e-06, + "loss": 1.0681, + "step": 4121 + }, + { + "epoch": 0.3177613320999075, + "grad_norm": 4.053248405456543, + "learning_rate": 7.981785836021601e-06, + "loss": 0.9929, + "step": 4122 + }, + { + "epoch": 0.31783842121492445, + "grad_norm": 4.073676109313965, + "learning_rate": 7.980783590495089e-06, + "loss": 0.9481, + "step": 4123 + }, + { + "epoch": 0.3179155103299414, + "grad_norm": 4.069306373596191, + "learning_rate": 7.979781159132157e-06, + "loss": 1.1544, + "step": 4124 + }, + { + "epoch": 0.3179925994449584, + "grad_norm": 3.5793960094451904, + "learning_rate": 7.978778541995304e-06, + "loss": 1.0315, + "step": 4125 + }, + { + "epoch": 0.3180696885599753, + "grad_norm": 3.4428999423980713, + "learning_rate": 7.977775739147038e-06, + "loss": 0.9997, + "step": 4126 + }, + { + "epoch": 0.3181467776749923, + "grad_norm": 3.8016507625579834, + "learning_rate": 7.976772750649874e-06, + "loss": 0.9788, + "step": 4127 + }, + { + "epoch": 0.31822386679000925, + "grad_norm": 3.3368539810180664, + "learning_rate": 7.975769576566348e-06, + "loss": 0.9229, + "step": 4128 + }, + { + "epoch": 0.3183009559050262, + "grad_norm": 3.78816556930542, + "learning_rate": 7.974766216959001e-06, + "loss": 1.0265, + "step": 4129 + }, + { + "epoch": 0.3183780450200432, + "grad_norm": 3.6755471229553223, + "learning_rate": 7.973762671890387e-06, + "loss": 1.0043, + "step": 4130 + }, + { + "epoch": 0.3184551341350601, + "grad_norm": 3.3952746391296387, + "learning_rate": 7.972758941423071e-06, + "loss": 0.895, + "step": 4131 + }, + { + "epoch": 0.3185322232500771, + "grad_norm": 3.5413053035736084, + "learning_rate": 7.971755025619632e-06, + "loss": 0.9635, + "step": 4132 + }, + { + "epoch": 0.31860931236509404, + "grad_norm": 3.619885206222534, + "learning_rate": 7.970750924542659e-06, + "loss": 1.0574, + "step": 4133 + }, + { + "epoch": 0.318686401480111, + "grad_norm": 3.1986031532287598, + "learning_rate": 7.96974663825475e-06, + "loss": 0.9205, + "step": 4134 + }, + { + "epoch": 0.318763490595128, + "grad_norm": 3.69963002204895, + "learning_rate": 7.968742166818521e-06, + "loss": 1.0614, + "step": 4135 + }, + { + "epoch": 0.3188405797101449, + "grad_norm": 3.788604736328125, + "learning_rate": 7.967737510296591e-06, + "loss": 1.0176, + "step": 4136 + }, + { + "epoch": 0.31891766882516187, + "grad_norm": 3.8696343898773193, + "learning_rate": 7.9667326687516e-06, + "loss": 1.0235, + "step": 4137 + }, + { + "epoch": 0.31899475794017884, + "grad_norm": 3.4079203605651855, + "learning_rate": 7.965727642246191e-06, + "loss": 0.9388, + "step": 4138 + }, + { + "epoch": 0.3190718470551958, + "grad_norm": 4.057232856750488, + "learning_rate": 7.964722430843021e-06, + "loss": 1.059, + "step": 4139 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 3.7363650798797607, + "learning_rate": 7.963717034604765e-06, + "loss": 1.0545, + "step": 4140 + }, + { + "epoch": 0.31922602528522975, + "grad_norm": 3.5568385124206543, + "learning_rate": 7.962711453594101e-06, + "loss": 0.9876, + "step": 4141 + }, + { + "epoch": 0.31930311440024667, + "grad_norm": 3.632174491882324, + "learning_rate": 7.961705687873722e-06, + "loss": 0.9774, + "step": 4142 + }, + { + "epoch": 0.31938020351526364, + "grad_norm": 4.495326042175293, + "learning_rate": 7.960699737506333e-06, + "loss": 1.1305, + "step": 4143 + }, + { + "epoch": 0.3194572926302806, + "grad_norm": 3.538480281829834, + "learning_rate": 7.959693602554648e-06, + "loss": 1.0082, + "step": 4144 + }, + { + "epoch": 0.3195343817452976, + "grad_norm": 3.430492877960205, + "learning_rate": 7.958687283081394e-06, + "loss": 1.1023, + "step": 4145 + }, + { + "epoch": 0.31961147086031455, + "grad_norm": 3.7447872161865234, + "learning_rate": 7.957680779149315e-06, + "loss": 1.0244, + "step": 4146 + }, + { + "epoch": 0.31968855997533147, + "grad_norm": 3.6742663383483887, + "learning_rate": 7.956674090821156e-06, + "loss": 0.9341, + "step": 4147 + }, + { + "epoch": 0.31976564909034844, + "grad_norm": 4.139322280883789, + "learning_rate": 7.955667218159679e-06, + "loss": 0.9498, + "step": 4148 + }, + { + "epoch": 0.3198427382053654, + "grad_norm": 3.798419952392578, + "learning_rate": 7.95466016122766e-06, + "loss": 1.0443, + "step": 4149 + }, + { + "epoch": 0.3199198273203824, + "grad_norm": 3.6184287071228027, + "learning_rate": 7.953652920087884e-06, + "loss": 0.9629, + "step": 4150 + }, + { + "epoch": 0.31999691643539935, + "grad_norm": 3.602494478225708, + "learning_rate": 7.952645494803145e-06, + "loss": 1.0702, + "step": 4151 + }, + { + "epoch": 0.32007400555041626, + "grad_norm": 4.118215084075928, + "learning_rate": 7.95163788543625e-06, + "loss": 0.9996, + "step": 4152 + }, + { + "epoch": 0.32015109466543323, + "grad_norm": 3.864248037338257, + "learning_rate": 7.950630092050022e-06, + "loss": 1.0801, + "step": 4153 + }, + { + "epoch": 0.3202281837804502, + "grad_norm": 3.5450005531311035, + "learning_rate": 7.949622114707288e-06, + "loss": 0.9193, + "step": 4154 + }, + { + "epoch": 0.3203052728954672, + "grad_norm": 3.6530864238739014, + "learning_rate": 7.948613953470892e-06, + "loss": 0.8586, + "step": 4155 + }, + { + "epoch": 0.32038236201048415, + "grad_norm": 3.8235666751861572, + "learning_rate": 7.947605608403688e-06, + "loss": 0.9473, + "step": 4156 + }, + { + "epoch": 0.32045945112550106, + "grad_norm": 4.016197681427002, + "learning_rate": 7.946597079568538e-06, + "loss": 1.0146, + "step": 4157 + }, + { + "epoch": 0.32053654024051803, + "grad_norm": 3.6410627365112305, + "learning_rate": 7.945588367028324e-06, + "loss": 0.9481, + "step": 4158 + }, + { + "epoch": 0.320613629355535, + "grad_norm": 5.766059875488281, + "learning_rate": 7.94457947084593e-06, + "loss": 0.9786, + "step": 4159 + }, + { + "epoch": 0.32069071847055197, + "grad_norm": 3.8257815837860107, + "learning_rate": 7.943570391084254e-06, + "loss": 1.1402, + "step": 4160 + }, + { + "epoch": 0.32076780758556894, + "grad_norm": 4.60271692276001, + "learning_rate": 7.942561127806212e-06, + "loss": 1.0461, + "step": 4161 + }, + { + "epoch": 0.32084489670058586, + "grad_norm": 3.0530004501342773, + "learning_rate": 7.941551681074723e-06, + "loss": 0.8957, + "step": 4162 + }, + { + "epoch": 0.32092198581560283, + "grad_norm": 3.847745180130005, + "learning_rate": 7.94054205095272e-06, + "loss": 0.9406, + "step": 4163 + }, + { + "epoch": 0.3209990749306198, + "grad_norm": 3.877429962158203, + "learning_rate": 7.939532237503151e-06, + "loss": 0.9846, + "step": 4164 + }, + { + "epoch": 0.32107616404563677, + "grad_norm": 3.6080520153045654, + "learning_rate": 7.93852224078897e-06, + "loss": 1.0752, + "step": 4165 + }, + { + "epoch": 0.32115325316065374, + "grad_norm": 3.4427030086517334, + "learning_rate": 7.937512060873149e-06, + "loss": 0.9899, + "step": 4166 + }, + { + "epoch": 0.32123034227567066, + "grad_norm": 3.60064697265625, + "learning_rate": 7.936501697818661e-06, + "loss": 0.9592, + "step": 4167 + }, + { + "epoch": 0.3213074313906876, + "grad_norm": 3.334470510482788, + "learning_rate": 7.935491151688504e-06, + "loss": 1.0138, + "step": 4168 + }, + { + "epoch": 0.3213845205057046, + "grad_norm": 3.6496033668518066, + "learning_rate": 7.934480422545674e-06, + "loss": 1.0665, + "step": 4169 + }, + { + "epoch": 0.32146160962072157, + "grad_norm": 3.650097131729126, + "learning_rate": 7.933469510453189e-06, + "loss": 1.085, + "step": 4170 + }, + { + "epoch": 0.32153869873573854, + "grad_norm": 3.6096320152282715, + "learning_rate": 7.932458415474073e-06, + "loss": 0.9372, + "step": 4171 + }, + { + "epoch": 0.32161578785075545, + "grad_norm": 4.109353065490723, + "learning_rate": 7.931447137671364e-06, + "loss": 1.0568, + "step": 4172 + }, + { + "epoch": 0.3216928769657724, + "grad_norm": 3.8872742652893066, + "learning_rate": 7.930435677108106e-06, + "loss": 1.0137, + "step": 4173 + }, + { + "epoch": 0.3217699660807894, + "grad_norm": 4.055763244628906, + "learning_rate": 7.929424033847362e-06, + "loss": 1.0044, + "step": 4174 + }, + { + "epoch": 0.32184705519580636, + "grad_norm": 3.5979840755462646, + "learning_rate": 7.9284122079522e-06, + "loss": 0.982, + "step": 4175 + }, + { + "epoch": 0.32192414431082333, + "grad_norm": 3.5764944553375244, + "learning_rate": 7.927400199485705e-06, + "loss": 0.9204, + "step": 4176 + }, + { + "epoch": 0.32200123342584025, + "grad_norm": 3.7426390647888184, + "learning_rate": 7.926388008510968e-06, + "loss": 0.9989, + "step": 4177 + }, + { + "epoch": 0.3220783225408572, + "grad_norm": 3.4947009086608887, + "learning_rate": 7.925375635091094e-06, + "loss": 0.959, + "step": 4178 + }, + { + "epoch": 0.3221554116558742, + "grad_norm": 4.316697597503662, + "learning_rate": 7.924363079289203e-06, + "loss": 1.0835, + "step": 4179 + }, + { + "epoch": 0.32223250077089116, + "grad_norm": 4.12495756149292, + "learning_rate": 7.923350341168416e-06, + "loss": 1.0846, + "step": 4180 + }, + { + "epoch": 0.32230958988590813, + "grad_norm": 3.6122055053710938, + "learning_rate": 7.922337420791879e-06, + "loss": 0.8974, + "step": 4181 + }, + { + "epoch": 0.32238667900092505, + "grad_norm": 3.608186960220337, + "learning_rate": 7.921324318222737e-06, + "loss": 1.066, + "step": 4182 + }, + { + "epoch": 0.322463768115942, + "grad_norm": 3.8409266471862793, + "learning_rate": 7.920311033524156e-06, + "loss": 1.0132, + "step": 4183 + }, + { + "epoch": 0.322540857230959, + "grad_norm": 4.176751136779785, + "learning_rate": 7.919297566759304e-06, + "loss": 1.0244, + "step": 4184 + }, + { + "epoch": 0.32261794634597596, + "grad_norm": 3.635542869567871, + "learning_rate": 7.918283917991367e-06, + "loss": 1.0494, + "step": 4185 + }, + { + "epoch": 0.32269503546099293, + "grad_norm": 3.756164073944092, + "learning_rate": 7.917270087283544e-06, + "loss": 1.0204, + "step": 4186 + }, + { + "epoch": 0.32277212457600984, + "grad_norm": 3.905221939086914, + "learning_rate": 7.91625607469904e-06, + "loss": 0.9397, + "step": 4187 + }, + { + "epoch": 0.3228492136910268, + "grad_norm": 3.6469991207122803, + "learning_rate": 7.915241880301075e-06, + "loss": 1.0839, + "step": 4188 + }, + { + "epoch": 0.3229263028060438, + "grad_norm": 3.7983455657958984, + "learning_rate": 7.914227504152874e-06, + "loss": 0.9868, + "step": 4189 + }, + { + "epoch": 0.32300339192106076, + "grad_norm": 3.3824074268341064, + "learning_rate": 7.913212946317684e-06, + "loss": 0.8911, + "step": 4190 + }, + { + "epoch": 0.3230804810360777, + "grad_norm": 3.6762890815734863, + "learning_rate": 7.912198206858752e-06, + "loss": 1.0045, + "step": 4191 + }, + { + "epoch": 0.32315757015109464, + "grad_norm": 4.075868129730225, + "learning_rate": 7.911183285839347e-06, + "loss": 1.0257, + "step": 4192 + }, + { + "epoch": 0.3232346592661116, + "grad_norm": 3.5539023876190186, + "learning_rate": 7.91016818332274e-06, + "loss": 1.0092, + "step": 4193 + }, + { + "epoch": 0.3233117483811286, + "grad_norm": 3.818380355834961, + "learning_rate": 7.909152899372218e-06, + "loss": 1.097, + "step": 4194 + }, + { + "epoch": 0.32338883749614555, + "grad_norm": 3.38431453704834, + "learning_rate": 7.908137434051083e-06, + "loss": 0.896, + "step": 4195 + }, + { + "epoch": 0.3234659266111625, + "grad_norm": 3.587066888809204, + "learning_rate": 7.907121787422638e-06, + "loss": 0.9987, + "step": 4196 + }, + { + "epoch": 0.32354301572617944, + "grad_norm": 4.137657165527344, + "learning_rate": 7.906105959550206e-06, + "loss": 0.9355, + "step": 4197 + }, + { + "epoch": 0.3236201048411964, + "grad_norm": 3.3995325565338135, + "learning_rate": 7.90508995049712e-06, + "loss": 0.9642, + "step": 4198 + }, + { + "epoch": 0.3236971939562134, + "grad_norm": 3.7008519172668457, + "learning_rate": 7.90407376032672e-06, + "loss": 1.0123, + "step": 4199 + }, + { + "epoch": 0.32377428307123035, + "grad_norm": 3.6094677448272705, + "learning_rate": 7.903057389102361e-06, + "loss": 0.9955, + "step": 4200 + }, + { + "epoch": 0.3238513721862473, + "grad_norm": 3.729969024658203, + "learning_rate": 7.902040836887413e-06, + "loss": 0.9154, + "step": 4201 + }, + { + "epoch": 0.32392846130126424, + "grad_norm": 3.7781825065612793, + "learning_rate": 7.901024103745244e-06, + "loss": 1.0113, + "step": 4202 + }, + { + "epoch": 0.3240055504162812, + "grad_norm": 3.7726991176605225, + "learning_rate": 7.90000718973925e-06, + "loss": 1.0299, + "step": 4203 + }, + { + "epoch": 0.3240826395312982, + "grad_norm": 3.447260618209839, + "learning_rate": 7.898990094932826e-06, + "loss": 1.0302, + "step": 4204 + }, + { + "epoch": 0.32415972864631515, + "grad_norm": 3.4710397720336914, + "learning_rate": 7.897972819389385e-06, + "loss": 0.8961, + "step": 4205 + }, + { + "epoch": 0.3242368177613321, + "grad_norm": 3.3661441802978516, + "learning_rate": 7.896955363172347e-06, + "loss": 0.9486, + "step": 4206 + }, + { + "epoch": 0.32431390687634903, + "grad_norm": 3.684475898742676, + "learning_rate": 7.895937726345145e-06, + "loss": 0.9692, + "step": 4207 + }, + { + "epoch": 0.324390995991366, + "grad_norm": 3.5266542434692383, + "learning_rate": 7.894919908971225e-06, + "loss": 1.0262, + "step": 4208 + }, + { + "epoch": 0.324468085106383, + "grad_norm": 3.6420974731445312, + "learning_rate": 7.893901911114041e-06, + "loss": 1.0388, + "step": 4209 + }, + { + "epoch": 0.32454517422139995, + "grad_norm": 3.7208940982818604, + "learning_rate": 7.892883732837062e-06, + "loss": 0.9965, + "step": 4210 + }, + { + "epoch": 0.3246222633364169, + "grad_norm": 3.839670419692993, + "learning_rate": 7.891865374203765e-06, + "loss": 1.0087, + "step": 4211 + }, + { + "epoch": 0.32469935245143383, + "grad_norm": 3.901395559310913, + "learning_rate": 7.890846835277638e-06, + "loss": 0.9884, + "step": 4212 + }, + { + "epoch": 0.3247764415664508, + "grad_norm": 3.6374828815460205, + "learning_rate": 7.889828116122183e-06, + "loss": 1.0152, + "step": 4213 + }, + { + "epoch": 0.3248535306814678, + "grad_norm": 4.1036763191223145, + "learning_rate": 7.888809216800913e-06, + "loss": 1.0062, + "step": 4214 + }, + { + "epoch": 0.32493061979648474, + "grad_norm": 3.640856981277466, + "learning_rate": 7.887790137377348e-06, + "loss": 0.9845, + "step": 4215 + }, + { + "epoch": 0.3250077089115017, + "grad_norm": 3.1671056747436523, + "learning_rate": 7.886770877915027e-06, + "loss": 0.892, + "step": 4216 + }, + { + "epoch": 0.32508479802651863, + "grad_norm": 3.5967583656311035, + "learning_rate": 7.885751438477489e-06, + "loss": 1.0032, + "step": 4217 + }, + { + "epoch": 0.3251618871415356, + "grad_norm": 3.60170316696167, + "learning_rate": 7.884731819128298e-06, + "loss": 0.9933, + "step": 4218 + }, + { + "epoch": 0.32523897625655257, + "grad_norm": 3.8272790908813477, + "learning_rate": 7.883712019931017e-06, + "loss": 1.0296, + "step": 4219 + }, + { + "epoch": 0.32531606537156954, + "grad_norm": 4.294167995452881, + "learning_rate": 7.882692040949226e-06, + "loss": 1.1175, + "step": 4220 + }, + { + "epoch": 0.3253931544865865, + "grad_norm": 3.4372246265411377, + "learning_rate": 7.881671882246518e-06, + "loss": 1.0131, + "step": 4221 + }, + { + "epoch": 0.3254702436016034, + "grad_norm": 4.079759120941162, + "learning_rate": 7.880651543886491e-06, + "loss": 0.9886, + "step": 4222 + }, + { + "epoch": 0.3255473327166204, + "grad_norm": 5.348727226257324, + "learning_rate": 7.87963102593276e-06, + "loss": 0.9807, + "step": 4223 + }, + { + "epoch": 0.32562442183163737, + "grad_norm": 3.5309953689575195, + "learning_rate": 7.878610328448948e-06, + "loss": 0.9316, + "step": 4224 + }, + { + "epoch": 0.32570151094665434, + "grad_norm": 3.289616346359253, + "learning_rate": 7.877589451498692e-06, + "loss": 0.9425, + "step": 4225 + }, + { + "epoch": 0.3257786000616713, + "grad_norm": 3.5728421211242676, + "learning_rate": 7.876568395145636e-06, + "loss": 0.8591, + "step": 4226 + }, + { + "epoch": 0.3258556891766883, + "grad_norm": 3.5462687015533447, + "learning_rate": 7.87554715945344e-06, + "loss": 0.875, + "step": 4227 + }, + { + "epoch": 0.3259327782917052, + "grad_norm": 3.9166672229766846, + "learning_rate": 7.87452574448577e-06, + "loss": 1.0411, + "step": 4228 + }, + { + "epoch": 0.32600986740672216, + "grad_norm": 3.8413209915161133, + "learning_rate": 7.873504150306308e-06, + "loss": 0.9692, + "step": 4229 + }, + { + "epoch": 0.32608695652173914, + "grad_norm": 4.090485095977783, + "learning_rate": 7.872482376978746e-06, + "loss": 1.0688, + "step": 4230 + }, + { + "epoch": 0.3261640456367561, + "grad_norm": 3.474355697631836, + "learning_rate": 7.87146042456678e-06, + "loss": 1.0016, + "step": 4231 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 3.8390350341796875, + "learning_rate": 7.870438293134133e-06, + "loss": 1.0577, + "step": 4232 + }, + { + "epoch": 0.32631822386679, + "grad_norm": 4.005039691925049, + "learning_rate": 7.869415982744524e-06, + "loss": 0.8633, + "step": 4233 + }, + { + "epoch": 0.32639531298180696, + "grad_norm": 3.2557785511016846, + "learning_rate": 7.868393493461688e-06, + "loss": 0.8518, + "step": 4234 + }, + { + "epoch": 0.32647240209682393, + "grad_norm": 3.4779703617095947, + "learning_rate": 7.867370825349375e-06, + "loss": 1.043, + "step": 4235 + }, + { + "epoch": 0.3265494912118409, + "grad_norm": 3.7263035774230957, + "learning_rate": 7.86634797847134e-06, + "loss": 1.0086, + "step": 4236 + }, + { + "epoch": 0.3266265803268579, + "grad_norm": 3.4993364810943604, + "learning_rate": 7.865324952891354e-06, + "loss": 0.9986, + "step": 4237 + }, + { + "epoch": 0.3267036694418748, + "grad_norm": 31.114559173583984, + "learning_rate": 7.864301748673197e-06, + "loss": 1.041, + "step": 4238 + }, + { + "epoch": 0.32678075855689176, + "grad_norm": 3.9260265827178955, + "learning_rate": 7.863278365880662e-06, + "loss": 1.1013, + "step": 4239 + }, + { + "epoch": 0.32685784767190873, + "grad_norm": 3.4418816566467285, + "learning_rate": 7.862254804577549e-06, + "loss": 0.9764, + "step": 4240 + }, + { + "epoch": 0.3269349367869257, + "grad_norm": 3.3301095962524414, + "learning_rate": 7.861231064827673e-06, + "loss": 1.0296, + "step": 4241 + }, + { + "epoch": 0.32701202590194267, + "grad_norm": 3.416653633117676, + "learning_rate": 7.86020714669486e-06, + "loss": 1.0018, + "step": 4242 + }, + { + "epoch": 0.3270891150169596, + "grad_norm": 3.7522192001342773, + "learning_rate": 7.859183050242945e-06, + "loss": 0.8415, + "step": 4243 + }, + { + "epoch": 0.32716620413197656, + "grad_norm": 3.878190279006958, + "learning_rate": 7.858158775535773e-06, + "loss": 1.0617, + "step": 4244 + }, + { + "epoch": 0.3272432932469935, + "grad_norm": 3.7095284461975098, + "learning_rate": 7.857134322637205e-06, + "loss": 1.0556, + "step": 4245 + }, + { + "epoch": 0.3273203823620105, + "grad_norm": 3.8554065227508545, + "learning_rate": 7.85610969161111e-06, + "loss": 1.0377, + "step": 4246 + }, + { + "epoch": 0.32739747147702747, + "grad_norm": 3.8045499324798584, + "learning_rate": 7.855084882521366e-06, + "loss": 1.073, + "step": 4247 + }, + { + "epoch": 0.3274745605920444, + "grad_norm": 4.000002384185791, + "learning_rate": 7.854059895431869e-06, + "loss": 1.0777, + "step": 4248 + }, + { + "epoch": 0.32755164970706135, + "grad_norm": 4.595569610595703, + "learning_rate": 7.853034730406516e-06, + "loss": 0.9705, + "step": 4249 + }, + { + "epoch": 0.3276287388220783, + "grad_norm": 3.743229627609253, + "learning_rate": 7.852009387509227e-06, + "loss": 1.0785, + "step": 4250 + }, + { + "epoch": 0.3277058279370953, + "grad_norm": 3.717169761657715, + "learning_rate": 7.850983866803923e-06, + "loss": 1.0172, + "step": 4251 + }, + { + "epoch": 0.32778291705211227, + "grad_norm": 3.679903984069824, + "learning_rate": 7.849958168354538e-06, + "loss": 0.9691, + "step": 4252 + }, + { + "epoch": 0.3278600061671292, + "grad_norm": 3.682389497756958, + "learning_rate": 7.848932292225025e-06, + "loss": 1.0234, + "step": 4253 + }, + { + "epoch": 0.32793709528214615, + "grad_norm": 4.109451770782471, + "learning_rate": 7.847906238479337e-06, + "loss": 1.0486, + "step": 4254 + }, + { + "epoch": 0.3280141843971631, + "grad_norm": 4.467040061950684, + "learning_rate": 7.846880007181443e-06, + "loss": 1.0206, + "step": 4255 + }, + { + "epoch": 0.3280912735121801, + "grad_norm": 3.9203736782073975, + "learning_rate": 7.845853598395327e-06, + "loss": 0.9341, + "step": 4256 + }, + { + "epoch": 0.32816836262719706, + "grad_norm": 3.5691542625427246, + "learning_rate": 7.844827012184978e-06, + "loss": 0.9578, + "step": 4257 + }, + { + "epoch": 0.328245451742214, + "grad_norm": 3.7378454208374023, + "learning_rate": 7.843800248614396e-06, + "loss": 1.04, + "step": 4258 + }, + { + "epoch": 0.32832254085723095, + "grad_norm": 3.8135650157928467, + "learning_rate": 7.8427733077476e-06, + "loss": 1.021, + "step": 4259 + }, + { + "epoch": 0.3283996299722479, + "grad_norm": 3.978492021560669, + "learning_rate": 7.84174618964861e-06, + "loss": 1.0348, + "step": 4260 + }, + { + "epoch": 0.3284767190872649, + "grad_norm": 3.6482443809509277, + "learning_rate": 7.840718894381464e-06, + "loss": 0.9996, + "step": 4261 + }, + { + "epoch": 0.32855380820228186, + "grad_norm": 3.369802474975586, + "learning_rate": 7.839691422010208e-06, + "loss": 0.9249, + "step": 4262 + }, + { + "epoch": 0.3286308973172988, + "grad_norm": 3.726884603500366, + "learning_rate": 7.838663772598897e-06, + "loss": 0.9568, + "step": 4263 + }, + { + "epoch": 0.32870798643231575, + "grad_norm": 3.5390725135803223, + "learning_rate": 7.837635946211603e-06, + "loss": 1.0817, + "step": 4264 + }, + { + "epoch": 0.3287850755473327, + "grad_norm": 4.038122177124023, + "learning_rate": 7.836607942912403e-06, + "loss": 1.0059, + "step": 4265 + }, + { + "epoch": 0.3288621646623497, + "grad_norm": 3.1766395568847656, + "learning_rate": 7.83557976276539e-06, + "loss": 0.8842, + "step": 4266 + }, + { + "epoch": 0.32893925377736666, + "grad_norm": 3.975059986114502, + "learning_rate": 7.834551405834665e-06, + "loss": 1.1503, + "step": 4267 + }, + { + "epoch": 0.3290163428923836, + "grad_norm": 4.144785404205322, + "learning_rate": 7.833522872184338e-06, + "loss": 0.974, + "step": 4268 + }, + { + "epoch": 0.32909343200740054, + "grad_norm": 3.740612268447876, + "learning_rate": 7.832494161878537e-06, + "loss": 0.956, + "step": 4269 + }, + { + "epoch": 0.3291705211224175, + "grad_norm": 3.4083251953125, + "learning_rate": 7.831465274981395e-06, + "loss": 0.9164, + "step": 4270 + }, + { + "epoch": 0.3292476102374345, + "grad_norm": 3.277799367904663, + "learning_rate": 7.830436211557057e-06, + "loss": 0.9708, + "step": 4271 + }, + { + "epoch": 0.32932469935245146, + "grad_norm": 3.846964120864868, + "learning_rate": 7.82940697166968e-06, + "loss": 0.9867, + "step": 4272 + }, + { + "epoch": 0.32940178846746837, + "grad_norm": 3.877805233001709, + "learning_rate": 7.828377555383433e-06, + "loss": 1.0738, + "step": 4273 + }, + { + "epoch": 0.32947887758248534, + "grad_norm": 3.754934310913086, + "learning_rate": 7.827347962762495e-06, + "loss": 1.0322, + "step": 4274 + }, + { + "epoch": 0.3295559666975023, + "grad_norm": 4.1259284019470215, + "learning_rate": 7.826318193871052e-06, + "loss": 1.0407, + "step": 4275 + }, + { + "epoch": 0.3296330558125193, + "grad_norm": 4.191363334655762, + "learning_rate": 7.825288248773309e-06, + "loss": 1.0295, + "step": 4276 + }, + { + "epoch": 0.32971014492753625, + "grad_norm": 3.45347261428833, + "learning_rate": 7.824258127533477e-06, + "loss": 0.9255, + "step": 4277 + }, + { + "epoch": 0.32978723404255317, + "grad_norm": 3.499629497528076, + "learning_rate": 7.823227830215776e-06, + "loss": 1.0538, + "step": 4278 + }, + { + "epoch": 0.32986432315757014, + "grad_norm": 4.018231391906738, + "learning_rate": 7.822197356884442e-06, + "loss": 1.0472, + "step": 4279 + }, + { + "epoch": 0.3299414122725871, + "grad_norm": 3.717026472091675, + "learning_rate": 7.82116670760372e-06, + "loss": 0.9186, + "step": 4280 + }, + { + "epoch": 0.3300185013876041, + "grad_norm": 4.538501262664795, + "learning_rate": 7.820135882437866e-06, + "loss": 1.1079, + "step": 4281 + }, + { + "epoch": 0.33009559050262105, + "grad_norm": 3.6810877323150635, + "learning_rate": 7.819104881451145e-06, + "loss": 0.9466, + "step": 4282 + }, + { + "epoch": 0.33017267961763797, + "grad_norm": 3.926638603210449, + "learning_rate": 7.818073704707834e-06, + "loss": 0.9839, + "step": 4283 + }, + { + "epoch": 0.33024976873265494, + "grad_norm": 3.823995590209961, + "learning_rate": 7.817042352272224e-06, + "loss": 1.0753, + "step": 4284 + }, + { + "epoch": 0.3303268578476719, + "grad_norm": 3.6535205841064453, + "learning_rate": 7.816010824208613e-06, + "loss": 1.0285, + "step": 4285 + }, + { + "epoch": 0.3304039469626889, + "grad_norm": 3.4686245918273926, + "learning_rate": 7.814979120581311e-06, + "loss": 0.9721, + "step": 4286 + }, + { + "epoch": 0.33048103607770585, + "grad_norm": 3.8238701820373535, + "learning_rate": 7.81394724145464e-06, + "loss": 1.0513, + "step": 4287 + }, + { + "epoch": 0.33055812519272276, + "grad_norm": 3.5260727405548096, + "learning_rate": 7.812915186892933e-06, + "loss": 0.9242, + "step": 4288 + }, + { + "epoch": 0.33063521430773973, + "grad_norm": 3.374612331390381, + "learning_rate": 7.811882956960532e-06, + "loss": 0.984, + "step": 4289 + }, + { + "epoch": 0.3307123034227567, + "grad_norm": 3.7520065307617188, + "learning_rate": 7.810850551721793e-06, + "loss": 0.9958, + "step": 4290 + }, + { + "epoch": 0.3307893925377737, + "grad_norm": 3.351409435272217, + "learning_rate": 7.809817971241079e-06, + "loss": 0.9151, + "step": 4291 + }, + { + "epoch": 0.33086648165279064, + "grad_norm": 3.3581125736236572, + "learning_rate": 7.808785215582766e-06, + "loss": 0.9481, + "step": 4292 + }, + { + "epoch": 0.33094357076780756, + "grad_norm": 3.93084716796875, + "learning_rate": 7.807752284811243e-06, + "loss": 1.1273, + "step": 4293 + }, + { + "epoch": 0.33102065988282453, + "grad_norm": 4.633861064910889, + "learning_rate": 7.806719178990906e-06, + "loss": 1.0091, + "step": 4294 + }, + { + "epoch": 0.3310977489978415, + "grad_norm": 3.382524013519287, + "learning_rate": 7.805685898186164e-06, + "loss": 0.958, + "step": 4295 + }, + { + "epoch": 0.33117483811285847, + "grad_norm": 3.2215330600738525, + "learning_rate": 7.804652442461438e-06, + "loss": 0.9924, + "step": 4296 + }, + { + "epoch": 0.33125192722787544, + "grad_norm": 3.757098913192749, + "learning_rate": 7.80361881188116e-06, + "loss": 1.0064, + "step": 4297 + }, + { + "epoch": 0.33132901634289236, + "grad_norm": 3.6601150035858154, + "learning_rate": 7.802585006509766e-06, + "loss": 0.9142, + "step": 4298 + }, + { + "epoch": 0.33140610545790933, + "grad_norm": 4.117657661437988, + "learning_rate": 7.801551026411715e-06, + "loss": 1.1443, + "step": 4299 + }, + { + "epoch": 0.3314831945729263, + "grad_norm": 3.925748109817505, + "learning_rate": 7.800516871651465e-06, + "loss": 0.9226, + "step": 4300 + }, + { + "epoch": 0.33156028368794327, + "grad_norm": 3.6940126419067383, + "learning_rate": 7.799482542293491e-06, + "loss": 1.1027, + "step": 4301 + }, + { + "epoch": 0.33163737280296024, + "grad_norm": 3.6253130435943604, + "learning_rate": 7.798448038402283e-06, + "loss": 0.9479, + "step": 4302 + }, + { + "epoch": 0.33171446191797715, + "grad_norm": 3.360285758972168, + "learning_rate": 7.79741336004233e-06, + "loss": 0.9113, + "step": 4303 + }, + { + "epoch": 0.3317915510329941, + "grad_norm": 3.5105605125427246, + "learning_rate": 7.796378507278144e-06, + "loss": 0.9038, + "step": 4304 + }, + { + "epoch": 0.3318686401480111, + "grad_norm": 4.047402858734131, + "learning_rate": 7.79534348017424e-06, + "loss": 0.9321, + "step": 4305 + }, + { + "epoch": 0.33194572926302807, + "grad_norm": 3.2786002159118652, + "learning_rate": 7.794308278795148e-06, + "loss": 0.9316, + "step": 4306 + }, + { + "epoch": 0.33202281837804504, + "grad_norm": 3.5382752418518066, + "learning_rate": 7.793272903205406e-06, + "loss": 1.0196, + "step": 4307 + }, + { + "epoch": 0.33209990749306195, + "grad_norm": 3.999952793121338, + "learning_rate": 7.792237353469567e-06, + "loss": 0.9702, + "step": 4308 + }, + { + "epoch": 0.3321769966080789, + "grad_norm": 3.6367757320404053, + "learning_rate": 7.791201629652189e-06, + "loss": 0.9877, + "step": 4309 + }, + { + "epoch": 0.3322540857230959, + "grad_norm": 5.327296733856201, + "learning_rate": 7.790165731817847e-06, + "loss": 0.8918, + "step": 4310 + }, + { + "epoch": 0.33233117483811286, + "grad_norm": 3.5317442417144775, + "learning_rate": 7.789129660031121e-06, + "loss": 0.9834, + "step": 4311 + }, + { + "epoch": 0.33240826395312983, + "grad_norm": 3.6593918800354004, + "learning_rate": 7.788093414356605e-06, + "loss": 1.0577, + "step": 4312 + }, + { + "epoch": 0.3324853530681468, + "grad_norm": 3.4909262657165527, + "learning_rate": 7.787056994858906e-06, + "loss": 0.9607, + "step": 4313 + }, + { + "epoch": 0.3325624421831637, + "grad_norm": 3.5621049404144287, + "learning_rate": 7.786020401602638e-06, + "loss": 0.9476, + "step": 4314 + }, + { + "epoch": 0.3326395312981807, + "grad_norm": 3.4073727130889893, + "learning_rate": 7.784983634652425e-06, + "loss": 1.0184, + "step": 4315 + }, + { + "epoch": 0.33271662041319766, + "grad_norm": 4.091675281524658, + "learning_rate": 7.783946694072908e-06, + "loss": 1.0435, + "step": 4316 + }, + { + "epoch": 0.33279370952821463, + "grad_norm": 3.9657843112945557, + "learning_rate": 7.782909579928733e-06, + "loss": 1.0276, + "step": 4317 + }, + { + "epoch": 0.3328707986432316, + "grad_norm": 3.8811726570129395, + "learning_rate": 7.78187229228456e-06, + "loss": 1.022, + "step": 4318 + }, + { + "epoch": 0.3329478877582485, + "grad_norm": 4.559733867645264, + "learning_rate": 7.780834831205056e-06, + "loss": 1.0362, + "step": 4319 + }, + { + "epoch": 0.3330249768732655, + "grad_norm": 3.447314500808716, + "learning_rate": 7.779797196754901e-06, + "loss": 0.9834, + "step": 4320 + }, + { + "epoch": 0.33310206598828246, + "grad_norm": 3.546065330505371, + "learning_rate": 7.77875938899879e-06, + "loss": 0.9406, + "step": 4321 + }, + { + "epoch": 0.33317915510329943, + "grad_norm": 3.527249336242676, + "learning_rate": 7.777721408001421e-06, + "loss": 1.0047, + "step": 4322 + }, + { + "epoch": 0.3332562442183164, + "grad_norm": 3.692897319793701, + "learning_rate": 7.77668325382751e-06, + "loss": 1.1071, + "step": 4323 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 3.6986513137817383, + "learning_rate": 7.775644926541778e-06, + "loss": 0.9281, + "step": 4324 + }, + { + "epoch": 0.3334104224483503, + "grad_norm": 3.8168718814849854, + "learning_rate": 7.77460642620896e-06, + "loss": 1.0708, + "step": 4325 + }, + { + "epoch": 0.33348751156336726, + "grad_norm": 3.7896907329559326, + "learning_rate": 7.773567752893803e-06, + "loss": 0.9714, + "step": 4326 + }, + { + "epoch": 0.3335646006783842, + "grad_norm": 3.77909255027771, + "learning_rate": 7.772528906661059e-06, + "loss": 0.9923, + "step": 4327 + }, + { + "epoch": 0.3336416897934012, + "grad_norm": 3.9389283657073975, + "learning_rate": 7.771489887575498e-06, + "loss": 1.0688, + "step": 4328 + }, + { + "epoch": 0.3337187789084181, + "grad_norm": 3.601837635040283, + "learning_rate": 7.770450695701896e-06, + "loss": 1.0515, + "step": 4329 + }, + { + "epoch": 0.3337958680234351, + "grad_norm": 4.004146099090576, + "learning_rate": 7.769411331105044e-06, + "loss": 1.0052, + "step": 4330 + }, + { + "epoch": 0.33387295713845205, + "grad_norm": 3.7516090869903564, + "learning_rate": 7.768371793849736e-06, + "loss": 1.0585, + "step": 4331 + }, + { + "epoch": 0.333950046253469, + "grad_norm": 3.6058356761932373, + "learning_rate": 7.767332084000784e-06, + "loss": 1.0234, + "step": 4332 + }, + { + "epoch": 0.334027135368486, + "grad_norm": 3.795377016067505, + "learning_rate": 7.766292201623012e-06, + "loss": 1.0903, + "step": 4333 + }, + { + "epoch": 0.3341042244835029, + "grad_norm": 3.492107391357422, + "learning_rate": 7.765252146781245e-06, + "loss": 0.8837, + "step": 4334 + }, + { + "epoch": 0.3341813135985199, + "grad_norm": 3.525789976119995, + "learning_rate": 7.76421191954033e-06, + "loss": 0.8463, + "step": 4335 + }, + { + "epoch": 0.33425840271353685, + "grad_norm": 4.0740509033203125, + "learning_rate": 7.763171519965118e-06, + "loss": 1.152, + "step": 4336 + }, + { + "epoch": 0.3343354918285538, + "grad_norm": 3.6718363761901855, + "learning_rate": 7.762130948120472e-06, + "loss": 0.9143, + "step": 4337 + }, + { + "epoch": 0.3344125809435708, + "grad_norm": 3.7916324138641357, + "learning_rate": 7.761090204071267e-06, + "loss": 0.9976, + "step": 4338 + }, + { + "epoch": 0.3344896700585877, + "grad_norm": 3.5439155101776123, + "learning_rate": 7.760049287882388e-06, + "loss": 0.9737, + "step": 4339 + }, + { + "epoch": 0.3345667591736047, + "grad_norm": 3.6218690872192383, + "learning_rate": 7.75900819961873e-06, + "loss": 0.9956, + "step": 4340 + }, + { + "epoch": 0.33464384828862165, + "grad_norm": 3.703711986541748, + "learning_rate": 7.757966939345201e-06, + "loss": 0.854, + "step": 4341 + }, + { + "epoch": 0.3347209374036386, + "grad_norm": 3.890967845916748, + "learning_rate": 7.756925507126717e-06, + "loss": 1.0805, + "step": 4342 + }, + { + "epoch": 0.3347980265186556, + "grad_norm": 3.6906652450561523, + "learning_rate": 7.755883903028205e-06, + "loss": 1.0733, + "step": 4343 + }, + { + "epoch": 0.3348751156336725, + "grad_norm": 3.4065797328948975, + "learning_rate": 7.754842127114606e-06, + "loss": 0.9758, + "step": 4344 + }, + { + "epoch": 0.3349522047486895, + "grad_norm": 3.7064595222473145, + "learning_rate": 7.753800179450867e-06, + "loss": 1.0737, + "step": 4345 + }, + { + "epoch": 0.33502929386370645, + "grad_norm": 3.8116419315338135, + "learning_rate": 7.752758060101951e-06, + "loss": 1.0783, + "step": 4346 + }, + { + "epoch": 0.3351063829787234, + "grad_norm": 3.957881450653076, + "learning_rate": 7.751715769132823e-06, + "loss": 1.0667, + "step": 4347 + }, + { + "epoch": 0.3351834720937404, + "grad_norm": 3.905587911605835, + "learning_rate": 7.750673306608473e-06, + "loss": 1.0612, + "step": 4348 + }, + { + "epoch": 0.3352605612087573, + "grad_norm": 4.058816432952881, + "learning_rate": 7.749630672593886e-06, + "loss": 1.2289, + "step": 4349 + }, + { + "epoch": 0.3353376503237743, + "grad_norm": 3.449509382247925, + "learning_rate": 7.748587867154068e-06, + "loss": 0.9291, + "step": 4350 + }, + { + "epoch": 0.33541473943879124, + "grad_norm": 3.368961811065674, + "learning_rate": 7.747544890354031e-06, + "loss": 0.9528, + "step": 4351 + }, + { + "epoch": 0.3354918285538082, + "grad_norm": 3.8779165744781494, + "learning_rate": 7.7465017422588e-06, + "loss": 1.075, + "step": 4352 + }, + { + "epoch": 0.3355689176688252, + "grad_norm": 3.9211997985839844, + "learning_rate": 7.745458422933409e-06, + "loss": 0.9959, + "step": 4353 + }, + { + "epoch": 0.3356460067838421, + "grad_norm": 3.9596736431121826, + "learning_rate": 7.744414932442908e-06, + "loss": 1.0908, + "step": 4354 + }, + { + "epoch": 0.33572309589885907, + "grad_norm": 4.090301036834717, + "learning_rate": 7.743371270852346e-06, + "loss": 1.2187, + "step": 4355 + }, + { + "epoch": 0.33580018501387604, + "grad_norm": 3.7745120525360107, + "learning_rate": 7.742327438226796e-06, + "loss": 0.9202, + "step": 4356 + }, + { + "epoch": 0.335877274128893, + "grad_norm": 3.6797232627868652, + "learning_rate": 7.74128343463133e-06, + "loss": 0.8468, + "step": 4357 + }, + { + "epoch": 0.33595436324391, + "grad_norm": 3.7442944049835205, + "learning_rate": 7.740239260131042e-06, + "loss": 0.974, + "step": 4358 + }, + { + "epoch": 0.3360314523589269, + "grad_norm": 3.405045509338379, + "learning_rate": 7.739194914791028e-06, + "loss": 0.9661, + "step": 4359 + }, + { + "epoch": 0.33610854147394387, + "grad_norm": 3.6131653785705566, + "learning_rate": 7.738150398676397e-06, + "loss": 1.014, + "step": 4360 + }, + { + "epoch": 0.33618563058896084, + "grad_norm": 4.105853080749512, + "learning_rate": 7.73710571185227e-06, + "loss": 1.2013, + "step": 4361 + }, + { + "epoch": 0.3362627197039778, + "grad_norm": 3.5816614627838135, + "learning_rate": 7.736060854383778e-06, + "loss": 0.8909, + "step": 4362 + }, + { + "epoch": 0.3363398088189948, + "grad_norm": 3.8828015327453613, + "learning_rate": 7.735015826336064e-06, + "loss": 1.0176, + "step": 4363 + }, + { + "epoch": 0.3364168979340117, + "grad_norm": 3.9706432819366455, + "learning_rate": 7.733970627774275e-06, + "loss": 1.0422, + "step": 4364 + }, + { + "epoch": 0.33649398704902866, + "grad_norm": 4.061374664306641, + "learning_rate": 7.732925258763577e-06, + "loss": 0.9859, + "step": 4365 + }, + { + "epoch": 0.33657107616404563, + "grad_norm": 3.8068432807922363, + "learning_rate": 7.731879719369145e-06, + "loss": 1.1827, + "step": 4366 + }, + { + "epoch": 0.3366481652790626, + "grad_norm": 3.7083537578582764, + "learning_rate": 7.73083400965616e-06, + "loss": 0.994, + "step": 4367 + }, + { + "epoch": 0.3367252543940796, + "grad_norm": 3.8531298637390137, + "learning_rate": 7.72978812968982e-06, + "loss": 0.8946, + "step": 4368 + }, + { + "epoch": 0.3368023435090965, + "grad_norm": 3.4262845516204834, + "learning_rate": 7.728742079535326e-06, + "loss": 0.9598, + "step": 4369 + }, + { + "epoch": 0.33687943262411346, + "grad_norm": 3.567720413208008, + "learning_rate": 7.727695859257896e-06, + "loss": 0.9415, + "step": 4370 + }, + { + "epoch": 0.33695652173913043, + "grad_norm": 3.5674967765808105, + "learning_rate": 7.726649468922756e-06, + "loss": 0.9562, + "step": 4371 + }, + { + "epoch": 0.3370336108541474, + "grad_norm": 4.217917442321777, + "learning_rate": 7.725602908595144e-06, + "loss": 1.0205, + "step": 4372 + }, + { + "epoch": 0.3371106999691644, + "grad_norm": 3.5196731090545654, + "learning_rate": 7.724556178340307e-06, + "loss": 1.0654, + "step": 4373 + }, + { + "epoch": 0.3371877890841813, + "grad_norm": 4.0760369300842285, + "learning_rate": 7.723509278223504e-06, + "loss": 0.9772, + "step": 4374 + }, + { + "epoch": 0.33726487819919826, + "grad_norm": 3.8422629833221436, + "learning_rate": 7.722462208310002e-06, + "loss": 0.9518, + "step": 4375 + }, + { + "epoch": 0.33734196731421523, + "grad_norm": 3.41591739654541, + "learning_rate": 7.721414968665081e-06, + "loss": 1.0063, + "step": 4376 + }, + { + "epoch": 0.3374190564292322, + "grad_norm": 3.6463801860809326, + "learning_rate": 7.720367559354032e-06, + "loss": 1.0955, + "step": 4377 + }, + { + "epoch": 0.33749614554424917, + "grad_norm": 4.359663963317871, + "learning_rate": 7.719319980442154e-06, + "loss": 0.9804, + "step": 4378 + }, + { + "epoch": 0.3375732346592661, + "grad_norm": 3.783148765563965, + "learning_rate": 7.718272231994759e-06, + "loss": 0.9911, + "step": 4379 + }, + { + "epoch": 0.33765032377428306, + "grad_norm": 4.130781173706055, + "learning_rate": 7.71722431407717e-06, + "loss": 1.0291, + "step": 4380 + }, + { + "epoch": 0.3377274128893, + "grad_norm": 3.954774856567383, + "learning_rate": 7.716176226754716e-06, + "loss": 0.9907, + "step": 4381 + }, + { + "epoch": 0.337804502004317, + "grad_norm": 3.640958547592163, + "learning_rate": 7.715127970092742e-06, + "loss": 0.9913, + "step": 4382 + }, + { + "epoch": 0.33788159111933397, + "grad_norm": 3.703672170639038, + "learning_rate": 7.714079544156602e-06, + "loss": 0.9763, + "step": 4383 + }, + { + "epoch": 0.3379586802343509, + "grad_norm": 3.6405487060546875, + "learning_rate": 7.71303094901166e-06, + "loss": 0.8953, + "step": 4384 + }, + { + "epoch": 0.33803576934936785, + "grad_norm": 3.752634286880493, + "learning_rate": 7.711982184723287e-06, + "loss": 0.8807, + "step": 4385 + }, + { + "epoch": 0.3381128584643848, + "grad_norm": 3.402266025543213, + "learning_rate": 7.71093325135687e-06, + "loss": 0.9654, + "step": 4386 + }, + { + "epoch": 0.3381899475794018, + "grad_norm": 3.3396193981170654, + "learning_rate": 7.709884148977808e-06, + "loss": 0.8923, + "step": 4387 + }, + { + "epoch": 0.33826703669441877, + "grad_norm": 4.080255031585693, + "learning_rate": 7.708834877651504e-06, + "loss": 0.9685, + "step": 4388 + }, + { + "epoch": 0.3383441258094357, + "grad_norm": 3.752744436264038, + "learning_rate": 7.70778543744337e-06, + "loss": 1.0687, + "step": 4389 + }, + { + "epoch": 0.33842121492445265, + "grad_norm": 4.078830718994141, + "learning_rate": 7.706735828418844e-06, + "loss": 1.1614, + "step": 4390 + }, + { + "epoch": 0.3384983040394696, + "grad_norm": 3.583847999572754, + "learning_rate": 7.705686050643354e-06, + "loss": 1.0587, + "step": 4391 + }, + { + "epoch": 0.3385753931544866, + "grad_norm": 3.779529094696045, + "learning_rate": 7.704636104182353e-06, + "loss": 1.0217, + "step": 4392 + }, + { + "epoch": 0.33865248226950356, + "grad_norm": 3.4878971576690674, + "learning_rate": 7.703585989101298e-06, + "loss": 0.8588, + "step": 4393 + }, + { + "epoch": 0.3387295713845205, + "grad_norm": 3.9853031635284424, + "learning_rate": 7.70253570546566e-06, + "loss": 1.0825, + "step": 4394 + }, + { + "epoch": 0.33880666049953745, + "grad_norm": 3.818143606185913, + "learning_rate": 7.701485253340917e-06, + "loss": 1.0648, + "step": 4395 + }, + { + "epoch": 0.3388837496145544, + "grad_norm": 3.480123281478882, + "learning_rate": 7.70043463279256e-06, + "loss": 0.9691, + "step": 4396 + }, + { + "epoch": 0.3389608387295714, + "grad_norm": 3.3591983318328857, + "learning_rate": 7.69938384388609e-06, + "loss": 0.8988, + "step": 4397 + }, + { + "epoch": 0.33903792784458836, + "grad_norm": 3.428823232650757, + "learning_rate": 7.698332886687016e-06, + "loss": 0.8853, + "step": 4398 + }, + { + "epoch": 0.33911501695960533, + "grad_norm": 3.8618509769439697, + "learning_rate": 7.697281761260863e-06, + "loss": 1.0317, + "step": 4399 + }, + { + "epoch": 0.33919210607462225, + "grad_norm": 3.7116105556488037, + "learning_rate": 7.696230467673163e-06, + "loss": 1.0838, + "step": 4400 + }, + { + "epoch": 0.3392691951896392, + "grad_norm": 3.5771796703338623, + "learning_rate": 7.695179005989454e-06, + "loss": 0.995, + "step": 4401 + }, + { + "epoch": 0.3393462843046562, + "grad_norm": 3.8581273555755615, + "learning_rate": 7.694127376275295e-06, + "loss": 0.9682, + "step": 4402 + }, + { + "epoch": 0.33942337341967316, + "grad_norm": 3.888493776321411, + "learning_rate": 7.693075578596247e-06, + "loss": 1.0419, + "step": 4403 + }, + { + "epoch": 0.33950046253469013, + "grad_norm": 4.1317877769470215, + "learning_rate": 7.692023613017884e-06, + "loss": 1.0923, + "step": 4404 + }, + { + "epoch": 0.33957755164970704, + "grad_norm": 3.4401021003723145, + "learning_rate": 7.690971479605793e-06, + "loss": 0.9184, + "step": 4405 + }, + { + "epoch": 0.339654640764724, + "grad_norm": 3.6244630813598633, + "learning_rate": 7.689919178425565e-06, + "loss": 0.9051, + "step": 4406 + }, + { + "epoch": 0.339731729879741, + "grad_norm": 3.5508549213409424, + "learning_rate": 7.688866709542809e-06, + "loss": 1.0023, + "step": 4407 + }, + { + "epoch": 0.33980881899475796, + "grad_norm": 3.756072521209717, + "learning_rate": 7.687814073023138e-06, + "loss": 0.9766, + "step": 4408 + }, + { + "epoch": 0.3398859081097749, + "grad_norm": 4.206667423248291, + "learning_rate": 7.686761268932182e-06, + "loss": 1.0179, + "step": 4409 + }, + { + "epoch": 0.33996299722479184, + "grad_norm": 3.559318780899048, + "learning_rate": 7.685708297335575e-06, + "loss": 1.0625, + "step": 4410 + }, + { + "epoch": 0.3400400863398088, + "grad_norm": 3.7180213928222656, + "learning_rate": 7.684655158298963e-06, + "loss": 1.0346, + "step": 4411 + }, + { + "epoch": 0.3401171754548258, + "grad_norm": 3.8383474349975586, + "learning_rate": 7.68360185188801e-06, + "loss": 0.945, + "step": 4412 + }, + { + "epoch": 0.34019426456984275, + "grad_norm": 3.727611780166626, + "learning_rate": 7.682548378168379e-06, + "loss": 1.0908, + "step": 4413 + }, + { + "epoch": 0.3402713536848597, + "grad_norm": 3.4572296142578125, + "learning_rate": 7.681494737205748e-06, + "loss": 0.9101, + "step": 4414 + }, + { + "epoch": 0.34034844279987664, + "grad_norm": 4.0167741775512695, + "learning_rate": 7.680440929065811e-06, + "loss": 1.0263, + "step": 4415 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 4.128690719604492, + "learning_rate": 7.679386953814262e-06, + "loss": 1.1199, + "step": 4416 + }, + { + "epoch": 0.3405026210299106, + "grad_norm": 3.2454535961151123, + "learning_rate": 7.678332811516815e-06, + "loss": 0.9708, + "step": 4417 + }, + { + "epoch": 0.34057971014492755, + "grad_norm": 3.2851145267486572, + "learning_rate": 7.677278502239187e-06, + "loss": 0.8788, + "step": 4418 + }, + { + "epoch": 0.3406567992599445, + "grad_norm": 3.677515983581543, + "learning_rate": 7.676224026047112e-06, + "loss": 0.9379, + "step": 4419 + }, + { + "epoch": 0.34073388837496144, + "grad_norm": 3.384584426879883, + "learning_rate": 7.67516938300633e-06, + "loss": 0.8972, + "step": 4420 + }, + { + "epoch": 0.3408109774899784, + "grad_norm": 3.8261232376098633, + "learning_rate": 7.674114573182589e-06, + "loss": 1.0569, + "step": 4421 + }, + { + "epoch": 0.3408880666049954, + "grad_norm": 3.638108491897583, + "learning_rate": 7.673059596641657e-06, + "loss": 1.0784, + "step": 4422 + }, + { + "epoch": 0.34096515572001235, + "grad_norm": 3.599555730819702, + "learning_rate": 7.672004453449301e-06, + "loss": 1.0344, + "step": 4423 + }, + { + "epoch": 0.3410422448350293, + "grad_norm": 3.3416149616241455, + "learning_rate": 7.670949143671306e-06, + "loss": 0.8663, + "step": 4424 + }, + { + "epoch": 0.34111933395004623, + "grad_norm": 3.894286632537842, + "learning_rate": 7.669893667373466e-06, + "loss": 0.9725, + "step": 4425 + }, + { + "epoch": 0.3411964230650632, + "grad_norm": 3.740086078643799, + "learning_rate": 7.668838024621585e-06, + "loss": 0.9731, + "step": 4426 + }, + { + "epoch": 0.3412735121800802, + "grad_norm": 3.730048179626465, + "learning_rate": 7.667782215481474e-06, + "loss": 0.912, + "step": 4427 + }, + { + "epoch": 0.34135060129509714, + "grad_norm": 3.557861328125, + "learning_rate": 7.66672624001896e-06, + "loss": 0.9324, + "step": 4428 + }, + { + "epoch": 0.3414276904101141, + "grad_norm": 3.767613649368286, + "learning_rate": 7.665670098299874e-06, + "loss": 0.8642, + "step": 4429 + }, + { + "epoch": 0.34150477952513103, + "grad_norm": 4.100453853607178, + "learning_rate": 7.664613790390065e-06, + "loss": 1.0388, + "step": 4430 + }, + { + "epoch": 0.341581868640148, + "grad_norm": 3.8830795288085938, + "learning_rate": 7.663557316355386e-06, + "loss": 1.005, + "step": 4431 + }, + { + "epoch": 0.34165895775516497, + "grad_norm": 3.5792341232299805, + "learning_rate": 7.662500676261703e-06, + "loss": 0.954, + "step": 4432 + }, + { + "epoch": 0.34173604687018194, + "grad_norm": 3.8511476516723633, + "learning_rate": 7.661443870174892e-06, + "loss": 1.0085, + "step": 4433 + }, + { + "epoch": 0.3418131359851989, + "grad_norm": 3.9277069568634033, + "learning_rate": 7.66038689816084e-06, + "loss": 1.0202, + "step": 4434 + }, + { + "epoch": 0.3418902251002158, + "grad_norm": 3.5222573280334473, + "learning_rate": 7.659329760285443e-06, + "loss": 0.9149, + "step": 4435 + }, + { + "epoch": 0.3419673142152328, + "grad_norm": 3.536393404006958, + "learning_rate": 7.658272456614609e-06, + "loss": 0.9427, + "step": 4436 + }, + { + "epoch": 0.34204440333024977, + "grad_norm": 4.047040939331055, + "learning_rate": 7.657214987214254e-06, + "loss": 1.0167, + "step": 4437 + }, + { + "epoch": 0.34212149244526674, + "grad_norm": 3.8045451641082764, + "learning_rate": 7.65615735215031e-06, + "loss": 1.0209, + "step": 4438 + }, + { + "epoch": 0.3421985815602837, + "grad_norm": 3.7481818199157715, + "learning_rate": 7.655099551488708e-06, + "loss": 1.0584, + "step": 4439 + }, + { + "epoch": 0.3422756706753006, + "grad_norm": 3.487293004989624, + "learning_rate": 7.6540415852954e-06, + "loss": 0.9386, + "step": 4440 + }, + { + "epoch": 0.3423527597903176, + "grad_norm": 3.341665267944336, + "learning_rate": 7.652983453636346e-06, + "loss": 0.935, + "step": 4441 + }, + { + "epoch": 0.34242984890533457, + "grad_norm": 3.843733310699463, + "learning_rate": 7.651925156577514e-06, + "loss": 1.0647, + "step": 4442 + }, + { + "epoch": 0.34250693802035154, + "grad_norm": 3.3451244831085205, + "learning_rate": 7.650866694184883e-06, + "loss": 1.0161, + "step": 4443 + }, + { + "epoch": 0.3425840271353685, + "grad_norm": 3.6213300228118896, + "learning_rate": 7.649808066524442e-06, + "loss": 0.994, + "step": 4444 + }, + { + "epoch": 0.3426611162503854, + "grad_norm": 4.26886510848999, + "learning_rate": 7.648749273662194e-06, + "loss": 0.8898, + "step": 4445 + }, + { + "epoch": 0.3427382053654024, + "grad_norm": 3.5714519023895264, + "learning_rate": 7.647690315664145e-06, + "loss": 1.028, + "step": 4446 + }, + { + "epoch": 0.34281529448041936, + "grad_norm": 4.256858825683594, + "learning_rate": 7.646631192596317e-06, + "loss": 0.9492, + "step": 4447 + }, + { + "epoch": 0.34289238359543633, + "grad_norm": 3.999039649963379, + "learning_rate": 7.645571904524745e-06, + "loss": 1.0845, + "step": 4448 + }, + { + "epoch": 0.3429694727104533, + "grad_norm": 3.7288503646850586, + "learning_rate": 7.644512451515462e-06, + "loss": 0.9635, + "step": 4449 + }, + { + "epoch": 0.3430465618254702, + "grad_norm": 3.4463119506835938, + "learning_rate": 7.643452833634527e-06, + "loss": 0.9414, + "step": 4450 + }, + { + "epoch": 0.3431236509404872, + "grad_norm": 3.423847198486328, + "learning_rate": 7.642393050947997e-06, + "loss": 0.9491, + "step": 4451 + }, + { + "epoch": 0.34320074005550416, + "grad_norm": 4.031202793121338, + "learning_rate": 7.641333103521945e-06, + "loss": 1.1798, + "step": 4452 + }, + { + "epoch": 0.34327782917052113, + "grad_norm": 4.064019203186035, + "learning_rate": 7.640272991422456e-06, + "loss": 1.1341, + "step": 4453 + }, + { + "epoch": 0.3433549182855381, + "grad_norm": 3.5513041019439697, + "learning_rate": 7.63921271471562e-06, + "loss": 1.0552, + "step": 4454 + }, + { + "epoch": 0.343432007400555, + "grad_norm": 3.514437437057495, + "learning_rate": 7.638152273467538e-06, + "loss": 1.086, + "step": 4455 + }, + { + "epoch": 0.343509096515572, + "grad_norm": 3.334420680999756, + "learning_rate": 7.637091667744326e-06, + "loss": 0.8893, + "step": 4456 + }, + { + "epoch": 0.34358618563058896, + "grad_norm": 3.7241454124450684, + "learning_rate": 7.636030897612108e-06, + "loss": 0.9269, + "step": 4457 + }, + { + "epoch": 0.34366327474560593, + "grad_norm": 4.0841755867004395, + "learning_rate": 7.634969963137015e-06, + "loss": 0.9552, + "step": 4458 + }, + { + "epoch": 0.3437403638606229, + "grad_norm": 3.496382236480713, + "learning_rate": 7.633908864385191e-06, + "loss": 0.8975, + "step": 4459 + }, + { + "epoch": 0.3438174529756398, + "grad_norm": 3.9809958934783936, + "learning_rate": 7.63284760142279e-06, + "loss": 1.0904, + "step": 4460 + }, + { + "epoch": 0.3438945420906568, + "grad_norm": 3.6975362300872803, + "learning_rate": 7.631786174315978e-06, + "loss": 0.9786, + "step": 4461 + }, + { + "epoch": 0.34397163120567376, + "grad_norm": 4.022948265075684, + "learning_rate": 7.630724583130929e-06, + "loss": 0.9845, + "step": 4462 + }, + { + "epoch": 0.3440487203206907, + "grad_norm": 3.6614363193511963, + "learning_rate": 7.629662827933827e-06, + "loss": 1.0014, + "step": 4463 + }, + { + "epoch": 0.3441258094357077, + "grad_norm": 3.6465506553649902, + "learning_rate": 7.628600908790867e-06, + "loss": 0.97, + "step": 4464 + }, + { + "epoch": 0.3442028985507246, + "grad_norm": 3.7315773963928223, + "learning_rate": 7.627538825768253e-06, + "loss": 0.9663, + "step": 4465 + }, + { + "epoch": 0.3442799876657416, + "grad_norm": 3.598884344100952, + "learning_rate": 7.626476578932202e-06, + "loss": 0.9673, + "step": 4466 + }, + { + "epoch": 0.34435707678075855, + "grad_norm": 3.5586609840393066, + "learning_rate": 7.625414168348939e-06, + "loss": 1.0037, + "step": 4467 + }, + { + "epoch": 0.3444341658957755, + "grad_norm": 3.392164468765259, + "learning_rate": 7.6243515940847005e-06, + "loss": 1.0022, + "step": 4468 + }, + { + "epoch": 0.3445112550107925, + "grad_norm": 3.7672579288482666, + "learning_rate": 7.623288856205733e-06, + "loss": 0.9914, + "step": 4469 + }, + { + "epoch": 0.3445883441258094, + "grad_norm": 3.6310787200927734, + "learning_rate": 7.62222595477829e-06, + "loss": 0.9816, + "step": 4470 + }, + { + "epoch": 0.3446654332408264, + "grad_norm": 3.9517312049865723, + "learning_rate": 7.6211628898686386e-06, + "loss": 1.0428, + "step": 4471 + }, + { + "epoch": 0.34474252235584335, + "grad_norm": 3.518615245819092, + "learning_rate": 7.620099661543059e-06, + "loss": 0.8673, + "step": 4472 + }, + { + "epoch": 0.3448196114708603, + "grad_norm": 3.833643674850464, + "learning_rate": 7.619036269867835e-06, + "loss": 0.9825, + "step": 4473 + }, + { + "epoch": 0.3448967005858773, + "grad_norm": 3.776273012161255, + "learning_rate": 7.617972714909263e-06, + "loss": 0.9213, + "step": 4474 + }, + { + "epoch": 0.3449737897008942, + "grad_norm": 4.035387992858887, + "learning_rate": 7.616908996733651e-06, + "loss": 0.9992, + "step": 4475 + }, + { + "epoch": 0.3450508788159112, + "grad_norm": 3.4117751121520996, + "learning_rate": 7.615845115407316e-06, + "loss": 0.9576, + "step": 4476 + }, + { + "epoch": 0.34512796793092815, + "grad_norm": 3.680464267730713, + "learning_rate": 7.6147810709965864e-06, + "loss": 0.952, + "step": 4477 + }, + { + "epoch": 0.3452050570459451, + "grad_norm": 3.7311110496520996, + "learning_rate": 7.6137168635677996e-06, + "loss": 1.094, + "step": 4478 + }, + { + "epoch": 0.3452821461609621, + "grad_norm": 3.6351912021636963, + "learning_rate": 7.612652493187302e-06, + "loss": 1.0033, + "step": 4479 + }, + { + "epoch": 0.345359235275979, + "grad_norm": 3.4685473442077637, + "learning_rate": 7.611587959921453e-06, + "loss": 1.0091, + "step": 4480 + }, + { + "epoch": 0.345436324390996, + "grad_norm": 3.7951955795288086, + "learning_rate": 7.6105232638366225e-06, + "loss": 0.9334, + "step": 4481 + }, + { + "epoch": 0.34551341350601295, + "grad_norm": 3.6663379669189453, + "learning_rate": 7.609458404999186e-06, + "loss": 0.998, + "step": 4482 + }, + { + "epoch": 0.3455905026210299, + "grad_norm": 3.7089757919311523, + "learning_rate": 7.608393383475532e-06, + "loss": 0.9697, + "step": 4483 + }, + { + "epoch": 0.3456675917360469, + "grad_norm": 3.706681489944458, + "learning_rate": 7.607328199332059e-06, + "loss": 0.9436, + "step": 4484 + }, + { + "epoch": 0.34574468085106386, + "grad_norm": 3.7007081508636475, + "learning_rate": 7.606262852635178e-06, + "loss": 1.0404, + "step": 4485 + }, + { + "epoch": 0.34582176996608077, + "grad_norm": 3.912898063659668, + "learning_rate": 7.605197343451305e-06, + "loss": 1.0742, + "step": 4486 + }, + { + "epoch": 0.34589885908109774, + "grad_norm": 3.450777292251587, + "learning_rate": 7.604131671846872e-06, + "loss": 0.9221, + "step": 4487 + }, + { + "epoch": 0.3459759481961147, + "grad_norm": 3.4001710414886475, + "learning_rate": 7.6030658378883145e-06, + "loss": 0.9292, + "step": 4488 + }, + { + "epoch": 0.3460530373111317, + "grad_norm": 3.4011619091033936, + "learning_rate": 7.601999841642085e-06, + "loss": 0.9305, + "step": 4489 + }, + { + "epoch": 0.34613012642614865, + "grad_norm": 3.7454307079315186, + "learning_rate": 7.60093368317464e-06, + "loss": 1.0018, + "step": 4490 + }, + { + "epoch": 0.34620721554116557, + "grad_norm": 3.7179946899414062, + "learning_rate": 7.599867362552451e-06, + "loss": 0.9987, + "step": 4491 + }, + { + "epoch": 0.34628430465618254, + "grad_norm": 3.8563878536224365, + "learning_rate": 7.598800879841998e-06, + "loss": 1.0384, + "step": 4492 + }, + { + "epoch": 0.3463613937711995, + "grad_norm": 3.63057804107666, + "learning_rate": 7.5977342351097694e-06, + "loss": 1.0281, + "step": 4493 + }, + { + "epoch": 0.3464384828862165, + "grad_norm": 3.7689592838287354, + "learning_rate": 7.596667428422264e-06, + "loss": 1.0204, + "step": 4494 + }, + { + "epoch": 0.34651557200123345, + "grad_norm": 4.023124694824219, + "learning_rate": 7.595600459845994e-06, + "loss": 1.1758, + "step": 4495 + }, + { + "epoch": 0.34659266111625037, + "grad_norm": 3.7010927200317383, + "learning_rate": 7.594533329447479e-06, + "loss": 1.0144, + "step": 4496 + }, + { + "epoch": 0.34666975023126734, + "grad_norm": 3.7244632244110107, + "learning_rate": 7.593466037293247e-06, + "loss": 0.9995, + "step": 4497 + }, + { + "epoch": 0.3467468393462843, + "grad_norm": 3.411987781524658, + "learning_rate": 7.5923985834498405e-06, + "loss": 0.943, + "step": 4498 + }, + { + "epoch": 0.3468239284613013, + "grad_norm": 3.9309608936309814, + "learning_rate": 7.5913309679838074e-06, + "loss": 0.9554, + "step": 4499 + }, + { + "epoch": 0.34690101757631825, + "grad_norm": 3.5526673793792725, + "learning_rate": 7.590263190961711e-06, + "loss": 0.9885, + "step": 4500 + }, + { + "epoch": 0.34697810669133516, + "grad_norm": 3.6589465141296387, + "learning_rate": 7.589195252450118e-06, + "loss": 1.0778, + "step": 4501 + }, + { + "epoch": 0.34705519580635213, + "grad_norm": 3.7149205207824707, + "learning_rate": 7.588127152515611e-06, + "loss": 1.0306, + "step": 4502 + }, + { + "epoch": 0.3471322849213691, + "grad_norm": 4.081293106079102, + "learning_rate": 7.587058891224781e-06, + "loss": 0.9663, + "step": 4503 + }, + { + "epoch": 0.3472093740363861, + "grad_norm": 3.5740342140197754, + "learning_rate": 7.585990468644229e-06, + "loss": 1.0298, + "step": 4504 + }, + { + "epoch": 0.34728646315140305, + "grad_norm": 3.4450860023498535, + "learning_rate": 7.584921884840563e-06, + "loss": 0.9135, + "step": 4505 + }, + { + "epoch": 0.34736355226641996, + "grad_norm": 3.9143495559692383, + "learning_rate": 7.583853139880406e-06, + "loss": 1.092, + "step": 4506 + }, + { + "epoch": 0.34744064138143693, + "grad_norm": 3.25154447555542, + "learning_rate": 7.5827842338303866e-06, + "loss": 0.9825, + "step": 4507 + }, + { + "epoch": 0.3475177304964539, + "grad_norm": 3.853174924850464, + "learning_rate": 7.581715166757147e-06, + "loss": 0.9713, + "step": 4508 + }, + { + "epoch": 0.3475948196114709, + "grad_norm": 3.6071982383728027, + "learning_rate": 7.58064593872734e-06, + "loss": 0.9837, + "step": 4509 + }, + { + "epoch": 0.34767190872648784, + "grad_norm": 3.3574037551879883, + "learning_rate": 7.579576549807621e-06, + "loss": 0.8727, + "step": 4510 + }, + { + "epoch": 0.34774899784150476, + "grad_norm": 3.408640146255493, + "learning_rate": 7.578507000064668e-06, + "loss": 0.9032, + "step": 4511 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 3.5927014350891113, + "learning_rate": 7.5774372895651545e-06, + "loss": 1.0204, + "step": 4512 + }, + { + "epoch": 0.3479031760715387, + "grad_norm": 4.023251533508301, + "learning_rate": 7.576367418375776e-06, + "loss": 1.0703, + "step": 4513 + }, + { + "epoch": 0.34798026518655567, + "grad_norm": 3.498119831085205, + "learning_rate": 7.575297386563232e-06, + "loss": 1.0155, + "step": 4514 + }, + { + "epoch": 0.34805735430157264, + "grad_norm": 4.321279525756836, + "learning_rate": 7.574227194194234e-06, + "loss": 1.0152, + "step": 4515 + }, + { + "epoch": 0.34813444341658956, + "grad_norm": 3.662602186203003, + "learning_rate": 7.573156841335503e-06, + "loss": 1.0795, + "step": 4516 + }, + { + "epoch": 0.3482115325316065, + "grad_norm": 3.4204254150390625, + "learning_rate": 7.572086328053769e-06, + "loss": 0.9286, + "step": 4517 + }, + { + "epoch": 0.3482886216466235, + "grad_norm": 4.003498554229736, + "learning_rate": 7.571015654415774e-06, + "loss": 0.991, + "step": 4518 + }, + { + "epoch": 0.34836571076164047, + "grad_norm": 3.632580518722534, + "learning_rate": 7.5699448204882684e-06, + "loss": 0.9635, + "step": 4519 + }, + { + "epoch": 0.34844279987665744, + "grad_norm": 3.763371706008911, + "learning_rate": 7.568873826338015e-06, + "loss": 1.0865, + "step": 4520 + }, + { + "epoch": 0.34851988899167435, + "grad_norm": 3.367755889892578, + "learning_rate": 7.567802672031781e-06, + "loss": 0.9446, + "step": 4521 + }, + { + "epoch": 0.3485969781066913, + "grad_norm": 3.568023204803467, + "learning_rate": 7.56673135763635e-06, + "loss": 0.9173, + "step": 4522 + }, + { + "epoch": 0.3486740672217083, + "grad_norm": 3.6380043029785156, + "learning_rate": 7.565659883218512e-06, + "loss": 0.9864, + "step": 4523 + }, + { + "epoch": 0.34875115633672527, + "grad_norm": 3.404252290725708, + "learning_rate": 7.56458824884507e-06, + "loss": 0.9846, + "step": 4524 + }, + { + "epoch": 0.34882824545174224, + "grad_norm": 3.7746942043304443, + "learning_rate": 7.563516454582831e-06, + "loss": 0.9901, + "step": 4525 + }, + { + "epoch": 0.34890533456675915, + "grad_norm": 3.6926469802856445, + "learning_rate": 7.562444500498618e-06, + "loss": 1.0837, + "step": 4526 + }, + { + "epoch": 0.3489824236817761, + "grad_norm": 3.7840046882629395, + "learning_rate": 7.561372386659262e-06, + "loss": 1.1675, + "step": 4527 + }, + { + "epoch": 0.3490595127967931, + "grad_norm": 4.055497646331787, + "learning_rate": 7.560300113131604e-06, + "loss": 1.0338, + "step": 4528 + }, + { + "epoch": 0.34913660191181006, + "grad_norm": 4.383268356323242, + "learning_rate": 7.559227679982493e-06, + "loss": 1.0946, + "step": 4529 + }, + { + "epoch": 0.34921369102682703, + "grad_norm": 3.687899351119995, + "learning_rate": 7.558155087278791e-06, + "loss": 0.8959, + "step": 4530 + }, + { + "epoch": 0.34929078014184395, + "grad_norm": 4.111197471618652, + "learning_rate": 7.557082335087369e-06, + "loss": 0.9816, + "step": 4531 + }, + { + "epoch": 0.3493678692568609, + "grad_norm": 3.7356393337249756, + "learning_rate": 7.556009423475106e-06, + "loss": 0.9332, + "step": 4532 + }, + { + "epoch": 0.3494449583718779, + "grad_norm": 3.6677143573760986, + "learning_rate": 7.554936352508895e-06, + "loss": 1.0524, + "step": 4533 + }, + { + "epoch": 0.34952204748689486, + "grad_norm": 3.7901206016540527, + "learning_rate": 7.5538631222556325e-06, + "loss": 0.9411, + "step": 4534 + }, + { + "epoch": 0.34959913660191183, + "grad_norm": 3.7148752212524414, + "learning_rate": 7.552789732782233e-06, + "loss": 1.0595, + "step": 4535 + }, + { + "epoch": 0.34967622571692875, + "grad_norm": 3.53884220123291, + "learning_rate": 7.551716184155614e-06, + "loss": 0.969, + "step": 4536 + }, + { + "epoch": 0.3497533148319457, + "grad_norm": 3.777893543243408, + "learning_rate": 7.5506424764427065e-06, + "loss": 1.011, + "step": 4537 + }, + { + "epoch": 0.3498304039469627, + "grad_norm": 3.4374454021453857, + "learning_rate": 7.549568609710451e-06, + "loss": 0.9996, + "step": 4538 + }, + { + "epoch": 0.34990749306197966, + "grad_norm": 3.2913565635681152, + "learning_rate": 7.548494584025797e-06, + "loss": 0.9579, + "step": 4539 + }, + { + "epoch": 0.34998458217699663, + "grad_norm": 3.424445152282715, + "learning_rate": 7.547420399455705e-06, + "loss": 0.9822, + "step": 4540 + }, + { + "epoch": 0.35006167129201354, + "grad_norm": 3.8123433589935303, + "learning_rate": 7.5463460560671446e-06, + "loss": 0.9697, + "step": 4541 + }, + { + "epoch": 0.3501387604070305, + "grad_norm": 3.7088186740875244, + "learning_rate": 7.5452715539270945e-06, + "loss": 1.0171, + "step": 4542 + }, + { + "epoch": 0.3502158495220475, + "grad_norm": 3.776679754257202, + "learning_rate": 7.544196893102547e-06, + "loss": 1.0093, + "step": 4543 + }, + { + "epoch": 0.35029293863706445, + "grad_norm": 3.353478193283081, + "learning_rate": 7.543122073660498e-06, + "loss": 0.8819, + "step": 4544 + }, + { + "epoch": 0.3503700277520814, + "grad_norm": 3.6976723670959473, + "learning_rate": 7.542047095667959e-06, + "loss": 1.0509, + "step": 4545 + }, + { + "epoch": 0.35044711686709834, + "grad_norm": 4.122467517852783, + "learning_rate": 7.540971959191952e-06, + "loss": 1.1499, + "step": 4546 + }, + { + "epoch": 0.3505242059821153, + "grad_norm": 3.71630859375, + "learning_rate": 7.5398966642995e-06, + "loss": 1.0094, + "step": 4547 + }, + { + "epoch": 0.3506012950971323, + "grad_norm": 3.93642258644104, + "learning_rate": 7.538821211057648e-06, + "loss": 1.1285, + "step": 4548 + }, + { + "epoch": 0.35067838421214925, + "grad_norm": 3.9508731365203857, + "learning_rate": 7.537745599533442e-06, + "loss": 1.2053, + "step": 4549 + }, + { + "epoch": 0.3507554733271662, + "grad_norm": 4.088351726531982, + "learning_rate": 7.536669829793939e-06, + "loss": 0.9771, + "step": 4550 + }, + { + "epoch": 0.35083256244218314, + "grad_norm": 3.7594566345214844, + "learning_rate": 7.535593901906212e-06, + "loss": 1.0617, + "step": 4551 + }, + { + "epoch": 0.3509096515572001, + "grad_norm": 3.9711499214172363, + "learning_rate": 7.534517815937336e-06, + "loss": 1.0532, + "step": 4552 + }, + { + "epoch": 0.3509867406722171, + "grad_norm": 4.007244110107422, + "learning_rate": 7.533441571954401e-06, + "loss": 1.0847, + "step": 4553 + }, + { + "epoch": 0.35106382978723405, + "grad_norm": 3.750323534011841, + "learning_rate": 7.532365170024507e-06, + "loss": 0.951, + "step": 4554 + }, + { + "epoch": 0.351140918902251, + "grad_norm": 3.2767701148986816, + "learning_rate": 7.531288610214758e-06, + "loss": 1.0074, + "step": 4555 + }, + { + "epoch": 0.35121800801726794, + "grad_norm": 4.005361080169678, + "learning_rate": 7.530211892592274e-06, + "loss": 1.0625, + "step": 4556 + }, + { + "epoch": 0.3512950971322849, + "grad_norm": 3.4741199016571045, + "learning_rate": 7.529135017224185e-06, + "loss": 0.9702, + "step": 4557 + }, + { + "epoch": 0.3513721862473019, + "grad_norm": 4.308572292327881, + "learning_rate": 7.528057984177624e-06, + "loss": 1.1244, + "step": 4558 + }, + { + "epoch": 0.35144927536231885, + "grad_norm": 3.521604299545288, + "learning_rate": 7.526980793519742e-06, + "loss": 0.9298, + "step": 4559 + }, + { + "epoch": 0.3515263644773358, + "grad_norm": 3.766963243484497, + "learning_rate": 7.525903445317695e-06, + "loss": 0.9063, + "step": 4560 + }, + { + "epoch": 0.35160345359235273, + "grad_norm": 3.509277582168579, + "learning_rate": 7.524825939638649e-06, + "loss": 0.9582, + "step": 4561 + }, + { + "epoch": 0.3516805427073697, + "grad_norm": 3.2499780654907227, + "learning_rate": 7.5237482765497835e-06, + "loss": 0.9871, + "step": 4562 + }, + { + "epoch": 0.3517576318223867, + "grad_norm": 3.8931937217712402, + "learning_rate": 7.522670456118284e-06, + "loss": 1.0293, + "step": 4563 + }, + { + "epoch": 0.35183472093740364, + "grad_norm": 4.080838203430176, + "learning_rate": 7.521592478411346e-06, + "loss": 0.9787, + "step": 4564 + }, + { + "epoch": 0.3519118100524206, + "grad_norm": 3.7860987186431885, + "learning_rate": 7.520514343496177e-06, + "loss": 1.0544, + "step": 4565 + }, + { + "epoch": 0.3519888991674376, + "grad_norm": 3.832820415496826, + "learning_rate": 7.519436051439991e-06, + "loss": 1.045, + "step": 4566 + }, + { + "epoch": 0.3520659882824545, + "grad_norm": 3.489471912384033, + "learning_rate": 7.518357602310018e-06, + "loss": 1.0061, + "step": 4567 + }, + { + "epoch": 0.35214307739747147, + "grad_norm": 3.8605971336364746, + "learning_rate": 7.517278996173489e-06, + "loss": 1.1125, + "step": 4568 + }, + { + "epoch": 0.35222016651248844, + "grad_norm": 3.234241247177124, + "learning_rate": 7.516200233097655e-06, + "loss": 0.9031, + "step": 4569 + }, + { + "epoch": 0.3522972556275054, + "grad_norm": 3.4962003231048584, + "learning_rate": 7.515121313149767e-06, + "loss": 1.0268, + "step": 4570 + }, + { + "epoch": 0.3523743447425224, + "grad_norm": 4.1234211921691895, + "learning_rate": 7.51404223639709e-06, + "loss": 1.0732, + "step": 4571 + }, + { + "epoch": 0.3524514338575393, + "grad_norm": 3.3490140438079834, + "learning_rate": 7.512963002906902e-06, + "loss": 0.9798, + "step": 4572 + }, + { + "epoch": 0.35252852297255627, + "grad_norm": 3.6099109649658203, + "learning_rate": 7.5118836127464845e-06, + "loss": 1.0534, + "step": 4573 + }, + { + "epoch": 0.35260561208757324, + "grad_norm": 3.4530441761016846, + "learning_rate": 7.5108040659831325e-06, + "loss": 0.96, + "step": 4574 + }, + { + "epoch": 0.3526827012025902, + "grad_norm": 3.9595425128936768, + "learning_rate": 7.509724362684153e-06, + "loss": 0.99, + "step": 4575 + }, + { + "epoch": 0.3527597903176072, + "grad_norm": 3.436499834060669, + "learning_rate": 7.508644502916857e-06, + "loss": 1.0278, + "step": 4576 + }, + { + "epoch": 0.3528368794326241, + "grad_norm": 3.768009901046753, + "learning_rate": 7.507564486748567e-06, + "loss": 0.9352, + "step": 4577 + }, + { + "epoch": 0.35291396854764107, + "grad_norm": 3.8910608291625977, + "learning_rate": 7.50648431424662e-06, + "loss": 1.1054, + "step": 4578 + }, + { + "epoch": 0.35299105766265804, + "grad_norm": 3.816119909286499, + "learning_rate": 7.5054039854783565e-06, + "loss": 1.0385, + "step": 4579 + }, + { + "epoch": 0.353068146777675, + "grad_norm": 3.278871774673462, + "learning_rate": 7.5043235005111314e-06, + "loss": 0.9679, + "step": 4580 + }, + { + "epoch": 0.353145235892692, + "grad_norm": 3.655571222305298, + "learning_rate": 7.503242859412306e-06, + "loss": 1.0785, + "step": 4581 + }, + { + "epoch": 0.3532223250077089, + "grad_norm": 3.78136944770813, + "learning_rate": 7.502162062249252e-06, + "loss": 0.9802, + "step": 4582 + }, + { + "epoch": 0.35329941412272586, + "grad_norm": 3.6029317378997803, + "learning_rate": 7.501081109089354e-06, + "loss": 0.991, + "step": 4583 + }, + { + "epoch": 0.35337650323774283, + "grad_norm": 3.5113279819488525, + "learning_rate": 7.500000000000001e-06, + "loss": 1.0691, + "step": 4584 + }, + { + "epoch": 0.3534535923527598, + "grad_norm": 3.781364679336548, + "learning_rate": 7.498918735048596e-06, + "loss": 0.9593, + "step": 4585 + }, + { + "epoch": 0.3535306814677768, + "grad_norm": 3.832432270050049, + "learning_rate": 7.497837314302551e-06, + "loss": 0.933, + "step": 4586 + }, + { + "epoch": 0.3536077705827937, + "grad_norm": 3.3761022090911865, + "learning_rate": 7.496755737829284e-06, + "loss": 0.9601, + "step": 4587 + }, + { + "epoch": 0.35368485969781066, + "grad_norm": 3.549546241760254, + "learning_rate": 7.4956740056962294e-06, + "loss": 0.9896, + "step": 4588 + }, + { + "epoch": 0.35376194881282763, + "grad_norm": 4.057165622711182, + "learning_rate": 7.494592117970826e-06, + "loss": 1.0498, + "step": 4589 + }, + { + "epoch": 0.3538390379278446, + "grad_norm": 3.55377197265625, + "learning_rate": 7.493510074720523e-06, + "loss": 0.9978, + "step": 4590 + }, + { + "epoch": 0.3539161270428616, + "grad_norm": 3.4123051166534424, + "learning_rate": 7.492427876012782e-06, + "loss": 0.9557, + "step": 4591 + }, + { + "epoch": 0.3539932161578785, + "grad_norm": 3.7144315242767334, + "learning_rate": 7.491345521915071e-06, + "loss": 0.9637, + "step": 4592 + }, + { + "epoch": 0.35407030527289546, + "grad_norm": 3.7125496864318848, + "learning_rate": 7.490263012494869e-06, + "loss": 0.9757, + "step": 4593 + }, + { + "epoch": 0.35414739438791243, + "grad_norm": 3.407020092010498, + "learning_rate": 7.489180347819668e-06, + "loss": 0.987, + "step": 4594 + }, + { + "epoch": 0.3542244835029294, + "grad_norm": 3.3304386138916016, + "learning_rate": 7.488097527956962e-06, + "loss": 0.9076, + "step": 4595 + }, + { + "epoch": 0.35430157261794637, + "grad_norm": 3.7230682373046875, + "learning_rate": 7.487014552974263e-06, + "loss": 1.0456, + "step": 4596 + }, + { + "epoch": 0.3543786617329633, + "grad_norm": 3.5402743816375732, + "learning_rate": 7.4859314229390856e-06, + "loss": 1.0437, + "step": 4597 + }, + { + "epoch": 0.35445575084798026, + "grad_norm": 3.747596025466919, + "learning_rate": 7.484848137918958e-06, + "loss": 1.0386, + "step": 4598 + }, + { + "epoch": 0.3545328399629972, + "grad_norm": 3.624826192855835, + "learning_rate": 7.483764697981422e-06, + "loss": 0.9824, + "step": 4599 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 3.802626609802246, + "learning_rate": 7.482681103194018e-06, + "loss": 1.0693, + "step": 4600 + }, + { + "epoch": 0.35468701819303117, + "grad_norm": 3.6754977703094482, + "learning_rate": 7.481597353624306e-06, + "loss": 0.9418, + "step": 4601 + }, + { + "epoch": 0.3547641073080481, + "grad_norm": 3.6683568954467773, + "learning_rate": 7.480513449339851e-06, + "loss": 1.0815, + "step": 4602 + }, + { + "epoch": 0.35484119642306505, + "grad_norm": 3.4328901767730713, + "learning_rate": 7.47942939040823e-06, + "loss": 0.888, + "step": 4603 + }, + { + "epoch": 0.354918285538082, + "grad_norm": 3.344658374786377, + "learning_rate": 7.478345176897027e-06, + "loss": 0.8972, + "step": 4604 + }, + { + "epoch": 0.354995374653099, + "grad_norm": 3.514586925506592, + "learning_rate": 7.4772608088738395e-06, + "loss": 0.8603, + "step": 4605 + }, + { + "epoch": 0.35507246376811596, + "grad_norm": 3.7426164150238037, + "learning_rate": 7.4761762864062694e-06, + "loss": 0.8437, + "step": 4606 + }, + { + "epoch": 0.3551495528831329, + "grad_norm": 4.099521636962891, + "learning_rate": 7.475091609561933e-06, + "loss": 0.9417, + "step": 4607 + }, + { + "epoch": 0.35522664199814985, + "grad_norm": 4.204078674316406, + "learning_rate": 7.4740067784084534e-06, + "loss": 1.0172, + "step": 4608 + }, + { + "epoch": 0.3553037311131668, + "grad_norm": 3.2971291542053223, + "learning_rate": 7.472921793013463e-06, + "loss": 0.9418, + "step": 4609 + }, + { + "epoch": 0.3553808202281838, + "grad_norm": 3.2113230228424072, + "learning_rate": 7.4718366534446085e-06, + "loss": 0.9096, + "step": 4610 + }, + { + "epoch": 0.35545790934320076, + "grad_norm": 3.6125385761260986, + "learning_rate": 7.4707513597695396e-06, + "loss": 0.8899, + "step": 4611 + }, + { + "epoch": 0.3555349984582177, + "grad_norm": 3.8694043159484863, + "learning_rate": 7.469665912055919e-06, + "loss": 1.0594, + "step": 4612 + }, + { + "epoch": 0.35561208757323465, + "grad_norm": 3.883565902709961, + "learning_rate": 7.4685803103714204e-06, + "loss": 0.8622, + "step": 4613 + }, + { + "epoch": 0.3556891766882516, + "grad_norm": 3.652117967605591, + "learning_rate": 7.4674945547837245e-06, + "loss": 1.045, + "step": 4614 + }, + { + "epoch": 0.3557662658032686, + "grad_norm": 3.6539828777313232, + "learning_rate": 7.466408645360524e-06, + "loss": 1.117, + "step": 4615 + }, + { + "epoch": 0.35584335491828556, + "grad_norm": 3.4314699172973633, + "learning_rate": 7.465322582169516e-06, + "loss": 0.8358, + "step": 4616 + }, + { + "epoch": 0.3559204440333025, + "grad_norm": 3.411808729171753, + "learning_rate": 7.464236365278413e-06, + "loss": 0.9704, + "step": 4617 + }, + { + "epoch": 0.35599753314831944, + "grad_norm": 3.6066629886627197, + "learning_rate": 7.463149994754938e-06, + "loss": 0.9963, + "step": 4618 + }, + { + "epoch": 0.3560746222633364, + "grad_norm": 3.517822742462158, + "learning_rate": 7.462063470666816e-06, + "loss": 0.9606, + "step": 4619 + }, + { + "epoch": 0.3561517113783534, + "grad_norm": 3.6595687866210938, + "learning_rate": 7.460976793081789e-06, + "loss": 0.944, + "step": 4620 + }, + { + "epoch": 0.35622880049337036, + "grad_norm": 3.9949941635131836, + "learning_rate": 7.459889962067603e-06, + "loss": 1.1163, + "step": 4621 + }, + { + "epoch": 0.35630588960838727, + "grad_norm": 3.570941209793091, + "learning_rate": 7.458802977692018e-06, + "loss": 1.02, + "step": 4622 + }, + { + "epoch": 0.35638297872340424, + "grad_norm": 3.210967540740967, + "learning_rate": 7.4577158400228034e-06, + "loss": 0.8364, + "step": 4623 + }, + { + "epoch": 0.3564600678384212, + "grad_norm": 3.722440719604492, + "learning_rate": 7.4566285491277334e-06, + "loss": 0.9846, + "step": 4624 + }, + { + "epoch": 0.3565371569534382, + "grad_norm": 3.4632368087768555, + "learning_rate": 7.455541105074598e-06, + "loss": 1.0375, + "step": 4625 + }, + { + "epoch": 0.35661424606845515, + "grad_norm": 3.4531891345977783, + "learning_rate": 7.454453507931192e-06, + "loss": 0.8937, + "step": 4626 + }, + { + "epoch": 0.35669133518347207, + "grad_norm": 3.6453826427459717, + "learning_rate": 7.45336575776532e-06, + "loss": 0.9928, + "step": 4627 + }, + { + "epoch": 0.35676842429848904, + "grad_norm": 3.3601088523864746, + "learning_rate": 7.452277854644801e-06, + "loss": 0.8869, + "step": 4628 + }, + { + "epoch": 0.356845513413506, + "grad_norm": 3.36553955078125, + "learning_rate": 7.45118979863746e-06, + "loss": 0.9932, + "step": 4629 + }, + { + "epoch": 0.356922602528523, + "grad_norm": 3.6436736583709717, + "learning_rate": 7.450101589811127e-06, + "loss": 1.0522, + "step": 4630 + }, + { + "epoch": 0.35699969164353995, + "grad_norm": 3.701080083847046, + "learning_rate": 7.449013228233651e-06, + "loss": 0.7591, + "step": 4631 + }, + { + "epoch": 0.35707678075855687, + "grad_norm": 3.6947665214538574, + "learning_rate": 7.447924713972883e-06, + "loss": 0.9894, + "step": 4632 + }, + { + "epoch": 0.35715386987357384, + "grad_norm": 3.8331315517425537, + "learning_rate": 7.4468360470966875e-06, + "loss": 0.9405, + "step": 4633 + }, + { + "epoch": 0.3572309589885908, + "grad_norm": 3.559194803237915, + "learning_rate": 7.445747227672937e-06, + "loss": 1.0249, + "step": 4634 + }, + { + "epoch": 0.3573080481036078, + "grad_norm": 4.069003582000732, + "learning_rate": 7.444658255769513e-06, + "loss": 1.0037, + "step": 4635 + }, + { + "epoch": 0.35738513721862475, + "grad_norm": 3.7197611331939697, + "learning_rate": 7.4435691314543105e-06, + "loss": 1.0408, + "step": 4636 + }, + { + "epoch": 0.35746222633364166, + "grad_norm": 3.3894405364990234, + "learning_rate": 7.442479854795226e-06, + "loss": 1.0021, + "step": 4637 + }, + { + "epoch": 0.35753931544865863, + "grad_norm": 3.269364833831787, + "learning_rate": 7.441390425860172e-06, + "loss": 0.9195, + "step": 4638 + }, + { + "epoch": 0.3576164045636756, + "grad_norm": 3.6949777603149414, + "learning_rate": 7.440300844717071e-06, + "loss": 1.111, + "step": 4639 + }, + { + "epoch": 0.3576934936786926, + "grad_norm": 3.708047389984131, + "learning_rate": 7.43921111143385e-06, + "loss": 1.0689, + "step": 4640 + }, + { + "epoch": 0.35777058279370955, + "grad_norm": 3.682943105697632, + "learning_rate": 7.438121226078449e-06, + "loss": 1.041, + "step": 4641 + }, + { + "epoch": 0.35784767190872646, + "grad_norm": 3.447502613067627, + "learning_rate": 7.437031188718818e-06, + "loss": 0.9712, + "step": 4642 + }, + { + "epoch": 0.35792476102374343, + "grad_norm": 3.6242482662200928, + "learning_rate": 7.435940999422913e-06, + "loss": 0.9694, + "step": 4643 + }, + { + "epoch": 0.3580018501387604, + "grad_norm": 3.254422903060913, + "learning_rate": 7.434850658258704e-06, + "loss": 0.8993, + "step": 4644 + }, + { + "epoch": 0.3580789392537774, + "grad_norm": 3.3965585231781006, + "learning_rate": 7.433760165294166e-06, + "loss": 0.9396, + "step": 4645 + }, + { + "epoch": 0.35815602836879434, + "grad_norm": 3.665678024291992, + "learning_rate": 7.4326695205972865e-06, + "loss": 1.008, + "step": 4646 + }, + { + "epoch": 0.35823311748381126, + "grad_norm": 3.5869436264038086, + "learning_rate": 7.431578724236062e-06, + "loss": 0.965, + "step": 4647 + }, + { + "epoch": 0.35831020659882823, + "grad_norm": 3.809087038040161, + "learning_rate": 7.430487776278497e-06, + "loss": 0.9678, + "step": 4648 + }, + { + "epoch": 0.3583872957138452, + "grad_norm": 3.8319554328918457, + "learning_rate": 7.429396676792607e-06, + "loss": 1.0688, + "step": 4649 + }, + { + "epoch": 0.35846438482886217, + "grad_norm": 3.6857879161834717, + "learning_rate": 7.428305425846416e-06, + "loss": 1.0452, + "step": 4650 + }, + { + "epoch": 0.35854147394387914, + "grad_norm": 3.5876760482788086, + "learning_rate": 7.427214023507958e-06, + "loss": 1.0092, + "step": 4651 + }, + { + "epoch": 0.3586185630588961, + "grad_norm": 3.979463815689087, + "learning_rate": 7.426122469845277e-06, + "loss": 1.1185, + "step": 4652 + }, + { + "epoch": 0.358695652173913, + "grad_norm": 3.456955909729004, + "learning_rate": 7.4250307649264265e-06, + "loss": 0.9516, + "step": 4653 + }, + { + "epoch": 0.35877274128893, + "grad_norm": 3.4941182136535645, + "learning_rate": 7.423938908819466e-06, + "loss": 0.945, + "step": 4654 + }, + { + "epoch": 0.35884983040394697, + "grad_norm": 3.835120677947998, + "learning_rate": 7.4228469015924675e-06, + "loss": 0.9512, + "step": 4655 + }, + { + "epoch": 0.35892691951896394, + "grad_norm": 3.682276964187622, + "learning_rate": 7.421754743313514e-06, + "loss": 1.0016, + "step": 4656 + }, + { + "epoch": 0.3590040086339809, + "grad_norm": 3.423520088195801, + "learning_rate": 7.420662434050695e-06, + "loss": 0.8797, + "step": 4657 + }, + { + "epoch": 0.3590810977489978, + "grad_norm": 3.420644998550415, + "learning_rate": 7.41956997387211e-06, + "loss": 0.8758, + "step": 4658 + }, + { + "epoch": 0.3591581868640148, + "grad_norm": 3.3552937507629395, + "learning_rate": 7.418477362845868e-06, + "loss": 0.9692, + "step": 4659 + }, + { + "epoch": 0.35923527597903177, + "grad_norm": 3.6283655166625977, + "learning_rate": 7.41738460104009e-06, + "loss": 1.0536, + "step": 4660 + }, + { + "epoch": 0.35931236509404874, + "grad_norm": 3.8392016887664795, + "learning_rate": 7.4162916885229e-06, + "loss": 0.9439, + "step": 4661 + }, + { + "epoch": 0.3593894542090657, + "grad_norm": 3.771641492843628, + "learning_rate": 7.415198625362437e-06, + "loss": 1.0178, + "step": 4662 + }, + { + "epoch": 0.3594665433240826, + "grad_norm": 3.600102424621582, + "learning_rate": 7.414105411626851e-06, + "loss": 1.0072, + "step": 4663 + }, + { + "epoch": 0.3595436324390996, + "grad_norm": 3.391453742980957, + "learning_rate": 7.413012047384292e-06, + "loss": 0.9686, + "step": 4664 + }, + { + "epoch": 0.35962072155411656, + "grad_norm": 3.72904634475708, + "learning_rate": 7.411918532702932e-06, + "loss": 1.0089, + "step": 4665 + }, + { + "epoch": 0.35969781066913353, + "grad_norm": 3.5617527961730957, + "learning_rate": 7.410824867650943e-06, + "loss": 0.9439, + "step": 4666 + }, + { + "epoch": 0.3597748997841505, + "grad_norm": 3.4571316242218018, + "learning_rate": 7.409731052296508e-06, + "loss": 0.9568, + "step": 4667 + }, + { + "epoch": 0.3598519888991674, + "grad_norm": 3.765029191970825, + "learning_rate": 7.408637086707824e-06, + "loss": 1.071, + "step": 4668 + }, + { + "epoch": 0.3599290780141844, + "grad_norm": 3.57428240776062, + "learning_rate": 7.4075429709530896e-06, + "loss": 1.0565, + "step": 4669 + }, + { + "epoch": 0.36000616712920136, + "grad_norm": 3.6238112449645996, + "learning_rate": 7.4064487051005215e-06, + "loss": 1.02, + "step": 4670 + }, + { + "epoch": 0.36008325624421833, + "grad_norm": 3.5156960487365723, + "learning_rate": 7.405354289218341e-06, + "loss": 0.9572, + "step": 4671 + }, + { + "epoch": 0.3601603453592353, + "grad_norm": 3.847799062728882, + "learning_rate": 7.404259723374778e-06, + "loss": 0.8973, + "step": 4672 + }, + { + "epoch": 0.3602374344742522, + "grad_norm": 3.390068531036377, + "learning_rate": 7.403165007638071e-06, + "loss": 0.9345, + "step": 4673 + }, + { + "epoch": 0.3603145235892692, + "grad_norm": 4.004146575927734, + "learning_rate": 7.402070142076475e-06, + "loss": 0.9548, + "step": 4674 + }, + { + "epoch": 0.36039161270428616, + "grad_norm": 3.443192720413208, + "learning_rate": 7.400975126758244e-06, + "loss": 1.0265, + "step": 4675 + }, + { + "epoch": 0.36046870181930313, + "grad_norm": 3.7502102851867676, + "learning_rate": 7.39987996175165e-06, + "loss": 0.9425, + "step": 4676 + }, + { + "epoch": 0.3605457909343201, + "grad_norm": 3.6927671432495117, + "learning_rate": 7.398784647124969e-06, + "loss": 0.9693, + "step": 4677 + }, + { + "epoch": 0.360622880049337, + "grad_norm": 3.445166826248169, + "learning_rate": 7.397689182946489e-06, + "loss": 1.0462, + "step": 4678 + }, + { + "epoch": 0.360699969164354, + "grad_norm": 3.651033401489258, + "learning_rate": 7.396593569284507e-06, + "loss": 1.0167, + "step": 4679 + }, + { + "epoch": 0.36077705827937095, + "grad_norm": 3.4636173248291016, + "learning_rate": 7.395497806207328e-06, + "loss": 0.9994, + "step": 4680 + }, + { + "epoch": 0.3608541473943879, + "grad_norm": 3.4041786193847656, + "learning_rate": 7.394401893783267e-06, + "loss": 0.9804, + "step": 4681 + }, + { + "epoch": 0.3609312365094049, + "grad_norm": 3.829814910888672, + "learning_rate": 7.393305832080649e-06, + "loss": 0.893, + "step": 4682 + }, + { + "epoch": 0.3610083256244218, + "grad_norm": 3.543970823287964, + "learning_rate": 7.392209621167808e-06, + "loss": 0.9415, + "step": 4683 + }, + { + "epoch": 0.3610854147394388, + "grad_norm": 3.526637315750122, + "learning_rate": 7.3911132611130865e-06, + "loss": 0.8818, + "step": 4684 + }, + { + "epoch": 0.36116250385445575, + "grad_norm": 3.4459733963012695, + "learning_rate": 7.390016751984837e-06, + "loss": 0.9302, + "step": 4685 + }, + { + "epoch": 0.3612395929694727, + "grad_norm": 3.7464380264282227, + "learning_rate": 7.388920093851422e-06, + "loss": 0.9382, + "step": 4686 + }, + { + "epoch": 0.3613166820844897, + "grad_norm": 3.308462381362915, + "learning_rate": 7.387823286781211e-06, + "loss": 1.0056, + "step": 4687 + }, + { + "epoch": 0.3613937711995066, + "grad_norm": 3.8184940814971924, + "learning_rate": 7.386726330842584e-06, + "loss": 1.075, + "step": 4688 + }, + { + "epoch": 0.3614708603145236, + "grad_norm": 3.9437508583068848, + "learning_rate": 7.385629226103932e-06, + "loss": 1.0295, + "step": 4689 + }, + { + "epoch": 0.36154794942954055, + "grad_norm": 3.679063558578491, + "learning_rate": 7.384531972633654e-06, + "loss": 1.067, + "step": 4690 + }, + { + "epoch": 0.3616250385445575, + "grad_norm": 3.801813840866089, + "learning_rate": 7.383434570500156e-06, + "loss": 0.9291, + "step": 4691 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 3.7192230224609375, + "learning_rate": 7.382337019771859e-06, + "loss": 1.0775, + "step": 4692 + }, + { + "epoch": 0.3617792167745914, + "grad_norm": 3.7992069721221924, + "learning_rate": 7.381239320517185e-06, + "loss": 1.0711, + "step": 4693 + }, + { + "epoch": 0.3618563058896084, + "grad_norm": 3.7021334171295166, + "learning_rate": 7.380141472804572e-06, + "loss": 0.9285, + "step": 4694 + }, + { + "epoch": 0.36193339500462535, + "grad_norm": 3.7016396522521973, + "learning_rate": 7.379043476702468e-06, + "loss": 0.9494, + "step": 4695 + }, + { + "epoch": 0.3620104841196423, + "grad_norm": 3.413544178009033, + "learning_rate": 7.377945332279322e-06, + "loss": 0.9474, + "step": 4696 + }, + { + "epoch": 0.3620875732346593, + "grad_norm": 3.475022077560425, + "learning_rate": 7.376847039603601e-06, + "loss": 1.0284, + "step": 4697 + }, + { + "epoch": 0.3621646623496762, + "grad_norm": 4.036199569702148, + "learning_rate": 7.375748598743777e-06, + "loss": 0.9613, + "step": 4698 + }, + { + "epoch": 0.3622417514646932, + "grad_norm": 3.2614598274230957, + "learning_rate": 7.374650009768332e-06, + "loss": 0.9619, + "step": 4699 + }, + { + "epoch": 0.36231884057971014, + "grad_norm": 3.5047171115875244, + "learning_rate": 7.373551272745757e-06, + "loss": 0.995, + "step": 4700 + }, + { + "epoch": 0.3623959296947271, + "grad_norm": 3.3241288661956787, + "learning_rate": 7.372452387744554e-06, + "loss": 0.8698, + "step": 4701 + }, + { + "epoch": 0.3624730188097441, + "grad_norm": 3.509653091430664, + "learning_rate": 7.371353354833231e-06, + "loss": 1.053, + "step": 4702 + }, + { + "epoch": 0.362550107924761, + "grad_norm": 4.059185981750488, + "learning_rate": 7.370254174080309e-06, + "loss": 1.1282, + "step": 4703 + }, + { + "epoch": 0.36262719703977797, + "grad_norm": 3.687779426574707, + "learning_rate": 7.3691548455543146e-06, + "loss": 0.9489, + "step": 4704 + }, + { + "epoch": 0.36270428615479494, + "grad_norm": 3.582054615020752, + "learning_rate": 7.368055369323787e-06, + "loss": 0.9714, + "step": 4705 + }, + { + "epoch": 0.3627813752698119, + "grad_norm": 3.6636664867401123, + "learning_rate": 7.36695574545727e-06, + "loss": 1.0117, + "step": 4706 + }, + { + "epoch": 0.3628584643848289, + "grad_norm": 3.7531189918518066, + "learning_rate": 7.365855974023321e-06, + "loss": 0.9775, + "step": 4707 + }, + { + "epoch": 0.3629355534998458, + "grad_norm": 3.9122326374053955, + "learning_rate": 7.364756055090506e-06, + "loss": 1.0191, + "step": 4708 + }, + { + "epoch": 0.36301264261486277, + "grad_norm": 4.153857231140137, + "learning_rate": 7.363655988727398e-06, + "loss": 1.0468, + "step": 4709 + }, + { + "epoch": 0.36308973172987974, + "grad_norm": 3.230003833770752, + "learning_rate": 7.36255577500258e-06, + "loss": 0.8938, + "step": 4710 + }, + { + "epoch": 0.3631668208448967, + "grad_norm": 3.6824822425842285, + "learning_rate": 7.3614554139846475e-06, + "loss": 0.9479, + "step": 4711 + }, + { + "epoch": 0.3632439099599137, + "grad_norm": 3.7477197647094727, + "learning_rate": 7.3603549057421975e-06, + "loss": 1.0547, + "step": 4712 + }, + { + "epoch": 0.3633209990749306, + "grad_norm": 3.6179938316345215, + "learning_rate": 7.359254250343846e-06, + "loss": 0.9269, + "step": 4713 + }, + { + "epoch": 0.36339808818994757, + "grad_norm": 3.606153964996338, + "learning_rate": 7.35815344785821e-06, + "loss": 1.08, + "step": 4714 + }, + { + "epoch": 0.36347517730496454, + "grad_norm": 3.634908676147461, + "learning_rate": 7.35705249835392e-06, + "loss": 0.8419, + "step": 4715 + }, + { + "epoch": 0.3635522664199815, + "grad_norm": 3.726848602294922, + "learning_rate": 7.355951401899614e-06, + "loss": 1.0174, + "step": 4716 + }, + { + "epoch": 0.3636293555349985, + "grad_norm": 3.769972562789917, + "learning_rate": 7.35485015856394e-06, + "loss": 0.9593, + "step": 4717 + }, + { + "epoch": 0.3637064446500154, + "grad_norm": 4.418411731719971, + "learning_rate": 7.353748768415554e-06, + "loss": 1.0226, + "step": 4718 + }, + { + "epoch": 0.36378353376503236, + "grad_norm": 3.3944032192230225, + "learning_rate": 7.352647231523124e-06, + "loss": 0.9723, + "step": 4719 + }, + { + "epoch": 0.36386062288004933, + "grad_norm": 3.6218276023864746, + "learning_rate": 7.351545547955321e-06, + "loss": 1.0031, + "step": 4720 + }, + { + "epoch": 0.3639377119950663, + "grad_norm": 3.672168016433716, + "learning_rate": 7.350443717780834e-06, + "loss": 0.9893, + "step": 4721 + }, + { + "epoch": 0.3640148011100833, + "grad_norm": 3.8659307956695557, + "learning_rate": 7.3493417410683545e-06, + "loss": 0.9818, + "step": 4722 + }, + { + "epoch": 0.3640918902251002, + "grad_norm": 3.814786672592163, + "learning_rate": 7.3482396178865846e-06, + "loss": 1.0346, + "step": 4723 + }, + { + "epoch": 0.36416897934011716, + "grad_norm": 3.595031976699829, + "learning_rate": 7.347137348304237e-06, + "loss": 0.9958, + "step": 4724 + }, + { + "epoch": 0.36424606845513413, + "grad_norm": 3.76802921295166, + "learning_rate": 7.346034932390031e-06, + "loss": 0.9374, + "step": 4725 + }, + { + "epoch": 0.3643231575701511, + "grad_norm": 3.507568120956421, + "learning_rate": 7.3449323702126985e-06, + "loss": 0.9716, + "step": 4726 + }, + { + "epoch": 0.36440024668516807, + "grad_norm": 3.8154513835906982, + "learning_rate": 7.343829661840977e-06, + "loss": 1.0059, + "step": 4727 + }, + { + "epoch": 0.364477335800185, + "grad_norm": 3.319742202758789, + "learning_rate": 7.342726807343615e-06, + "loss": 0.9758, + "step": 4728 + }, + { + "epoch": 0.36455442491520196, + "grad_norm": 3.301845073699951, + "learning_rate": 7.341623806789371e-06, + "loss": 0.9537, + "step": 4729 + }, + { + "epoch": 0.36463151403021893, + "grad_norm": 3.977522611618042, + "learning_rate": 7.340520660247008e-06, + "loss": 1.0368, + "step": 4730 + }, + { + "epoch": 0.3647086031452359, + "grad_norm": 3.8079488277435303, + "learning_rate": 7.339417367785304e-06, + "loss": 0.9951, + "step": 4731 + }, + { + "epoch": 0.36478569226025287, + "grad_norm": 3.3151187896728516, + "learning_rate": 7.338313929473046e-06, + "loss": 1.0278, + "step": 4732 + }, + { + "epoch": 0.3648627813752698, + "grad_norm": 3.971615791320801, + "learning_rate": 7.337210345379022e-06, + "loss": 0.9549, + "step": 4733 + }, + { + "epoch": 0.36493987049028676, + "grad_norm": 3.8018195629119873, + "learning_rate": 7.3361066155720385e-06, + "loss": 1.0072, + "step": 4734 + }, + { + "epoch": 0.3650169596053037, + "grad_norm": 3.6749868392944336, + "learning_rate": 7.335002740120907e-06, + "loss": 0.9764, + "step": 4735 + }, + { + "epoch": 0.3650940487203207, + "grad_norm": 3.655430316925049, + "learning_rate": 7.333898719094448e-06, + "loss": 0.9881, + "step": 4736 + }, + { + "epoch": 0.36517113783533767, + "grad_norm": 3.9375107288360596, + "learning_rate": 7.3327945525614906e-06, + "loss": 1.0533, + "step": 4737 + }, + { + "epoch": 0.36524822695035464, + "grad_norm": 3.5556159019470215, + "learning_rate": 7.3316902405908756e-06, + "loss": 0.9244, + "step": 4738 + }, + { + "epoch": 0.36532531606537155, + "grad_norm": 3.914153575897217, + "learning_rate": 7.33058578325145e-06, + "loss": 1.0044, + "step": 4739 + }, + { + "epoch": 0.3654024051803885, + "grad_norm": 3.557025194168091, + "learning_rate": 7.329481180612072e-06, + "loss": 0.8395, + "step": 4740 + }, + { + "epoch": 0.3654794942954055, + "grad_norm": 3.5067689418792725, + "learning_rate": 7.328376432741605e-06, + "loss": 1.0821, + "step": 4741 + }, + { + "epoch": 0.36555658341042246, + "grad_norm": 3.890801191329956, + "learning_rate": 7.327271539708927e-06, + "loss": 1.0238, + "step": 4742 + }, + { + "epoch": 0.36563367252543943, + "grad_norm": 4.020145893096924, + "learning_rate": 7.326166501582922e-06, + "loss": 0.9887, + "step": 4743 + }, + { + "epoch": 0.36571076164045635, + "grad_norm": 3.579129219055176, + "learning_rate": 7.325061318432482e-06, + "loss": 1.0122, + "step": 4744 + }, + { + "epoch": 0.3657878507554733, + "grad_norm": 3.599555730819702, + "learning_rate": 7.323955990326514e-06, + "loss": 1.0213, + "step": 4745 + }, + { + "epoch": 0.3658649398704903, + "grad_norm": 3.7440710067749023, + "learning_rate": 7.322850517333924e-06, + "loss": 1.1071, + "step": 4746 + }, + { + "epoch": 0.36594202898550726, + "grad_norm": 3.8629238605499268, + "learning_rate": 7.321744899523634e-06, + "loss": 1.0812, + "step": 4747 + }, + { + "epoch": 0.36601911810052423, + "grad_norm": 3.8284130096435547, + "learning_rate": 7.320639136964576e-06, + "loss": 0.9624, + "step": 4748 + }, + { + "epoch": 0.36609620721554115, + "grad_norm": 3.523813247680664, + "learning_rate": 7.319533229725685e-06, + "loss": 1.0628, + "step": 4749 + }, + { + "epoch": 0.3661732963305581, + "grad_norm": 3.4013636112213135, + "learning_rate": 7.31842717787591e-06, + "loss": 0.9275, + "step": 4750 + }, + { + "epoch": 0.3662503854455751, + "grad_norm": 3.6408772468566895, + "learning_rate": 7.31732098148421e-06, + "loss": 1.0767, + "step": 4751 + }, + { + "epoch": 0.36632747456059206, + "grad_norm": 3.796874523162842, + "learning_rate": 7.316214640619546e-06, + "loss": 1.013, + "step": 4752 + }, + { + "epoch": 0.36640456367560903, + "grad_norm": 3.552623748779297, + "learning_rate": 7.3151081553508975e-06, + "loss": 0.9438, + "step": 4753 + }, + { + "epoch": 0.36648165279062594, + "grad_norm": 3.6614010334014893, + "learning_rate": 7.314001525747244e-06, + "loss": 0.981, + "step": 4754 + }, + { + "epoch": 0.3665587419056429, + "grad_norm": 3.726024627685547, + "learning_rate": 7.31289475187758e-06, + "loss": 0.9823, + "step": 4755 + }, + { + "epoch": 0.3666358310206599, + "grad_norm": 3.74556303024292, + "learning_rate": 7.311787833810908e-06, + "loss": 0.9622, + "step": 4756 + }, + { + "epoch": 0.36671292013567686, + "grad_norm": 3.351370096206665, + "learning_rate": 7.310680771616238e-06, + "loss": 0.9525, + "step": 4757 + }, + { + "epoch": 0.3667900092506938, + "grad_norm": 3.492908477783203, + "learning_rate": 7.309573565362588e-06, + "loss": 0.9845, + "step": 4758 + }, + { + "epoch": 0.36686709836571074, + "grad_norm": 3.719702959060669, + "learning_rate": 7.308466215118988e-06, + "loss": 0.9873, + "step": 4759 + }, + { + "epoch": 0.3669441874807277, + "grad_norm": 3.778932809829712, + "learning_rate": 7.307358720954476e-06, + "loss": 1.0009, + "step": 4760 + }, + { + "epoch": 0.3670212765957447, + "grad_norm": 3.6195807456970215, + "learning_rate": 7.306251082938096e-06, + "loss": 0.8939, + "step": 4761 + }, + { + "epoch": 0.36709836571076165, + "grad_norm": 3.6259400844573975, + "learning_rate": 7.305143301138908e-06, + "loss": 1.0111, + "step": 4762 + }, + { + "epoch": 0.3671754548257786, + "grad_norm": 4.1105637550354, + "learning_rate": 7.3040353756259726e-06, + "loss": 1.0318, + "step": 4763 + }, + { + "epoch": 0.36725254394079554, + "grad_norm": 4.061164379119873, + "learning_rate": 7.302927306468365e-06, + "loss": 0.9631, + "step": 4764 + }, + { + "epoch": 0.3673296330558125, + "grad_norm": 3.501284122467041, + "learning_rate": 7.301819093735165e-06, + "loss": 1.0098, + "step": 4765 + }, + { + "epoch": 0.3674067221708295, + "grad_norm": 3.7744534015655518, + "learning_rate": 7.3007107374954665e-06, + "loss": 1.1278, + "step": 4766 + }, + { + "epoch": 0.36748381128584645, + "grad_norm": 3.3170313835144043, + "learning_rate": 7.299602237818371e-06, + "loss": 1.0022, + "step": 4767 + }, + { + "epoch": 0.3675609004008634, + "grad_norm": 3.677799701690674, + "learning_rate": 7.298493594772985e-06, + "loss": 1.0452, + "step": 4768 + }, + { + "epoch": 0.36763798951588034, + "grad_norm": 3.811732292175293, + "learning_rate": 7.297384808428428e-06, + "loss": 0.8941, + "step": 4769 + }, + { + "epoch": 0.3677150786308973, + "grad_norm": 4.003880977630615, + "learning_rate": 7.296275878853826e-06, + "loss": 1.0617, + "step": 4770 + }, + { + "epoch": 0.3677921677459143, + "grad_norm": 3.452606439590454, + "learning_rate": 7.295166806118315e-06, + "loss": 0.9734, + "step": 4771 + }, + { + "epoch": 0.36786925686093125, + "grad_norm": 3.6096274852752686, + "learning_rate": 7.294057590291043e-06, + "loss": 0.8906, + "step": 4772 + }, + { + "epoch": 0.3679463459759482, + "grad_norm": 3.7491250038146973, + "learning_rate": 7.2929482314411596e-06, + "loss": 0.959, + "step": 4773 + }, + { + "epoch": 0.36802343509096513, + "grad_norm": 3.7471847534179688, + "learning_rate": 7.291838729637829e-06, + "loss": 1.1626, + "step": 4774 + }, + { + "epoch": 0.3681005242059821, + "grad_norm": 3.6201562881469727, + "learning_rate": 7.290729084950226e-06, + "loss": 0.9828, + "step": 4775 + }, + { + "epoch": 0.3681776133209991, + "grad_norm": 3.8552613258361816, + "learning_rate": 7.289619297447525e-06, + "loss": 0.9679, + "step": 4776 + }, + { + "epoch": 0.36825470243601605, + "grad_norm": 3.5603456497192383, + "learning_rate": 7.288509367198922e-06, + "loss": 0.9275, + "step": 4777 + }, + { + "epoch": 0.368331791551033, + "grad_norm": 3.6187384128570557, + "learning_rate": 7.28739929427361e-06, + "loss": 1.0267, + "step": 4778 + }, + { + "epoch": 0.36840888066604993, + "grad_norm": 3.5130581855773926, + "learning_rate": 7.2862890787408e-06, + "loss": 0.9113, + "step": 4779 + }, + { + "epoch": 0.3684859697810669, + "grad_norm": 3.5186641216278076, + "learning_rate": 7.2851787206697075e-06, + "loss": 0.9837, + "step": 4780 + }, + { + "epoch": 0.3685630588960839, + "grad_norm": 3.7700576782226562, + "learning_rate": 7.284068220129557e-06, + "loss": 1.0265, + "step": 4781 + }, + { + "epoch": 0.36864014801110084, + "grad_norm": 3.770547389984131, + "learning_rate": 7.282957577189581e-06, + "loss": 1.0115, + "step": 4782 + }, + { + "epoch": 0.3687172371261178, + "grad_norm": 3.8225295543670654, + "learning_rate": 7.281846791919025e-06, + "loss": 1.1445, + "step": 4783 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 3.4760961532592773, + "learning_rate": 7.280735864387138e-06, + "loss": 0.9826, + "step": 4784 + }, + { + "epoch": 0.3688714153561517, + "grad_norm": 3.5458803176879883, + "learning_rate": 7.279624794663184e-06, + "loss": 0.9137, + "step": 4785 + }, + { + "epoch": 0.36894850447116867, + "grad_norm": 3.6354055404663086, + "learning_rate": 7.27851358281643e-06, + "loss": 0.9363, + "step": 4786 + }, + { + "epoch": 0.36902559358618564, + "grad_norm": 3.4249465465545654, + "learning_rate": 7.277402228916155e-06, + "loss": 0.9732, + "step": 4787 + }, + { + "epoch": 0.3691026827012026, + "grad_norm": 4.003117561340332, + "learning_rate": 7.2762907330316446e-06, + "loss": 1.0536, + "step": 4788 + }, + { + "epoch": 0.3691797718162195, + "grad_norm": 3.547496795654297, + "learning_rate": 7.275179095232197e-06, + "loss": 0.9676, + "step": 4789 + }, + { + "epoch": 0.3692568609312365, + "grad_norm": 3.738219976425171, + "learning_rate": 7.274067315587116e-06, + "loss": 1.0024, + "step": 4790 + }, + { + "epoch": 0.36933395004625347, + "grad_norm": 4.052086353302002, + "learning_rate": 7.272955394165717e-06, + "loss": 0.9979, + "step": 4791 + }, + { + "epoch": 0.36941103916127044, + "grad_norm": 3.6811420917510986, + "learning_rate": 7.27184333103732e-06, + "loss": 1.0432, + "step": 4792 + }, + { + "epoch": 0.3694881282762874, + "grad_norm": 3.4397454261779785, + "learning_rate": 7.270731126271257e-06, + "loss": 0.8792, + "step": 4793 + }, + { + "epoch": 0.3695652173913043, + "grad_norm": 3.70704984664917, + "learning_rate": 7.26961877993687e-06, + "loss": 0.8806, + "step": 4794 + }, + { + "epoch": 0.3696423065063213, + "grad_norm": 4.066430568695068, + "learning_rate": 7.268506292103505e-06, + "loss": 1.0428, + "step": 4795 + }, + { + "epoch": 0.36971939562133826, + "grad_norm": 3.72969388961792, + "learning_rate": 7.267393662840525e-06, + "loss": 1.0417, + "step": 4796 + }, + { + "epoch": 0.36979648473635524, + "grad_norm": 3.8739657402038574, + "learning_rate": 7.26628089221729e-06, + "loss": 1.0413, + "step": 4797 + }, + { + "epoch": 0.3698735738513722, + "grad_norm": 3.4731998443603516, + "learning_rate": 7.265167980303179e-06, + "loss": 1.0326, + "step": 4798 + }, + { + "epoch": 0.3699506629663891, + "grad_norm": 3.697890281677246, + "learning_rate": 7.264054927167577e-06, + "loss": 1.0505, + "step": 4799 + }, + { + "epoch": 0.3700277520814061, + "grad_norm": 3.670499086380005, + "learning_rate": 7.2629417328798755e-06, + "loss": 0.9704, + "step": 4800 + }, + { + "epoch": 0.37010484119642306, + "grad_norm": 3.542783260345459, + "learning_rate": 7.2618283975094785e-06, + "loss": 1.0082, + "step": 4801 + }, + { + "epoch": 0.37018193031144003, + "grad_norm": 3.7526941299438477, + "learning_rate": 7.260714921125795e-06, + "loss": 1.05, + "step": 4802 + }, + { + "epoch": 0.370259019426457, + "grad_norm": 3.547579050064087, + "learning_rate": 7.259601303798243e-06, + "loss": 0.9653, + "step": 4803 + }, + { + "epoch": 0.3703361085414739, + "grad_norm": 3.573000192642212, + "learning_rate": 7.258487545596254e-06, + "loss": 0.9571, + "step": 4804 + }, + { + "epoch": 0.3704131976564909, + "grad_norm": 3.745025873184204, + "learning_rate": 7.257373646589262e-06, + "loss": 1.0854, + "step": 4805 + }, + { + "epoch": 0.37049028677150786, + "grad_norm": 3.414745330810547, + "learning_rate": 7.256259606846715e-06, + "loss": 0.9017, + "step": 4806 + }, + { + "epoch": 0.37056737588652483, + "grad_norm": 3.3791048526763916, + "learning_rate": 7.255145426438068e-06, + "loss": 0.946, + "step": 4807 + }, + { + "epoch": 0.3706444650015418, + "grad_norm": 3.603844165802002, + "learning_rate": 7.254031105432781e-06, + "loss": 0.956, + "step": 4808 + }, + { + "epoch": 0.3707215541165587, + "grad_norm": 4.20005464553833, + "learning_rate": 7.252916643900331e-06, + "loss": 1.0829, + "step": 4809 + }, + { + "epoch": 0.3707986432315757, + "grad_norm": 3.4412059783935547, + "learning_rate": 7.251802041910194e-06, + "loss": 1.0052, + "step": 4810 + }, + { + "epoch": 0.37087573234659266, + "grad_norm": 3.931774616241455, + "learning_rate": 7.250687299531864e-06, + "loss": 1.0152, + "step": 4811 + }, + { + "epoch": 0.3709528214616096, + "grad_norm": 3.7596309185028076, + "learning_rate": 7.249572416834838e-06, + "loss": 0.9358, + "step": 4812 + }, + { + "epoch": 0.3710299105766266, + "grad_norm": 3.739588975906372, + "learning_rate": 7.2484573938886215e-06, + "loss": 0.9234, + "step": 4813 + }, + { + "epoch": 0.3711069996916435, + "grad_norm": 3.4759035110473633, + "learning_rate": 7.2473422307627304e-06, + "loss": 1.0292, + "step": 4814 + }, + { + "epoch": 0.3711840888066605, + "grad_norm": 3.456209659576416, + "learning_rate": 7.246226927526693e-06, + "loss": 1.0245, + "step": 4815 + }, + { + "epoch": 0.37126117792167745, + "grad_norm": 3.7224795818328857, + "learning_rate": 7.245111484250038e-06, + "loss": 0.9233, + "step": 4816 + }, + { + "epoch": 0.3713382670366944, + "grad_norm": 3.3582236766815186, + "learning_rate": 7.243995901002312e-06, + "loss": 1.0253, + "step": 4817 + }, + { + "epoch": 0.3714153561517114, + "grad_norm": 3.8575472831726074, + "learning_rate": 7.242880177853062e-06, + "loss": 1.0134, + "step": 4818 + }, + { + "epoch": 0.3714924452667283, + "grad_norm": 3.423126697540283, + "learning_rate": 7.241764314871848e-06, + "loss": 0.9355, + "step": 4819 + }, + { + "epoch": 0.3715695343817453, + "grad_norm": 3.5420002937316895, + "learning_rate": 7.240648312128242e-06, + "loss": 0.9646, + "step": 4820 + }, + { + "epoch": 0.37164662349676225, + "grad_norm": 3.214689254760742, + "learning_rate": 7.239532169691817e-06, + "loss": 0.9524, + "step": 4821 + }, + { + "epoch": 0.3717237126117792, + "grad_norm": 3.500775098800659, + "learning_rate": 7.23841588763216e-06, + "loss": 1.0106, + "step": 4822 + }, + { + "epoch": 0.3718008017267962, + "grad_norm": 3.302901029586792, + "learning_rate": 7.237299466018866e-06, + "loss": 0.8822, + "step": 4823 + }, + { + "epoch": 0.37187789084181316, + "grad_norm": 3.5195157527923584, + "learning_rate": 7.236182904921536e-06, + "loss": 1.0378, + "step": 4824 + }, + { + "epoch": 0.3719549799568301, + "grad_norm": 3.8534128665924072, + "learning_rate": 7.2350662044097854e-06, + "loss": 1.0471, + "step": 4825 + }, + { + "epoch": 0.37203206907184705, + "grad_norm": 3.6526763439178467, + "learning_rate": 7.233949364553232e-06, + "loss": 1.0511, + "step": 4826 + }, + { + "epoch": 0.372109158186864, + "grad_norm": 3.9592645168304443, + "learning_rate": 7.2328323854215044e-06, + "loss": 0.9967, + "step": 4827 + }, + { + "epoch": 0.372186247301881, + "grad_norm": 3.569300889968872, + "learning_rate": 7.231715267084243e-06, + "loss": 0.9936, + "step": 4828 + }, + { + "epoch": 0.37226333641689796, + "grad_norm": 3.6688168048858643, + "learning_rate": 7.2305980096110925e-06, + "loss": 1.1023, + "step": 4829 + }, + { + "epoch": 0.3723404255319149, + "grad_norm": 3.4594991207122803, + "learning_rate": 7.229480613071709e-06, + "loss": 0.9457, + "step": 4830 + }, + { + "epoch": 0.37241751464693185, + "grad_norm": 3.949270009994507, + "learning_rate": 7.228363077535756e-06, + "loss": 0.9467, + "step": 4831 + }, + { + "epoch": 0.3724946037619488, + "grad_norm": 3.7275776863098145, + "learning_rate": 7.227245403072905e-06, + "loss": 0.909, + "step": 4832 + }, + { + "epoch": 0.3725716928769658, + "grad_norm": 3.561305046081543, + "learning_rate": 7.2261275897528405e-06, + "loss": 1.1122, + "step": 4833 + }, + { + "epoch": 0.37264878199198276, + "grad_norm": 3.612966299057007, + "learning_rate": 7.225009637645248e-06, + "loss": 1.0967, + "step": 4834 + }, + { + "epoch": 0.3727258711069997, + "grad_norm": 3.454404354095459, + "learning_rate": 7.223891546819829e-06, + "loss": 1.0302, + "step": 4835 + }, + { + "epoch": 0.37280296022201664, + "grad_norm": 3.8778316974639893, + "learning_rate": 7.222773317346291e-06, + "loss": 1.0213, + "step": 4836 + }, + { + "epoch": 0.3728800493370336, + "grad_norm": 3.4762234687805176, + "learning_rate": 7.221654949294348e-06, + "loss": 0.9744, + "step": 4837 + }, + { + "epoch": 0.3729571384520506, + "grad_norm": 3.4373879432678223, + "learning_rate": 7.220536442733724e-06, + "loss": 0.8598, + "step": 4838 + }, + { + "epoch": 0.37303422756706756, + "grad_norm": 4.246344089508057, + "learning_rate": 7.219417797734155e-06, + "loss": 1.0942, + "step": 4839 + }, + { + "epoch": 0.37311131668208447, + "grad_norm": 3.5066323280334473, + "learning_rate": 7.2182990143653795e-06, + "loss": 1.0291, + "step": 4840 + }, + { + "epoch": 0.37318840579710144, + "grad_norm": 3.613722562789917, + "learning_rate": 7.217180092697152e-06, + "loss": 0.9715, + "step": 4841 + }, + { + "epoch": 0.3732654949121184, + "grad_norm": 3.9713635444641113, + "learning_rate": 7.216061032799225e-06, + "loss": 1.0856, + "step": 4842 + }, + { + "epoch": 0.3733425840271354, + "grad_norm": 3.5957889556884766, + "learning_rate": 7.2149418347413724e-06, + "loss": 0.9641, + "step": 4843 + }, + { + "epoch": 0.37341967314215235, + "grad_norm": 3.5281994342803955, + "learning_rate": 7.213822498593368e-06, + "loss": 1.0026, + "step": 4844 + }, + { + "epoch": 0.37349676225716927, + "grad_norm": 4.489894390106201, + "learning_rate": 7.212703024424995e-06, + "loss": 1.061, + "step": 4845 + }, + { + "epoch": 0.37357385137218624, + "grad_norm": 4.678754806518555, + "learning_rate": 7.211583412306049e-06, + "loss": 0.9971, + "step": 4846 + }, + { + "epoch": 0.3736509404872032, + "grad_norm": 4.363256454467773, + "learning_rate": 7.2104636623063315e-06, + "loss": 1.0027, + "step": 4847 + }, + { + "epoch": 0.3737280296022202, + "grad_norm": 3.5184109210968018, + "learning_rate": 7.209343774495652e-06, + "loss": 1.0354, + "step": 4848 + }, + { + "epoch": 0.37380511871723715, + "grad_norm": 3.521500587463379, + "learning_rate": 7.208223748943832e-06, + "loss": 0.9084, + "step": 4849 + }, + { + "epoch": 0.37388220783225407, + "grad_norm": 3.56219744682312, + "learning_rate": 7.207103585720697e-06, + "loss": 1.0089, + "step": 4850 + }, + { + "epoch": 0.37395929694727104, + "grad_norm": 3.738111734390259, + "learning_rate": 7.2059832848960855e-06, + "loss": 1.0386, + "step": 4851 + }, + { + "epoch": 0.374036386062288, + "grad_norm": 4.135499954223633, + "learning_rate": 7.204862846539841e-06, + "loss": 1.0668, + "step": 4852 + }, + { + "epoch": 0.374113475177305, + "grad_norm": 3.822765827178955, + "learning_rate": 7.2037422707218165e-06, + "loss": 1.0167, + "step": 4853 + }, + { + "epoch": 0.37419056429232195, + "grad_norm": 3.540203094482422, + "learning_rate": 7.202621557511874e-06, + "loss": 0.9056, + "step": 4854 + }, + { + "epoch": 0.37426765340733886, + "grad_norm": 3.5464091300964355, + "learning_rate": 7.201500706979886e-06, + "loss": 0.9215, + "step": 4855 + }, + { + "epoch": 0.37434474252235583, + "grad_norm": 3.7455790042877197, + "learning_rate": 7.20037971919573e-06, + "loss": 0.9164, + "step": 4856 + }, + { + "epoch": 0.3744218316373728, + "grad_norm": 3.647853374481201, + "learning_rate": 7.199258594229297e-06, + "loss": 1.0188, + "step": 4857 + }, + { + "epoch": 0.3744989207523898, + "grad_norm": 3.654245615005493, + "learning_rate": 7.198137332150479e-06, + "loss": 1.0136, + "step": 4858 + }, + { + "epoch": 0.37457600986740675, + "grad_norm": 3.327587127685547, + "learning_rate": 7.197015933029184e-06, + "loss": 0.9465, + "step": 4859 + }, + { + "epoch": 0.37465309898242366, + "grad_norm": 4.306604385375977, + "learning_rate": 7.195894396935324e-06, + "loss": 1.0239, + "step": 4860 + }, + { + "epoch": 0.37473018809744063, + "grad_norm": 4.178218841552734, + "learning_rate": 7.194772723938819e-06, + "loss": 1.0016, + "step": 4861 + }, + { + "epoch": 0.3748072772124576, + "grad_norm": 4.198592185974121, + "learning_rate": 7.193650914109603e-06, + "loss": 1.0389, + "step": 4862 + }, + { + "epoch": 0.37488436632747457, + "grad_norm": 3.9807186126708984, + "learning_rate": 7.192528967517615e-06, + "loss": 0.9948, + "step": 4863 + }, + { + "epoch": 0.37496145544249154, + "grad_norm": 3.558039426803589, + "learning_rate": 7.1914068842328e-06, + "loss": 1.0135, + "step": 4864 + }, + { + "epoch": 0.37503854455750846, + "grad_norm": 3.5846774578094482, + "learning_rate": 7.190284664325116e-06, + "loss": 1.0469, + "step": 4865 + }, + { + "epoch": 0.37511563367252543, + "grad_norm": 3.2694685459136963, + "learning_rate": 7.189162307864525e-06, + "loss": 0.9575, + "step": 4866 + }, + { + "epoch": 0.3751927227875424, + "grad_norm": 3.3585803508758545, + "learning_rate": 7.188039814921004e-06, + "loss": 0.9909, + "step": 4867 + }, + { + "epoch": 0.37526981190255937, + "grad_norm": 3.606834650039673, + "learning_rate": 7.186917185564534e-06, + "loss": 0.999, + "step": 4868 + }, + { + "epoch": 0.37534690101757634, + "grad_norm": 3.2820441722869873, + "learning_rate": 7.185794419865102e-06, + "loss": 0.9831, + "step": 4869 + }, + { + "epoch": 0.37542399013259325, + "grad_norm": 3.559875011444092, + "learning_rate": 7.184671517892707e-06, + "loss": 0.9819, + "step": 4870 + }, + { + "epoch": 0.3755010792476102, + "grad_norm": 3.5334460735321045, + "learning_rate": 7.183548479717361e-06, + "loss": 1.031, + "step": 4871 + }, + { + "epoch": 0.3755781683626272, + "grad_norm": 3.8719050884246826, + "learning_rate": 7.1824253054090735e-06, + "loss": 0.9976, + "step": 4872 + }, + { + "epoch": 0.37565525747764417, + "grad_norm": 3.7610104084014893, + "learning_rate": 7.1813019950378724e-06, + "loss": 1.1075, + "step": 4873 + }, + { + "epoch": 0.37573234659266114, + "grad_norm": 3.8099474906921387, + "learning_rate": 7.1801785486737884e-06, + "loss": 1.114, + "step": 4874 + }, + { + "epoch": 0.37580943570767805, + "grad_norm": 3.353347063064575, + "learning_rate": 7.1790549663868644e-06, + "loss": 0.9258, + "step": 4875 + }, + { + "epoch": 0.375886524822695, + "grad_norm": 3.220855236053467, + "learning_rate": 7.1779312482471475e-06, + "loss": 0.9835, + "step": 4876 + }, + { + "epoch": 0.375963613937712, + "grad_norm": 3.72371244430542, + "learning_rate": 7.176807394324697e-06, + "loss": 1.0217, + "step": 4877 + }, + { + "epoch": 0.37604070305272896, + "grad_norm": 3.4218595027923584, + "learning_rate": 7.1756834046895815e-06, + "loss": 0.9421, + "step": 4878 + }, + { + "epoch": 0.37611779216774593, + "grad_norm": 3.7744340896606445, + "learning_rate": 7.174559279411872e-06, + "loss": 0.9703, + "step": 4879 + }, + { + "epoch": 0.37619488128276285, + "grad_norm": 3.9162957668304443, + "learning_rate": 7.173435018561654e-06, + "loss": 1.1117, + "step": 4880 + }, + { + "epoch": 0.3762719703977798, + "grad_norm": 3.7875723838806152, + "learning_rate": 7.17231062220902e-06, + "loss": 1.0843, + "step": 4881 + }, + { + "epoch": 0.3763490595127968, + "grad_norm": 3.8338139057159424, + "learning_rate": 7.171186090424069e-06, + "loss": 1.0342, + "step": 4882 + }, + { + "epoch": 0.37642614862781376, + "grad_norm": 3.3888375759124756, + "learning_rate": 7.170061423276911e-06, + "loss": 1.0047, + "step": 4883 + }, + { + "epoch": 0.37650323774283073, + "grad_norm": 3.7450151443481445, + "learning_rate": 7.168936620837661e-06, + "loss": 0.9721, + "step": 4884 + }, + { + "epoch": 0.37658032685784765, + "grad_norm": 3.4551503658294678, + "learning_rate": 7.167811683176446e-06, + "loss": 0.8549, + "step": 4885 + }, + { + "epoch": 0.3766574159728646, + "grad_norm": 3.826219320297241, + "learning_rate": 7.166686610363399e-06, + "loss": 1.0317, + "step": 4886 + }, + { + "epoch": 0.3767345050878816, + "grad_norm": 3.661606788635254, + "learning_rate": 7.165561402468666e-06, + "loss": 0.9179, + "step": 4887 + }, + { + "epoch": 0.37681159420289856, + "grad_norm": 3.9215991497039795, + "learning_rate": 7.164436059562393e-06, + "loss": 1.1039, + "step": 4888 + }, + { + "epoch": 0.37688868331791553, + "grad_norm": 3.983035087585449, + "learning_rate": 7.163310581714744e-06, + "loss": 0.9932, + "step": 4889 + }, + { + "epoch": 0.37696577243293244, + "grad_norm": 3.3933942317962646, + "learning_rate": 7.162184968995882e-06, + "loss": 0.9746, + "step": 4890 + }, + { + "epoch": 0.3770428615479494, + "grad_norm": 3.4548404216766357, + "learning_rate": 7.161059221475985e-06, + "loss": 0.8202, + "step": 4891 + }, + { + "epoch": 0.3771199506629664, + "grad_norm": 3.530745267868042, + "learning_rate": 7.15993333922524e-06, + "loss": 0.8811, + "step": 4892 + }, + { + "epoch": 0.37719703977798336, + "grad_norm": 3.4644525051116943, + "learning_rate": 7.158807322313837e-06, + "loss": 1.0706, + "step": 4893 + }, + { + "epoch": 0.3772741288930003, + "grad_norm": 3.4556727409362793, + "learning_rate": 7.157681170811979e-06, + "loss": 0.9343, + "step": 4894 + }, + { + "epoch": 0.37735121800801724, + "grad_norm": 3.663114547729492, + "learning_rate": 7.156554884789874e-06, + "loss": 1.0052, + "step": 4895 + }, + { + "epoch": 0.3774283071230342, + "grad_norm": 3.9174418449401855, + "learning_rate": 7.155428464317741e-06, + "loss": 0.9232, + "step": 4896 + }, + { + "epoch": 0.3775053962380512, + "grad_norm": 3.8079843521118164, + "learning_rate": 7.1543019094658074e-06, + "loss": 0.9495, + "step": 4897 + }, + { + "epoch": 0.37758248535306815, + "grad_norm": 3.580113649368286, + "learning_rate": 7.153175220304305e-06, + "loss": 1.0305, + "step": 4898 + }, + { + "epoch": 0.3776595744680851, + "grad_norm": 3.540668487548828, + "learning_rate": 7.152048396903479e-06, + "loss": 0.959, + "step": 4899 + }, + { + "epoch": 0.37773666358310204, + "grad_norm": 3.402894973754883, + "learning_rate": 7.150921439333584e-06, + "loss": 0.9978, + "step": 4900 + }, + { + "epoch": 0.377813752698119, + "grad_norm": 3.550427198410034, + "learning_rate": 7.149794347664876e-06, + "loss": 1.0077, + "step": 4901 + }, + { + "epoch": 0.377890841813136, + "grad_norm": 3.6682233810424805, + "learning_rate": 7.148667121967625e-06, + "loss": 0.9905, + "step": 4902 + }, + { + "epoch": 0.37796793092815295, + "grad_norm": 3.8685266971588135, + "learning_rate": 7.147539762312107e-06, + "loss": 1.0975, + "step": 4903 + }, + { + "epoch": 0.3780450200431699, + "grad_norm": 3.448533773422241, + "learning_rate": 7.146412268768605e-06, + "loss": 0.857, + "step": 4904 + }, + { + "epoch": 0.37812210915818684, + "grad_norm": 3.861265182495117, + "learning_rate": 7.145284641407418e-06, + "loss": 1.0235, + "step": 4905 + }, + { + "epoch": 0.3781991982732038, + "grad_norm": 3.419316053390503, + "learning_rate": 7.144156880298843e-06, + "loss": 1.0753, + "step": 4906 + }, + { + "epoch": 0.3782762873882208, + "grad_norm": 3.76204776763916, + "learning_rate": 7.143028985513191e-06, + "loss": 0.9979, + "step": 4907 + }, + { + "epoch": 0.37835337650323775, + "grad_norm": 3.591517686843872, + "learning_rate": 7.141900957120781e-06, + "loss": 1.0564, + "step": 4908 + }, + { + "epoch": 0.3784304656182547, + "grad_norm": 3.76042103767395, + "learning_rate": 7.140772795191939e-06, + "loss": 1.0371, + "step": 4909 + }, + { + "epoch": 0.3785075547332717, + "grad_norm": 4.168672561645508, + "learning_rate": 7.139644499797e-06, + "loss": 1.0409, + "step": 4910 + }, + { + "epoch": 0.3785846438482886, + "grad_norm": 3.331540822982788, + "learning_rate": 7.13851607100631e-06, + "loss": 0.9319, + "step": 4911 + }, + { + "epoch": 0.3786617329633056, + "grad_norm": 3.5602786540985107, + "learning_rate": 7.137387508890218e-06, + "loss": 0.8784, + "step": 4912 + }, + { + "epoch": 0.37873882207832255, + "grad_norm": 3.6382076740264893, + "learning_rate": 7.136258813519085e-06, + "loss": 1.0411, + "step": 4913 + }, + { + "epoch": 0.3788159111933395, + "grad_norm": 3.845329761505127, + "learning_rate": 7.135129984963277e-06, + "loss": 1.0158, + "step": 4914 + }, + { + "epoch": 0.3788930003083565, + "grad_norm": 3.546248197555542, + "learning_rate": 7.134001023293173e-06, + "loss": 1.044, + "step": 4915 + }, + { + "epoch": 0.3789700894233734, + "grad_norm": 3.4055960178375244, + "learning_rate": 7.132871928579159e-06, + "loss": 0.9337, + "step": 4916 + }, + { + "epoch": 0.3790471785383904, + "grad_norm": 3.8326213359832764, + "learning_rate": 7.131742700891626e-06, + "loss": 1.0974, + "step": 4917 + }, + { + "epoch": 0.37912426765340734, + "grad_norm": 3.4271841049194336, + "learning_rate": 7.130613340300976e-06, + "loss": 0.9532, + "step": 4918 + }, + { + "epoch": 0.3792013567684243, + "grad_norm": 3.5200610160827637, + "learning_rate": 7.1294838468776195e-06, + "loss": 1.0648, + "step": 4919 + }, + { + "epoch": 0.3792784458834413, + "grad_norm": 3.484342098236084, + "learning_rate": 7.128354220691973e-06, + "loss": 0.8582, + "step": 4920 + }, + { + "epoch": 0.3793555349984582, + "grad_norm": 3.4747977256774902, + "learning_rate": 7.127224461814465e-06, + "loss": 0.9661, + "step": 4921 + }, + { + "epoch": 0.37943262411347517, + "grad_norm": 3.514315366744995, + "learning_rate": 7.126094570315527e-06, + "loss": 1.0848, + "step": 4922 + }, + { + "epoch": 0.37950971322849214, + "grad_norm": 3.6075520515441895, + "learning_rate": 7.124964546265606e-06, + "loss": 1.0381, + "step": 4923 + }, + { + "epoch": 0.3795868023435091, + "grad_norm": 3.4969682693481445, + "learning_rate": 7.1238343897351505e-06, + "loss": 0.9393, + "step": 4924 + }, + { + "epoch": 0.3796638914585261, + "grad_norm": 3.8373172283172607, + "learning_rate": 7.12270410079462e-06, + "loss": 1.1534, + "step": 4925 + }, + { + "epoch": 0.379740980573543, + "grad_norm": 4.133366584777832, + "learning_rate": 7.121573679514484e-06, + "loss": 1.1071, + "step": 4926 + }, + { + "epoch": 0.37981806968855997, + "grad_norm": 3.4957773685455322, + "learning_rate": 7.120443125965215e-06, + "loss": 0.9435, + "step": 4927 + }, + { + "epoch": 0.37989515880357694, + "grad_norm": 3.298091411590576, + "learning_rate": 7.1193124402172995e-06, + "loss": 0.8858, + "step": 4928 + }, + { + "epoch": 0.3799722479185939, + "grad_norm": 3.720797061920166, + "learning_rate": 7.118181622341232e-06, + "loss": 1.0036, + "step": 4929 + }, + { + "epoch": 0.3800493370336109, + "grad_norm": 3.520009994506836, + "learning_rate": 7.117050672407507e-06, + "loss": 0.9223, + "step": 4930 + }, + { + "epoch": 0.3801264261486278, + "grad_norm": 3.883283853530884, + "learning_rate": 7.115919590486638e-06, + "loss": 0.8866, + "step": 4931 + }, + { + "epoch": 0.38020351526364476, + "grad_norm": 3.637392520904541, + "learning_rate": 7.114788376649143e-06, + "loss": 0.9867, + "step": 4932 + }, + { + "epoch": 0.38028060437866174, + "grad_norm": 3.6200270652770996, + "learning_rate": 7.113657030965544e-06, + "loss": 0.9339, + "step": 4933 + }, + { + "epoch": 0.3803576934936787, + "grad_norm": 3.7726197242736816, + "learning_rate": 7.1125255535063766e-06, + "loss": 0.9902, + "step": 4934 + }, + { + "epoch": 0.3804347826086957, + "grad_norm": 3.6421115398406982, + "learning_rate": 7.111393944342182e-06, + "loss": 1.0895, + "step": 4935 + }, + { + "epoch": 0.3805118717237126, + "grad_norm": 3.6255416870117188, + "learning_rate": 7.110262203543509e-06, + "loss": 0.9478, + "step": 4936 + }, + { + "epoch": 0.38058896083872956, + "grad_norm": 3.6377484798431396, + "learning_rate": 7.10913033118092e-06, + "loss": 1.0047, + "step": 4937 + }, + { + "epoch": 0.38066604995374653, + "grad_norm": 3.65698504447937, + "learning_rate": 7.107998327324975e-06, + "loss": 1.0363, + "step": 4938 + }, + { + "epoch": 0.3807431390687635, + "grad_norm": 3.5486626625061035, + "learning_rate": 7.106866192046254e-06, + "loss": 1.0382, + "step": 4939 + }, + { + "epoch": 0.3808202281837805, + "grad_norm": 3.589175224304199, + "learning_rate": 7.105733925415336e-06, + "loss": 0.9718, + "step": 4940 + }, + { + "epoch": 0.3808973172987974, + "grad_norm": 3.605515956878662, + "learning_rate": 7.104601527502815e-06, + "loss": 0.9097, + "step": 4941 + }, + { + "epoch": 0.38097440641381436, + "grad_norm": 3.868885040283203, + "learning_rate": 7.103468998379288e-06, + "loss": 1.0441, + "step": 4942 + }, + { + "epoch": 0.38105149552883133, + "grad_norm": 3.621760129928589, + "learning_rate": 7.102336338115363e-06, + "loss": 1.0506, + "step": 4943 + }, + { + "epoch": 0.3811285846438483, + "grad_norm": 4.027126789093018, + "learning_rate": 7.101203546781655e-06, + "loss": 1.0522, + "step": 4944 + }, + { + "epoch": 0.38120567375886527, + "grad_norm": 3.5115973949432373, + "learning_rate": 7.1000706244487896e-06, + "loss": 0.9613, + "step": 4945 + }, + { + "epoch": 0.3812827628738822, + "grad_norm": 3.698106050491333, + "learning_rate": 7.098937571187397e-06, + "loss": 0.9886, + "step": 4946 + }, + { + "epoch": 0.38135985198889916, + "grad_norm": 3.994877576828003, + "learning_rate": 7.097804387068117e-06, + "loss": 1.1597, + "step": 4947 + }, + { + "epoch": 0.3814369411039161, + "grad_norm": 3.984495162963867, + "learning_rate": 7.0966710721616e-06, + "loss": 0.9572, + "step": 4948 + }, + { + "epoch": 0.3815140302189331, + "grad_norm": 3.9467110633850098, + "learning_rate": 7.095537626538498e-06, + "loss": 1.0071, + "step": 4949 + }, + { + "epoch": 0.38159111933395007, + "grad_norm": 3.824596643447876, + "learning_rate": 7.09440405026948e-06, + "loss": 1.0244, + "step": 4950 + }, + { + "epoch": 0.381668208448967, + "grad_norm": 3.812662124633789, + "learning_rate": 7.093270343425216e-06, + "loss": 1.0646, + "step": 4951 + }, + { + "epoch": 0.38174529756398395, + "grad_norm": 3.604254961013794, + "learning_rate": 7.092136506076387e-06, + "loss": 1.0749, + "step": 4952 + }, + { + "epoch": 0.3818223866790009, + "grad_norm": 3.493272542953491, + "learning_rate": 7.091002538293683e-06, + "loss": 0.9379, + "step": 4953 + }, + { + "epoch": 0.3818994757940179, + "grad_norm": 3.4740121364593506, + "learning_rate": 7.0898684401478e-06, + "loss": 0.9795, + "step": 4954 + }, + { + "epoch": 0.38197656490903487, + "grad_norm": 4.244378566741943, + "learning_rate": 7.088734211709443e-06, + "loss": 0.9794, + "step": 4955 + }, + { + "epoch": 0.3820536540240518, + "grad_norm": 3.6403074264526367, + "learning_rate": 7.087599853049327e-06, + "loss": 1.0321, + "step": 4956 + }, + { + "epoch": 0.38213074313906875, + "grad_norm": 3.5275707244873047, + "learning_rate": 7.086465364238171e-06, + "loss": 0.971, + "step": 4957 + }, + { + "epoch": 0.3822078322540857, + "grad_norm": 4.26153564453125, + "learning_rate": 7.085330745346706e-06, + "loss": 1.0188, + "step": 4958 + }, + { + "epoch": 0.3822849213691027, + "grad_norm": 3.5671985149383545, + "learning_rate": 7.08419599644567e-06, + "loss": 1.0241, + "step": 4959 + }, + { + "epoch": 0.38236201048411966, + "grad_norm": 3.462334632873535, + "learning_rate": 7.083061117605806e-06, + "loss": 0.933, + "step": 4960 + }, + { + "epoch": 0.3824390995991366, + "grad_norm": 3.856916904449463, + "learning_rate": 7.081926108897872e-06, + "loss": 0.9347, + "step": 4961 + }, + { + "epoch": 0.38251618871415355, + "grad_norm": 3.425292730331421, + "learning_rate": 7.080790970392626e-06, + "loss": 0.9056, + "step": 4962 + }, + { + "epoch": 0.3825932778291705, + "grad_norm": 3.7738280296325684, + "learning_rate": 7.07965570216084e-06, + "loss": 0.904, + "step": 4963 + }, + { + "epoch": 0.3826703669441875, + "grad_norm": 3.893852710723877, + "learning_rate": 7.078520304273293e-06, + "loss": 0.9253, + "step": 4964 + }, + { + "epoch": 0.38274745605920446, + "grad_norm": 3.1743626594543457, + "learning_rate": 7.077384776800767e-06, + "loss": 0.9373, + "step": 4965 + }, + { + "epoch": 0.3828245451742214, + "grad_norm": 3.271000385284424, + "learning_rate": 7.076249119814062e-06, + "loss": 0.9536, + "step": 4966 + }, + { + "epoch": 0.38290163428923835, + "grad_norm": 3.6323864459991455, + "learning_rate": 7.075113333383976e-06, + "loss": 1.0153, + "step": 4967 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 4.131968975067139, + "learning_rate": 7.073977417581321e-06, + "loss": 1.0542, + "step": 4968 + }, + { + "epoch": 0.3830558125192723, + "grad_norm": 3.279594898223877, + "learning_rate": 7.072841372476918e-06, + "loss": 0.9579, + "step": 4969 + }, + { + "epoch": 0.38313290163428926, + "grad_norm": 3.7253003120422363, + "learning_rate": 7.071705198141588e-06, + "loss": 0.9364, + "step": 4970 + }, + { + "epoch": 0.3832099907493062, + "grad_norm": 3.599295139312744, + "learning_rate": 7.070568894646171e-06, + "loss": 0.9531, + "step": 4971 + }, + { + "epoch": 0.38328707986432314, + "grad_norm": 3.4222543239593506, + "learning_rate": 7.0694324620615054e-06, + "loss": 1.0856, + "step": 4972 + }, + { + "epoch": 0.3833641689793401, + "grad_norm": 3.6698365211486816, + "learning_rate": 7.0682959004584436e-06, + "loss": 0.9242, + "step": 4973 + }, + { + "epoch": 0.3834412580943571, + "grad_norm": 3.4762206077575684, + "learning_rate": 7.067159209907845e-06, + "loss": 0.933, + "step": 4974 + }, + { + "epoch": 0.38351834720937406, + "grad_norm": 3.3484156131744385, + "learning_rate": 7.066022390480576e-06, + "loss": 0.9227, + "step": 4975 + }, + { + "epoch": 0.38359543632439097, + "grad_norm": 4.061807155609131, + "learning_rate": 7.06488544224751e-06, + "loss": 1.0448, + "step": 4976 + }, + { + "epoch": 0.38367252543940794, + "grad_norm": 3.4074602127075195, + "learning_rate": 7.0637483652795325e-06, + "loss": 0.8229, + "step": 4977 + }, + { + "epoch": 0.3837496145544249, + "grad_norm": 3.6092684268951416, + "learning_rate": 7.062611159647532e-06, + "loss": 1.0388, + "step": 4978 + }, + { + "epoch": 0.3838267036694419, + "grad_norm": 3.7201006412506104, + "learning_rate": 7.061473825422408e-06, + "loss": 0.9342, + "step": 4979 + }, + { + "epoch": 0.38390379278445885, + "grad_norm": 3.228909969329834, + "learning_rate": 7.060336362675069e-06, + "loss": 0.9043, + "step": 4980 + }, + { + "epoch": 0.38398088189947577, + "grad_norm": 3.3236916065216064, + "learning_rate": 7.059198771476426e-06, + "loss": 0.9576, + "step": 4981 + }, + { + "epoch": 0.38405797101449274, + "grad_norm": 3.616847038269043, + "learning_rate": 7.0580610518974065e-06, + "loss": 1.0305, + "step": 4982 + }, + { + "epoch": 0.3841350601295097, + "grad_norm": 3.782071590423584, + "learning_rate": 7.05692320400894e-06, + "loss": 0.9386, + "step": 4983 + }, + { + "epoch": 0.3842121492445267, + "grad_norm": 3.3913662433624268, + "learning_rate": 7.055785227881963e-06, + "loss": 0.9801, + "step": 4984 + }, + { + "epoch": 0.38428923835954365, + "grad_norm": 3.7937633991241455, + "learning_rate": 7.054647123587426e-06, + "loss": 1.0372, + "step": 4985 + }, + { + "epoch": 0.38436632747456057, + "grad_norm": 3.505120038986206, + "learning_rate": 7.05350889119628e-06, + "loss": 0.9769, + "step": 4986 + }, + { + "epoch": 0.38444341658957754, + "grad_norm": 3.569885015487671, + "learning_rate": 7.052370530779491e-06, + "loss": 1.138, + "step": 4987 + }, + { + "epoch": 0.3845205057045945, + "grad_norm": 4.187367916107178, + "learning_rate": 7.051232042408028e-06, + "loss": 0.9448, + "step": 4988 + }, + { + "epoch": 0.3845975948196115, + "grad_norm": 3.4536428451538086, + "learning_rate": 7.050093426152872e-06, + "loss": 0.8804, + "step": 4989 + }, + { + "epoch": 0.38467468393462845, + "grad_norm": 3.314223289489746, + "learning_rate": 7.0489546820850086e-06, + "loss": 0.9758, + "step": 4990 + }, + { + "epoch": 0.38475177304964536, + "grad_norm": 3.603670120239258, + "learning_rate": 7.047815810275432e-06, + "loss": 0.9632, + "step": 4991 + }, + { + "epoch": 0.38482886216466233, + "grad_norm": 3.7399637699127197, + "learning_rate": 7.046676810795146e-06, + "loss": 1.1052, + "step": 4992 + }, + { + "epoch": 0.3849059512796793, + "grad_norm": 3.740985870361328, + "learning_rate": 7.045537683715161e-06, + "loss": 1.0086, + "step": 4993 + }, + { + "epoch": 0.3849830403946963, + "grad_norm": 3.6011998653411865, + "learning_rate": 7.044398429106495e-06, + "loss": 1.0373, + "step": 4994 + }, + { + "epoch": 0.38506012950971324, + "grad_norm": 3.508432388305664, + "learning_rate": 7.043259047040175e-06, + "loss": 1.0279, + "step": 4995 + }, + { + "epoch": 0.3851372186247302, + "grad_norm": 3.528916358947754, + "learning_rate": 7.042119537587237e-06, + "loss": 1.0366, + "step": 4996 + }, + { + "epoch": 0.38521430773974713, + "grad_norm": 3.8693268299102783, + "learning_rate": 7.04097990081872e-06, + "loss": 0.9748, + "step": 4997 + }, + { + "epoch": 0.3852913968547641, + "grad_norm": 3.851828098297119, + "learning_rate": 7.039840136805679e-06, + "loss": 1.0828, + "step": 4998 + }, + { + "epoch": 0.38536848596978107, + "grad_norm": 3.7920031547546387, + "learning_rate": 7.038700245619169e-06, + "loss": 1.1299, + "step": 4999 + }, + { + "epoch": 0.38544557508479804, + "grad_norm": 4.139936923980713, + "learning_rate": 7.037560227330258e-06, + "loss": 0.9281, + "step": 5000 + }, + { + "epoch": 0.385522664199815, + "grad_norm": 3.6004490852355957, + "learning_rate": 7.0364200820100194e-06, + "loss": 1.0097, + "step": 5001 + }, + { + "epoch": 0.38559975331483193, + "grad_norm": 3.911167621612549, + "learning_rate": 7.035279809729535e-06, + "loss": 1.0057, + "step": 5002 + }, + { + "epoch": 0.3856768424298489, + "grad_norm": 3.6797120571136475, + "learning_rate": 7.0341394105598944e-06, + "loss": 0.9339, + "step": 5003 + }, + { + "epoch": 0.38575393154486587, + "grad_norm": 3.733839750289917, + "learning_rate": 7.0329988845721996e-06, + "loss": 0.9404, + "step": 5004 + }, + { + "epoch": 0.38583102065988284, + "grad_norm": 3.474809169769287, + "learning_rate": 7.0318582318375514e-06, + "loss": 1.0068, + "step": 5005 + }, + { + "epoch": 0.3859081097748998, + "grad_norm": 4.166741371154785, + "learning_rate": 7.030717452427066e-06, + "loss": 1.0622, + "step": 5006 + }, + { + "epoch": 0.3859851988899167, + "grad_norm": 3.7879574298858643, + "learning_rate": 7.0295765464118645e-06, + "loss": 0.9869, + "step": 5007 + }, + { + "epoch": 0.3860622880049337, + "grad_norm": 3.560980796813965, + "learning_rate": 7.028435513863078e-06, + "loss": 1.022, + "step": 5008 + }, + { + "epoch": 0.38613937711995067, + "grad_norm": 3.5441577434539795, + "learning_rate": 7.027294354851842e-06, + "loss": 1.014, + "step": 5009 + }, + { + "epoch": 0.38621646623496764, + "grad_norm": 3.795830488204956, + "learning_rate": 7.0261530694493015e-06, + "loss": 0.8879, + "step": 5010 + }, + { + "epoch": 0.3862935553499846, + "grad_norm": 3.435690402984619, + "learning_rate": 7.025011657726613e-06, + "loss": 0.8957, + "step": 5011 + }, + { + "epoch": 0.3863706444650015, + "grad_norm": 3.815304756164551, + "learning_rate": 7.0238701197549345e-06, + "loss": 1.0519, + "step": 5012 + }, + { + "epoch": 0.3864477335800185, + "grad_norm": 3.4580211639404297, + "learning_rate": 7.0227284556054355e-06, + "loss": 0.9664, + "step": 5013 + }, + { + "epoch": 0.38652482269503546, + "grad_norm": 3.8483505249023438, + "learning_rate": 7.0215866653492936e-06, + "loss": 0.9913, + "step": 5014 + }, + { + "epoch": 0.38660191181005243, + "grad_norm": 3.6694788932800293, + "learning_rate": 7.020444749057693e-06, + "loss": 0.9433, + "step": 5015 + }, + { + "epoch": 0.3866790009250694, + "grad_norm": 3.6327717304229736, + "learning_rate": 7.019302706801826e-06, + "loss": 0.9477, + "step": 5016 + }, + { + "epoch": 0.3867560900400863, + "grad_norm": 3.4883623123168945, + "learning_rate": 7.018160538652895e-06, + "loss": 0.9981, + "step": 5017 + }, + { + "epoch": 0.3868331791551033, + "grad_norm": 3.332284927368164, + "learning_rate": 7.017018244682106e-06, + "loss": 0.8992, + "step": 5018 + }, + { + "epoch": 0.38691026827012026, + "grad_norm": 3.850799560546875, + "learning_rate": 7.015875824960675e-06, + "loss": 1.105, + "step": 5019 + }, + { + "epoch": 0.38698735738513723, + "grad_norm": 3.9528861045837402, + "learning_rate": 7.014733279559829e-06, + "loss": 0.9505, + "step": 5020 + }, + { + "epoch": 0.3870644465001542, + "grad_norm": 3.9368906021118164, + "learning_rate": 7.013590608550796e-06, + "loss": 0.9322, + "step": 5021 + }, + { + "epoch": 0.3871415356151711, + "grad_norm": 3.622251272201538, + "learning_rate": 7.0124478120048164e-06, + "loss": 0.9249, + "step": 5022 + }, + { + "epoch": 0.3872186247301881, + "grad_norm": 3.4808027744293213, + "learning_rate": 7.0113048899931406e-06, + "loss": 1.045, + "step": 5023 + }, + { + "epoch": 0.38729571384520506, + "grad_norm": 3.806081771850586, + "learning_rate": 7.010161842587019e-06, + "loss": 0.9854, + "step": 5024 + }, + { + "epoch": 0.38737280296022203, + "grad_norm": 3.621647834777832, + "learning_rate": 7.009018669857719e-06, + "loss": 0.9636, + "step": 5025 + }, + { + "epoch": 0.387449892075239, + "grad_norm": 3.846794605255127, + "learning_rate": 7.0078753718765105e-06, + "loss": 0.957, + "step": 5026 + }, + { + "epoch": 0.3875269811902559, + "grad_norm": 3.383054733276367, + "learning_rate": 7.00673194871467e-06, + "loss": 0.8777, + "step": 5027 + }, + { + "epoch": 0.3876040703052729, + "grad_norm": 3.6006200313568115, + "learning_rate": 7.005588400443487e-06, + "loss": 1.086, + "step": 5028 + }, + { + "epoch": 0.38768115942028986, + "grad_norm": 3.8396594524383545, + "learning_rate": 7.004444727134254e-06, + "loss": 1.1418, + "step": 5029 + }, + { + "epoch": 0.3877582485353068, + "grad_norm": 3.7753446102142334, + "learning_rate": 7.003300928858273e-06, + "loss": 0.9981, + "step": 5030 + }, + { + "epoch": 0.3878353376503238, + "grad_norm": 3.5758206844329834, + "learning_rate": 7.002157005686855e-06, + "loss": 0.9798, + "step": 5031 + }, + { + "epoch": 0.3879124267653407, + "grad_norm": 4.047423362731934, + "learning_rate": 7.001012957691317e-06, + "loss": 1.0324, + "step": 5032 + }, + { + "epoch": 0.3879895158803577, + "grad_norm": 3.57684326171875, + "learning_rate": 6.999868784942985e-06, + "loss": 0.9592, + "step": 5033 + }, + { + "epoch": 0.38806660499537465, + "grad_norm": 3.5978035926818848, + "learning_rate": 6.998724487513191e-06, + "loss": 0.94, + "step": 5034 + }, + { + "epoch": 0.3881436941103916, + "grad_norm": 3.3599936962127686, + "learning_rate": 6.9975800654732785e-06, + "loss": 0.9095, + "step": 5035 + }, + { + "epoch": 0.3882207832254086, + "grad_norm": 3.662285566329956, + "learning_rate": 6.996435518894593e-06, + "loss": 0.9509, + "step": 5036 + }, + { + "epoch": 0.3882978723404255, + "grad_norm": 3.5465025901794434, + "learning_rate": 6.9952908478484925e-06, + "loss": 0.9714, + "step": 5037 + }, + { + "epoch": 0.3883749614554425, + "grad_norm": 3.2565081119537354, + "learning_rate": 6.994146052406343e-06, + "loss": 0.9064, + "step": 5038 + }, + { + "epoch": 0.38845205057045945, + "grad_norm": 3.7436461448669434, + "learning_rate": 6.993001132639514e-06, + "loss": 0.9848, + "step": 5039 + }, + { + "epoch": 0.3885291396854764, + "grad_norm": 3.5175907611846924, + "learning_rate": 6.991856088619387e-06, + "loss": 1.0, + "step": 5040 + }, + { + "epoch": 0.3886062288004934, + "grad_norm": 3.567593574523926, + "learning_rate": 6.99071092041735e-06, + "loss": 1.0659, + "step": 5041 + }, + { + "epoch": 0.3886833179155103, + "grad_norm": 3.355001211166382, + "learning_rate": 6.989565628104795e-06, + "loss": 0.8182, + "step": 5042 + }, + { + "epoch": 0.3887604070305273, + "grad_norm": 3.528596878051758, + "learning_rate": 6.988420211753129e-06, + "loss": 0.9568, + "step": 5043 + }, + { + "epoch": 0.38883749614554425, + "grad_norm": 3.691378593444824, + "learning_rate": 6.987274671433761e-06, + "loss": 1.025, + "step": 5044 + }, + { + "epoch": 0.3889145852605612, + "grad_norm": 3.5705032348632812, + "learning_rate": 6.986129007218107e-06, + "loss": 0.9854, + "step": 5045 + }, + { + "epoch": 0.3889916743755782, + "grad_norm": 3.5841269493103027, + "learning_rate": 6.984983219177599e-06, + "loss": 1.1187, + "step": 5046 + }, + { + "epoch": 0.3890687634905951, + "grad_norm": 3.42091965675354, + "learning_rate": 6.983837307383666e-06, + "loss": 0.9753, + "step": 5047 + }, + { + "epoch": 0.3891458526056121, + "grad_norm": 3.6692841053009033, + "learning_rate": 6.982691271907752e-06, + "loss": 0.8831, + "step": 5048 + }, + { + "epoch": 0.38922294172062905, + "grad_norm": 3.5156326293945312, + "learning_rate": 6.981545112821306e-06, + "loss": 1.0276, + "step": 5049 + }, + { + "epoch": 0.389300030835646, + "grad_norm": 3.318147897720337, + "learning_rate": 6.980398830195785e-06, + "loss": 0.9131, + "step": 5050 + }, + { + "epoch": 0.389377119950663, + "grad_norm": 4.0214152336120605, + "learning_rate": 6.979252424102654e-06, + "loss": 1.0253, + "step": 5051 + }, + { + "epoch": 0.3894542090656799, + "grad_norm": 3.803903818130493, + "learning_rate": 6.978105894613385e-06, + "loss": 1.0602, + "step": 5052 + }, + { + "epoch": 0.38953129818069687, + "grad_norm": 3.736529588699341, + "learning_rate": 6.976959241799456e-06, + "loss": 0.9655, + "step": 5053 + }, + { + "epoch": 0.38960838729571384, + "grad_norm": 3.8049023151397705, + "learning_rate": 6.97581246573236e-06, + "loss": 0.9087, + "step": 5054 + }, + { + "epoch": 0.3896854764107308, + "grad_norm": 4.01686954498291, + "learning_rate": 6.974665566483588e-06, + "loss": 1.1021, + "step": 5055 + }, + { + "epoch": 0.3897625655257478, + "grad_norm": 3.589202642440796, + "learning_rate": 6.9735185441246466e-06, + "loss": 0.9415, + "step": 5056 + }, + { + "epoch": 0.3898396546407647, + "grad_norm": 3.462862014770508, + "learning_rate": 6.972371398727045e-06, + "loss": 0.9772, + "step": 5057 + }, + { + "epoch": 0.38991674375578167, + "grad_norm": 3.413395404815674, + "learning_rate": 6.971224130362301e-06, + "loss": 0.9391, + "step": 5058 + }, + { + "epoch": 0.38999383287079864, + "grad_norm": 3.5816166400909424, + "learning_rate": 6.970076739101942e-06, + "loss": 0.9687, + "step": 5059 + }, + { + "epoch": 0.3900709219858156, + "grad_norm": 3.3513641357421875, + "learning_rate": 6.968929225017501e-06, + "loss": 1.0188, + "step": 5060 + }, + { + "epoch": 0.3901480111008326, + "grad_norm": 3.450777292251587, + "learning_rate": 6.9677815881805215e-06, + "loss": 0.9195, + "step": 5061 + }, + { + "epoch": 0.3902251002158495, + "grad_norm": 3.946836471557617, + "learning_rate": 6.96663382866255e-06, + "loss": 0.9913, + "step": 5062 + }, + { + "epoch": 0.39030218933086647, + "grad_norm": 3.5374841690063477, + "learning_rate": 6.965485946535145e-06, + "loss": 0.945, + "step": 5063 + }, + { + "epoch": 0.39037927844588344, + "grad_norm": 3.718808174133301, + "learning_rate": 6.964337941869872e-06, + "loss": 1.0653, + "step": 5064 + }, + { + "epoch": 0.3904563675609004, + "grad_norm": 3.8617405891418457, + "learning_rate": 6.963189814738301e-06, + "loss": 1.0212, + "step": 5065 + }, + { + "epoch": 0.3905334566759174, + "grad_norm": 3.3130338191986084, + "learning_rate": 6.962041565212012e-06, + "loss": 0.9596, + "step": 5066 + }, + { + "epoch": 0.3906105457909343, + "grad_norm": 3.5554182529449463, + "learning_rate": 6.960893193362594e-06, + "loss": 1.0024, + "step": 5067 + }, + { + "epoch": 0.39068763490595126, + "grad_norm": 4.0424604415893555, + "learning_rate": 6.959744699261641e-06, + "loss": 0.9991, + "step": 5068 + }, + { + "epoch": 0.39076472402096823, + "grad_norm": 3.6211678981781006, + "learning_rate": 6.9585960829807555e-06, + "loss": 1.0449, + "step": 5069 + }, + { + "epoch": 0.3908418131359852, + "grad_norm": 3.4850916862487793, + "learning_rate": 6.9574473445915495e-06, + "loss": 0.9836, + "step": 5070 + }, + { + "epoch": 0.3909189022510022, + "grad_norm": 4.005676746368408, + "learning_rate": 6.956298484165638e-06, + "loss": 0.8692, + "step": 5071 + }, + { + "epoch": 0.3909959913660191, + "grad_norm": 3.5592994689941406, + "learning_rate": 6.955149501774648e-06, + "loss": 1.1276, + "step": 5072 + }, + { + "epoch": 0.39107308048103606, + "grad_norm": 3.935800313949585, + "learning_rate": 6.954000397490213e-06, + "loss": 0.992, + "step": 5073 + }, + { + "epoch": 0.39115016959605303, + "grad_norm": 3.423206090927124, + "learning_rate": 6.952851171383972e-06, + "loss": 0.923, + "step": 5074 + }, + { + "epoch": 0.39122725871107, + "grad_norm": 3.449836254119873, + "learning_rate": 6.951701823527575e-06, + "loss": 0.9982, + "step": 5075 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 3.2396328449249268, + "learning_rate": 6.950552353992678e-06, + "loss": 0.8038, + "step": 5076 + }, + { + "epoch": 0.3913814369411039, + "grad_norm": 3.798563003540039, + "learning_rate": 6.949402762850943e-06, + "loss": 1.0334, + "step": 5077 + }, + { + "epoch": 0.39145852605612086, + "grad_norm": 3.7131567001342773, + "learning_rate": 6.948253050174043e-06, + "loss": 1.0026, + "step": 5078 + }, + { + "epoch": 0.39153561517113783, + "grad_norm": 4.187877655029297, + "learning_rate": 6.947103216033655e-06, + "loss": 1.1898, + "step": 5079 + }, + { + "epoch": 0.3916127042861548, + "grad_norm": 3.439176082611084, + "learning_rate": 6.945953260501466e-06, + "loss": 0.9082, + "step": 5080 + }, + { + "epoch": 0.39168979340117177, + "grad_norm": 3.9063847064971924, + "learning_rate": 6.9448031836491705e-06, + "loss": 1.0431, + "step": 5081 + }, + { + "epoch": 0.39176688251618874, + "grad_norm": 3.500028610229492, + "learning_rate": 6.943652985548468e-06, + "loss": 0.9663, + "step": 5082 + }, + { + "epoch": 0.39184397163120566, + "grad_norm": 3.206749439239502, + "learning_rate": 6.942502666271069e-06, + "loss": 0.8262, + "step": 5083 + }, + { + "epoch": 0.3919210607462226, + "grad_norm": 3.7229039669036865, + "learning_rate": 6.9413522258886874e-06, + "loss": 0.9906, + "step": 5084 + }, + { + "epoch": 0.3919981498612396, + "grad_norm": 3.451598882675171, + "learning_rate": 6.940201664473051e-06, + "loss": 0.9667, + "step": 5085 + }, + { + "epoch": 0.39207523897625657, + "grad_norm": 4.100324630737305, + "learning_rate": 6.939050982095889e-06, + "loss": 0.9991, + "step": 5086 + }, + { + "epoch": 0.39215232809127354, + "grad_norm": 3.7283709049224854, + "learning_rate": 6.93790017882894e-06, + "loss": 0.9816, + "step": 5087 + }, + { + "epoch": 0.39222941720629045, + "grad_norm": 3.6680102348327637, + "learning_rate": 6.936749254743951e-06, + "loss": 0.9915, + "step": 5088 + }, + { + "epoch": 0.3923065063213074, + "grad_norm": 3.723318576812744, + "learning_rate": 6.935598209912679e-06, + "loss": 0.9894, + "step": 5089 + }, + { + "epoch": 0.3923835954363244, + "grad_norm": 3.6859655380249023, + "learning_rate": 6.934447044406882e-06, + "loss": 1.0608, + "step": 5090 + }, + { + "epoch": 0.39246068455134137, + "grad_norm": 4.22714900970459, + "learning_rate": 6.9332957582983295e-06, + "loss": 0.977, + "step": 5091 + }, + { + "epoch": 0.39253777366635834, + "grad_norm": 3.4322969913482666, + "learning_rate": 6.932144351658801e-06, + "loss": 1.0241, + "step": 5092 + }, + { + "epoch": 0.39261486278137525, + "grad_norm": 3.7159926891326904, + "learning_rate": 6.930992824560078e-06, + "loss": 0.901, + "step": 5093 + }, + { + "epoch": 0.3926919518963922, + "grad_norm": 3.612903356552124, + "learning_rate": 6.9298411770739535e-06, + "loss": 0.9303, + "step": 5094 + }, + { + "epoch": 0.3927690410114092, + "grad_norm": 3.71382999420166, + "learning_rate": 6.9286894092722265e-06, + "loss": 0.9735, + "step": 5095 + }, + { + "epoch": 0.39284613012642616, + "grad_norm": 3.824847459793091, + "learning_rate": 6.9275375212267035e-06, + "loss": 1.0065, + "step": 5096 + }, + { + "epoch": 0.39292321924144313, + "grad_norm": 3.6681530475616455, + "learning_rate": 6.926385513009199e-06, + "loss": 1.0493, + "step": 5097 + }, + { + "epoch": 0.39300030835646005, + "grad_norm": 3.7533106803894043, + "learning_rate": 6.925233384691534e-06, + "loss": 1.077, + "step": 5098 + }, + { + "epoch": 0.393077397471477, + "grad_norm": 3.778269052505493, + "learning_rate": 6.924081136345541e-06, + "loss": 0.9683, + "step": 5099 + }, + { + "epoch": 0.393154486586494, + "grad_norm": 4.528807163238525, + "learning_rate": 6.9229287680430526e-06, + "loss": 1.0982, + "step": 5100 + }, + { + "epoch": 0.39323157570151096, + "grad_norm": 3.8181581497192383, + "learning_rate": 6.921776279855914e-06, + "loss": 0.8895, + "step": 5101 + }, + { + "epoch": 0.39330866481652793, + "grad_norm": 4.004360198974609, + "learning_rate": 6.92062367185598e-06, + "loss": 0.9677, + "step": 5102 + }, + { + "epoch": 0.39338575393154485, + "grad_norm": 3.5526719093322754, + "learning_rate": 6.919470944115104e-06, + "loss": 0.9523, + "step": 5103 + }, + { + "epoch": 0.3934628430465618, + "grad_norm": 3.5170533657073975, + "learning_rate": 6.918318096705157e-06, + "loss": 0.9552, + "step": 5104 + }, + { + "epoch": 0.3935399321615788, + "grad_norm": 3.4880666732788086, + "learning_rate": 6.917165129698013e-06, + "loss": 0.9412, + "step": 5105 + }, + { + "epoch": 0.39361702127659576, + "grad_norm": 3.667492628097534, + "learning_rate": 6.916012043165552e-06, + "loss": 0.889, + "step": 5106 + }, + { + "epoch": 0.39369411039161273, + "grad_norm": 3.4989163875579834, + "learning_rate": 6.9148588371796635e-06, + "loss": 1.009, + "step": 5107 + }, + { + "epoch": 0.39377119950662964, + "grad_norm": 3.465567111968994, + "learning_rate": 6.913705511812243e-06, + "loss": 0.9644, + "step": 5108 + }, + { + "epoch": 0.3938482886216466, + "grad_norm": 3.9162404537200928, + "learning_rate": 6.912552067135195e-06, + "loss": 0.9753, + "step": 5109 + }, + { + "epoch": 0.3939253777366636, + "grad_norm": 3.494250774383545, + "learning_rate": 6.9113985032204325e-06, + "loss": 1.0625, + "step": 5110 + }, + { + "epoch": 0.39400246685168056, + "grad_norm": 3.6587326526641846, + "learning_rate": 6.910244820139871e-06, + "loss": 0.9012, + "step": 5111 + }, + { + "epoch": 0.3940795559666975, + "grad_norm": 4.449551105499268, + "learning_rate": 6.909091017965439e-06, + "loss": 1.0422, + "step": 5112 + }, + { + "epoch": 0.39415664508171444, + "grad_norm": 3.5880303382873535, + "learning_rate": 6.907937096769071e-06, + "loss": 1.0439, + "step": 5113 + }, + { + "epoch": 0.3942337341967314, + "grad_norm": 4.151397228240967, + "learning_rate": 6.906783056622703e-06, + "loss": 1.1482, + "step": 5114 + }, + { + "epoch": 0.3943108233117484, + "grad_norm": 3.602203607559204, + "learning_rate": 6.9056288975982896e-06, + "loss": 0.9506, + "step": 5115 + }, + { + "epoch": 0.39438791242676535, + "grad_norm": 3.9229793548583984, + "learning_rate": 6.904474619767784e-06, + "loss": 1.0528, + "step": 5116 + }, + { + "epoch": 0.3944650015417823, + "grad_norm": 3.542268991470337, + "learning_rate": 6.903320223203148e-06, + "loss": 1.0154, + "step": 5117 + }, + { + "epoch": 0.39454209065679924, + "grad_norm": 3.427914619445801, + "learning_rate": 6.9021657079763545e-06, + "loss": 0.9821, + "step": 5118 + }, + { + "epoch": 0.3946191797718162, + "grad_norm": 3.257849931716919, + "learning_rate": 6.901011074159381e-06, + "loss": 1.0341, + "step": 5119 + }, + { + "epoch": 0.3946962688868332, + "grad_norm": 3.4966633319854736, + "learning_rate": 6.899856321824212e-06, + "loss": 0.9723, + "step": 5120 + }, + { + "epoch": 0.39477335800185015, + "grad_norm": 3.4750454425811768, + "learning_rate": 6.89870145104284e-06, + "loss": 0.9318, + "step": 5121 + }, + { + "epoch": 0.3948504471168671, + "grad_norm": 4.037153244018555, + "learning_rate": 6.897546461887268e-06, + "loss": 0.9593, + "step": 5122 + }, + { + "epoch": 0.39492753623188404, + "grad_norm": 3.800772190093994, + "learning_rate": 6.896391354429501e-06, + "loss": 1.0182, + "step": 5123 + }, + { + "epoch": 0.395004625346901, + "grad_norm": 3.4899842739105225, + "learning_rate": 6.895236128741554e-06, + "loss": 1.0558, + "step": 5124 + }, + { + "epoch": 0.395081714461918, + "grad_norm": 3.538621187210083, + "learning_rate": 6.8940807848954506e-06, + "loss": 0.9602, + "step": 5125 + }, + { + "epoch": 0.39515880357693495, + "grad_norm": 4.251044273376465, + "learning_rate": 6.892925322963221e-06, + "loss": 1.1921, + "step": 5126 + }, + { + "epoch": 0.3952358926919519, + "grad_norm": 3.6266791820526123, + "learning_rate": 6.8917697430169e-06, + "loss": 0.9136, + "step": 5127 + }, + { + "epoch": 0.39531298180696883, + "grad_norm": 3.5252368450164795, + "learning_rate": 6.890614045128533e-06, + "loss": 0.911, + "step": 5128 + }, + { + "epoch": 0.3953900709219858, + "grad_norm": 4.031729698181152, + "learning_rate": 6.889458229370173e-06, + "loss": 1.0345, + "step": 5129 + }, + { + "epoch": 0.3954671600370028, + "grad_norm": 3.6114397048950195, + "learning_rate": 6.888302295813878e-06, + "loss": 1.0215, + "step": 5130 + }, + { + "epoch": 0.39554424915201974, + "grad_norm": 3.7317934036254883, + "learning_rate": 6.887146244531715e-06, + "loss": 1.0653, + "step": 5131 + }, + { + "epoch": 0.3956213382670367, + "grad_norm": 3.487471580505371, + "learning_rate": 6.885990075595757e-06, + "loss": 1.0672, + "step": 5132 + }, + { + "epoch": 0.39569842738205363, + "grad_norm": 3.7505714893341064, + "learning_rate": 6.884833789078084e-06, + "loss": 1.0301, + "step": 5133 + }, + { + "epoch": 0.3957755164970706, + "grad_norm": 3.2812490463256836, + "learning_rate": 6.883677385050788e-06, + "loss": 0.9663, + "step": 5134 + }, + { + "epoch": 0.39585260561208757, + "grad_norm": 4.058938026428223, + "learning_rate": 6.882520863585962e-06, + "loss": 1.0798, + "step": 5135 + }, + { + "epoch": 0.39592969472710454, + "grad_norm": 3.5618515014648438, + "learning_rate": 6.88136422475571e-06, + "loss": 0.9328, + "step": 5136 + }, + { + "epoch": 0.3960067838421215, + "grad_norm": 3.4688501358032227, + "learning_rate": 6.880207468632143e-06, + "loss": 0.9176, + "step": 5137 + }, + { + "epoch": 0.3960838729571384, + "grad_norm": 3.658219575881958, + "learning_rate": 6.8790505952873775e-06, + "loss": 0.9566, + "step": 5138 + }, + { + "epoch": 0.3961609620721554, + "grad_norm": 3.3856353759765625, + "learning_rate": 6.877893604793539e-06, + "loss": 1.046, + "step": 5139 + }, + { + "epoch": 0.39623805118717237, + "grad_norm": 3.460387706756592, + "learning_rate": 6.876736497222762e-06, + "loss": 0.9436, + "step": 5140 + }, + { + "epoch": 0.39631514030218934, + "grad_norm": 3.813292980194092, + "learning_rate": 6.875579272647182e-06, + "loss": 1.0252, + "step": 5141 + }, + { + "epoch": 0.3963922294172063, + "grad_norm": 3.8571043014526367, + "learning_rate": 6.87442193113895e-06, + "loss": 1.0681, + "step": 5142 + }, + { + "epoch": 0.3964693185322232, + "grad_norm": 3.726741075515747, + "learning_rate": 6.873264472770217e-06, + "loss": 0.9728, + "step": 5143 + }, + { + "epoch": 0.3965464076472402, + "grad_norm": 3.4998526573181152, + "learning_rate": 6.8721068976131466e-06, + "loss": 0.966, + "step": 5144 + }, + { + "epoch": 0.39662349676225717, + "grad_norm": 3.4608278274536133, + "learning_rate": 6.870949205739907e-06, + "loss": 1.0177, + "step": 5145 + }, + { + "epoch": 0.39670058587727414, + "grad_norm": 3.6188559532165527, + "learning_rate": 6.869791397222674e-06, + "loss": 1.0666, + "step": 5146 + }, + { + "epoch": 0.3967776749922911, + "grad_norm": 3.4421889781951904, + "learning_rate": 6.868633472133632e-06, + "loss": 0.9928, + "step": 5147 + }, + { + "epoch": 0.396854764107308, + "grad_norm": 3.740668773651123, + "learning_rate": 6.867475430544971e-06, + "loss": 1.0269, + "step": 5148 + }, + { + "epoch": 0.396931853222325, + "grad_norm": 3.9305145740509033, + "learning_rate": 6.866317272528889e-06, + "loss": 0.9517, + "step": 5149 + }, + { + "epoch": 0.39700894233734196, + "grad_norm": 3.588923931121826, + "learning_rate": 6.865158998157591e-06, + "loss": 0.9619, + "step": 5150 + }, + { + "epoch": 0.39708603145235893, + "grad_norm": 3.9232397079467773, + "learning_rate": 6.8640006075032875e-06, + "loss": 1.1459, + "step": 5151 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 3.4399147033691406, + "learning_rate": 6.862842100638201e-06, + "loss": 0.8585, + "step": 5152 + }, + { + "epoch": 0.3972402096823928, + "grad_norm": 3.5570197105407715, + "learning_rate": 6.861683477634559e-06, + "loss": 1.0854, + "step": 5153 + }, + { + "epoch": 0.3973172987974098, + "grad_norm": 3.628215789794922, + "learning_rate": 6.860524738564591e-06, + "loss": 0.9911, + "step": 5154 + }, + { + "epoch": 0.39739438791242676, + "grad_norm": 4.154507160186768, + "learning_rate": 6.859365883500545e-06, + "loss": 1.0277, + "step": 5155 + }, + { + "epoch": 0.39747147702744373, + "grad_norm": 3.7878050804138184, + "learning_rate": 6.858206912514664e-06, + "loss": 1.075, + "step": 5156 + }, + { + "epoch": 0.3975485661424607, + "grad_norm": 3.571855068206787, + "learning_rate": 6.857047825679206e-06, + "loss": 0.9955, + "step": 5157 + }, + { + "epoch": 0.3976256552574776, + "grad_norm": 3.5463037490844727, + "learning_rate": 6.855888623066434e-06, + "loss": 0.969, + "step": 5158 + }, + { + "epoch": 0.3977027443724946, + "grad_norm": 3.6123862266540527, + "learning_rate": 6.854729304748619e-06, + "loss": 0.8627, + "step": 5159 + }, + { + "epoch": 0.39777983348751156, + "grad_norm": 4.434622764587402, + "learning_rate": 6.8535698707980356e-06, + "loss": 0.9712, + "step": 5160 + }, + { + "epoch": 0.39785692260252853, + "grad_norm": 3.731267213821411, + "learning_rate": 6.852410321286974e-06, + "loss": 0.9873, + "step": 5161 + }, + { + "epoch": 0.3979340117175455, + "grad_norm": 3.5864498615264893, + "learning_rate": 6.85125065628772e-06, + "loss": 0.9812, + "step": 5162 + }, + { + "epoch": 0.3980111008325624, + "grad_norm": 3.548417329788208, + "learning_rate": 6.850090875872577e-06, + "loss": 0.9825, + "step": 5163 + }, + { + "epoch": 0.3980881899475794, + "grad_norm": 3.9418680667877197, + "learning_rate": 6.848930980113848e-06, + "loss": 1.1676, + "step": 5164 + }, + { + "epoch": 0.39816527906259636, + "grad_norm": 3.502095937728882, + "learning_rate": 6.8477709690838486e-06, + "loss": 0.9376, + "step": 5165 + }, + { + "epoch": 0.3982423681776133, + "grad_norm": 3.5473270416259766, + "learning_rate": 6.846610842854902e-06, + "loss": 0.9449, + "step": 5166 + }, + { + "epoch": 0.3983194572926303, + "grad_norm": 3.6665074825286865, + "learning_rate": 6.845450601499331e-06, + "loss": 1.0283, + "step": 5167 + }, + { + "epoch": 0.39839654640764727, + "grad_norm": 3.162196636199951, + "learning_rate": 6.844290245089473e-06, + "loss": 0.8484, + "step": 5168 + }, + { + "epoch": 0.3984736355226642, + "grad_norm": 3.5580244064331055, + "learning_rate": 6.84312977369767e-06, + "loss": 0.958, + "step": 5169 + }, + { + "epoch": 0.39855072463768115, + "grad_norm": 3.2327682971954346, + "learning_rate": 6.841969187396271e-06, + "loss": 0.8806, + "step": 5170 + }, + { + "epoch": 0.3986278137526981, + "grad_norm": 3.59846568107605, + "learning_rate": 6.840808486257634e-06, + "loss": 0.8916, + "step": 5171 + }, + { + "epoch": 0.3987049028677151, + "grad_norm": 3.460810899734497, + "learning_rate": 6.83964767035412e-06, + "loss": 1.0101, + "step": 5172 + }, + { + "epoch": 0.39878199198273206, + "grad_norm": 3.336289882659912, + "learning_rate": 6.838486739758102e-06, + "loss": 0.9098, + "step": 5173 + }, + { + "epoch": 0.398859081097749, + "grad_norm": 3.8699560165405273, + "learning_rate": 6.83732569454196e-06, + "loss": 1.0069, + "step": 5174 + }, + { + "epoch": 0.39893617021276595, + "grad_norm": 3.5346524715423584, + "learning_rate": 6.836164534778074e-06, + "loss": 0.9581, + "step": 5175 + }, + { + "epoch": 0.3990132593277829, + "grad_norm": 3.468693733215332, + "learning_rate": 6.835003260538839e-06, + "loss": 0.9279, + "step": 5176 + }, + { + "epoch": 0.3990903484427999, + "grad_norm": 3.598850727081299, + "learning_rate": 6.833841871896656e-06, + "loss": 1.0076, + "step": 5177 + }, + { + "epoch": 0.39916743755781686, + "grad_norm": 3.525563955307007, + "learning_rate": 6.83268036892393e-06, + "loss": 0.9998, + "step": 5178 + }, + { + "epoch": 0.3992445266728338, + "grad_norm": 3.8953518867492676, + "learning_rate": 6.831518751693073e-06, + "loss": 0.9768, + "step": 5179 + }, + { + "epoch": 0.39932161578785075, + "grad_norm": 3.505366802215576, + "learning_rate": 6.830357020276509e-06, + "loss": 1.007, + "step": 5180 + }, + { + "epoch": 0.3993987049028677, + "grad_norm": 3.4880192279815674, + "learning_rate": 6.829195174746663e-06, + "loss": 1.0653, + "step": 5181 + }, + { + "epoch": 0.3994757940178847, + "grad_norm": 3.4839048385620117, + "learning_rate": 6.828033215175974e-06, + "loss": 0.9051, + "step": 5182 + }, + { + "epoch": 0.39955288313290166, + "grad_norm": 3.4201345443725586, + "learning_rate": 6.826871141636879e-06, + "loss": 0.9144, + "step": 5183 + }, + { + "epoch": 0.3996299722479186, + "grad_norm": 3.57698917388916, + "learning_rate": 6.825708954201832e-06, + "loss": 0.9932, + "step": 5184 + }, + { + "epoch": 0.39970706136293555, + "grad_norm": 3.7728288173675537, + "learning_rate": 6.824546652943287e-06, + "loss": 1.0721, + "step": 5185 + }, + { + "epoch": 0.3997841504779525, + "grad_norm": 3.7117831707000732, + "learning_rate": 6.823384237933706e-06, + "loss": 0.9387, + "step": 5186 + }, + { + "epoch": 0.3998612395929695, + "grad_norm": 3.299254894256592, + "learning_rate": 6.822221709245562e-06, + "loss": 0.9593, + "step": 5187 + }, + { + "epoch": 0.39993832870798646, + "grad_norm": 3.3196065425872803, + "learning_rate": 6.8210590669513325e-06, + "loss": 0.9527, + "step": 5188 + }, + { + "epoch": 0.40001541782300337, + "grad_norm": 3.578375816345215, + "learning_rate": 6.8198963111235e-06, + "loss": 0.9586, + "step": 5189 + }, + { + "epoch": 0.40009250693802034, + "grad_norm": 3.807652235031128, + "learning_rate": 6.8187334418345605e-06, + "loss": 0.8759, + "step": 5190 + }, + { + "epoch": 0.4001695960530373, + "grad_norm": 3.5907301902770996, + "learning_rate": 6.8175704591570105e-06, + "loss": 1.0146, + "step": 5191 + }, + { + "epoch": 0.4002466851680543, + "grad_norm": 3.224435806274414, + "learning_rate": 6.816407363163354e-06, + "loss": 0.9449, + "step": 5192 + }, + { + "epoch": 0.40032377428307125, + "grad_norm": 3.421980619430542, + "learning_rate": 6.815244153926106e-06, + "loss": 0.893, + "step": 5193 + }, + { + "epoch": 0.40040086339808817, + "grad_norm": 3.3092360496520996, + "learning_rate": 6.814080831517787e-06, + "loss": 0.8947, + "step": 5194 + }, + { + "epoch": 0.40047795251310514, + "grad_norm": 3.3089075088500977, + "learning_rate": 6.812917396010924e-06, + "loss": 0.8276, + "step": 5195 + }, + { + "epoch": 0.4005550416281221, + "grad_norm": 3.9031176567077637, + "learning_rate": 6.811753847478051e-06, + "loss": 1.0147, + "step": 5196 + }, + { + "epoch": 0.4006321307431391, + "grad_norm": 3.475313901901245, + "learning_rate": 6.810590185991707e-06, + "loss": 0.9532, + "step": 5197 + }, + { + "epoch": 0.40070921985815605, + "grad_norm": 3.7122201919555664, + "learning_rate": 6.8094264116244434e-06, + "loss": 1.0051, + "step": 5198 + }, + { + "epoch": 0.40078630897317297, + "grad_norm": 3.6033477783203125, + "learning_rate": 6.8082625244488145e-06, + "loss": 1.0563, + "step": 5199 + }, + { + "epoch": 0.40086339808818994, + "grad_norm": 4.018650054931641, + "learning_rate": 6.807098524537381e-06, + "loss": 0.922, + "step": 5200 + }, + { + "epoch": 0.4009404872032069, + "grad_norm": 3.791778087615967, + "learning_rate": 6.8059344119627155e-06, + "loss": 1.0258, + "step": 5201 + }, + { + "epoch": 0.4010175763182239, + "grad_norm": 3.5043606758117676, + "learning_rate": 6.804770186797391e-06, + "loss": 1.0035, + "step": 5202 + }, + { + "epoch": 0.40109466543324085, + "grad_norm": 4.14236307144165, + "learning_rate": 6.8036058491139944e-06, + "loss": 0.9195, + "step": 5203 + }, + { + "epoch": 0.40117175454825776, + "grad_norm": 3.829864263534546, + "learning_rate": 6.802441398985114e-06, + "loss": 0.9637, + "step": 5204 + }, + { + "epoch": 0.40124884366327473, + "grad_norm": 3.572660446166992, + "learning_rate": 6.801276836483346e-06, + "loss": 0.8871, + "step": 5205 + }, + { + "epoch": 0.4013259327782917, + "grad_norm": 3.405879259109497, + "learning_rate": 6.800112161681297e-06, + "loss": 0.8893, + "step": 5206 + }, + { + "epoch": 0.4014030218933087, + "grad_norm": 3.654069662094116, + "learning_rate": 6.798947374651578e-06, + "loss": 1.0638, + "step": 5207 + }, + { + "epoch": 0.40148011100832565, + "grad_norm": 3.986391067504883, + "learning_rate": 6.797782475466806e-06, + "loss": 1.0039, + "step": 5208 + }, + { + "epoch": 0.40155720012334256, + "grad_norm": 3.699451446533203, + "learning_rate": 6.7966174641996085e-06, + "loss": 0.9939, + "step": 5209 + }, + { + "epoch": 0.40163428923835953, + "grad_norm": 4.058764934539795, + "learning_rate": 6.795452340922617e-06, + "loss": 0.9998, + "step": 5210 + }, + { + "epoch": 0.4017113783533765, + "grad_norm": 4.153298854827881, + "learning_rate": 6.7942871057084715e-06, + "loss": 1.0098, + "step": 5211 + }, + { + "epoch": 0.4017884674683935, + "grad_norm": 3.8170831203460693, + "learning_rate": 6.793121758629817e-06, + "loss": 1.0502, + "step": 5212 + }, + { + "epoch": 0.40186555658341044, + "grad_norm": 4.2434844970703125, + "learning_rate": 6.791956299759307e-06, + "loss": 0.9763, + "step": 5213 + }, + { + "epoch": 0.40194264569842736, + "grad_norm": 4.152831077575684, + "learning_rate": 6.790790729169604e-06, + "loss": 1.0741, + "step": 5214 + }, + { + "epoch": 0.40201973481344433, + "grad_norm": 3.7323899269104004, + "learning_rate": 6.7896250469333725e-06, + "loss": 0.9627, + "step": 5215 + }, + { + "epoch": 0.4020968239284613, + "grad_norm": 4.520125389099121, + "learning_rate": 6.78845925312329e-06, + "loss": 1.0859, + "step": 5216 + }, + { + "epoch": 0.40217391304347827, + "grad_norm": 5.602902889251709, + "learning_rate": 6.787293347812034e-06, + "loss": 1.0761, + "step": 5217 + }, + { + "epoch": 0.40225100215849524, + "grad_norm": 3.4149789810180664, + "learning_rate": 6.7861273310722945e-06, + "loss": 0.9928, + "step": 5218 + }, + { + "epoch": 0.40232809127351216, + "grad_norm": 3.432222366333008, + "learning_rate": 6.784961202976768e-06, + "loss": 0.9324, + "step": 5219 + }, + { + "epoch": 0.4024051803885291, + "grad_norm": 3.2903270721435547, + "learning_rate": 6.7837949635981524e-06, + "loss": 0.9092, + "step": 5220 + }, + { + "epoch": 0.4024822695035461, + "grad_norm": 3.851072072982788, + "learning_rate": 6.782628613009161e-06, + "loss": 0.9979, + "step": 5221 + }, + { + "epoch": 0.40255935861856307, + "grad_norm": 3.850883960723877, + "learning_rate": 6.781462151282508e-06, + "loss": 0.9482, + "step": 5222 + }, + { + "epoch": 0.40263644773358004, + "grad_norm": 3.9080920219421387, + "learning_rate": 6.7802955784909165e-06, + "loss": 0.9543, + "step": 5223 + }, + { + "epoch": 0.40271353684859695, + "grad_norm": 3.799767255783081, + "learning_rate": 6.779128894707116e-06, + "loss": 0.9217, + "step": 5224 + }, + { + "epoch": 0.4027906259636139, + "grad_norm": 3.8918371200561523, + "learning_rate": 6.777962100003843e-06, + "loss": 1.1152, + "step": 5225 + }, + { + "epoch": 0.4028677150786309, + "grad_norm": 3.4487974643707275, + "learning_rate": 6.776795194453841e-06, + "loss": 1.0196, + "step": 5226 + }, + { + "epoch": 0.40294480419364787, + "grad_norm": 3.964271068572998, + "learning_rate": 6.7756281781298615e-06, + "loss": 1.0352, + "step": 5227 + }, + { + "epoch": 0.40302189330866484, + "grad_norm": 3.359325647354126, + "learning_rate": 6.7744610511046615e-06, + "loss": 0.933, + "step": 5228 + }, + { + "epoch": 0.40309898242368175, + "grad_norm": 3.6119518280029297, + "learning_rate": 6.773293813451004e-06, + "loss": 0.9631, + "step": 5229 + }, + { + "epoch": 0.4031760715386987, + "grad_norm": 3.3389744758605957, + "learning_rate": 6.772126465241663e-06, + "loss": 0.9816, + "step": 5230 + }, + { + "epoch": 0.4032531606537157, + "grad_norm": 3.3862054347991943, + "learning_rate": 6.7709590065494125e-06, + "loss": 0.9248, + "step": 5231 + }, + { + "epoch": 0.40333024976873266, + "grad_norm": 3.3513505458831787, + "learning_rate": 6.769791437447042e-06, + "loss": 1.0087, + "step": 5232 + }, + { + "epoch": 0.40340733888374963, + "grad_norm": 3.7678074836730957, + "learning_rate": 6.76862375800734e-06, + "loss": 0.8853, + "step": 5233 + }, + { + "epoch": 0.40348442799876655, + "grad_norm": 3.6383919715881348, + "learning_rate": 6.767455968303107e-06, + "loss": 0.8316, + "step": 5234 + }, + { + "epoch": 0.4035615171137835, + "grad_norm": 3.6129610538482666, + "learning_rate": 6.7662880684071495e-06, + "loss": 1.0224, + "step": 5235 + }, + { + "epoch": 0.4036386062288005, + "grad_norm": 3.6119580268859863, + "learning_rate": 6.765120058392278e-06, + "loss": 1.0199, + "step": 5236 + }, + { + "epoch": 0.40371569534381746, + "grad_norm": 3.5133492946624756, + "learning_rate": 6.763951938331313e-06, + "loss": 0.8906, + "step": 5237 + }, + { + "epoch": 0.40379278445883443, + "grad_norm": 3.35998272895813, + "learning_rate": 6.76278370829708e-06, + "loss": 0.9159, + "step": 5238 + }, + { + "epoch": 0.40386987357385135, + "grad_norm": 3.517707586288452, + "learning_rate": 6.761615368362412e-06, + "loss": 1.0151, + "step": 5239 + }, + { + "epoch": 0.4039469626888683, + "grad_norm": 3.719403028488159, + "learning_rate": 6.760446918600151e-06, + "loss": 1.0202, + "step": 5240 + }, + { + "epoch": 0.4040240518038853, + "grad_norm": 3.614016056060791, + "learning_rate": 6.759278359083141e-06, + "loss": 1.0067, + "step": 5241 + }, + { + "epoch": 0.40410114091890226, + "grad_norm": 3.3453943729400635, + "learning_rate": 6.758109689884236e-06, + "loss": 1.0046, + "step": 5242 + }, + { + "epoch": 0.40417823003391923, + "grad_norm": 3.9464776515960693, + "learning_rate": 6.756940911076299e-06, + "loss": 1.0484, + "step": 5243 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 3.7068941593170166, + "learning_rate": 6.755772022732194e-06, + "loss": 0.9906, + "step": 5244 + }, + { + "epoch": 0.4043324082639531, + "grad_norm": 3.643101215362549, + "learning_rate": 6.754603024924799e-06, + "loss": 1.0175, + "step": 5245 + }, + { + "epoch": 0.4044094973789701, + "grad_norm": 3.6078343391418457, + "learning_rate": 6.753433917726991e-06, + "loss": 1.0381, + "step": 5246 + }, + { + "epoch": 0.40448658649398705, + "grad_norm": 3.2811944484710693, + "learning_rate": 6.75226470121166e-06, + "loss": 0.946, + "step": 5247 + }, + { + "epoch": 0.404563675609004, + "grad_norm": 3.3540406227111816, + "learning_rate": 6.751095375451699e-06, + "loss": 1.0243, + "step": 5248 + }, + { + "epoch": 0.404640764724021, + "grad_norm": 3.783407211303711, + "learning_rate": 6.749925940520012e-06, + "loss": 1.0053, + "step": 5249 + }, + { + "epoch": 0.4047178538390379, + "grad_norm": 3.748215913772583, + "learning_rate": 6.7487563964895066e-06, + "loss": 0.9892, + "step": 5250 + }, + { + "epoch": 0.4047949429540549, + "grad_norm": 4.270641803741455, + "learning_rate": 6.747586743433096e-06, + "loss": 1.1076, + "step": 5251 + }, + { + "epoch": 0.40487203206907185, + "grad_norm": 3.66277813911438, + "learning_rate": 6.746416981423701e-06, + "loss": 0.9607, + "step": 5252 + }, + { + "epoch": 0.4049491211840888, + "grad_norm": 3.703913927078247, + "learning_rate": 6.7452471105342536e-06, + "loss": 0.9762, + "step": 5253 + }, + { + "epoch": 0.4050262102991058, + "grad_norm": 3.490093469619751, + "learning_rate": 6.744077130837687e-06, + "loss": 0.943, + "step": 5254 + }, + { + "epoch": 0.4051032994141227, + "grad_norm": 3.721001625061035, + "learning_rate": 6.742907042406945e-06, + "loss": 1.0145, + "step": 5255 + }, + { + "epoch": 0.4051803885291397, + "grad_norm": 3.3705310821533203, + "learning_rate": 6.741736845314977e-06, + "loss": 0.9094, + "step": 5256 + }, + { + "epoch": 0.40525747764415665, + "grad_norm": 3.7736752033233643, + "learning_rate": 6.7405665396347345e-06, + "loss": 0.8783, + "step": 5257 + }, + { + "epoch": 0.4053345667591736, + "grad_norm": 3.2952444553375244, + "learning_rate": 6.739396125439184e-06, + "loss": 0.9976, + "step": 5258 + }, + { + "epoch": 0.4054116558741906, + "grad_norm": 3.4504001140594482, + "learning_rate": 6.7382256028012945e-06, + "loss": 0.9702, + "step": 5259 + }, + { + "epoch": 0.4054887449892075, + "grad_norm": 3.792487621307373, + "learning_rate": 6.7370549717940405e-06, + "loss": 1.0202, + "step": 5260 + }, + { + "epoch": 0.4055658341042245, + "grad_norm": 3.9348297119140625, + "learning_rate": 6.735884232490405e-06, + "loss": 1.0302, + "step": 5261 + }, + { + "epoch": 0.40564292321924145, + "grad_norm": 4.174359321594238, + "learning_rate": 6.734713384963379e-06, + "loss": 1.0545, + "step": 5262 + }, + { + "epoch": 0.4057200123342584, + "grad_norm": 3.5805842876434326, + "learning_rate": 6.733542429285957e-06, + "loss": 1.0303, + "step": 5263 + }, + { + "epoch": 0.4057971014492754, + "grad_norm": 3.4815800189971924, + "learning_rate": 6.732371365531144e-06, + "loss": 0.9535, + "step": 5264 + }, + { + "epoch": 0.4058741905642923, + "grad_norm": 3.540489435195923, + "learning_rate": 6.731200193771947e-06, + "loss": 0.9182, + "step": 5265 + }, + { + "epoch": 0.4059512796793093, + "grad_norm": 3.4096593856811523, + "learning_rate": 6.730028914081384e-06, + "loss": 1.0019, + "step": 5266 + }, + { + "epoch": 0.40602836879432624, + "grad_norm": 3.462355136871338, + "learning_rate": 6.72885752653248e-06, + "loss": 0.9894, + "step": 5267 + }, + { + "epoch": 0.4061054579093432, + "grad_norm": 3.640557289123535, + "learning_rate": 6.7276860311982614e-06, + "loss": 0.9686, + "step": 5268 + }, + { + "epoch": 0.4061825470243602, + "grad_norm": 3.729519844055176, + "learning_rate": 6.726514428151767e-06, + "loss": 1.0958, + "step": 5269 + }, + { + "epoch": 0.4062596361393771, + "grad_norm": 3.461050033569336, + "learning_rate": 6.725342717466041e-06, + "loss": 0.961, + "step": 5270 + }, + { + "epoch": 0.40633672525439407, + "grad_norm": 3.6534311771392822, + "learning_rate": 6.724170899214131e-06, + "loss": 1.0216, + "step": 5271 + }, + { + "epoch": 0.40641381436941104, + "grad_norm": 3.3762400150299072, + "learning_rate": 6.7229989734690956e-06, + "loss": 0.9508, + "step": 5272 + }, + { + "epoch": 0.406490903484428, + "grad_norm": 3.6674129962921143, + "learning_rate": 6.721826940303999e-06, + "loss": 0.8763, + "step": 5273 + }, + { + "epoch": 0.406567992599445, + "grad_norm": 3.514495849609375, + "learning_rate": 6.720654799791908e-06, + "loss": 0.9273, + "step": 5274 + }, + { + "epoch": 0.4066450817144619, + "grad_norm": 3.849905490875244, + "learning_rate": 6.719482552005903e-06, + "loss": 1.0703, + "step": 5275 + }, + { + "epoch": 0.40672217082947887, + "grad_norm": 3.744799852371216, + "learning_rate": 6.718310197019066e-06, + "loss": 1.0608, + "step": 5276 + }, + { + "epoch": 0.40679925994449584, + "grad_norm": 3.5321202278137207, + "learning_rate": 6.7171377349044875e-06, + "loss": 0.9747, + "step": 5277 + }, + { + "epoch": 0.4068763490595128, + "grad_norm": 3.6646018028259277, + "learning_rate": 6.7159651657352656e-06, + "loss": 1.0208, + "step": 5278 + }, + { + "epoch": 0.4069534381745298, + "grad_norm": 3.462653636932373, + "learning_rate": 6.714792489584501e-06, + "loss": 0.9575, + "step": 5279 + }, + { + "epoch": 0.4070305272895467, + "grad_norm": 3.5436105728149414, + "learning_rate": 6.7136197065253075e-06, + "loss": 0.9758, + "step": 5280 + }, + { + "epoch": 0.40710761640456367, + "grad_norm": 3.5151219367980957, + "learning_rate": 6.7124468166308e-06, + "loss": 1.0746, + "step": 5281 + }, + { + "epoch": 0.40718470551958064, + "grad_norm": 3.4494199752807617, + "learning_rate": 6.711273819974101e-06, + "loss": 0.9407, + "step": 5282 + }, + { + "epoch": 0.4072617946345976, + "grad_norm": 3.708519220352173, + "learning_rate": 6.710100716628345e-06, + "loss": 1.044, + "step": 5283 + }, + { + "epoch": 0.4073388837496146, + "grad_norm": 3.8825523853302, + "learning_rate": 6.708927506666664e-06, + "loss": 0.9725, + "step": 5284 + }, + { + "epoch": 0.4074159728646315, + "grad_norm": 3.7221336364746094, + "learning_rate": 6.707754190162203e-06, + "loss": 0.9684, + "step": 5285 + }, + { + "epoch": 0.40749306197964846, + "grad_norm": 3.6459972858428955, + "learning_rate": 6.7065807671881155e-06, + "loss": 1.0946, + "step": 5286 + }, + { + "epoch": 0.40757015109466543, + "grad_norm": 3.2291955947875977, + "learning_rate": 6.705407237817554e-06, + "loss": 0.8986, + "step": 5287 + }, + { + "epoch": 0.4076472402096824, + "grad_norm": 3.8956899642944336, + "learning_rate": 6.704233602123685e-06, + "loss": 1.0724, + "step": 5288 + }, + { + "epoch": 0.4077243293246994, + "grad_norm": 3.887136459350586, + "learning_rate": 6.703059860179677e-06, + "loss": 1.0851, + "step": 5289 + }, + { + "epoch": 0.4078014184397163, + "grad_norm": 3.6385395526885986, + "learning_rate": 6.701886012058706e-06, + "loss": 0.9677, + "step": 5290 + }, + { + "epoch": 0.40787850755473326, + "grad_norm": 3.614091634750366, + "learning_rate": 6.700712057833958e-06, + "loss": 0.991, + "step": 5291 + }, + { + "epoch": 0.40795559666975023, + "grad_norm": 3.7074344158172607, + "learning_rate": 6.69953799757862e-06, + "loss": 0.8541, + "step": 5292 + }, + { + "epoch": 0.4080326857847672, + "grad_norm": 3.504150152206421, + "learning_rate": 6.69836383136589e-06, + "loss": 0.942, + "step": 5293 + }, + { + "epoch": 0.4081097748997842, + "grad_norm": 3.62353777885437, + "learning_rate": 6.697189559268973e-06, + "loss": 0.9787, + "step": 5294 + }, + { + "epoch": 0.4081868640148011, + "grad_norm": 3.5454580783843994, + "learning_rate": 6.696015181361076e-06, + "loss": 1.0769, + "step": 5295 + }, + { + "epoch": 0.40826395312981806, + "grad_norm": 3.8468873500823975, + "learning_rate": 6.694840697715415e-06, + "loss": 1.0924, + "step": 5296 + }, + { + "epoch": 0.40834104224483503, + "grad_norm": 3.6607978343963623, + "learning_rate": 6.693666108405215e-06, + "loss": 0.9552, + "step": 5297 + }, + { + "epoch": 0.408418131359852, + "grad_norm": 3.949882984161377, + "learning_rate": 6.692491413503704e-06, + "loss": 0.9816, + "step": 5298 + }, + { + "epoch": 0.40849522047486897, + "grad_norm": 3.432330369949341, + "learning_rate": 6.691316613084121e-06, + "loss": 0.8183, + "step": 5299 + }, + { + "epoch": 0.4085723095898859, + "grad_norm": 3.6200854778289795, + "learning_rate": 6.690141707219706e-06, + "loss": 0.992, + "step": 5300 + }, + { + "epoch": 0.40864939870490286, + "grad_norm": 3.9913625717163086, + "learning_rate": 6.688966695983708e-06, + "loss": 0.9583, + "step": 5301 + }, + { + "epoch": 0.4087264878199198, + "grad_norm": 3.4617879390716553, + "learning_rate": 6.6877915794493855e-06, + "loss": 0.9242, + "step": 5302 + }, + { + "epoch": 0.4088035769349368, + "grad_norm": 3.69291615486145, + "learning_rate": 6.6866163576899985e-06, + "loss": 0.9768, + "step": 5303 + }, + { + "epoch": 0.40888066604995377, + "grad_norm": 3.7546987533569336, + "learning_rate": 6.6854410307788175e-06, + "loss": 0.9785, + "step": 5304 + }, + { + "epoch": 0.4089577551649707, + "grad_norm": 3.5481560230255127, + "learning_rate": 6.684265598789117e-06, + "loss": 1.0147, + "step": 5305 + }, + { + "epoch": 0.40903484427998765, + "grad_norm": 4.156661510467529, + "learning_rate": 6.683090061794179e-06, + "loss": 0.9727, + "step": 5306 + }, + { + "epoch": 0.4091119333950046, + "grad_norm": 3.6532485485076904, + "learning_rate": 6.6819144198672925e-06, + "loss": 0.9808, + "step": 5307 + }, + { + "epoch": 0.4091890225100216, + "grad_norm": 3.576277494430542, + "learning_rate": 6.6807386730817525e-06, + "loss": 0.9245, + "step": 5308 + }, + { + "epoch": 0.40926611162503856, + "grad_norm": 3.498910903930664, + "learning_rate": 6.679562821510862e-06, + "loss": 0.9565, + "step": 5309 + }, + { + "epoch": 0.4093432007400555, + "grad_norm": 3.947864294052124, + "learning_rate": 6.6783868652279285e-06, + "loss": 1.0307, + "step": 5310 + }, + { + "epoch": 0.40942028985507245, + "grad_norm": 3.714496612548828, + "learning_rate": 6.677210804306266e-06, + "loss": 0.9651, + "step": 5311 + }, + { + "epoch": 0.4094973789700894, + "grad_norm": 3.6817400455474854, + "learning_rate": 6.676034638819197e-06, + "loss": 0.932, + "step": 5312 + }, + { + "epoch": 0.4095744680851064, + "grad_norm": 3.330031394958496, + "learning_rate": 6.674858368840047e-06, + "loss": 0.964, + "step": 5313 + }, + { + "epoch": 0.40965155720012336, + "grad_norm": 3.4984524250030518, + "learning_rate": 6.673681994442153e-06, + "loss": 0.9394, + "step": 5314 + }, + { + "epoch": 0.4097286463151403, + "grad_norm": 3.6142520904541016, + "learning_rate": 6.6725055156988545e-06, + "loss": 1.0045, + "step": 5315 + }, + { + "epoch": 0.40980573543015725, + "grad_norm": 3.7212975025177, + "learning_rate": 6.671328932683499e-06, + "loss": 1.0474, + "step": 5316 + }, + { + "epoch": 0.4098828245451742, + "grad_norm": 3.747412919998169, + "learning_rate": 6.670152245469441e-06, + "loss": 1.0149, + "step": 5317 + }, + { + "epoch": 0.4099599136601912, + "grad_norm": 3.9034531116485596, + "learning_rate": 6.6689754541300426e-06, + "loss": 1.1157, + "step": 5318 + }, + { + "epoch": 0.41003700277520816, + "grad_norm": 4.028858184814453, + "learning_rate": 6.667798558738664e-06, + "loss": 1.0, + "step": 5319 + }, + { + "epoch": 0.4101140918902251, + "grad_norm": 3.367152690887451, + "learning_rate": 6.666621559368687e-06, + "loss": 0.9779, + "step": 5320 + }, + { + "epoch": 0.41019118100524204, + "grad_norm": 5.969235420227051, + "learning_rate": 6.665444456093485e-06, + "loss": 1.0506, + "step": 5321 + }, + { + "epoch": 0.410268270120259, + "grad_norm": 3.4636454582214355, + "learning_rate": 6.664267248986447e-06, + "loss": 0.941, + "step": 5322 + }, + { + "epoch": 0.410345359235276, + "grad_norm": 3.7874643802642822, + "learning_rate": 6.663089938120967e-06, + "loss": 1.089, + "step": 5323 + }, + { + "epoch": 0.41042244835029296, + "grad_norm": 3.6759419441223145, + "learning_rate": 6.6619125235704414e-06, + "loss": 0.9959, + "step": 5324 + }, + { + "epoch": 0.41049953746530987, + "grad_norm": 3.6804399490356445, + "learning_rate": 6.660735005408278e-06, + "loss": 1.0022, + "step": 5325 + }, + { + "epoch": 0.41057662658032684, + "grad_norm": 3.5297484397888184, + "learning_rate": 6.659557383707887e-06, + "loss": 1.0264, + "step": 5326 + }, + { + "epoch": 0.4106537156953438, + "grad_norm": 3.6392202377319336, + "learning_rate": 6.65837965854269e-06, + "loss": 0.9898, + "step": 5327 + }, + { + "epoch": 0.4107308048103608, + "grad_norm": 3.438964366912842, + "learning_rate": 6.65720182998611e-06, + "loss": 0.9249, + "step": 5328 + }, + { + "epoch": 0.41080789392537775, + "grad_norm": 3.7228152751922607, + "learning_rate": 6.656023898111577e-06, + "loss": 1.0178, + "step": 5329 + }, + { + "epoch": 0.41088498304039467, + "grad_norm": 3.804844856262207, + "learning_rate": 6.654845862992532e-06, + "loss": 0.8827, + "step": 5330 + }, + { + "epoch": 0.41096207215541164, + "grad_norm": 3.4772403240203857, + "learning_rate": 6.653667724702419e-06, + "loss": 1.0009, + "step": 5331 + }, + { + "epoch": 0.4110391612704286, + "grad_norm": 3.5902342796325684, + "learning_rate": 6.652489483314686e-06, + "loss": 1.0335, + "step": 5332 + }, + { + "epoch": 0.4111162503854456, + "grad_norm": 3.510636329650879, + "learning_rate": 6.651311138902792e-06, + "loss": 1.0807, + "step": 5333 + }, + { + "epoch": 0.41119333950046255, + "grad_norm": 3.3497116565704346, + "learning_rate": 6.650132691540203e-06, + "loss": 0.9117, + "step": 5334 + }, + { + "epoch": 0.4112704286154795, + "grad_norm": 3.712567090988159, + "learning_rate": 6.648954141300386e-06, + "loss": 0.9818, + "step": 5335 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 3.705669641494751, + "learning_rate": 6.647775488256818e-06, + "loss": 0.957, + "step": 5336 + }, + { + "epoch": 0.4114246068455134, + "grad_norm": 3.471576452255249, + "learning_rate": 6.646596732482982e-06, + "loss": 0.9049, + "step": 5337 + }, + { + "epoch": 0.4115016959605304, + "grad_norm": 3.4686636924743652, + "learning_rate": 6.645417874052368e-06, + "loss": 1.0353, + "step": 5338 + }, + { + "epoch": 0.41157878507554735, + "grad_norm": 3.8832995891571045, + "learning_rate": 6.644238913038472e-06, + "loss": 1.0707, + "step": 5339 + }, + { + "epoch": 0.4116558741905643, + "grad_norm": 3.786958694458008, + "learning_rate": 6.643059849514795e-06, + "loss": 1.0643, + "step": 5340 + }, + { + "epoch": 0.41173296330558123, + "grad_norm": 3.698564052581787, + "learning_rate": 6.641880683554846e-06, + "loss": 1.0087, + "step": 5341 + }, + { + "epoch": 0.4118100524205982, + "grad_norm": 3.6572458744049072, + "learning_rate": 6.640701415232139e-06, + "loss": 0.9285, + "step": 5342 + }, + { + "epoch": 0.4118871415356152, + "grad_norm": 3.6328537464141846, + "learning_rate": 6.639522044620197e-06, + "loss": 1.0668, + "step": 5343 + }, + { + "epoch": 0.41196423065063215, + "grad_norm": 3.2839369773864746, + "learning_rate": 6.638342571792548e-06, + "loss": 0.8407, + "step": 5344 + }, + { + "epoch": 0.4120413197656491, + "grad_norm": 3.6459763050079346, + "learning_rate": 6.637162996822724e-06, + "loss": 0.9962, + "step": 5345 + }, + { + "epoch": 0.41211840888066603, + "grad_norm": 3.8563995361328125, + "learning_rate": 6.635983319784265e-06, + "loss": 0.9631, + "step": 5346 + }, + { + "epoch": 0.412195497995683, + "grad_norm": 3.544633388519287, + "learning_rate": 6.63480354075072e-06, + "loss": 0.8998, + "step": 5347 + }, + { + "epoch": 0.4122725871107, + "grad_norm": 3.736931085586548, + "learning_rate": 6.633623659795642e-06, + "loss": 0.985, + "step": 5348 + }, + { + "epoch": 0.41234967622571694, + "grad_norm": 3.825957775115967, + "learning_rate": 6.632443676992588e-06, + "loss": 0.9953, + "step": 5349 + }, + { + "epoch": 0.4124267653407339, + "grad_norm": 3.397263288497925, + "learning_rate": 6.631263592415127e-06, + "loss": 0.9607, + "step": 5350 + }, + { + "epoch": 0.41250385445575083, + "grad_norm": 3.639387845993042, + "learning_rate": 6.630083406136829e-06, + "loss": 0.9568, + "step": 5351 + }, + { + "epoch": 0.4125809435707678, + "grad_norm": 3.8172554969787598, + "learning_rate": 6.628903118231274e-06, + "loss": 0.9392, + "step": 5352 + }, + { + "epoch": 0.41265803268578477, + "grad_norm": 3.8433003425598145, + "learning_rate": 6.627722728772044e-06, + "loss": 0.987, + "step": 5353 + }, + { + "epoch": 0.41273512180080174, + "grad_norm": 3.482104778289795, + "learning_rate": 6.6265422378327314e-06, + "loss": 0.9403, + "step": 5354 + }, + { + "epoch": 0.4128122109158187, + "grad_norm": 3.8958816528320312, + "learning_rate": 6.625361645486936e-06, + "loss": 0.9892, + "step": 5355 + }, + { + "epoch": 0.4128893000308356, + "grad_norm": 3.7703473567962646, + "learning_rate": 6.62418095180826e-06, + "loss": 0.951, + "step": 5356 + }, + { + "epoch": 0.4129663891458526, + "grad_norm": 3.737750291824341, + "learning_rate": 6.623000156870313e-06, + "loss": 0.9753, + "step": 5357 + }, + { + "epoch": 0.41304347826086957, + "grad_norm": 3.6272356510162354, + "learning_rate": 6.621819260746713e-06, + "loss": 1.0136, + "step": 5358 + }, + { + "epoch": 0.41312056737588654, + "grad_norm": 3.797292470932007, + "learning_rate": 6.62063826351108e-06, + "loss": 1.0478, + "step": 5359 + }, + { + "epoch": 0.4131976564909035, + "grad_norm": 4.469625949859619, + "learning_rate": 6.619457165237046e-06, + "loss": 1.0983, + "step": 5360 + }, + { + "epoch": 0.4132747456059204, + "grad_norm": 3.5453269481658936, + "learning_rate": 6.618275965998244e-06, + "loss": 1.0379, + "step": 5361 + }, + { + "epoch": 0.4133518347209374, + "grad_norm": 3.681480646133423, + "learning_rate": 6.617094665868317e-06, + "loss": 0.9983, + "step": 5362 + }, + { + "epoch": 0.41342892383595437, + "grad_norm": 3.6391947269439697, + "learning_rate": 6.615913264920912e-06, + "loss": 0.9968, + "step": 5363 + }, + { + "epoch": 0.41350601295097134, + "grad_norm": 3.4914567470550537, + "learning_rate": 6.614731763229686e-06, + "loss": 0.9064, + "step": 5364 + }, + { + "epoch": 0.4135831020659883, + "grad_norm": 3.5731704235076904, + "learning_rate": 6.613550160868297e-06, + "loss": 1.0242, + "step": 5365 + }, + { + "epoch": 0.4136601911810052, + "grad_norm": 4.226243019104004, + "learning_rate": 6.612368457910412e-06, + "loss": 1.0554, + "step": 5366 + }, + { + "epoch": 0.4137372802960222, + "grad_norm": 3.477130889892578, + "learning_rate": 6.611186654429704e-06, + "loss": 0.8728, + "step": 5367 + }, + { + "epoch": 0.41381436941103916, + "grad_norm": 3.375094175338745, + "learning_rate": 6.6100047504998535e-06, + "loss": 0.9865, + "step": 5368 + }, + { + "epoch": 0.41389145852605613, + "grad_norm": 3.527829647064209, + "learning_rate": 6.6088227461945434e-06, + "loss": 1.0235, + "step": 5369 + }, + { + "epoch": 0.4139685476410731, + "grad_norm": 3.82974910736084, + "learning_rate": 6.607640641587469e-06, + "loss": 0.9777, + "step": 5370 + }, + { + "epoch": 0.41404563675609, + "grad_norm": 3.5548770427703857, + "learning_rate": 6.606458436752327e-06, + "loss": 0.965, + "step": 5371 + }, + { + "epoch": 0.414122725871107, + "grad_norm": 3.665213108062744, + "learning_rate": 6.60527613176282e-06, + "loss": 0.9712, + "step": 5372 + }, + { + "epoch": 0.41419981498612396, + "grad_norm": 3.664933443069458, + "learning_rate": 6.604093726692661e-06, + "loss": 0.91, + "step": 5373 + }, + { + "epoch": 0.41427690410114093, + "grad_norm": 3.6058082580566406, + "learning_rate": 6.602911221615567e-06, + "loss": 1.0525, + "step": 5374 + }, + { + "epoch": 0.4143539932161579, + "grad_norm": 3.68048095703125, + "learning_rate": 6.601728616605259e-06, + "loss": 0.9375, + "step": 5375 + }, + { + "epoch": 0.4144310823311748, + "grad_norm": 3.4503397941589355, + "learning_rate": 6.600545911735468e-06, + "loss": 0.9779, + "step": 5376 + }, + { + "epoch": 0.4145081714461918, + "grad_norm": 3.573887586593628, + "learning_rate": 6.599363107079927e-06, + "loss": 0.9744, + "step": 5377 + }, + { + "epoch": 0.41458526056120876, + "grad_norm": 3.5022692680358887, + "learning_rate": 6.59818020271238e-06, + "loss": 0.9208, + "step": 5378 + }, + { + "epoch": 0.4146623496762257, + "grad_norm": 3.4239299297332764, + "learning_rate": 6.596997198706576e-06, + "loss": 0.9342, + "step": 5379 + }, + { + "epoch": 0.4147394387912427, + "grad_norm": 4.041162014007568, + "learning_rate": 6.595814095136267e-06, + "loss": 0.8855, + "step": 5380 + }, + { + "epoch": 0.4148165279062596, + "grad_norm": 4.2460761070251465, + "learning_rate": 6.594630892075213e-06, + "loss": 1.0153, + "step": 5381 + }, + { + "epoch": 0.4148936170212766, + "grad_norm": 3.367917537689209, + "learning_rate": 6.593447589597184e-06, + "loss": 0.9544, + "step": 5382 + }, + { + "epoch": 0.41497070613629355, + "grad_norm": 3.5523362159729004, + "learning_rate": 6.5922641877759484e-06, + "loss": 0.9909, + "step": 5383 + }, + { + "epoch": 0.4150477952513105, + "grad_norm": 3.4318056106567383, + "learning_rate": 6.591080686685289e-06, + "loss": 0.9482, + "step": 5384 + }, + { + "epoch": 0.4151248843663275, + "grad_norm": 3.5694563388824463, + "learning_rate": 6.589897086398989e-06, + "loss": 0.9606, + "step": 5385 + }, + { + "epoch": 0.4152019734813444, + "grad_norm": 3.1972849369049072, + "learning_rate": 6.588713386990837e-06, + "loss": 0.9337, + "step": 5386 + }, + { + "epoch": 0.4152790625963614, + "grad_norm": 3.958773136138916, + "learning_rate": 6.5875295885346356e-06, + "loss": 0.8853, + "step": 5387 + }, + { + "epoch": 0.41535615171137835, + "grad_norm": 4.037240982055664, + "learning_rate": 6.5863456911041865e-06, + "loss": 1.0532, + "step": 5388 + }, + { + "epoch": 0.4154332408263953, + "grad_norm": 3.808276891708374, + "learning_rate": 6.585161694773301e-06, + "loss": 1.0308, + "step": 5389 + }, + { + "epoch": 0.4155103299414123, + "grad_norm": 3.6094319820404053, + "learning_rate": 6.583977599615792e-06, + "loss": 1.0333, + "step": 5390 + }, + { + "epoch": 0.4155874190564292, + "grad_norm": 3.879518747329712, + "learning_rate": 6.582793405705482e-06, + "loss": 1.0241, + "step": 5391 + }, + { + "epoch": 0.4156645081714462, + "grad_norm": 3.571319818496704, + "learning_rate": 6.581609113116203e-06, + "loss": 0.9956, + "step": 5392 + }, + { + "epoch": 0.41574159728646315, + "grad_norm": 3.7715680599212646, + "learning_rate": 6.580424721921785e-06, + "loss": 0.9709, + "step": 5393 + }, + { + "epoch": 0.4158186864014801, + "grad_norm": 3.707052230834961, + "learning_rate": 6.579240232196073e-06, + "loss": 0.9661, + "step": 5394 + }, + { + "epoch": 0.4158957755164971, + "grad_norm": 3.3030142784118652, + "learning_rate": 6.578055644012911e-06, + "loss": 0.9728, + "step": 5395 + }, + { + "epoch": 0.415972864631514, + "grad_norm": 3.5438342094421387, + "learning_rate": 6.576870957446151e-06, + "loss": 0.9302, + "step": 5396 + }, + { + "epoch": 0.416049953746531, + "grad_norm": 3.5697009563446045, + "learning_rate": 6.575686172569655e-06, + "loss": 1.1335, + "step": 5397 + }, + { + "epoch": 0.41612704286154795, + "grad_norm": 3.6315059661865234, + "learning_rate": 6.574501289457287e-06, + "loss": 1.1065, + "step": 5398 + }, + { + "epoch": 0.4162041319765649, + "grad_norm": 3.3344013690948486, + "learning_rate": 6.573316308182917e-06, + "loss": 0.9045, + "step": 5399 + }, + { + "epoch": 0.4162812210915819, + "grad_norm": 3.6112499237060547, + "learning_rate": 6.5721312288204254e-06, + "loss": 1.0306, + "step": 5400 + }, + { + "epoch": 0.4163583102065988, + "grad_norm": 3.63188099861145, + "learning_rate": 6.570946051443693e-06, + "loss": 0.9627, + "step": 5401 + }, + { + "epoch": 0.4164353993216158, + "grad_norm": 3.795747995376587, + "learning_rate": 6.5697607761266105e-06, + "loss": 1.1105, + "step": 5402 + }, + { + "epoch": 0.41651248843663274, + "grad_norm": 3.7781665325164795, + "learning_rate": 6.568575402943073e-06, + "loss": 0.9043, + "step": 5403 + }, + { + "epoch": 0.4165895775516497, + "grad_norm": 3.3593337535858154, + "learning_rate": 6.567389931966983e-06, + "loss": 1.0182, + "step": 5404 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 4.008146286010742, + "learning_rate": 6.566204363272248e-06, + "loss": 1.0122, + "step": 5405 + }, + { + "epoch": 0.4167437557816836, + "grad_norm": 3.6243855953216553, + "learning_rate": 6.565018696932786e-06, + "loss": 0.9194, + "step": 5406 + }, + { + "epoch": 0.41682084489670057, + "grad_norm": 3.93573260307312, + "learning_rate": 6.5638329330225096e-06, + "loss": 0.9881, + "step": 5407 + }, + { + "epoch": 0.41689793401171754, + "grad_norm": 3.9312803745269775, + "learning_rate": 6.5626470716153515e-06, + "loss": 0.9584, + "step": 5408 + }, + { + "epoch": 0.4169750231267345, + "grad_norm": 3.7972381114959717, + "learning_rate": 6.561461112785239e-06, + "loss": 0.8669, + "step": 5409 + }, + { + "epoch": 0.4170521122417515, + "grad_norm": 3.8700931072235107, + "learning_rate": 6.5602750566061154e-06, + "loss": 1.0306, + "step": 5410 + }, + { + "epoch": 0.4171292013567684, + "grad_norm": 3.5603837966918945, + "learning_rate": 6.559088903151923e-06, + "loss": 1.0485, + "step": 5411 + }, + { + "epoch": 0.41720629047178537, + "grad_norm": 3.537443161010742, + "learning_rate": 6.5579026524966106e-06, + "loss": 1.0481, + "step": 5412 + }, + { + "epoch": 0.41728337958680234, + "grad_norm": 3.662034511566162, + "learning_rate": 6.5567163047141395e-06, + "loss": 1.0871, + "step": 5413 + }, + { + "epoch": 0.4173604687018193, + "grad_norm": 3.207895517349243, + "learning_rate": 6.555529859878466e-06, + "loss": 0.8474, + "step": 5414 + }, + { + "epoch": 0.4174375578168363, + "grad_norm": 3.5553321838378906, + "learning_rate": 6.554343318063563e-06, + "loss": 1.0164, + "step": 5415 + }, + { + "epoch": 0.4175146469318532, + "grad_norm": 4.284764766693115, + "learning_rate": 6.553156679343404e-06, + "loss": 0.9951, + "step": 5416 + }, + { + "epoch": 0.41759173604687017, + "grad_norm": 3.8002851009368896, + "learning_rate": 6.551969943791972e-06, + "loss": 1.0258, + "step": 5417 + }, + { + "epoch": 0.41766882516188714, + "grad_norm": 4.005855560302734, + "learning_rate": 6.550783111483249e-06, + "loss": 0.9425, + "step": 5418 + }, + { + "epoch": 0.4177459142769041, + "grad_norm": 3.5410430431365967, + "learning_rate": 6.549596182491233e-06, + "loss": 0.9774, + "step": 5419 + }, + { + "epoch": 0.4178230033919211, + "grad_norm": 3.704803943634033, + "learning_rate": 6.548409156889919e-06, + "loss": 0.9802, + "step": 5420 + }, + { + "epoch": 0.41790009250693805, + "grad_norm": 3.7099010944366455, + "learning_rate": 6.5472220347533145e-06, + "loss": 0.9581, + "step": 5421 + }, + { + "epoch": 0.41797718162195496, + "grad_norm": 3.5973217487335205, + "learning_rate": 6.546034816155429e-06, + "loss": 0.9321, + "step": 5422 + }, + { + "epoch": 0.41805427073697193, + "grad_norm": 4.097982406616211, + "learning_rate": 6.54484750117028e-06, + "loss": 1.0699, + "step": 5423 + }, + { + "epoch": 0.4181313598519889, + "grad_norm": 3.9153554439544678, + "learning_rate": 6.543660089871891e-06, + "loss": 0.9045, + "step": 5424 + }, + { + "epoch": 0.4182084489670059, + "grad_norm": 3.366682767868042, + "learning_rate": 6.5424725823342895e-06, + "loss": 1.0124, + "step": 5425 + }, + { + "epoch": 0.41828553808202285, + "grad_norm": 3.677605390548706, + "learning_rate": 6.54128497863151e-06, + "loss": 1.0661, + "step": 5426 + }, + { + "epoch": 0.41836262719703976, + "grad_norm": 3.49324369430542, + "learning_rate": 6.540097278837596e-06, + "loss": 0.9011, + "step": 5427 + }, + { + "epoch": 0.41843971631205673, + "grad_norm": 3.3589603900909424, + "learning_rate": 6.538909483026593e-06, + "loss": 0.9454, + "step": 5428 + }, + { + "epoch": 0.4185168054270737, + "grad_norm": 3.4064254760742188, + "learning_rate": 6.537721591272553e-06, + "loss": 0.9948, + "step": 5429 + }, + { + "epoch": 0.41859389454209067, + "grad_norm": 3.785304546356201, + "learning_rate": 6.536533603649536e-06, + "loss": 0.9664, + "step": 5430 + }, + { + "epoch": 0.41867098365710764, + "grad_norm": 3.5690832138061523, + "learning_rate": 6.5353455202316075e-06, + "loss": 0.9863, + "step": 5431 + }, + { + "epoch": 0.41874807277212456, + "grad_norm": 3.667205572128296, + "learning_rate": 6.5341573410928376e-06, + "loss": 1.0341, + "step": 5432 + }, + { + "epoch": 0.41882516188714153, + "grad_norm": 3.1317427158355713, + "learning_rate": 6.532969066307302e-06, + "loss": 0.9154, + "step": 5433 + }, + { + "epoch": 0.4189022510021585, + "grad_norm": 3.749356746673584, + "learning_rate": 6.531780695949086e-06, + "loss": 0.9678, + "step": 5434 + }, + { + "epoch": 0.41897934011717547, + "grad_norm": 3.7041513919830322, + "learning_rate": 6.530592230092276e-06, + "loss": 1.0266, + "step": 5435 + }, + { + "epoch": 0.41905642923219244, + "grad_norm": 3.199434518814087, + "learning_rate": 6.529403668810968e-06, + "loss": 0.964, + "step": 5436 + }, + { + "epoch": 0.41913351834720936, + "grad_norm": 3.5290520191192627, + "learning_rate": 6.528215012179262e-06, + "loss": 0.9869, + "step": 5437 + }, + { + "epoch": 0.4192106074622263, + "grad_norm": 3.9519081115722656, + "learning_rate": 6.527026260271265e-06, + "loss": 1.0839, + "step": 5438 + }, + { + "epoch": 0.4192876965772433, + "grad_norm": 3.520599603652954, + "learning_rate": 6.525837413161089e-06, + "loss": 1.0857, + "step": 5439 + }, + { + "epoch": 0.41936478569226027, + "grad_norm": 3.515936851501465, + "learning_rate": 6.524648470922854e-06, + "loss": 1.067, + "step": 5440 + }, + { + "epoch": 0.41944187480727724, + "grad_norm": 3.486137628555298, + "learning_rate": 6.523459433630681e-06, + "loss": 0.9586, + "step": 5441 + }, + { + "epoch": 0.41951896392229415, + "grad_norm": 3.7735965251922607, + "learning_rate": 6.522270301358704e-06, + "loss": 1.0085, + "step": 5442 + }, + { + "epoch": 0.4195960530373111, + "grad_norm": 3.485811948776245, + "learning_rate": 6.521081074181058e-06, + "loss": 0.9666, + "step": 5443 + }, + { + "epoch": 0.4196731421523281, + "grad_norm": 3.6323955059051514, + "learning_rate": 6.519891752171884e-06, + "loss": 1.0444, + "step": 5444 + }, + { + "epoch": 0.41975023126734506, + "grad_norm": 3.7233123779296875, + "learning_rate": 6.518702335405331e-06, + "loss": 0.9007, + "step": 5445 + }, + { + "epoch": 0.41982732038236203, + "grad_norm": 3.387659788131714, + "learning_rate": 6.517512823955554e-06, + "loss": 0.9503, + "step": 5446 + }, + { + "epoch": 0.41990440949737895, + "grad_norm": 3.6936397552490234, + "learning_rate": 6.516323217896712e-06, + "loss": 1.012, + "step": 5447 + }, + { + "epoch": 0.4199814986123959, + "grad_norm": 3.6477158069610596, + "learning_rate": 6.515133517302969e-06, + "loss": 1.0494, + "step": 5448 + }, + { + "epoch": 0.4200585877274129, + "grad_norm": 3.4203903675079346, + "learning_rate": 6.513943722248499e-06, + "loss": 0.8738, + "step": 5449 + }, + { + "epoch": 0.42013567684242986, + "grad_norm": 3.43131160736084, + "learning_rate": 6.512753832807479e-06, + "loss": 0.8789, + "step": 5450 + }, + { + "epoch": 0.42021276595744683, + "grad_norm": 3.5120580196380615, + "learning_rate": 6.5115638490540925e-06, + "loss": 0.9803, + "step": 5451 + }, + { + "epoch": 0.42028985507246375, + "grad_norm": 3.5579140186309814, + "learning_rate": 6.510373771062527e-06, + "loss": 0.893, + "step": 5452 + }, + { + "epoch": 0.4203669441874807, + "grad_norm": 4.425456523895264, + "learning_rate": 6.50918359890698e-06, + "loss": 0.9491, + "step": 5453 + }, + { + "epoch": 0.4204440333024977, + "grad_norm": 3.6898512840270996, + "learning_rate": 6.507993332661653e-06, + "loss": 0.9606, + "step": 5454 + }, + { + "epoch": 0.42052112241751466, + "grad_norm": 3.4359302520751953, + "learning_rate": 6.506802972400751e-06, + "loss": 0.9471, + "step": 5455 + }, + { + "epoch": 0.42059821153253163, + "grad_norm": 3.5586142539978027, + "learning_rate": 6.5056125181984885e-06, + "loss": 0.958, + "step": 5456 + }, + { + "epoch": 0.42067530064754854, + "grad_norm": 3.591996192932129, + "learning_rate": 6.5044219701290825e-06, + "loss": 1.0096, + "step": 5457 + }, + { + "epoch": 0.4207523897625655, + "grad_norm": 3.7730772495269775, + "learning_rate": 6.503231328266758e-06, + "loss": 0.9814, + "step": 5458 + }, + { + "epoch": 0.4208294788775825, + "grad_norm": 3.378232717514038, + "learning_rate": 6.5020405926857465e-06, + "loss": 0.915, + "step": 5459 + }, + { + "epoch": 0.42090656799259946, + "grad_norm": 3.841001033782959, + "learning_rate": 6.500849763460283e-06, + "loss": 1.0572, + "step": 5460 + }, + { + "epoch": 0.4209836571076164, + "grad_norm": 3.5528838634490967, + "learning_rate": 6.4996588406646096e-06, + "loss": 1.0324, + "step": 5461 + }, + { + "epoch": 0.42106074622263334, + "grad_norm": 3.5531325340270996, + "learning_rate": 6.498467824372974e-06, + "loss": 0.9234, + "step": 5462 + }, + { + "epoch": 0.4211378353376503, + "grad_norm": 3.503661870956421, + "learning_rate": 6.497276714659631e-06, + "loss": 0.9227, + "step": 5463 + }, + { + "epoch": 0.4212149244526673, + "grad_norm": 3.5585978031158447, + "learning_rate": 6.49608551159884e-06, + "loss": 0.9453, + "step": 5464 + }, + { + "epoch": 0.42129201356768425, + "grad_norm": 3.890265941619873, + "learning_rate": 6.494894215264864e-06, + "loss": 0.9202, + "step": 5465 + }, + { + "epoch": 0.4213691026827012, + "grad_norm": 3.9904141426086426, + "learning_rate": 6.493702825731977e-06, + "loss": 0.9892, + "step": 5466 + }, + { + "epoch": 0.42144619179771814, + "grad_norm": 3.911604881286621, + "learning_rate": 6.492511343074456e-06, + "loss": 1.007, + "step": 5467 + }, + { + "epoch": 0.4215232809127351, + "grad_norm": 3.6176915168762207, + "learning_rate": 6.491319767366581e-06, + "loss": 1.0362, + "step": 5468 + }, + { + "epoch": 0.4216003700277521, + "grad_norm": 3.815973997116089, + "learning_rate": 6.490128098682642e-06, + "loss": 0.9862, + "step": 5469 + }, + { + "epoch": 0.42167745914276905, + "grad_norm": 4.3016037940979, + "learning_rate": 6.488936337096935e-06, + "loss": 0.9584, + "step": 5470 + }, + { + "epoch": 0.421754548257786, + "grad_norm": 3.49375319480896, + "learning_rate": 6.487744482683758e-06, + "loss": 0.9048, + "step": 5471 + }, + { + "epoch": 0.42183163737280294, + "grad_norm": 4.007232189178467, + "learning_rate": 6.486552535517419e-06, + "loss": 1.0449, + "step": 5472 + }, + { + "epoch": 0.4219087264878199, + "grad_norm": 3.4516263008117676, + "learning_rate": 6.485360495672226e-06, + "loss": 0.9911, + "step": 5473 + }, + { + "epoch": 0.4219858156028369, + "grad_norm": 3.6937084197998047, + "learning_rate": 6.4841683632225005e-06, + "loss": 0.9007, + "step": 5474 + }, + { + "epoch": 0.42206290471785385, + "grad_norm": 4.0262627601623535, + "learning_rate": 6.482976138242564e-06, + "loss": 1.1981, + "step": 5475 + }, + { + "epoch": 0.4221399938328708, + "grad_norm": 3.6597800254821777, + "learning_rate": 6.481783820806745e-06, + "loss": 0.898, + "step": 5476 + }, + { + "epoch": 0.42221708294788773, + "grad_norm": 3.7206947803497314, + "learning_rate": 6.4805914109893805e-06, + "loss": 0.9356, + "step": 5477 + }, + { + "epoch": 0.4222941720629047, + "grad_norm": 3.3729729652404785, + "learning_rate": 6.479398908864809e-06, + "loss": 0.9129, + "step": 5478 + }, + { + "epoch": 0.4223712611779217, + "grad_norm": 3.6832430362701416, + "learning_rate": 6.478206314507378e-06, + "loss": 0.9142, + "step": 5479 + }, + { + "epoch": 0.42244835029293865, + "grad_norm": 3.764679431915283, + "learning_rate": 6.47701362799144e-06, + "loss": 1.0789, + "step": 5480 + }, + { + "epoch": 0.4225254394079556, + "grad_norm": 3.5273795127868652, + "learning_rate": 6.47582084939135e-06, + "loss": 0.9582, + "step": 5481 + }, + { + "epoch": 0.42260252852297253, + "grad_norm": 3.7401888370513916, + "learning_rate": 6.474627978781474e-06, + "loss": 1.0972, + "step": 5482 + }, + { + "epoch": 0.4226796176379895, + "grad_norm": 3.4443490505218506, + "learning_rate": 6.473435016236181e-06, + "loss": 0.9941, + "step": 5483 + }, + { + "epoch": 0.4227567067530065, + "grad_norm": 3.8706183433532715, + "learning_rate": 6.472241961829846e-06, + "loss": 0.9752, + "step": 5484 + }, + { + "epoch": 0.42283379586802344, + "grad_norm": 3.611464023590088, + "learning_rate": 6.47104881563685e-06, + "loss": 0.9907, + "step": 5485 + }, + { + "epoch": 0.4229108849830404, + "grad_norm": 3.5316667556762695, + "learning_rate": 6.469855577731579e-06, + "loss": 0.8496, + "step": 5486 + }, + { + "epoch": 0.42298797409805733, + "grad_norm": 3.4962635040283203, + "learning_rate": 6.468662248188424e-06, + "loss": 1.0317, + "step": 5487 + }, + { + "epoch": 0.4230650632130743, + "grad_norm": 4.126564025878906, + "learning_rate": 6.467468827081786e-06, + "loss": 0.9859, + "step": 5488 + }, + { + "epoch": 0.42314215232809127, + "grad_norm": 3.768683671951294, + "learning_rate": 6.466275314486066e-06, + "loss": 0.8975, + "step": 5489 + }, + { + "epoch": 0.42321924144310824, + "grad_norm": 3.997591733932495, + "learning_rate": 6.4650817104756735e-06, + "loss": 0.9813, + "step": 5490 + }, + { + "epoch": 0.4232963305581252, + "grad_norm": 3.633054494857788, + "learning_rate": 6.463888015125026e-06, + "loss": 0.8793, + "step": 5491 + }, + { + "epoch": 0.4233734196731421, + "grad_norm": 3.5406949520111084, + "learning_rate": 6.4626942285085414e-06, + "loss": 0.8675, + "step": 5492 + }, + { + "epoch": 0.4234505087881591, + "grad_norm": 3.7187070846557617, + "learning_rate": 6.461500350700648e-06, + "loss": 1.0153, + "step": 5493 + }, + { + "epoch": 0.42352759790317607, + "grad_norm": 3.6261672973632812, + "learning_rate": 6.4603063817757746e-06, + "loss": 0.9925, + "step": 5494 + }, + { + "epoch": 0.42360468701819304, + "grad_norm": 3.54945969581604, + "learning_rate": 6.459112321808363e-06, + "loss": 1.0348, + "step": 5495 + }, + { + "epoch": 0.42368177613321, + "grad_norm": 3.686965227127075, + "learning_rate": 6.457918170872855e-06, + "loss": 0.9913, + "step": 5496 + }, + { + "epoch": 0.4237588652482269, + "grad_norm": 3.7915258407592773, + "learning_rate": 6.4567239290437e-06, + "loss": 1.025, + "step": 5497 + }, + { + "epoch": 0.4238359543632439, + "grad_norm": 3.7425386905670166, + "learning_rate": 6.455529596395353e-06, + "loss": 0.9447, + "step": 5498 + }, + { + "epoch": 0.42391304347826086, + "grad_norm": 3.5631515979766846, + "learning_rate": 6.454335173002273e-06, + "loss": 0.9225, + "step": 5499 + }, + { + "epoch": 0.42399013259327784, + "grad_norm": 3.4709506034851074, + "learning_rate": 6.4531406589389275e-06, + "loss": 0.9846, + "step": 5500 + }, + { + "epoch": 0.4240672217082948, + "grad_norm": 3.696099281311035, + "learning_rate": 6.451946054279788e-06, + "loss": 0.9241, + "step": 5501 + }, + { + "epoch": 0.4241443108233117, + "grad_norm": 3.3619208335876465, + "learning_rate": 6.450751359099332e-06, + "loss": 0.9098, + "step": 5502 + }, + { + "epoch": 0.4242213999383287, + "grad_norm": 3.8258392810821533, + "learning_rate": 6.449556573472042e-06, + "loss": 1.1477, + "step": 5503 + }, + { + "epoch": 0.42429848905334566, + "grad_norm": 3.742774724960327, + "learning_rate": 6.448361697472408e-06, + "loss": 1.0046, + "step": 5504 + }, + { + "epoch": 0.42437557816836263, + "grad_norm": 3.6903765201568604, + "learning_rate": 6.447166731174923e-06, + "loss": 0.9383, + "step": 5505 + }, + { + "epoch": 0.4244526672833796, + "grad_norm": 4.07509183883667, + "learning_rate": 6.445971674654087e-06, + "loss": 1.0482, + "step": 5506 + }, + { + "epoch": 0.4245297563983966, + "grad_norm": 3.969420909881592, + "learning_rate": 6.444776527984406e-06, + "loss": 0.9808, + "step": 5507 + }, + { + "epoch": 0.4246068455134135, + "grad_norm": 3.540527820587158, + "learning_rate": 6.443581291240392e-06, + "loss": 0.9194, + "step": 5508 + }, + { + "epoch": 0.42468393462843046, + "grad_norm": 3.800278663635254, + "learning_rate": 6.44238596449656e-06, + "loss": 0.9855, + "step": 5509 + }, + { + "epoch": 0.42476102374344743, + "grad_norm": 3.841402769088745, + "learning_rate": 6.441190547827434e-06, + "loss": 1.0468, + "step": 5510 + }, + { + "epoch": 0.4248381128584644, + "grad_norm": 3.673065423965454, + "learning_rate": 6.439995041307541e-06, + "loss": 1.0121, + "step": 5511 + }, + { + "epoch": 0.42491520197348137, + "grad_norm": 3.586517810821533, + "learning_rate": 6.438799445011415e-06, + "loss": 1.0284, + "step": 5512 + }, + { + "epoch": 0.4249922910884983, + "grad_norm": 4.116529941558838, + "learning_rate": 6.4376037590135955e-06, + "loss": 1.06, + "step": 5513 + }, + { + "epoch": 0.42506938020351526, + "grad_norm": 3.565775156021118, + "learning_rate": 6.436407983388627e-06, + "loss": 0.9655, + "step": 5514 + }, + { + "epoch": 0.4251464693185322, + "grad_norm": 3.403215169906616, + "learning_rate": 6.435212118211062e-06, + "loss": 0.9076, + "step": 5515 + }, + { + "epoch": 0.4252235584335492, + "grad_norm": 4.086001396179199, + "learning_rate": 6.434016163555452e-06, + "loss": 1.0058, + "step": 5516 + }, + { + "epoch": 0.42530064754856617, + "grad_norm": 3.650869131088257, + "learning_rate": 6.432820119496363e-06, + "loss": 0.941, + "step": 5517 + }, + { + "epoch": 0.4253777366635831, + "grad_norm": 3.593628168106079, + "learning_rate": 6.431623986108359e-06, + "loss": 1.0367, + "step": 5518 + }, + { + "epoch": 0.42545482577860005, + "grad_norm": 3.7632596492767334, + "learning_rate": 6.430427763466014e-06, + "loss": 0.9751, + "step": 5519 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 3.490562677383423, + "learning_rate": 6.429231451643907e-06, + "loss": 0.9234, + "step": 5520 + }, + { + "epoch": 0.425609004008634, + "grad_norm": 3.740706443786621, + "learning_rate": 6.428035050716621e-06, + "loss": 1.023, + "step": 5521 + }, + { + "epoch": 0.42568609312365097, + "grad_norm": 3.898951292037964, + "learning_rate": 6.426838560758746e-06, + "loss": 0.9387, + "step": 5522 + }, + { + "epoch": 0.4257631822386679, + "grad_norm": 3.8880972862243652, + "learning_rate": 6.425641981844876e-06, + "loss": 0.8878, + "step": 5523 + }, + { + "epoch": 0.42584027135368485, + "grad_norm": 3.443209648132324, + "learning_rate": 6.424445314049611e-06, + "loss": 0.8749, + "step": 5524 + }, + { + "epoch": 0.4259173604687018, + "grad_norm": 4.383835315704346, + "learning_rate": 6.42324855744756e-06, + "loss": 1.0518, + "step": 5525 + }, + { + "epoch": 0.4259944495837188, + "grad_norm": 3.5138866901397705, + "learning_rate": 6.422051712113332e-06, + "loss": 1.003, + "step": 5526 + }, + { + "epoch": 0.42607153869873576, + "grad_norm": 3.474919080734253, + "learning_rate": 6.420854778121543e-06, + "loss": 1.0002, + "step": 5527 + }, + { + "epoch": 0.4261486278137527, + "grad_norm": 3.776796817779541, + "learning_rate": 6.41965775554682e-06, + "loss": 1.0651, + "step": 5528 + }, + { + "epoch": 0.42622571692876965, + "grad_norm": 3.249838352203369, + "learning_rate": 6.418460644463787e-06, + "loss": 0.9011, + "step": 5529 + }, + { + "epoch": 0.4263028060437866, + "grad_norm": 3.8011159896850586, + "learning_rate": 6.41726344494708e-06, + "loss": 0.8613, + "step": 5530 + }, + { + "epoch": 0.4263798951588036, + "grad_norm": 4.086413383483887, + "learning_rate": 6.416066157071338e-06, + "loss": 1.0941, + "step": 5531 + }, + { + "epoch": 0.42645698427382056, + "grad_norm": 3.662161111831665, + "learning_rate": 6.414868780911203e-06, + "loss": 1.0212, + "step": 5532 + }, + { + "epoch": 0.4265340733888375, + "grad_norm": 3.3614611625671387, + "learning_rate": 6.41367131654133e-06, + "loss": 0.925, + "step": 5533 + }, + { + "epoch": 0.42661116250385445, + "grad_norm": 3.57147216796875, + "learning_rate": 6.41247376403637e-06, + "loss": 0.9396, + "step": 5534 + }, + { + "epoch": 0.4266882516188714, + "grad_norm": 3.516568422317505, + "learning_rate": 6.4112761234709866e-06, + "loss": 0.8563, + "step": 5535 + }, + { + "epoch": 0.4267653407338884, + "grad_norm": 3.589931011199951, + "learning_rate": 6.4100783949198465e-06, + "loss": 0.9974, + "step": 5536 + }, + { + "epoch": 0.42684242984890536, + "grad_norm": 3.5021722316741943, + "learning_rate": 6.408880578457622e-06, + "loss": 1.0911, + "step": 5537 + }, + { + "epoch": 0.4269195189639223, + "grad_norm": 3.5328221321105957, + "learning_rate": 6.407682674158988e-06, + "loss": 0.8483, + "step": 5538 + }, + { + "epoch": 0.42699660807893924, + "grad_norm": 4.008224010467529, + "learning_rate": 6.406484682098632e-06, + "loss": 1.0525, + "step": 5539 + }, + { + "epoch": 0.4270736971939562, + "grad_norm": 3.6036767959594727, + "learning_rate": 6.40528660235124e-06, + "loss": 0.9958, + "step": 5540 + }, + { + "epoch": 0.4271507863089732, + "grad_norm": 4.159304618835449, + "learning_rate": 6.404088434991504e-06, + "loss": 0.9268, + "step": 5541 + }, + { + "epoch": 0.42722787542399016, + "grad_norm": 4.040691375732422, + "learning_rate": 6.402890180094129e-06, + "loss": 1.1215, + "step": 5542 + }, + { + "epoch": 0.42730496453900707, + "grad_norm": 3.66817307472229, + "learning_rate": 6.401691837733815e-06, + "loss": 0.9277, + "step": 5543 + }, + { + "epoch": 0.42738205365402404, + "grad_norm": 4.073860168457031, + "learning_rate": 6.4004934079852775e-06, + "loss": 0.986, + "step": 5544 + }, + { + "epoch": 0.427459142769041, + "grad_norm": 3.9062860012054443, + "learning_rate": 6.399294890923227e-06, + "loss": 1.0575, + "step": 5545 + }, + { + "epoch": 0.427536231884058, + "grad_norm": 3.5801217555999756, + "learning_rate": 6.398096286622388e-06, + "loss": 0.9702, + "step": 5546 + }, + { + "epoch": 0.42761332099907495, + "grad_norm": 3.972623825073242, + "learning_rate": 6.396897595157487e-06, + "loss": 0.9699, + "step": 5547 + }, + { + "epoch": 0.42769041011409187, + "grad_norm": 3.516526222229004, + "learning_rate": 6.395698816603253e-06, + "loss": 0.9262, + "step": 5548 + }, + { + "epoch": 0.42776749922910884, + "grad_norm": 3.5756266117095947, + "learning_rate": 6.39449995103443e-06, + "loss": 0.9331, + "step": 5549 + }, + { + "epoch": 0.4278445883441258, + "grad_norm": 3.659475326538086, + "learning_rate": 6.393300998525754e-06, + "loss": 0.8752, + "step": 5550 + }, + { + "epoch": 0.4279216774591428, + "grad_norm": 4.002374649047852, + "learning_rate": 6.392101959151978e-06, + "loss": 1.1071, + "step": 5551 + }, + { + "epoch": 0.42799876657415975, + "grad_norm": 3.4947450160980225, + "learning_rate": 6.390902832987857e-06, + "loss": 0.9502, + "step": 5552 + }, + { + "epoch": 0.42807585568917667, + "grad_norm": 3.6659510135650635, + "learning_rate": 6.389703620108145e-06, + "loss": 0.9983, + "step": 5553 + }, + { + "epoch": 0.42815294480419364, + "grad_norm": 3.892176628112793, + "learning_rate": 6.388504320587611e-06, + "loss": 0.9481, + "step": 5554 + }, + { + "epoch": 0.4282300339192106, + "grad_norm": 3.580537796020508, + "learning_rate": 6.387304934501024e-06, + "loss": 0.9526, + "step": 5555 + }, + { + "epoch": 0.4283071230342276, + "grad_norm": 3.420477867126465, + "learning_rate": 6.386105461923159e-06, + "loss": 0.9092, + "step": 5556 + }, + { + "epoch": 0.42838421214924455, + "grad_norm": 3.7150497436523438, + "learning_rate": 6.384905902928797e-06, + "loss": 1.0413, + "step": 5557 + }, + { + "epoch": 0.42846130126426146, + "grad_norm": 3.4000418186187744, + "learning_rate": 6.383706257592725e-06, + "loss": 0.9897, + "step": 5558 + }, + { + "epoch": 0.42853839037927843, + "grad_norm": 3.336329698562622, + "learning_rate": 6.382506525989734e-06, + "loss": 1.0003, + "step": 5559 + }, + { + "epoch": 0.4286154794942954, + "grad_norm": 3.510364055633545, + "learning_rate": 6.381306708194622e-06, + "loss": 0.8863, + "step": 5560 + }, + { + "epoch": 0.4286925686093124, + "grad_norm": 3.6670730113983154, + "learning_rate": 6.38010680428219e-06, + "loss": 1.0318, + "step": 5561 + }, + { + "epoch": 0.42876965772432934, + "grad_norm": 3.3945372104644775, + "learning_rate": 6.378906814327246e-06, + "loss": 1.0504, + "step": 5562 + }, + { + "epoch": 0.42884674683934626, + "grad_norm": 4.040469169616699, + "learning_rate": 6.377706738404604e-06, + "loss": 1.1041, + "step": 5563 + }, + { + "epoch": 0.42892383595436323, + "grad_norm": 4.579511642456055, + "learning_rate": 6.376506576589082e-06, + "loss": 1.0169, + "step": 5564 + }, + { + "epoch": 0.4290009250693802, + "grad_norm": 3.5444085597991943, + "learning_rate": 6.375306328955506e-06, + "loss": 1.0349, + "step": 5565 + }, + { + "epoch": 0.42907801418439717, + "grad_norm": 3.6533939838409424, + "learning_rate": 6.374105995578701e-06, + "loss": 0.9106, + "step": 5566 + }, + { + "epoch": 0.42915510329941414, + "grad_norm": 3.527340888977051, + "learning_rate": 6.372905576533505e-06, + "loss": 0.9116, + "step": 5567 + }, + { + "epoch": 0.42923219241443106, + "grad_norm": 3.571380853652954, + "learning_rate": 6.371705071894756e-06, + "loss": 1.0186, + "step": 5568 + }, + { + "epoch": 0.42930928152944803, + "grad_norm": 3.8140509128570557, + "learning_rate": 6.3705044817373006e-06, + "loss": 0.9015, + "step": 5569 + }, + { + "epoch": 0.429386370644465, + "grad_norm": 3.546802520751953, + "learning_rate": 6.369303806135989e-06, + "loss": 1.0005, + "step": 5570 + }, + { + "epoch": 0.42946345975948197, + "grad_norm": 3.889112949371338, + "learning_rate": 6.368103045165677e-06, + "loss": 1.0829, + "step": 5571 + }, + { + "epoch": 0.42954054887449894, + "grad_norm": 3.435272216796875, + "learning_rate": 6.366902198901225e-06, + "loss": 1.0089, + "step": 5572 + }, + { + "epoch": 0.42961763798951585, + "grad_norm": 3.3787803649902344, + "learning_rate": 6.365701267417501e-06, + "loss": 0.934, + "step": 5573 + }, + { + "epoch": 0.4296947271045328, + "grad_norm": 3.8699758052825928, + "learning_rate": 6.364500250789375e-06, + "loss": 1.1315, + "step": 5574 + }, + { + "epoch": 0.4297718162195498, + "grad_norm": 3.6572110652923584, + "learning_rate": 6.363299149091726e-06, + "loss": 1.0157, + "step": 5575 + }, + { + "epoch": 0.42984890533456677, + "grad_norm": 3.4599862098693848, + "learning_rate": 6.362097962399436e-06, + "loss": 0.9634, + "step": 5576 + }, + { + "epoch": 0.42992599444958374, + "grad_norm": 3.603281259536743, + "learning_rate": 6.3608966907873925e-06, + "loss": 0.9877, + "step": 5577 + }, + { + "epoch": 0.43000308356460065, + "grad_norm": 3.7701170444488525, + "learning_rate": 6.359695334330488e-06, + "loss": 1.0674, + "step": 5578 + }, + { + "epoch": 0.4300801726796176, + "grad_norm": 3.7312378883361816, + "learning_rate": 6.358493893103623e-06, + "loss": 0.9849, + "step": 5579 + }, + { + "epoch": 0.4301572617946346, + "grad_norm": 3.3905229568481445, + "learning_rate": 6.357292367181698e-06, + "loss": 0.8549, + "step": 5580 + }, + { + "epoch": 0.43023435090965156, + "grad_norm": 3.478235960006714, + "learning_rate": 6.356090756639623e-06, + "loss": 0.8909, + "step": 5581 + }, + { + "epoch": 0.43031144002466853, + "grad_norm": 3.4896721839904785, + "learning_rate": 6.354889061552314e-06, + "loss": 0.9935, + "step": 5582 + }, + { + "epoch": 0.43038852913968545, + "grad_norm": 3.669250965118408, + "learning_rate": 6.353687281994688e-06, + "loss": 1.0689, + "step": 5583 + }, + { + "epoch": 0.4304656182547024, + "grad_norm": 3.8079261779785156, + "learning_rate": 6.352485418041673e-06, + "loss": 0.9981, + "step": 5584 + }, + { + "epoch": 0.4305427073697194, + "grad_norm": 3.5822927951812744, + "learning_rate": 6.351283469768195e-06, + "loss": 1.0457, + "step": 5585 + }, + { + "epoch": 0.43061979648473636, + "grad_norm": 3.62357759475708, + "learning_rate": 6.350081437249191e-06, + "loss": 0.9444, + "step": 5586 + }, + { + "epoch": 0.43069688559975333, + "grad_norm": 3.627352476119995, + "learning_rate": 6.348879320559602e-06, + "loss": 0.9715, + "step": 5587 + }, + { + "epoch": 0.43077397471477025, + "grad_norm": 3.5861945152282715, + "learning_rate": 6.3476771197743735e-06, + "loss": 0.9506, + "step": 5588 + }, + { + "epoch": 0.4308510638297872, + "grad_norm": 3.5179619789123535, + "learning_rate": 6.346474834968458e-06, + "loss": 1.0651, + "step": 5589 + }, + { + "epoch": 0.4309281529448042, + "grad_norm": 4.048654556274414, + "learning_rate": 6.345272466216807e-06, + "loss": 1.0671, + "step": 5590 + }, + { + "epoch": 0.43100524205982116, + "grad_norm": 3.6038835048675537, + "learning_rate": 6.344070013594388e-06, + "loss": 1.0723, + "step": 5591 + }, + { + "epoch": 0.43108233117483813, + "grad_norm": 3.3801581859588623, + "learning_rate": 6.342867477176164e-06, + "loss": 0.9918, + "step": 5592 + }, + { + "epoch": 0.4311594202898551, + "grad_norm": 4.134720325469971, + "learning_rate": 6.3416648570371065e-06, + "loss": 1.0093, + "step": 5593 + }, + { + "epoch": 0.431236509404872, + "grad_norm": 3.704859495162964, + "learning_rate": 6.340462153252195e-06, + "loss": 0.953, + "step": 5594 + }, + { + "epoch": 0.431313598519889, + "grad_norm": 3.9466912746429443, + "learning_rate": 6.339259365896411e-06, + "loss": 1.0878, + "step": 5595 + }, + { + "epoch": 0.43139068763490596, + "grad_norm": 3.7357873916625977, + "learning_rate": 6.33805649504474e-06, + "loss": 0.9632, + "step": 5596 + }, + { + "epoch": 0.4314677767499229, + "grad_norm": 3.6938416957855225, + "learning_rate": 6.336853540772178e-06, + "loss": 0.9337, + "step": 5597 + }, + { + "epoch": 0.4315448658649399, + "grad_norm": 3.942068099975586, + "learning_rate": 6.335650503153721e-06, + "loss": 0.974, + "step": 5598 + }, + { + "epoch": 0.4316219549799568, + "grad_norm": 4.033871650695801, + "learning_rate": 6.334447382264372e-06, + "loss": 0.9456, + "step": 5599 + }, + { + "epoch": 0.4316990440949738, + "grad_norm": 3.7592177391052246, + "learning_rate": 6.333244178179141e-06, + "loss": 1.0018, + "step": 5600 + }, + { + "epoch": 0.43177613320999075, + "grad_norm": 3.3277747631073, + "learning_rate": 6.3320408909730405e-06, + "loss": 0.9049, + "step": 5601 + }, + { + "epoch": 0.4318532223250077, + "grad_norm": 3.5727672576904297, + "learning_rate": 6.330837520721088e-06, + "loss": 0.9017, + "step": 5602 + }, + { + "epoch": 0.4319303114400247, + "grad_norm": 3.610518455505371, + "learning_rate": 6.329634067498311e-06, + "loss": 1.0287, + "step": 5603 + }, + { + "epoch": 0.4320074005550416, + "grad_norm": 3.8098137378692627, + "learning_rate": 6.3284305313797335e-06, + "loss": 0.9755, + "step": 5604 + }, + { + "epoch": 0.4320844896700586, + "grad_norm": 3.6883254051208496, + "learning_rate": 6.327226912440394e-06, + "loss": 1.021, + "step": 5605 + }, + { + "epoch": 0.43216157878507555, + "grad_norm": 3.6646480560302734, + "learning_rate": 6.32602321075533e-06, + "loss": 0.9877, + "step": 5606 + }, + { + "epoch": 0.4322386679000925, + "grad_norm": 3.433976173400879, + "learning_rate": 6.324819426399587e-06, + "loss": 0.9482, + "step": 5607 + }, + { + "epoch": 0.4323157570151095, + "grad_norm": 3.4387431144714355, + "learning_rate": 6.323615559448213e-06, + "loss": 0.8817, + "step": 5608 + }, + { + "epoch": 0.4323928461301264, + "grad_norm": 3.4525887966156006, + "learning_rate": 6.322411609976265e-06, + "loss": 0.8509, + "step": 5609 + }, + { + "epoch": 0.4324699352451434, + "grad_norm": 3.7124297618865967, + "learning_rate": 6.321207578058803e-06, + "loss": 0.962, + "step": 5610 + }, + { + "epoch": 0.43254702436016035, + "grad_norm": 3.817814350128174, + "learning_rate": 6.32000346377089e-06, + "loss": 1.051, + "step": 5611 + }, + { + "epoch": 0.4326241134751773, + "grad_norm": 3.616466760635376, + "learning_rate": 6.318799267187596e-06, + "loss": 0.9733, + "step": 5612 + }, + { + "epoch": 0.4327012025901943, + "grad_norm": 3.7230231761932373, + "learning_rate": 6.317594988384e-06, + "loss": 1.0263, + "step": 5613 + }, + { + "epoch": 0.4327782917052112, + "grad_norm": 3.632129430770874, + "learning_rate": 6.31639062743518e-06, + "loss": 0.9545, + "step": 5614 + }, + { + "epoch": 0.4328553808202282, + "grad_norm": 4.578291416168213, + "learning_rate": 6.315186184416222e-06, + "loss": 1.0831, + "step": 5615 + }, + { + "epoch": 0.43293246993524515, + "grad_norm": 3.434826374053955, + "learning_rate": 6.313981659402218e-06, + "loss": 0.9454, + "step": 5616 + }, + { + "epoch": 0.4330095590502621, + "grad_norm": 3.2984015941619873, + "learning_rate": 6.312777052468262e-06, + "loss": 0.9104, + "step": 5617 + }, + { + "epoch": 0.4330866481652791, + "grad_norm": 3.349336862564087, + "learning_rate": 6.3115723636894565e-06, + "loss": 1.0299, + "step": 5618 + }, + { + "epoch": 0.433163737280296, + "grad_norm": 3.6265594959259033, + "learning_rate": 6.310367593140906e-06, + "loss": 0.9828, + "step": 5619 + }, + { + "epoch": 0.433240826395313, + "grad_norm": 3.9649040699005127, + "learning_rate": 6.3091627408977215e-06, + "loss": 0.9659, + "step": 5620 + }, + { + "epoch": 0.43331791551032994, + "grad_norm": 3.6127185821533203, + "learning_rate": 6.3079578070350235e-06, + "loss": 0.8698, + "step": 5621 + }, + { + "epoch": 0.4333950046253469, + "grad_norm": 3.442434310913086, + "learning_rate": 6.306752791627928e-06, + "loss": 0.8926, + "step": 5622 + }, + { + "epoch": 0.4334720937403639, + "grad_norm": 3.6911134719848633, + "learning_rate": 6.3055476947515635e-06, + "loss": 1.0027, + "step": 5623 + }, + { + "epoch": 0.4335491828553808, + "grad_norm": 3.4410293102264404, + "learning_rate": 6.3043425164810635e-06, + "loss": 0.8793, + "step": 5624 + }, + { + "epoch": 0.43362627197039777, + "grad_norm": 3.6440846920013428, + "learning_rate": 6.303137256891563e-06, + "loss": 1.027, + "step": 5625 + }, + { + "epoch": 0.43370336108541474, + "grad_norm": 3.805568218231201, + "learning_rate": 6.301931916058201e-06, + "loss": 0.9033, + "step": 5626 + }, + { + "epoch": 0.4337804502004317, + "grad_norm": 3.4418349266052246, + "learning_rate": 6.300726494056131e-06, + "loss": 0.8662, + "step": 5627 + }, + { + "epoch": 0.4338575393154487, + "grad_norm": 3.30560302734375, + "learning_rate": 6.299520990960497e-06, + "loss": 0.959, + "step": 5628 + }, + { + "epoch": 0.4339346284304656, + "grad_norm": 3.5205271244049072, + "learning_rate": 6.298315406846462e-06, + "loss": 0.9701, + "step": 5629 + }, + { + "epoch": 0.43401171754548257, + "grad_norm": 3.5560543537139893, + "learning_rate": 6.297109741789184e-06, + "loss": 0.8881, + "step": 5630 + }, + { + "epoch": 0.43408880666049954, + "grad_norm": 3.4408164024353027, + "learning_rate": 6.2959039958638325e-06, + "loss": 0.9109, + "step": 5631 + }, + { + "epoch": 0.4341658957755165, + "grad_norm": 3.3632290363311768, + "learning_rate": 6.2946981691455775e-06, + "loss": 0.8839, + "step": 5632 + }, + { + "epoch": 0.4342429848905335, + "grad_norm": 3.5588736534118652, + "learning_rate": 6.293492261709597e-06, + "loss": 0.9453, + "step": 5633 + }, + { + "epoch": 0.4343200740055504, + "grad_norm": 3.7234859466552734, + "learning_rate": 6.292286273631074e-06, + "loss": 1.0011, + "step": 5634 + }, + { + "epoch": 0.43439716312056736, + "grad_norm": 3.504631280899048, + "learning_rate": 6.291080204985195e-06, + "loss": 0.8339, + "step": 5635 + }, + { + "epoch": 0.43447425223558434, + "grad_norm": 3.596510171890259, + "learning_rate": 6.289874055847152e-06, + "loss": 0.8935, + "step": 5636 + }, + { + "epoch": 0.4345513413506013, + "grad_norm": 3.4602015018463135, + "learning_rate": 6.288667826292142e-06, + "loss": 0.9409, + "step": 5637 + }, + { + "epoch": 0.4346284304656183, + "grad_norm": 3.5766189098358154, + "learning_rate": 6.2874615163953654e-06, + "loss": 0.9501, + "step": 5638 + }, + { + "epoch": 0.4347055195806352, + "grad_norm": 3.890477180480957, + "learning_rate": 6.286255126232032e-06, + "loss": 0.8271, + "step": 5639 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 3.6183977127075195, + "learning_rate": 6.285048655877355e-06, + "loss": 0.9268, + "step": 5640 + }, + { + "epoch": 0.43485969781066913, + "grad_norm": 3.9595730304718018, + "learning_rate": 6.283842105406548e-06, + "loss": 1.0355, + "step": 5641 + }, + { + "epoch": 0.4349367869256861, + "grad_norm": 3.801222801208496, + "learning_rate": 6.282635474894836e-06, + "loss": 0.9763, + "step": 5642 + }, + { + "epoch": 0.4350138760407031, + "grad_norm": 3.6348776817321777, + "learning_rate": 6.281428764417444e-06, + "loss": 1.0238, + "step": 5643 + }, + { + "epoch": 0.43509096515572, + "grad_norm": 3.195066452026367, + "learning_rate": 6.280221974049606e-06, + "loss": 0.8491, + "step": 5644 + }, + { + "epoch": 0.43516805427073696, + "grad_norm": 3.7148594856262207, + "learning_rate": 6.279015103866559e-06, + "loss": 0.871, + "step": 5645 + }, + { + "epoch": 0.43524514338575393, + "grad_norm": 3.536971092224121, + "learning_rate": 6.2778081539435436e-06, + "loss": 0.9518, + "step": 5646 + }, + { + "epoch": 0.4353222325007709, + "grad_norm": 3.973155975341797, + "learning_rate": 6.276601124355807e-06, + "loss": 1.012, + "step": 5647 + }, + { + "epoch": 0.43539932161578787, + "grad_norm": 3.6045761108398438, + "learning_rate": 6.275394015178603e-06, + "loss": 0.983, + "step": 5648 + }, + { + "epoch": 0.4354764107308048, + "grad_norm": 3.582974433898926, + "learning_rate": 6.274186826487187e-06, + "loss": 0.9876, + "step": 5649 + }, + { + "epoch": 0.43555349984582176, + "grad_norm": 3.7312257289886475, + "learning_rate": 6.272979558356821e-06, + "loss": 1.029, + "step": 5650 + }, + { + "epoch": 0.4356305889608387, + "grad_norm": 3.7958641052246094, + "learning_rate": 6.271772210862773e-06, + "loss": 0.9989, + "step": 5651 + }, + { + "epoch": 0.4357076780758557, + "grad_norm": 3.4761128425598145, + "learning_rate": 6.270564784080314e-06, + "loss": 1.0541, + "step": 5652 + }, + { + "epoch": 0.43578476719087267, + "grad_norm": 3.6425018310546875, + "learning_rate": 6.269357278084723e-06, + "loss": 1.0119, + "step": 5653 + }, + { + "epoch": 0.4358618563058896, + "grad_norm": 3.6321396827697754, + "learning_rate": 6.268149692951278e-06, + "loss": 1.0429, + "step": 5654 + }, + { + "epoch": 0.43593894542090655, + "grad_norm": 3.9710259437561035, + "learning_rate": 6.266942028755269e-06, + "loss": 0.9756, + "step": 5655 + }, + { + "epoch": 0.4360160345359235, + "grad_norm": 3.927583694458008, + "learning_rate": 6.265734285571985e-06, + "loss": 1.0705, + "step": 5656 + }, + { + "epoch": 0.4360931236509405, + "grad_norm": 3.5504817962646484, + "learning_rate": 6.2645264634767236e-06, + "loss": 1.0056, + "step": 5657 + }, + { + "epoch": 0.43617021276595747, + "grad_norm": 3.7636258602142334, + "learning_rate": 6.263318562544787e-06, + "loss": 1.011, + "step": 5658 + }, + { + "epoch": 0.4362473018809744, + "grad_norm": 4.358571529388428, + "learning_rate": 6.262110582851481e-06, + "loss": 0.9815, + "step": 5659 + }, + { + "epoch": 0.43632439099599135, + "grad_norm": 3.5723724365234375, + "learning_rate": 6.260902524472116e-06, + "loss": 0.8575, + "step": 5660 + }, + { + "epoch": 0.4364014801110083, + "grad_norm": 3.437037229537964, + "learning_rate": 6.2596943874820105e-06, + "loss": 0.9603, + "step": 5661 + }, + { + "epoch": 0.4364785692260253, + "grad_norm": 3.732524871826172, + "learning_rate": 6.2584861719564825e-06, + "loss": 0.8858, + "step": 5662 + }, + { + "epoch": 0.43655565834104226, + "grad_norm": 3.429790496826172, + "learning_rate": 6.25727787797086e-06, + "loss": 1.0058, + "step": 5663 + }, + { + "epoch": 0.4366327474560592, + "grad_norm": 3.6642956733703613, + "learning_rate": 6.256069505600474e-06, + "loss": 0.9124, + "step": 5664 + }, + { + "epoch": 0.43670983657107615, + "grad_norm": 3.6058828830718994, + "learning_rate": 6.254861054920659e-06, + "loss": 0.9339, + "step": 5665 + }, + { + "epoch": 0.4367869256860931, + "grad_norm": 3.4865617752075195, + "learning_rate": 6.2536525260067575e-06, + "loss": 0.9652, + "step": 5666 + }, + { + "epoch": 0.4368640148011101, + "grad_norm": 3.6662580966949463, + "learning_rate": 6.252443918934112e-06, + "loss": 0.9143, + "step": 5667 + }, + { + "epoch": 0.43694110391612706, + "grad_norm": 4.207685470581055, + "learning_rate": 6.251235233778075e-06, + "loss": 0.9565, + "step": 5668 + }, + { + "epoch": 0.437018193031144, + "grad_norm": 3.748297929763794, + "learning_rate": 6.250026470614003e-06, + "loss": 1.1053, + "step": 5669 + }, + { + "epoch": 0.43709528214616095, + "grad_norm": 3.7725093364715576, + "learning_rate": 6.248817629517253e-06, + "loss": 0.9306, + "step": 5670 + }, + { + "epoch": 0.4371723712611779, + "grad_norm": 3.868713617324829, + "learning_rate": 6.247608710563192e-06, + "loss": 0.9549, + "step": 5671 + }, + { + "epoch": 0.4372494603761949, + "grad_norm": 3.3361618518829346, + "learning_rate": 6.2463997138271905e-06, + "loss": 0.9246, + "step": 5672 + }, + { + "epoch": 0.43732654949121186, + "grad_norm": 3.693483591079712, + "learning_rate": 6.24519063938462e-06, + "loss": 0.9676, + "step": 5673 + }, + { + "epoch": 0.4374036386062288, + "grad_norm": 3.5362865924835205, + "learning_rate": 6.243981487310864e-06, + "loss": 1.0187, + "step": 5674 + }, + { + "epoch": 0.43748072772124574, + "grad_norm": 3.2017054557800293, + "learning_rate": 6.242772257681304e-06, + "loss": 0.9676, + "step": 5675 + }, + { + "epoch": 0.4375578168362627, + "grad_norm": 3.7719571590423584, + "learning_rate": 6.241562950571331e-06, + "loss": 0.9562, + "step": 5676 + }, + { + "epoch": 0.4376349059512797, + "grad_norm": 3.446551561355591, + "learning_rate": 6.240353566056339e-06, + "loss": 0.9793, + "step": 5677 + }, + { + "epoch": 0.43771199506629666, + "grad_norm": 4.14500617980957, + "learning_rate": 6.239144104211724e-06, + "loss": 1.0242, + "step": 5678 + }, + { + "epoch": 0.4377890841813136, + "grad_norm": 4.008180141448975, + "learning_rate": 6.237934565112893e-06, + "loss": 1.0776, + "step": 5679 + }, + { + "epoch": 0.43786617329633054, + "grad_norm": 3.641493082046509, + "learning_rate": 6.2367249488352535e-06, + "loss": 1.0238, + "step": 5680 + }, + { + "epoch": 0.4379432624113475, + "grad_norm": 3.8391215801239014, + "learning_rate": 6.235515255454218e-06, + "loss": 1.0155, + "step": 5681 + }, + { + "epoch": 0.4380203515263645, + "grad_norm": 3.3518407344818115, + "learning_rate": 6.234305485045205e-06, + "loss": 0.8861, + "step": 5682 + }, + { + "epoch": 0.43809744064138145, + "grad_norm": 3.9264771938323975, + "learning_rate": 6.2330956376836384e-06, + "loss": 1.0198, + "step": 5683 + }, + { + "epoch": 0.4381745297563984, + "grad_norm": 3.6416938304901123, + "learning_rate": 6.231885713444944e-06, + "loss": 0.9194, + "step": 5684 + }, + { + "epoch": 0.43825161887141534, + "grad_norm": 3.5663013458251953, + "learning_rate": 6.230675712404557e-06, + "loss": 0.8705, + "step": 5685 + }, + { + "epoch": 0.4383287079864323, + "grad_norm": 3.5762104988098145, + "learning_rate": 6.229465634637912e-06, + "loss": 1.0757, + "step": 5686 + }, + { + "epoch": 0.4384057971014493, + "grad_norm": 4.03668212890625, + "learning_rate": 6.2282554802204535e-06, + "loss": 0.9993, + "step": 5687 + }, + { + "epoch": 0.43848288621646625, + "grad_norm": 3.1501452922821045, + "learning_rate": 6.2270452492276265e-06, + "loss": 0.8529, + "step": 5688 + }, + { + "epoch": 0.4385599753314832, + "grad_norm": 4.269925117492676, + "learning_rate": 6.225834941734883e-06, + "loss": 0.9343, + "step": 5689 + }, + { + "epoch": 0.43863706444650014, + "grad_norm": 3.7110297679901123, + "learning_rate": 6.22462455781768e-06, + "loss": 0.9587, + "step": 5690 + }, + { + "epoch": 0.4387141535615171, + "grad_norm": 3.938750743865967, + "learning_rate": 6.223414097551478e-06, + "loss": 0.979, + "step": 5691 + }, + { + "epoch": 0.4387912426765341, + "grad_norm": 3.5555200576782227, + "learning_rate": 6.2222035610117424e-06, + "loss": 0.9671, + "step": 5692 + }, + { + "epoch": 0.43886833179155105, + "grad_norm": 3.7331223487854004, + "learning_rate": 6.220992948273947e-06, + "loss": 0.8927, + "step": 5693 + }, + { + "epoch": 0.438945420906568, + "grad_norm": 3.361193895339966, + "learning_rate": 6.219782259413562e-06, + "loss": 0.8326, + "step": 5694 + }, + { + "epoch": 0.43902251002158493, + "grad_norm": 3.461210250854492, + "learning_rate": 6.218571494506073e-06, + "loss": 0.8966, + "step": 5695 + }, + { + "epoch": 0.4390995991366019, + "grad_norm": 3.2159628868103027, + "learning_rate": 6.2173606536269605e-06, + "loss": 0.8993, + "step": 5696 + }, + { + "epoch": 0.4391766882516189, + "grad_norm": 3.616848945617676, + "learning_rate": 6.2161497368517175e-06, + "loss": 0.9901, + "step": 5697 + }, + { + "epoch": 0.43925377736663584, + "grad_norm": 3.6139261722564697, + "learning_rate": 6.214938744255837e-06, + "loss": 0.8664, + "step": 5698 + }, + { + "epoch": 0.4393308664816528, + "grad_norm": 3.526050567626953, + "learning_rate": 6.213727675914818e-06, + "loss": 1.0925, + "step": 5699 + }, + { + "epoch": 0.43940795559666973, + "grad_norm": 3.36438250541687, + "learning_rate": 6.212516531904164e-06, + "loss": 0.9525, + "step": 5700 + }, + { + "epoch": 0.4394850447116867, + "grad_norm": 3.640000343322754, + "learning_rate": 6.2113053122993846e-06, + "loss": 1.0309, + "step": 5701 + }, + { + "epoch": 0.43956213382670367, + "grad_norm": 3.7838895320892334, + "learning_rate": 6.210094017175991e-06, + "loss": 0.9115, + "step": 5702 + }, + { + "epoch": 0.43963922294172064, + "grad_norm": 3.5852866172790527, + "learning_rate": 6.208882646609505e-06, + "loss": 0.918, + "step": 5703 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 3.5891425609588623, + "learning_rate": 6.207671200675446e-06, + "loss": 0.961, + "step": 5704 + }, + { + "epoch": 0.4397934011717545, + "grad_norm": 3.654510974884033, + "learning_rate": 6.206459679449341e-06, + "loss": 0.9256, + "step": 5705 + }, + { + "epoch": 0.4398704902867715, + "grad_norm": 3.4456582069396973, + "learning_rate": 6.205248083006725e-06, + "loss": 0.9671, + "step": 5706 + }, + { + "epoch": 0.43994757940178847, + "grad_norm": 3.725618362426758, + "learning_rate": 6.204036411423133e-06, + "loss": 1.0621, + "step": 5707 + }, + { + "epoch": 0.44002466851680544, + "grad_norm": 3.431455135345459, + "learning_rate": 6.202824664774107e-06, + "loss": 0.9588, + "step": 5708 + }, + { + "epoch": 0.4401017576318224, + "grad_norm": 3.980421543121338, + "learning_rate": 6.201612843135192e-06, + "loss": 1.0372, + "step": 5709 + }, + { + "epoch": 0.4401788467468393, + "grad_norm": 3.611001491546631, + "learning_rate": 6.200400946581939e-06, + "loss": 0.9616, + "step": 5710 + }, + { + "epoch": 0.4402559358618563, + "grad_norm": 3.704310417175293, + "learning_rate": 6.199188975189905e-06, + "loss": 1.0496, + "step": 5711 + }, + { + "epoch": 0.44033302497687327, + "grad_norm": 3.831941843032837, + "learning_rate": 6.197976929034649e-06, + "loss": 0.9209, + "step": 5712 + }, + { + "epoch": 0.44041011409189024, + "grad_norm": 3.790203094482422, + "learning_rate": 6.196764808191735e-06, + "loss": 0.9974, + "step": 5713 + }, + { + "epoch": 0.4404872032069072, + "grad_norm": 3.4942948818206787, + "learning_rate": 6.195552612736735e-06, + "loss": 1.0154, + "step": 5714 + }, + { + "epoch": 0.4405642923219241, + "grad_norm": 3.5759153366088867, + "learning_rate": 6.19434034274522e-06, + "loss": 0.9808, + "step": 5715 + }, + { + "epoch": 0.4406413814369411, + "grad_norm": 3.54484486579895, + "learning_rate": 6.19312799829277e-06, + "loss": 0.8969, + "step": 5716 + }, + { + "epoch": 0.44071847055195806, + "grad_norm": 3.6887714862823486, + "learning_rate": 6.191915579454971e-06, + "loss": 0.9693, + "step": 5717 + }, + { + "epoch": 0.44079555966697503, + "grad_norm": 3.173921585083008, + "learning_rate": 6.1907030863074055e-06, + "loss": 0.9008, + "step": 5718 + }, + { + "epoch": 0.440872648781992, + "grad_norm": 3.73128080368042, + "learning_rate": 6.189490518925673e-06, + "loss": 1.0533, + "step": 5719 + }, + { + "epoch": 0.4409497378970089, + "grad_norm": 3.7085869312286377, + "learning_rate": 6.188277877385365e-06, + "loss": 0.9966, + "step": 5720 + }, + { + "epoch": 0.4410268270120259, + "grad_norm": 3.4095280170440674, + "learning_rate": 6.187065161762085e-06, + "loss": 0.9337, + "step": 5721 + }, + { + "epoch": 0.44110391612704286, + "grad_norm": 3.37434720993042, + "learning_rate": 6.1858523721314425e-06, + "loss": 0.9576, + "step": 5722 + }, + { + "epoch": 0.44118100524205983, + "grad_norm": 3.524092674255371, + "learning_rate": 6.184639508569043e-06, + "loss": 0.943, + "step": 5723 + }, + { + "epoch": 0.4412580943570768, + "grad_norm": 3.7031326293945312, + "learning_rate": 6.183426571150508e-06, + "loss": 0.941, + "step": 5724 + }, + { + "epoch": 0.4413351834720937, + "grad_norm": 3.827249050140381, + "learning_rate": 6.182213559951456e-06, + "loss": 0.9173, + "step": 5725 + }, + { + "epoch": 0.4414122725871107, + "grad_norm": 3.6542346477508545, + "learning_rate": 6.181000475047509e-06, + "loss": 0.9171, + "step": 5726 + }, + { + "epoch": 0.44148936170212766, + "grad_norm": 3.8441221714019775, + "learning_rate": 6.1797873165143005e-06, + "loss": 1.0174, + "step": 5727 + }, + { + "epoch": 0.44156645081714463, + "grad_norm": 3.4966237545013428, + "learning_rate": 6.178574084427464e-06, + "loss": 0.9042, + "step": 5728 + }, + { + "epoch": 0.4416435399321616, + "grad_norm": 3.398782968521118, + "learning_rate": 6.1773607788626355e-06, + "loss": 0.9617, + "step": 5729 + }, + { + "epoch": 0.4417206290471785, + "grad_norm": 3.6098642349243164, + "learning_rate": 6.176147399895461e-06, + "loss": 0.9554, + "step": 5730 + }, + { + "epoch": 0.4417977181621955, + "grad_norm": 4.328033924102783, + "learning_rate": 6.174933947601587e-06, + "loss": 1.0728, + "step": 5731 + }, + { + "epoch": 0.44187480727721246, + "grad_norm": 3.6380655765533447, + "learning_rate": 6.173720422056666e-06, + "loss": 0.9786, + "step": 5732 + }, + { + "epoch": 0.4419518963922294, + "grad_norm": 3.8986289501190186, + "learning_rate": 6.172506823336357e-06, + "loss": 0.9966, + "step": 5733 + }, + { + "epoch": 0.4420289855072464, + "grad_norm": 3.5367014408111572, + "learning_rate": 6.17129315151632e-06, + "loss": 0.9705, + "step": 5734 + }, + { + "epoch": 0.4421060746222633, + "grad_norm": 3.6071369647979736, + "learning_rate": 6.17007940667222e-06, + "loss": 0.8371, + "step": 5735 + }, + { + "epoch": 0.4421831637372803, + "grad_norm": 3.784031867980957, + "learning_rate": 6.16886558887973e-06, + "loss": 0.9845, + "step": 5736 + }, + { + "epoch": 0.44226025285229725, + "grad_norm": 3.7865121364593506, + "learning_rate": 6.167651698214524e-06, + "loss": 0.9009, + "step": 5737 + }, + { + "epoch": 0.4423373419673142, + "grad_norm": 3.29433274269104, + "learning_rate": 6.166437734752282e-06, + "loss": 0.8721, + "step": 5738 + }, + { + "epoch": 0.4424144310823312, + "grad_norm": 3.7611396312713623, + "learning_rate": 6.165223698568689e-06, + "loss": 1.0397, + "step": 5739 + }, + { + "epoch": 0.4424915201973481, + "grad_norm": 3.6707139015197754, + "learning_rate": 6.164009589739431e-06, + "loss": 0.9511, + "step": 5740 + }, + { + "epoch": 0.4425686093123651, + "grad_norm": 3.516786575317383, + "learning_rate": 6.162795408340206e-06, + "loss": 0.9583, + "step": 5741 + }, + { + "epoch": 0.44264569842738205, + "grad_norm": 3.280879259109497, + "learning_rate": 6.161581154446709e-06, + "loss": 0.937, + "step": 5742 + }, + { + "epoch": 0.442722787542399, + "grad_norm": 3.570941209793091, + "learning_rate": 6.1603668281346425e-06, + "loss": 1.048, + "step": 5743 + }, + { + "epoch": 0.442799876657416, + "grad_norm": 3.6662280559539795, + "learning_rate": 6.159152429479714e-06, + "loss": 0.9452, + "step": 5744 + }, + { + "epoch": 0.4428769657724329, + "grad_norm": 3.7819111347198486, + "learning_rate": 6.157937958557635e-06, + "loss": 1.0653, + "step": 5745 + }, + { + "epoch": 0.4429540548874499, + "grad_norm": 3.768911361694336, + "learning_rate": 6.156723415444123e-06, + "loss": 0.8976, + "step": 5746 + }, + { + "epoch": 0.44303114400246685, + "grad_norm": 3.1364481449127197, + "learning_rate": 6.155508800214894e-06, + "loss": 0.9241, + "step": 5747 + }, + { + "epoch": 0.4431082331174838, + "grad_norm": 4.069226264953613, + "learning_rate": 6.154294112945678e-06, + "loss": 1.0962, + "step": 5748 + }, + { + "epoch": 0.4431853222325008, + "grad_norm": 3.394063711166382, + "learning_rate": 6.153079353712201e-06, + "loss": 0.8823, + "step": 5749 + }, + { + "epoch": 0.4432624113475177, + "grad_norm": 3.523238182067871, + "learning_rate": 6.1518645225902e-06, + "loss": 1.007, + "step": 5750 + }, + { + "epoch": 0.4433395004625347, + "grad_norm": 3.942383050918579, + "learning_rate": 6.150649619655411e-06, + "loss": 0.9954, + "step": 5751 + }, + { + "epoch": 0.44341658957755165, + "grad_norm": 3.7958526611328125, + "learning_rate": 6.149434644983576e-06, + "loss": 0.9991, + "step": 5752 + }, + { + "epoch": 0.4434936786925686, + "grad_norm": 3.4549882411956787, + "learning_rate": 6.148219598650444e-06, + "loss": 0.9293, + "step": 5753 + }, + { + "epoch": 0.4435707678075856, + "grad_norm": 4.054780960083008, + "learning_rate": 6.1470044807317695e-06, + "loss": 0.8547, + "step": 5754 + }, + { + "epoch": 0.4436478569226025, + "grad_norm": 3.937290906906128, + "learning_rate": 6.145789291303305e-06, + "loss": 0.9548, + "step": 5755 + }, + { + "epoch": 0.44372494603761947, + "grad_norm": 3.627803325653076, + "learning_rate": 6.144574030440811e-06, + "loss": 0.9164, + "step": 5756 + }, + { + "epoch": 0.44380203515263644, + "grad_norm": 3.1679458618164062, + "learning_rate": 6.143358698220055e-06, + "loss": 0.9428, + "step": 5757 + }, + { + "epoch": 0.4438791242676534, + "grad_norm": 3.7125744819641113, + "learning_rate": 6.142143294716806e-06, + "loss": 0.91, + "step": 5758 + }, + { + "epoch": 0.4439562133826704, + "grad_norm": 4.144897937774658, + "learning_rate": 6.140927820006838e-06, + "loss": 1.0763, + "step": 5759 + }, + { + "epoch": 0.4440333024976873, + "grad_norm": 3.8823111057281494, + "learning_rate": 6.139712274165929e-06, + "loss": 0.9906, + "step": 5760 + }, + { + "epoch": 0.44411039161270427, + "grad_norm": 3.5604453086853027, + "learning_rate": 6.138496657269862e-06, + "loss": 0.8591, + "step": 5761 + }, + { + "epoch": 0.44418748072772124, + "grad_norm": 3.5651559829711914, + "learning_rate": 6.1372809693944255e-06, + "loss": 1.0111, + "step": 5762 + }, + { + "epoch": 0.4442645698427382, + "grad_norm": 3.4907076358795166, + "learning_rate": 6.1360652106154095e-06, + "loss": 0.9794, + "step": 5763 + }, + { + "epoch": 0.4443416589577552, + "grad_norm": 3.5271456241607666, + "learning_rate": 6.134849381008613e-06, + "loss": 1.0717, + "step": 5764 + }, + { + "epoch": 0.44441874807277215, + "grad_norm": 3.589846611022949, + "learning_rate": 6.1336334806498356e-06, + "loss": 1.0364, + "step": 5765 + }, + { + "epoch": 0.44449583718778907, + "grad_norm": 3.321850299835205, + "learning_rate": 6.13241750961488e-06, + "loss": 0.9513, + "step": 5766 + }, + { + "epoch": 0.44457292630280604, + "grad_norm": 3.9627490043640137, + "learning_rate": 6.13120146797956e-06, + "loss": 0.9762, + "step": 5767 + }, + { + "epoch": 0.444650015417823, + "grad_norm": 3.2399096488952637, + "learning_rate": 6.129985355819684e-06, + "loss": 0.9448, + "step": 5768 + }, + { + "epoch": 0.44472710453284, + "grad_norm": 3.653228521347046, + "learning_rate": 6.128769173211075e-06, + "loss": 0.9295, + "step": 5769 + }, + { + "epoch": 0.44480419364785695, + "grad_norm": 3.379409074783325, + "learning_rate": 6.127552920229556e-06, + "loss": 0.8892, + "step": 5770 + }, + { + "epoch": 0.44488128276287386, + "grad_norm": 3.461467981338501, + "learning_rate": 6.126336596950949e-06, + "loss": 0.9007, + "step": 5771 + }, + { + "epoch": 0.44495837187789083, + "grad_norm": 3.425234794616699, + "learning_rate": 6.1251202034510905e-06, + "loss": 0.9136, + "step": 5772 + }, + { + "epoch": 0.4450354609929078, + "grad_norm": 3.859802007675171, + "learning_rate": 6.123903739805815e-06, + "loss": 0.9927, + "step": 5773 + }, + { + "epoch": 0.4451125501079248, + "grad_norm": 3.9901375770568848, + "learning_rate": 6.1226872060909606e-06, + "loss": 1.0381, + "step": 5774 + }, + { + "epoch": 0.44518963922294175, + "grad_norm": 3.5192387104034424, + "learning_rate": 6.121470602382375e-06, + "loss": 0.9129, + "step": 5775 + }, + { + "epoch": 0.44526672833795866, + "grad_norm": 3.422767400741577, + "learning_rate": 6.1202539287559035e-06, + "loss": 0.9601, + "step": 5776 + }, + { + "epoch": 0.44534381745297563, + "grad_norm": 3.6714818477630615, + "learning_rate": 6.119037185287402e-06, + "loss": 0.9204, + "step": 5777 + }, + { + "epoch": 0.4454209065679926, + "grad_norm": 3.7968335151672363, + "learning_rate": 6.1178203720527285e-06, + "loss": 0.9916, + "step": 5778 + }, + { + "epoch": 0.4454979956830096, + "grad_norm": 3.772531747817993, + "learning_rate": 6.116603489127744e-06, + "loss": 1.0515, + "step": 5779 + }, + { + "epoch": 0.44557508479802654, + "grad_norm": 3.4873077869415283, + "learning_rate": 6.1153865365883146e-06, + "loss": 0.9606, + "step": 5780 + }, + { + "epoch": 0.44565217391304346, + "grad_norm": 3.7624447345733643, + "learning_rate": 6.114169514510312e-06, + "loss": 0.9, + "step": 5781 + }, + { + "epoch": 0.44572926302806043, + "grad_norm": 3.8201286792755127, + "learning_rate": 6.112952422969611e-06, + "loss": 0.9601, + "step": 5782 + }, + { + "epoch": 0.4458063521430774, + "grad_norm": 4.063814640045166, + "learning_rate": 6.111735262042088e-06, + "loss": 0.9778, + "step": 5783 + }, + { + "epoch": 0.44588344125809437, + "grad_norm": 3.7615840435028076, + "learning_rate": 6.11051803180363e-06, + "loss": 0.9212, + "step": 5784 + }, + { + "epoch": 0.44596053037311134, + "grad_norm": 3.5406413078308105, + "learning_rate": 6.109300732330126e-06, + "loss": 0.9414, + "step": 5785 + }, + { + "epoch": 0.44603761948812826, + "grad_norm": 3.9752652645111084, + "learning_rate": 6.1080833636974655e-06, + "loss": 1.0694, + "step": 5786 + }, + { + "epoch": 0.4461147086031452, + "grad_norm": 3.561919689178467, + "learning_rate": 6.106865925981547e-06, + "loss": 0.9985, + "step": 5787 + }, + { + "epoch": 0.4461917977181622, + "grad_norm": 3.6946768760681152, + "learning_rate": 6.105648419258271e-06, + "loss": 1.0049, + "step": 5788 + }, + { + "epoch": 0.44626888683317917, + "grad_norm": 3.748198986053467, + "learning_rate": 6.10443084360354e-06, + "loss": 0.9642, + "step": 5789 + }, + { + "epoch": 0.44634597594819614, + "grad_norm": 3.938978672027588, + "learning_rate": 6.103213199093267e-06, + "loss": 1.0989, + "step": 5790 + }, + { + "epoch": 0.44642306506321305, + "grad_norm": 3.319303035736084, + "learning_rate": 6.101995485803367e-06, + "loss": 1.0071, + "step": 5791 + }, + { + "epoch": 0.44650015417823, + "grad_norm": 3.9416778087615967, + "learning_rate": 6.100777703809753e-06, + "loss": 1.038, + "step": 5792 + }, + { + "epoch": 0.446577243293247, + "grad_norm": 3.5364232063293457, + "learning_rate": 6.0995598531883504e-06, + "loss": 0.862, + "step": 5793 + }, + { + "epoch": 0.44665433240826397, + "grad_norm": 3.440767526626587, + "learning_rate": 6.098341934015088e-06, + "loss": 0.9708, + "step": 5794 + }, + { + "epoch": 0.44673142152328094, + "grad_norm": 3.8628475666046143, + "learning_rate": 6.097123946365893e-06, + "loss": 0.936, + "step": 5795 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 3.6398117542266846, + "learning_rate": 6.095905890316701e-06, + "loss": 1.0004, + "step": 5796 + }, + { + "epoch": 0.4468855997533148, + "grad_norm": 4.144468307495117, + "learning_rate": 6.094687765943455e-06, + "loss": 1.0691, + "step": 5797 + }, + { + "epoch": 0.4469626888683318, + "grad_norm": 3.321753740310669, + "learning_rate": 6.093469573322094e-06, + "loss": 1.0334, + "step": 5798 + }, + { + "epoch": 0.44703977798334876, + "grad_norm": 3.512993574142456, + "learning_rate": 6.09225131252857e-06, + "loss": 1.0446, + "step": 5799 + }, + { + "epoch": 0.44711686709836573, + "grad_norm": 3.447844982147217, + "learning_rate": 6.091032983638833e-06, + "loss": 0.9838, + "step": 5800 + }, + { + "epoch": 0.44719395621338265, + "grad_norm": 3.6095683574676514, + "learning_rate": 6.0898145867288395e-06, + "loss": 0.9409, + "step": 5801 + }, + { + "epoch": 0.4472710453283996, + "grad_norm": 3.7775659561157227, + "learning_rate": 6.088596121874552e-06, + "loss": 1.0464, + "step": 5802 + }, + { + "epoch": 0.4473481344434166, + "grad_norm": 3.305006980895996, + "learning_rate": 6.087377589151933e-06, + "loss": 0.9671, + "step": 5803 + }, + { + "epoch": 0.44742522355843356, + "grad_norm": 3.450387477874756, + "learning_rate": 6.086158988636953e-06, + "loss": 0.9715, + "step": 5804 + }, + { + "epoch": 0.44750231267345053, + "grad_norm": 3.848440647125244, + "learning_rate": 6.084940320405587e-06, + "loss": 1.0587, + "step": 5805 + }, + { + "epoch": 0.44757940178846745, + "grad_norm": 3.4928512573242188, + "learning_rate": 6.08372158453381e-06, + "loss": 0.922, + "step": 5806 + }, + { + "epoch": 0.4476564909034844, + "grad_norm": 3.3881824016571045, + "learning_rate": 6.082502781097603e-06, + "loss": 0.9111, + "step": 5807 + }, + { + "epoch": 0.4477335800185014, + "grad_norm": 3.2955267429351807, + "learning_rate": 6.081283910172956e-06, + "loss": 0.9145, + "step": 5808 + }, + { + "epoch": 0.44781066913351836, + "grad_norm": 3.5794951915740967, + "learning_rate": 6.080064971835857e-06, + "loss": 0.9883, + "step": 5809 + }, + { + "epoch": 0.44788775824853533, + "grad_norm": 3.3705379962921143, + "learning_rate": 6.078845966162302e-06, + "loss": 0.9652, + "step": 5810 + }, + { + "epoch": 0.44796484736355224, + "grad_norm": 3.782891273498535, + "learning_rate": 6.077626893228288e-06, + "loss": 1.0398, + "step": 5811 + }, + { + "epoch": 0.4480419364785692, + "grad_norm": 3.3794403076171875, + "learning_rate": 6.076407753109818e-06, + "loss": 0.9309, + "step": 5812 + }, + { + "epoch": 0.4481190255935862, + "grad_norm": 3.325587272644043, + "learning_rate": 6.0751885458829e-06, + "loss": 0.9469, + "step": 5813 + }, + { + "epoch": 0.44819611470860315, + "grad_norm": 3.5415618419647217, + "learning_rate": 6.073969271623543e-06, + "loss": 0.9648, + "step": 5814 + }, + { + "epoch": 0.4482732038236201, + "grad_norm": 3.56663179397583, + "learning_rate": 6.072749930407767e-06, + "loss": 1.0515, + "step": 5815 + }, + { + "epoch": 0.44835029293863704, + "grad_norm": 4.031259536743164, + "learning_rate": 6.071530522311586e-06, + "loss": 1.0355, + "step": 5816 + }, + { + "epoch": 0.448427382053654, + "grad_norm": 3.370077610015869, + "learning_rate": 6.070311047411027e-06, + "loss": 0.8723, + "step": 5817 + }, + { + "epoch": 0.448504471168671, + "grad_norm": 3.9571077823638916, + "learning_rate": 6.069091505782119e-06, + "loss": 0.9908, + "step": 5818 + }, + { + "epoch": 0.44858156028368795, + "grad_norm": 3.552165985107422, + "learning_rate": 6.067871897500891e-06, + "loss": 0.9056, + "step": 5819 + }, + { + "epoch": 0.4486586493987049, + "grad_norm": 3.8415639400482178, + "learning_rate": 6.066652222643381e-06, + "loss": 1.0353, + "step": 5820 + }, + { + "epoch": 0.44873573851372184, + "grad_norm": 3.4832634925842285, + "learning_rate": 6.0654324812856305e-06, + "loss": 0.9881, + "step": 5821 + }, + { + "epoch": 0.4488128276287388, + "grad_norm": 3.5420501232147217, + "learning_rate": 6.064212673503682e-06, + "loss": 0.9398, + "step": 5822 + }, + { + "epoch": 0.4488899167437558, + "grad_norm": 3.6258022785186768, + "learning_rate": 6.062992799373585e-06, + "loss": 1.0994, + "step": 5823 + }, + { + "epoch": 0.44896700585877275, + "grad_norm": 3.758962631225586, + "learning_rate": 6.061772858971392e-06, + "loss": 0.9196, + "step": 5824 + }, + { + "epoch": 0.4490440949737897, + "grad_norm": 3.7305667400360107, + "learning_rate": 6.060552852373161e-06, + "loss": 0.9636, + "step": 5825 + }, + { + "epoch": 0.44912118408880664, + "grad_norm": 3.6690361499786377, + "learning_rate": 6.059332779654953e-06, + "loss": 1.0034, + "step": 5826 + }, + { + "epoch": 0.4491982732038236, + "grad_norm": 3.7183585166931152, + "learning_rate": 6.05811264089283e-06, + "loss": 1.0183, + "step": 5827 + }, + { + "epoch": 0.4492753623188406, + "grad_norm": 3.5112946033477783, + "learning_rate": 6.056892436162866e-06, + "loss": 0.9799, + "step": 5828 + }, + { + "epoch": 0.44935245143385755, + "grad_norm": 3.783031940460205, + "learning_rate": 6.055672165541132e-06, + "loss": 0.9735, + "step": 5829 + }, + { + "epoch": 0.4494295405488745, + "grad_norm": 4.117906093597412, + "learning_rate": 6.0544518291037055e-06, + "loss": 1.0281, + "step": 5830 + }, + { + "epoch": 0.44950662966389143, + "grad_norm": 3.5739166736602783, + "learning_rate": 6.05323142692667e-06, + "loss": 1.0257, + "step": 5831 + }, + { + "epoch": 0.4495837187789084, + "grad_norm": 3.51526141166687, + "learning_rate": 6.052010959086109e-06, + "loss": 0.9378, + "step": 5832 + }, + { + "epoch": 0.4496608078939254, + "grad_norm": 4.066465377807617, + "learning_rate": 6.050790425658112e-06, + "loss": 1.0126, + "step": 5833 + }, + { + "epoch": 0.44973789700894234, + "grad_norm": 3.460562229156494, + "learning_rate": 6.049569826718776e-06, + "loss": 0.9873, + "step": 5834 + }, + { + "epoch": 0.4498149861239593, + "grad_norm": 3.6307222843170166, + "learning_rate": 6.048349162344196e-06, + "loss": 0.9094, + "step": 5835 + }, + { + "epoch": 0.44989207523897623, + "grad_norm": 4.065826892852783, + "learning_rate": 6.047128432610476e-06, + "loss": 0.8306, + "step": 5836 + }, + { + "epoch": 0.4499691643539932, + "grad_norm": 3.523407459259033, + "learning_rate": 6.045907637593722e-06, + "loss": 0.9603, + "step": 5837 + }, + { + "epoch": 0.45004625346901017, + "grad_norm": 3.768115997314453, + "learning_rate": 6.044686777370042e-06, + "loss": 1.0456, + "step": 5838 + }, + { + "epoch": 0.45012334258402714, + "grad_norm": 3.527888059616089, + "learning_rate": 6.043465852015553e-06, + "loss": 1.0582, + "step": 5839 + }, + { + "epoch": 0.4502004316990441, + "grad_norm": 3.7846176624298096, + "learning_rate": 6.042244861606373e-06, + "loss": 1.0712, + "step": 5840 + }, + { + "epoch": 0.450277520814061, + "grad_norm": 3.638317584991455, + "learning_rate": 6.041023806218622e-06, + "loss": 0.9448, + "step": 5841 + }, + { + "epoch": 0.450354609929078, + "grad_norm": 3.303644895553589, + "learning_rate": 6.03980268592843e-06, + "loss": 0.914, + "step": 5842 + }, + { + "epoch": 0.45043169904409497, + "grad_norm": 3.3192503452301025, + "learning_rate": 6.0385815008119254e-06, + "loss": 0.9118, + "step": 5843 + }, + { + "epoch": 0.45050878815911194, + "grad_norm": 3.531160831451416, + "learning_rate": 6.037360250945243e-06, + "loss": 0.966, + "step": 5844 + }, + { + "epoch": 0.4505858772741289, + "grad_norm": 3.8585782051086426, + "learning_rate": 6.036138936404521e-06, + "loss": 0.9231, + "step": 5845 + }, + { + "epoch": 0.4506629663891459, + "grad_norm": 3.3526854515075684, + "learning_rate": 6.034917557265903e-06, + "loss": 0.9663, + "step": 5846 + }, + { + "epoch": 0.4507400555041628, + "grad_norm": 3.64371919631958, + "learning_rate": 6.033696113605536e-06, + "loss": 0.9702, + "step": 5847 + }, + { + "epoch": 0.45081714461917977, + "grad_norm": 3.5181994438171387, + "learning_rate": 6.0324746054995685e-06, + "loss": 0.9026, + "step": 5848 + }, + { + "epoch": 0.45089423373419674, + "grad_norm": 3.487680196762085, + "learning_rate": 6.031253033024158e-06, + "loss": 0.9488, + "step": 5849 + }, + { + "epoch": 0.4509713228492137, + "grad_norm": 3.600533962249756, + "learning_rate": 6.030031396255462e-06, + "loss": 1.0593, + "step": 5850 + }, + { + "epoch": 0.4510484119642307, + "grad_norm": 3.4745328426361084, + "learning_rate": 6.028809695269641e-06, + "loss": 0.937, + "step": 5851 + }, + { + "epoch": 0.4511255010792476, + "grad_norm": 3.617902994155884, + "learning_rate": 6.027587930142866e-06, + "loss": 0.9552, + "step": 5852 + }, + { + "epoch": 0.45120259019426456, + "grad_norm": 3.4752037525177, + "learning_rate": 6.026366100951304e-06, + "loss": 0.9546, + "step": 5853 + }, + { + "epoch": 0.45127967930928153, + "grad_norm": 3.3312973976135254, + "learning_rate": 6.025144207771132e-06, + "loss": 0.9142, + "step": 5854 + }, + { + "epoch": 0.4513567684242985, + "grad_norm": 4.001164436340332, + "learning_rate": 6.0239222506785285e-06, + "loss": 1.0792, + "step": 5855 + }, + { + "epoch": 0.4514338575393155, + "grad_norm": 3.3809666633605957, + "learning_rate": 6.0227002297496765e-06, + "loss": 0.9077, + "step": 5856 + }, + { + "epoch": 0.4515109466543324, + "grad_norm": 3.638946771621704, + "learning_rate": 6.02147814506076e-06, + "loss": 0.9814, + "step": 5857 + }, + { + "epoch": 0.45158803576934936, + "grad_norm": 3.711111545562744, + "learning_rate": 6.020255996687973e-06, + "loss": 0.9074, + "step": 5858 + }, + { + "epoch": 0.45166512488436633, + "grad_norm": 3.797400712966919, + "learning_rate": 6.019033784707507e-06, + "loss": 1.0543, + "step": 5859 + }, + { + "epoch": 0.4517422139993833, + "grad_norm": 3.4948596954345703, + "learning_rate": 6.017811509195565e-06, + "loss": 0.9662, + "step": 5860 + }, + { + "epoch": 0.4518193031144003, + "grad_norm": 3.3635761737823486, + "learning_rate": 6.0165891702283444e-06, + "loss": 0.8524, + "step": 5861 + }, + { + "epoch": 0.4518963922294172, + "grad_norm": 3.4985485076904297, + "learning_rate": 6.015366767882054e-06, + "loss": 0.9585, + "step": 5862 + }, + { + "epoch": 0.45197348134443416, + "grad_norm": 3.706798791885376, + "learning_rate": 6.014144302232906e-06, + "loss": 1.0279, + "step": 5863 + }, + { + "epoch": 0.45205057045945113, + "grad_norm": 3.334749460220337, + "learning_rate": 6.012921773357112e-06, + "loss": 1.0438, + "step": 5864 + }, + { + "epoch": 0.4521276595744681, + "grad_norm": 4.0636091232299805, + "learning_rate": 6.011699181330891e-06, + "loss": 1.0819, + "step": 5865 + }, + { + "epoch": 0.45220474868948507, + "grad_norm": 3.427274703979492, + "learning_rate": 6.0104765262304676e-06, + "loss": 0.9706, + "step": 5866 + }, + { + "epoch": 0.452281837804502, + "grad_norm": 3.34578800201416, + "learning_rate": 6.009253808132064e-06, + "loss": 0.8081, + "step": 5867 + }, + { + "epoch": 0.45235892691951896, + "grad_norm": 4.062625408172607, + "learning_rate": 6.008031027111913e-06, + "loss": 1.0353, + "step": 5868 + }, + { + "epoch": 0.4524360160345359, + "grad_norm": 3.454693078994751, + "learning_rate": 6.00680818324625e-06, + "loss": 0.9642, + "step": 5869 + }, + { + "epoch": 0.4525131051495529, + "grad_norm": 3.7603514194488525, + "learning_rate": 6.00558527661131e-06, + "loss": 0.9058, + "step": 5870 + }, + { + "epoch": 0.45259019426456987, + "grad_norm": 3.822753667831421, + "learning_rate": 6.004362307283335e-06, + "loss": 1.0457, + "step": 5871 + }, + { + "epoch": 0.4526672833795868, + "grad_norm": 3.4553167819976807, + "learning_rate": 6.003139275338573e-06, + "loss": 0.9248, + "step": 5872 + }, + { + "epoch": 0.45274437249460375, + "grad_norm": 3.557408332824707, + "learning_rate": 6.001916180853271e-06, + "loss": 1.0115, + "step": 5873 + }, + { + "epoch": 0.4528214616096207, + "grad_norm": 3.668583393096924, + "learning_rate": 6.0006930239036865e-06, + "loss": 1.0528, + "step": 5874 + }, + { + "epoch": 0.4528985507246377, + "grad_norm": 3.564915418624878, + "learning_rate": 5.999469804566074e-06, + "loss": 0.986, + "step": 5875 + }, + { + "epoch": 0.45297563983965466, + "grad_norm": 3.27848744392395, + "learning_rate": 5.998246522916695e-06, + "loss": 0.8666, + "step": 5876 + }, + { + "epoch": 0.4530527289546716, + "grad_norm": 3.95578932762146, + "learning_rate": 5.997023179031815e-06, + "loss": 1.0293, + "step": 5877 + }, + { + "epoch": 0.45312981806968855, + "grad_norm": 3.2330212593078613, + "learning_rate": 5.995799772987705e-06, + "loss": 0.8926, + "step": 5878 + }, + { + "epoch": 0.4532069071847055, + "grad_norm": 3.7614970207214355, + "learning_rate": 5.994576304860636e-06, + "loss": 0.9709, + "step": 5879 + }, + { + "epoch": 0.4532839962997225, + "grad_norm": 3.9075229167938232, + "learning_rate": 5.993352774726885e-06, + "loss": 0.9729, + "step": 5880 + }, + { + "epoch": 0.45336108541473946, + "grad_norm": 3.5691709518432617, + "learning_rate": 5.992129182662733e-06, + "loss": 0.9145, + "step": 5881 + }, + { + "epoch": 0.4534381745297564, + "grad_norm": 3.983722686767578, + "learning_rate": 5.990905528744466e-06, + "loss": 0.9745, + "step": 5882 + }, + { + "epoch": 0.45351526364477335, + "grad_norm": 3.6609230041503906, + "learning_rate": 5.98968181304837e-06, + "loss": 0.9585, + "step": 5883 + }, + { + "epoch": 0.4535923527597903, + "grad_norm": 3.4482264518737793, + "learning_rate": 5.98845803565074e-06, + "loss": 1.0082, + "step": 5884 + }, + { + "epoch": 0.4536694418748073, + "grad_norm": 3.4838855266571045, + "learning_rate": 5.987234196627869e-06, + "loss": 0.8997, + "step": 5885 + }, + { + "epoch": 0.45374653098982426, + "grad_norm": 3.4517669677734375, + "learning_rate": 5.986010296056059e-06, + "loss": 1.0314, + "step": 5886 + }, + { + "epoch": 0.4538236201048412, + "grad_norm": 3.517615556716919, + "learning_rate": 5.984786334011617e-06, + "loss": 0.9729, + "step": 5887 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 3.7334585189819336, + "learning_rate": 5.983562310570844e-06, + "loss": 0.9424, + "step": 5888 + }, + { + "epoch": 0.4539777983348751, + "grad_norm": 3.1004371643066406, + "learning_rate": 5.982338225810056e-06, + "loss": 0.8848, + "step": 5889 + }, + { + "epoch": 0.4540548874498921, + "grad_norm": 3.9406261444091797, + "learning_rate": 5.9811140798055674e-06, + "loss": 1.0372, + "step": 5890 + }, + { + "epoch": 0.45413197656490906, + "grad_norm": 3.5485575199127197, + "learning_rate": 5.9798898726336965e-06, + "loss": 0.8899, + "step": 5891 + }, + { + "epoch": 0.45420906567992597, + "grad_norm": 3.7856481075286865, + "learning_rate": 5.978665604370767e-06, + "loss": 0.86, + "step": 5892 + }, + { + "epoch": 0.45428615479494294, + "grad_norm": 3.8746819496154785, + "learning_rate": 5.977441275093108e-06, + "loss": 0.8669, + "step": 5893 + }, + { + "epoch": 0.4543632439099599, + "grad_norm": 3.443347215652466, + "learning_rate": 5.976216884877045e-06, + "loss": 0.8895, + "step": 5894 + }, + { + "epoch": 0.4544403330249769, + "grad_norm": 3.6835126876831055, + "learning_rate": 5.974992433798916e-06, + "loss": 0.9634, + "step": 5895 + }, + { + "epoch": 0.45451742213999385, + "grad_norm": 4.053088665008545, + "learning_rate": 5.973767921935059e-06, + "loss": 0.933, + "step": 5896 + }, + { + "epoch": 0.45459451125501077, + "grad_norm": 3.9018402099609375, + "learning_rate": 5.972543349361813e-06, + "loss": 0.8993, + "step": 5897 + }, + { + "epoch": 0.45467160037002774, + "grad_norm": 3.5930070877075195, + "learning_rate": 5.97131871615553e-06, + "loss": 1.0334, + "step": 5898 + }, + { + "epoch": 0.4547486894850447, + "grad_norm": 3.705622434616089, + "learning_rate": 5.970094022392553e-06, + "loss": 0.8647, + "step": 5899 + }, + { + "epoch": 0.4548257786000617, + "grad_norm": 3.460371971130371, + "learning_rate": 5.968869268149239e-06, + "loss": 0.8912, + "step": 5900 + }, + { + "epoch": 0.45490286771507865, + "grad_norm": 3.655982255935669, + "learning_rate": 5.967644453501944e-06, + "loss": 1.004, + "step": 5901 + }, + { + "epoch": 0.45497995683009557, + "grad_norm": 3.730700969696045, + "learning_rate": 5.966419578527027e-06, + "loss": 0.9948, + "step": 5902 + }, + { + "epoch": 0.45505704594511254, + "grad_norm": 3.825910806655884, + "learning_rate": 5.965194643300858e-06, + "loss": 1.0192, + "step": 5903 + }, + { + "epoch": 0.4551341350601295, + "grad_norm": 3.2197327613830566, + "learning_rate": 5.9639696478997985e-06, + "loss": 0.8807, + "step": 5904 + }, + { + "epoch": 0.4552112241751465, + "grad_norm": 3.4035542011260986, + "learning_rate": 5.962744592400226e-06, + "loss": 0.8079, + "step": 5905 + }, + { + "epoch": 0.45528831329016345, + "grad_norm": 3.4863269329071045, + "learning_rate": 5.961519476878513e-06, + "loss": 0.8259, + "step": 5906 + }, + { + "epoch": 0.45536540240518036, + "grad_norm": 3.9149959087371826, + "learning_rate": 5.960294301411041e-06, + "loss": 0.9398, + "step": 5907 + }, + { + "epoch": 0.45544249152019733, + "grad_norm": 3.782867670059204, + "learning_rate": 5.959069066074195e-06, + "loss": 1.1127, + "step": 5908 + }, + { + "epoch": 0.4555195806352143, + "grad_norm": 3.4562859535217285, + "learning_rate": 5.957843770944357e-06, + "loss": 0.9791, + "step": 5909 + }, + { + "epoch": 0.4555966697502313, + "grad_norm": 3.538780689239502, + "learning_rate": 5.956618416097921e-06, + "loss": 0.965, + "step": 5910 + }, + { + "epoch": 0.45567375886524825, + "grad_norm": 3.6963050365448, + "learning_rate": 5.955393001611283e-06, + "loss": 1.0603, + "step": 5911 + }, + { + "epoch": 0.45575084798026516, + "grad_norm": 3.4785306453704834, + "learning_rate": 5.954167527560837e-06, + "loss": 0.9417, + "step": 5912 + }, + { + "epoch": 0.45582793709528213, + "grad_norm": 3.757713556289673, + "learning_rate": 5.952941994022988e-06, + "loss": 1.0384, + "step": 5913 + }, + { + "epoch": 0.4559050262102991, + "grad_norm": 3.715573310852051, + "learning_rate": 5.951716401074143e-06, + "loss": 0.9363, + "step": 5914 + }, + { + "epoch": 0.4559821153253161, + "grad_norm": 3.4896347522735596, + "learning_rate": 5.9504907487907086e-06, + "loss": 1.0017, + "step": 5915 + }, + { + "epoch": 0.45605920444033304, + "grad_norm": 3.7236409187316895, + "learning_rate": 5.949265037249096e-06, + "loss": 1.0504, + "step": 5916 + }, + { + "epoch": 0.45613629355534996, + "grad_norm": 3.842233657836914, + "learning_rate": 5.948039266525728e-06, + "loss": 1.0006, + "step": 5917 + }, + { + "epoch": 0.45621338267036693, + "grad_norm": 3.868191719055176, + "learning_rate": 5.946813436697021e-06, + "loss": 1.0097, + "step": 5918 + }, + { + "epoch": 0.4562904717853839, + "grad_norm": 3.6733784675598145, + "learning_rate": 5.9455875478394e-06, + "loss": 1.0119, + "step": 5919 + }, + { + "epoch": 0.45636756090040087, + "grad_norm": 3.975710153579712, + "learning_rate": 5.944361600029291e-06, + "loss": 1.0206, + "step": 5920 + }, + { + "epoch": 0.45644465001541784, + "grad_norm": 3.2998998165130615, + "learning_rate": 5.9431355933431285e-06, + "loss": 0.8406, + "step": 5921 + }, + { + "epoch": 0.45652173913043476, + "grad_norm": 3.8829758167266846, + "learning_rate": 5.941909527857348e-06, + "loss": 0.9318, + "step": 5922 + }, + { + "epoch": 0.4565988282454517, + "grad_norm": 3.46357798576355, + "learning_rate": 5.940683403648384e-06, + "loss": 0.927, + "step": 5923 + }, + { + "epoch": 0.4566759173604687, + "grad_norm": 3.7031266689300537, + "learning_rate": 5.939457220792684e-06, + "loss": 0.9815, + "step": 5924 + }, + { + "epoch": 0.45675300647548567, + "grad_norm": 3.6570425033569336, + "learning_rate": 5.938230979366691e-06, + "loss": 1.0229, + "step": 5925 + }, + { + "epoch": 0.45683009559050264, + "grad_norm": 3.5698962211608887, + "learning_rate": 5.937004679446854e-06, + "loss": 0.9961, + "step": 5926 + }, + { + "epoch": 0.45690718470551955, + "grad_norm": 3.748589515686035, + "learning_rate": 5.935778321109631e-06, + "loss": 0.9987, + "step": 5927 + }, + { + "epoch": 0.4569842738205365, + "grad_norm": 3.5768892765045166, + "learning_rate": 5.934551904431473e-06, + "loss": 0.9889, + "step": 5928 + }, + { + "epoch": 0.4570613629355535, + "grad_norm": 3.2622320652008057, + "learning_rate": 5.933325429488847e-06, + "loss": 0.9082, + "step": 5929 + }, + { + "epoch": 0.45713845205057047, + "grad_norm": 3.7464730739593506, + "learning_rate": 5.9320988963582125e-06, + "loss": 1.0409, + "step": 5930 + }, + { + "epoch": 0.45721554116558744, + "grad_norm": 3.6139042377471924, + "learning_rate": 5.93087230511604e-06, + "loss": 1.0702, + "step": 5931 + }, + { + "epoch": 0.4572926302806044, + "grad_norm": 3.8561818599700928, + "learning_rate": 5.929645655838801e-06, + "loss": 1.1142, + "step": 5932 + }, + { + "epoch": 0.4573697193956213, + "grad_norm": 3.627636194229126, + "learning_rate": 5.9284189486029684e-06, + "loss": 0.9967, + "step": 5933 + }, + { + "epoch": 0.4574468085106383, + "grad_norm": 4.026650428771973, + "learning_rate": 5.927192183485023e-06, + "loss": 1.0336, + "step": 5934 + }, + { + "epoch": 0.45752389762565526, + "grad_norm": 3.624974012374878, + "learning_rate": 5.925965360561448e-06, + "loss": 0.9791, + "step": 5935 + }, + { + "epoch": 0.45760098674067223, + "grad_norm": 3.952138662338257, + "learning_rate": 5.924738479908728e-06, + "loss": 0.9589, + "step": 5936 + }, + { + "epoch": 0.4576780758556892, + "grad_norm": 4.19709587097168, + "learning_rate": 5.923511541603353e-06, + "loss": 1.0059, + "step": 5937 + }, + { + "epoch": 0.4577551649707061, + "grad_norm": 4.1676530838012695, + "learning_rate": 5.922284545721817e-06, + "loss": 1.0851, + "step": 5938 + }, + { + "epoch": 0.4578322540857231, + "grad_norm": 3.9117021560668945, + "learning_rate": 5.921057492340614e-06, + "loss": 1.0179, + "step": 5939 + }, + { + "epoch": 0.45790934320074006, + "grad_norm": 3.553438186645508, + "learning_rate": 5.919830381536249e-06, + "loss": 0.9542, + "step": 5940 + }, + { + "epoch": 0.45798643231575703, + "grad_norm": 3.3051836490631104, + "learning_rate": 5.918603213385223e-06, + "loss": 0.9783, + "step": 5941 + }, + { + "epoch": 0.458063521430774, + "grad_norm": 4.240174770355225, + "learning_rate": 5.917375987964044e-06, + "loss": 1.0063, + "step": 5942 + }, + { + "epoch": 0.4581406105457909, + "grad_norm": 3.5065135955810547, + "learning_rate": 5.916148705349224e-06, + "loss": 0.8109, + "step": 5943 + }, + { + "epoch": 0.4582176996608079, + "grad_norm": 3.668158531188965, + "learning_rate": 5.914921365617276e-06, + "loss": 0.879, + "step": 5944 + }, + { + "epoch": 0.45829478877582486, + "grad_norm": 3.529534339904785, + "learning_rate": 5.9136939688447205e-06, + "loss": 0.9102, + "step": 5945 + }, + { + "epoch": 0.45837187789084183, + "grad_norm": 3.5600032806396484, + "learning_rate": 5.912466515108078e-06, + "loss": 0.9238, + "step": 5946 + }, + { + "epoch": 0.4584489670058588, + "grad_norm": 3.8036692142486572, + "learning_rate": 5.911239004483874e-06, + "loss": 1.1042, + "step": 5947 + }, + { + "epoch": 0.4585260561208757, + "grad_norm": 3.5569405555725098, + "learning_rate": 5.9100114370486375e-06, + "loss": 0.9662, + "step": 5948 + }, + { + "epoch": 0.4586031452358927, + "grad_norm": 3.4714467525482178, + "learning_rate": 5.9087838128789e-06, + "loss": 0.9444, + "step": 5949 + }, + { + "epoch": 0.45868023435090965, + "grad_norm": 3.8040552139282227, + "learning_rate": 5.9075561320511994e-06, + "loss": 0.9892, + "step": 5950 + }, + { + "epoch": 0.4587573234659266, + "grad_norm": 3.3058183193206787, + "learning_rate": 5.906328394642075e-06, + "loss": 1.0018, + "step": 5951 + }, + { + "epoch": 0.4588344125809436, + "grad_norm": 3.314758539199829, + "learning_rate": 5.905100600728067e-06, + "loss": 0.831, + "step": 5952 + }, + { + "epoch": 0.4589115016959605, + "grad_norm": 3.5863702297210693, + "learning_rate": 5.903872750385726e-06, + "loss": 0.9611, + "step": 5953 + }, + { + "epoch": 0.4589885908109775, + "grad_norm": 3.328200101852417, + "learning_rate": 5.902644843691601e-06, + "loss": 0.9919, + "step": 5954 + }, + { + "epoch": 0.45906567992599445, + "grad_norm": 3.5870213508605957, + "learning_rate": 5.901416880722242e-06, + "loss": 0.9036, + "step": 5955 + }, + { + "epoch": 0.4591427690410114, + "grad_norm": 3.978027582168579, + "learning_rate": 5.900188861554213e-06, + "loss": 0.9305, + "step": 5956 + }, + { + "epoch": 0.4592198581560284, + "grad_norm": 3.5891008377075195, + "learning_rate": 5.898960786264067e-06, + "loss": 1.0179, + "step": 5957 + }, + { + "epoch": 0.4592969472710453, + "grad_norm": 3.536594867706299, + "learning_rate": 5.897732654928373e-06, + "loss": 0.9776, + "step": 5958 + }, + { + "epoch": 0.4593740363860623, + "grad_norm": 3.7737505435943604, + "learning_rate": 5.896504467623698e-06, + "loss": 1.121, + "step": 5959 + }, + { + "epoch": 0.45945112550107925, + "grad_norm": 3.9404971599578857, + "learning_rate": 5.895276224426613e-06, + "loss": 1.0788, + "step": 5960 + }, + { + "epoch": 0.4595282146160962, + "grad_norm": 3.931784152984619, + "learning_rate": 5.894047925413691e-06, + "loss": 0.988, + "step": 5961 + }, + { + "epoch": 0.4596053037311132, + "grad_norm": 3.751889228820801, + "learning_rate": 5.892819570661511e-06, + "loss": 0.9438, + "step": 5962 + }, + { + "epoch": 0.4596823928461301, + "grad_norm": 4.397334098815918, + "learning_rate": 5.891591160246655e-06, + "loss": 0.9381, + "step": 5963 + }, + { + "epoch": 0.4597594819611471, + "grad_norm": 3.8391776084899902, + "learning_rate": 5.890362694245709e-06, + "loss": 0.8604, + "step": 5964 + }, + { + "epoch": 0.45983657107616405, + "grad_norm": 3.5146422386169434, + "learning_rate": 5.889134172735259e-06, + "loss": 1.0192, + "step": 5965 + }, + { + "epoch": 0.459913660191181, + "grad_norm": 3.9917097091674805, + "learning_rate": 5.887905595791899e-06, + "loss": 1.0062, + "step": 5966 + }, + { + "epoch": 0.459990749306198, + "grad_norm": 3.921919345855713, + "learning_rate": 5.886676963492224e-06, + "loss": 0.9134, + "step": 5967 + }, + { + "epoch": 0.4600678384212149, + "grad_norm": 3.3720543384552, + "learning_rate": 5.885448275912832e-06, + "loss": 0.8687, + "step": 5968 + }, + { + "epoch": 0.4601449275362319, + "grad_norm": 3.6408445835113525, + "learning_rate": 5.884219533130325e-06, + "loss": 1.0733, + "step": 5969 + }, + { + "epoch": 0.46022201665124884, + "grad_norm": 3.81526255607605, + "learning_rate": 5.882990735221312e-06, + "loss": 0.9489, + "step": 5970 + }, + { + "epoch": 0.4602991057662658, + "grad_norm": 4.006250858306885, + "learning_rate": 5.881761882262398e-06, + "loss": 0.9353, + "step": 5971 + }, + { + "epoch": 0.4603761948812828, + "grad_norm": 3.7868504524230957, + "learning_rate": 5.880532974330197e-06, + "loss": 1.0833, + "step": 5972 + }, + { + "epoch": 0.4604532839962997, + "grad_norm": 3.490978956222534, + "learning_rate": 5.879304011501327e-06, + "loss": 1.0075, + "step": 5973 + }, + { + "epoch": 0.46053037311131667, + "grad_norm": 3.4989683628082275, + "learning_rate": 5.878074993852405e-06, + "loss": 1.0013, + "step": 5974 + }, + { + "epoch": 0.46060746222633364, + "grad_norm": 3.448817729949951, + "learning_rate": 5.876845921460055e-06, + "loss": 0.8911, + "step": 5975 + }, + { + "epoch": 0.4606845513413506, + "grad_norm": 3.613193988800049, + "learning_rate": 5.875616794400902e-06, + "loss": 0.9418, + "step": 5976 + }, + { + "epoch": 0.4607616404563676, + "grad_norm": 3.5848329067230225, + "learning_rate": 5.874387612751579e-06, + "loss": 1.0202, + "step": 5977 + }, + { + "epoch": 0.4608387295713845, + "grad_norm": 3.828852415084839, + "learning_rate": 5.8731583765887156e-06, + "loss": 1.0841, + "step": 5978 + }, + { + "epoch": 0.46091581868640147, + "grad_norm": 3.3265674114227295, + "learning_rate": 5.87192908598895e-06, + "loss": 0.8589, + "step": 5979 + }, + { + "epoch": 0.46099290780141844, + "grad_norm": 3.5973331928253174, + "learning_rate": 5.870699741028922e-06, + "loss": 0.9637, + "step": 5980 + }, + { + "epoch": 0.4610699969164354, + "grad_norm": 3.2978997230529785, + "learning_rate": 5.869470341785274e-06, + "loss": 0.9569, + "step": 5981 + }, + { + "epoch": 0.4611470860314524, + "grad_norm": 3.7744216918945312, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.9753, + "step": 5982 + }, + { + "epoch": 0.4612241751464693, + "grad_norm": 3.4844672679901123, + "learning_rate": 5.8670113807537095e-06, + "loss": 0.9662, + "step": 5983 + }, + { + "epoch": 0.46130126426148627, + "grad_norm": 3.5159308910369873, + "learning_rate": 5.865781819119096e-06, + "loss": 0.9549, + "step": 5984 + }, + { + "epoch": 0.46137835337650324, + "grad_norm": 3.471386194229126, + "learning_rate": 5.864552203507472e-06, + "loss": 0.9481, + "step": 5985 + }, + { + "epoch": 0.4614554424915202, + "grad_norm": 3.8364601135253906, + "learning_rate": 5.863322533995495e-06, + "loss": 0.9589, + "step": 5986 + }, + { + "epoch": 0.4615325316065372, + "grad_norm": 3.745004415512085, + "learning_rate": 5.862092810659829e-06, + "loss": 0.9863, + "step": 5987 + }, + { + "epoch": 0.4616096207215541, + "grad_norm": 3.7336912155151367, + "learning_rate": 5.860863033577141e-06, + "loss": 1.0487, + "step": 5988 + }, + { + "epoch": 0.46168670983657106, + "grad_norm": 3.603816509246826, + "learning_rate": 5.859633202824101e-06, + "loss": 1.0228, + "step": 5989 + }, + { + "epoch": 0.46176379895158803, + "grad_norm": 3.479917049407959, + "learning_rate": 5.858403318477384e-06, + "loss": 0.9293, + "step": 5990 + }, + { + "epoch": 0.461840888066605, + "grad_norm": 3.8286755084991455, + "learning_rate": 5.857173380613665e-06, + "loss": 1.0632, + "step": 5991 + }, + { + "epoch": 0.461917977181622, + "grad_norm": 3.5856165885925293, + "learning_rate": 5.855943389309626e-06, + "loss": 1.0216, + "step": 5992 + }, + { + "epoch": 0.4619950662966389, + "grad_norm": 4.016223907470703, + "learning_rate": 5.8547133446419495e-06, + "loss": 0.9528, + "step": 5993 + }, + { + "epoch": 0.46207215541165586, + "grad_norm": 3.4276955127716064, + "learning_rate": 5.853483246687323e-06, + "loss": 1.0603, + "step": 5994 + }, + { + "epoch": 0.46214924452667283, + "grad_norm": 3.522810459136963, + "learning_rate": 5.852253095522435e-06, + "loss": 0.9513, + "step": 5995 + }, + { + "epoch": 0.4622263336416898, + "grad_norm": 3.607531785964966, + "learning_rate": 5.851022891223982e-06, + "loss": 0.9798, + "step": 5996 + }, + { + "epoch": 0.46230342275670677, + "grad_norm": 3.3895387649536133, + "learning_rate": 5.849792633868659e-06, + "loss": 0.9795, + "step": 5997 + }, + { + "epoch": 0.4623805118717237, + "grad_norm": 4.266413688659668, + "learning_rate": 5.848562323533165e-06, + "loss": 0.9955, + "step": 5998 + }, + { + "epoch": 0.46245760098674066, + "grad_norm": 3.460974931716919, + "learning_rate": 5.8473319602942065e-06, + "loss": 0.9737, + "step": 5999 + }, + { + "epoch": 0.46253469010175763, + "grad_norm": 3.6543521881103516, + "learning_rate": 5.846101544228488e-06, + "loss": 0.898, + "step": 6000 + }, + { + "epoch": 0.4626117792167746, + "grad_norm": 3.48816180229187, + "learning_rate": 5.84487107541272e-06, + "loss": 0.9795, + "step": 6001 + }, + { + "epoch": 0.46268886833179157, + "grad_norm": 3.694929838180542, + "learning_rate": 5.843640553923618e-06, + "loss": 0.9415, + "step": 6002 + }, + { + "epoch": 0.4627659574468085, + "grad_norm": 3.641181230545044, + "learning_rate": 5.842409979837894e-06, + "loss": 0.9422, + "step": 6003 + }, + { + "epoch": 0.46284304656182546, + "grad_norm": 5.558597087860107, + "learning_rate": 5.841179353232273e-06, + "loss": 0.9911, + "step": 6004 + }, + { + "epoch": 0.4629201356768424, + "grad_norm": 3.562645196914673, + "learning_rate": 5.839948674183476e-06, + "loss": 1.0034, + "step": 6005 + }, + { + "epoch": 0.4629972247918594, + "grad_norm": 3.8084611892700195, + "learning_rate": 5.8387179427682265e-06, + "loss": 1.0519, + "step": 6006 + }, + { + "epoch": 0.46307431390687637, + "grad_norm": 3.4862160682678223, + "learning_rate": 5.837487159063259e-06, + "loss": 0.9177, + "step": 6007 + }, + { + "epoch": 0.4631514030218933, + "grad_norm": 3.7302255630493164, + "learning_rate": 5.836256323145304e-06, + "loss": 0.9929, + "step": 6008 + }, + { + "epoch": 0.46322849213691025, + "grad_norm": 4.211978912353516, + "learning_rate": 5.835025435091099e-06, + "loss": 1.0944, + "step": 6009 + }, + { + "epoch": 0.4633055812519272, + "grad_norm": 3.287256956100464, + "learning_rate": 5.8337944949773825e-06, + "loss": 0.9778, + "step": 6010 + }, + { + "epoch": 0.4633826703669442, + "grad_norm": 3.5103518962860107, + "learning_rate": 5.832563502880898e-06, + "loss": 0.9329, + "step": 6011 + }, + { + "epoch": 0.46345975948196116, + "grad_norm": 3.8197262287139893, + "learning_rate": 5.831332458878391e-06, + "loss": 1.0127, + "step": 6012 + }, + { + "epoch": 0.4635368485969781, + "grad_norm": 3.5835108757019043, + "learning_rate": 5.830101363046611e-06, + "loss": 1.0235, + "step": 6013 + }, + { + "epoch": 0.46361393771199505, + "grad_norm": 3.5647315979003906, + "learning_rate": 5.82887021546231e-06, + "loss": 0.9782, + "step": 6014 + }, + { + "epoch": 0.463691026827012, + "grad_norm": 3.430677652359009, + "learning_rate": 5.827639016202244e-06, + "loss": 1.0517, + "step": 6015 + }, + { + "epoch": 0.463768115942029, + "grad_norm": 4.109676837921143, + "learning_rate": 5.826407765343172e-06, + "loss": 0.8909, + "step": 6016 + }, + { + "epoch": 0.46384520505704596, + "grad_norm": 5.432858943939209, + "learning_rate": 5.825176462961854e-06, + "loss": 0.9727, + "step": 6017 + }, + { + "epoch": 0.46392229417206293, + "grad_norm": 3.428158760070801, + "learning_rate": 5.82394510913506e-06, + "loss": 0.9286, + "step": 6018 + }, + { + "epoch": 0.46399938328707985, + "grad_norm": 3.5705456733703613, + "learning_rate": 5.822713703939554e-06, + "loss": 0.9397, + "step": 6019 + }, + { + "epoch": 0.4640764724020968, + "grad_norm": 3.492794990539551, + "learning_rate": 5.82148224745211e-06, + "loss": 0.943, + "step": 6020 + }, + { + "epoch": 0.4641535615171138, + "grad_norm": 3.4019174575805664, + "learning_rate": 5.820250739749502e-06, + "loss": 0.851, + "step": 6021 + }, + { + "epoch": 0.46423065063213076, + "grad_norm": 3.375807046890259, + "learning_rate": 5.819019180908509e-06, + "loss": 0.9877, + "step": 6022 + }, + { + "epoch": 0.46430773974714773, + "grad_norm": 3.3332841396331787, + "learning_rate": 5.817787571005913e-06, + "loss": 0.9421, + "step": 6023 + }, + { + "epoch": 0.46438482886216464, + "grad_norm": 3.4136815071105957, + "learning_rate": 5.8165559101184955e-06, + "loss": 1.0011, + "step": 6024 + }, + { + "epoch": 0.4644619179771816, + "grad_norm": 3.7559056282043457, + "learning_rate": 5.8153241983230464e-06, + "loss": 1.0067, + "step": 6025 + }, + { + "epoch": 0.4645390070921986, + "grad_norm": 3.7920331954956055, + "learning_rate": 5.814092435696358e-06, + "loss": 0.9631, + "step": 6026 + }, + { + "epoch": 0.46461609620721556, + "grad_norm": 3.2758357524871826, + "learning_rate": 5.81286062231522e-06, + "loss": 0.857, + "step": 6027 + }, + { + "epoch": 0.4646931853222325, + "grad_norm": 3.8748836517333984, + "learning_rate": 5.811628758256433e-06, + "loss": 1.0907, + "step": 6028 + }, + { + "epoch": 0.46477027443724944, + "grad_norm": 3.435882568359375, + "learning_rate": 5.8103968435967965e-06, + "loss": 0.8907, + "step": 6029 + }, + { + "epoch": 0.4648473635522664, + "grad_norm": 3.716470956802368, + "learning_rate": 5.809164878413114e-06, + "loss": 0.9661, + "step": 6030 + }, + { + "epoch": 0.4649244526672834, + "grad_norm": 3.935591697692871, + "learning_rate": 5.807932862782193e-06, + "loss": 1.005, + "step": 6031 + }, + { + "epoch": 0.46500154178230035, + "grad_norm": 3.4351768493652344, + "learning_rate": 5.8067007967808405e-06, + "loss": 0.9601, + "step": 6032 + }, + { + "epoch": 0.4650786308973173, + "grad_norm": 3.390435218811035, + "learning_rate": 5.805468680485874e-06, + "loss": 0.9108, + "step": 6033 + }, + { + "epoch": 0.46515572001233424, + "grad_norm": 3.5145466327667236, + "learning_rate": 5.804236513974104e-06, + "loss": 0.9561, + "step": 6034 + }, + { + "epoch": 0.4652328091273512, + "grad_norm": 3.6625545024871826, + "learning_rate": 5.8030042973223545e-06, + "loss": 0.896, + "step": 6035 + }, + { + "epoch": 0.4653098982423682, + "grad_norm": 3.7747929096221924, + "learning_rate": 5.801772030607445e-06, + "loss": 0.9278, + "step": 6036 + }, + { + "epoch": 0.46538698735738515, + "grad_norm": 3.4798998832702637, + "learning_rate": 5.800539713906203e-06, + "loss": 1.0179, + "step": 6037 + }, + { + "epoch": 0.4654640764724021, + "grad_norm": 3.7864067554473877, + "learning_rate": 5.799307347295455e-06, + "loss": 0.9142, + "step": 6038 + }, + { + "epoch": 0.46554116558741904, + "grad_norm": 3.8589086532592773, + "learning_rate": 5.798074930852035e-06, + "loss": 0.8484, + "step": 6039 + }, + { + "epoch": 0.465618254702436, + "grad_norm": 3.9709889888763428, + "learning_rate": 5.796842464652774e-06, + "loss": 1.1772, + "step": 6040 + }, + { + "epoch": 0.465695343817453, + "grad_norm": 3.7802610397338867, + "learning_rate": 5.7956099487745135e-06, + "loss": 1.0716, + "step": 6041 + }, + { + "epoch": 0.46577243293246995, + "grad_norm": 3.4964866638183594, + "learning_rate": 5.794377383294094e-06, + "loss": 0.9444, + "step": 6042 + }, + { + "epoch": 0.4658495220474869, + "grad_norm": 3.519301176071167, + "learning_rate": 5.7931447682883565e-06, + "loss": 1.0331, + "step": 6043 + }, + { + "epoch": 0.46592661116250383, + "grad_norm": 3.5994677543640137, + "learning_rate": 5.791912103834154e-06, + "loss": 0.9118, + "step": 6044 + }, + { + "epoch": 0.4660037002775208, + "grad_norm": 3.669741630554199, + "learning_rate": 5.79067939000833e-06, + "loss": 0.9314, + "step": 6045 + }, + { + "epoch": 0.4660807893925378, + "grad_norm": 3.6019272804260254, + "learning_rate": 5.7894466268877426e-06, + "loss": 0.9013, + "step": 6046 + }, + { + "epoch": 0.46615787850755475, + "grad_norm": 4.028010845184326, + "learning_rate": 5.788213814549247e-06, + "loss": 0.9965, + "step": 6047 + }, + { + "epoch": 0.4662349676225717, + "grad_norm": 3.4716103076934814, + "learning_rate": 5.786980953069702e-06, + "loss": 0.8853, + "step": 6048 + }, + { + "epoch": 0.46631205673758863, + "grad_norm": 3.9504270553588867, + "learning_rate": 5.785748042525969e-06, + "loss": 1.1174, + "step": 6049 + }, + { + "epoch": 0.4663891458526056, + "grad_norm": 3.453526258468628, + "learning_rate": 5.784515082994917e-06, + "loss": 1.031, + "step": 6050 + }, + { + "epoch": 0.4664662349676226, + "grad_norm": 3.568610668182373, + "learning_rate": 5.783282074553412e-06, + "loss": 0.9396, + "step": 6051 + }, + { + "epoch": 0.46654332408263954, + "grad_norm": 3.7350780963897705, + "learning_rate": 5.782049017278326e-06, + "loss": 0.9443, + "step": 6052 + }, + { + "epoch": 0.4666204131976565, + "grad_norm": 4.244739532470703, + "learning_rate": 5.7808159112465344e-06, + "loss": 1.0614, + "step": 6053 + }, + { + "epoch": 0.46669750231267343, + "grad_norm": 3.688356637954712, + "learning_rate": 5.779582756534914e-06, + "loss": 0.8876, + "step": 6054 + }, + { + "epoch": 0.4667745914276904, + "grad_norm": 3.4765431880950928, + "learning_rate": 5.778349553220348e-06, + "loss": 0.9501, + "step": 6055 + }, + { + "epoch": 0.46685168054270737, + "grad_norm": 3.3060991764068604, + "learning_rate": 5.777116301379717e-06, + "loss": 0.9299, + "step": 6056 + }, + { + "epoch": 0.46692876965772434, + "grad_norm": 3.5781679153442383, + "learning_rate": 5.775883001089911e-06, + "loss": 0.9658, + "step": 6057 + }, + { + "epoch": 0.4670058587727413, + "grad_norm": 3.767371892929077, + "learning_rate": 5.7746496524278176e-06, + "loss": 1.0249, + "step": 6058 + }, + { + "epoch": 0.4670829478877582, + "grad_norm": 3.511653184890747, + "learning_rate": 5.7734162554703284e-06, + "loss": 0.8842, + "step": 6059 + }, + { + "epoch": 0.4671600370027752, + "grad_norm": 3.546081781387329, + "learning_rate": 5.7721828102943445e-06, + "loss": 1.0454, + "step": 6060 + }, + { + "epoch": 0.46723712611779217, + "grad_norm": 3.7766621112823486, + "learning_rate": 5.770949316976759e-06, + "loss": 0.9635, + "step": 6061 + }, + { + "epoch": 0.46731421523280914, + "grad_norm": 3.4135143756866455, + "learning_rate": 5.7697157755944775e-06, + "loss": 0.9405, + "step": 6062 + }, + { + "epoch": 0.4673913043478261, + "grad_norm": 3.601234197616577, + "learning_rate": 5.768482186224405e-06, + "loss": 0.9234, + "step": 6063 + }, + { + "epoch": 0.467468393462843, + "grad_norm": 3.9338316917419434, + "learning_rate": 5.7672485489434456e-06, + "loss": 1.032, + "step": 6064 + }, + { + "epoch": 0.46754548257786, + "grad_norm": 4.0997633934021, + "learning_rate": 5.766014863828515e-06, + "loss": 0.956, + "step": 6065 + }, + { + "epoch": 0.46762257169287696, + "grad_norm": 3.778200387954712, + "learning_rate": 5.764781130956525e-06, + "loss": 0.9533, + "step": 6066 + }, + { + "epoch": 0.46769966080789394, + "grad_norm": 3.8161983489990234, + "learning_rate": 5.763547350404391e-06, + "loss": 0.9875, + "step": 6067 + }, + { + "epoch": 0.4677767499229109, + "grad_norm": 3.7838432788848877, + "learning_rate": 5.762313522249036e-06, + "loss": 0.9397, + "step": 6068 + }, + { + "epoch": 0.4678538390379278, + "grad_norm": 3.383547067642212, + "learning_rate": 5.761079646567379e-06, + "loss": 0.95, + "step": 6069 + }, + { + "epoch": 0.4679309281529448, + "grad_norm": 3.7536256313323975, + "learning_rate": 5.7598457234363484e-06, + "loss": 1.0254, + "step": 6070 + }, + { + "epoch": 0.46800801726796176, + "grad_norm": 3.5894887447357178, + "learning_rate": 5.7586117529328735e-06, + "loss": 0.9251, + "step": 6071 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 3.460188627243042, + "learning_rate": 5.757377735133882e-06, + "loss": 0.9488, + "step": 6072 + }, + { + "epoch": 0.4681621954979957, + "grad_norm": 3.609666109085083, + "learning_rate": 5.756143670116315e-06, + "loss": 0.8807, + "step": 6073 + }, + { + "epoch": 0.4682392846130126, + "grad_norm": 3.767493724822998, + "learning_rate": 5.754909557957104e-06, + "loss": 1.1737, + "step": 6074 + }, + { + "epoch": 0.4683163737280296, + "grad_norm": 3.6794867515563965, + "learning_rate": 5.753675398733192e-06, + "loss": 1.0523, + "step": 6075 + }, + { + "epoch": 0.46839346284304656, + "grad_norm": 3.692859649658203, + "learning_rate": 5.752441192521523e-06, + "loss": 0.9905, + "step": 6076 + }, + { + "epoch": 0.46847055195806353, + "grad_norm": 3.4956281185150146, + "learning_rate": 5.751206939399041e-06, + "loss": 0.8996, + "step": 6077 + }, + { + "epoch": 0.4685476410730805, + "grad_norm": 3.6580235958099365, + "learning_rate": 5.749972639442698e-06, + "loss": 1.0527, + "step": 6078 + }, + { + "epoch": 0.4686247301880974, + "grad_norm": 3.3004517555236816, + "learning_rate": 5.748738292729445e-06, + "loss": 1.0016, + "step": 6079 + }, + { + "epoch": 0.4687018193031144, + "grad_norm": 3.7859244346618652, + "learning_rate": 5.747503899336238e-06, + "loss": 1.0551, + "step": 6080 + }, + { + "epoch": 0.46877890841813136, + "grad_norm": 3.9992268085479736, + "learning_rate": 5.746269459340034e-06, + "loss": 0.9909, + "step": 6081 + }, + { + "epoch": 0.4688559975331483, + "grad_norm": 3.6973602771759033, + "learning_rate": 5.7450349728177945e-06, + "loss": 0.9538, + "step": 6082 + }, + { + "epoch": 0.4689330866481653, + "grad_norm": 3.4329545497894287, + "learning_rate": 5.743800439846482e-06, + "loss": 0.9311, + "step": 6083 + }, + { + "epoch": 0.4690101757631822, + "grad_norm": 3.846623182296753, + "learning_rate": 5.742565860503066e-06, + "loss": 0.9475, + "step": 6084 + }, + { + "epoch": 0.4690872648781992, + "grad_norm": 4.217624664306641, + "learning_rate": 5.741331234864513e-06, + "loss": 1.204, + "step": 6085 + }, + { + "epoch": 0.46916435399321615, + "grad_norm": 4.2322258949279785, + "learning_rate": 5.740096563007797e-06, + "loss": 0.9926, + "step": 6086 + }, + { + "epoch": 0.4692414431082331, + "grad_norm": 4.023532867431641, + "learning_rate": 5.738861845009894e-06, + "loss": 1.0084, + "step": 6087 + }, + { + "epoch": 0.4693185322232501, + "grad_norm": 3.6507327556610107, + "learning_rate": 5.737627080947781e-06, + "loss": 0.9628, + "step": 6088 + }, + { + "epoch": 0.469395621338267, + "grad_norm": 3.5047266483306885, + "learning_rate": 5.73639227089844e-06, + "loss": 0.9771, + "step": 6089 + }, + { + "epoch": 0.469472710453284, + "grad_norm": 3.76613450050354, + "learning_rate": 5.735157414938855e-06, + "loss": 0.981, + "step": 6090 + }, + { + "epoch": 0.46954979956830095, + "grad_norm": 3.5989575386047363, + "learning_rate": 5.733922513146013e-06, + "loss": 0.8601, + "step": 6091 + }, + { + "epoch": 0.4696268886833179, + "grad_norm": 4.787153720855713, + "learning_rate": 5.732687565596904e-06, + "loss": 1.1043, + "step": 6092 + }, + { + "epoch": 0.4697039777983349, + "grad_norm": 3.478583812713623, + "learning_rate": 5.731452572368517e-06, + "loss": 0.9255, + "step": 6093 + }, + { + "epoch": 0.4697810669133518, + "grad_norm": 3.7596375942230225, + "learning_rate": 5.730217533537853e-06, + "loss": 1.1279, + "step": 6094 + }, + { + "epoch": 0.4698581560283688, + "grad_norm": 3.626962661743164, + "learning_rate": 5.728982449181907e-06, + "loss": 1.0483, + "step": 6095 + }, + { + "epoch": 0.46993524514338575, + "grad_norm": 3.686464548110962, + "learning_rate": 5.72774731937768e-06, + "loss": 0.9567, + "step": 6096 + }, + { + "epoch": 0.4700123342584027, + "grad_norm": 4.225037574768066, + "learning_rate": 5.7265121442021784e-06, + "loss": 1.0122, + "step": 6097 + }, + { + "epoch": 0.4700894233734197, + "grad_norm": 3.9033050537109375, + "learning_rate": 5.725276923732406e-06, + "loss": 0.9657, + "step": 6098 + }, + { + "epoch": 0.4701665124884366, + "grad_norm": 3.35575532913208, + "learning_rate": 5.724041658045374e-06, + "loss": 0.9243, + "step": 6099 + }, + { + "epoch": 0.4702436016034536, + "grad_norm": 3.7150776386260986, + "learning_rate": 5.722806347218095e-06, + "loss": 0.98, + "step": 6100 + }, + { + "epoch": 0.47032069071847055, + "grad_norm": 3.7382919788360596, + "learning_rate": 5.721570991327583e-06, + "loss": 1.1083, + "step": 6101 + }, + { + "epoch": 0.4703977798334875, + "grad_norm": 3.8335604667663574, + "learning_rate": 5.720335590450858e-06, + "loss": 0.9726, + "step": 6102 + }, + { + "epoch": 0.4704748689485045, + "grad_norm": 3.5674989223480225, + "learning_rate": 5.719100144664938e-06, + "loss": 1.0163, + "step": 6103 + }, + { + "epoch": 0.47055195806352146, + "grad_norm": 3.2261412143707275, + "learning_rate": 5.71786465404685e-06, + "loss": 0.9454, + "step": 6104 + }, + { + "epoch": 0.4706290471785384, + "grad_norm": 3.453087091445923, + "learning_rate": 5.716629118673618e-06, + "loss": 0.8817, + "step": 6105 + }, + { + "epoch": 0.47070613629355534, + "grad_norm": 3.143789768218994, + "learning_rate": 5.7153935386222715e-06, + "loss": 0.9472, + "step": 6106 + }, + { + "epoch": 0.4707832254085723, + "grad_norm": 3.5191619396209717, + "learning_rate": 5.714157913969843e-06, + "loss": 0.9291, + "step": 6107 + }, + { + "epoch": 0.4708603145235893, + "grad_norm": 3.2179696559906006, + "learning_rate": 5.712922244793369e-06, + "loss": 0.9505, + "step": 6108 + }, + { + "epoch": 0.47093740363860626, + "grad_norm": 3.557729959487915, + "learning_rate": 5.711686531169883e-06, + "loss": 0.999, + "step": 6109 + }, + { + "epoch": 0.47101449275362317, + "grad_norm": 3.841547727584839, + "learning_rate": 5.710450773176428e-06, + "loss": 0.987, + "step": 6110 + }, + { + "epoch": 0.47109158186864014, + "grad_norm": 3.502171039581299, + "learning_rate": 5.709214970890049e-06, + "loss": 0.9596, + "step": 6111 + }, + { + "epoch": 0.4711686709836571, + "grad_norm": 3.86859393119812, + "learning_rate": 5.707979124387788e-06, + "loss": 0.9704, + "step": 6112 + }, + { + "epoch": 0.4712457600986741, + "grad_norm": 3.8467605113983154, + "learning_rate": 5.706743233746695e-06, + "loss": 0.9401, + "step": 6113 + }, + { + "epoch": 0.47132284921369105, + "grad_norm": 3.454573392868042, + "learning_rate": 5.705507299043822e-06, + "loss": 0.8615, + "step": 6114 + }, + { + "epoch": 0.47139993832870797, + "grad_norm": 3.3528759479522705, + "learning_rate": 5.704271320356223e-06, + "loss": 0.8713, + "step": 6115 + }, + { + "epoch": 0.47147702744372494, + "grad_norm": 3.8421576023101807, + "learning_rate": 5.703035297760956e-06, + "loss": 1.066, + "step": 6116 + }, + { + "epoch": 0.4715541165587419, + "grad_norm": 3.7155203819274902, + "learning_rate": 5.7017992313350765e-06, + "loss": 1.0062, + "step": 6117 + }, + { + "epoch": 0.4716312056737589, + "grad_norm": 3.5856966972351074, + "learning_rate": 5.700563121155651e-06, + "loss": 1.0004, + "step": 6118 + }, + { + "epoch": 0.47170829478877585, + "grad_norm": 4.203090190887451, + "learning_rate": 5.699326967299743e-06, + "loss": 1.0252, + "step": 6119 + }, + { + "epoch": 0.47178538390379277, + "grad_norm": 3.741010904312134, + "learning_rate": 5.698090769844421e-06, + "loss": 1.0402, + "step": 6120 + }, + { + "epoch": 0.47186247301880974, + "grad_norm": 3.547619104385376, + "learning_rate": 5.696854528866755e-06, + "loss": 0.8535, + "step": 6121 + }, + { + "epoch": 0.4719395621338267, + "grad_norm": 3.611316442489624, + "learning_rate": 5.695618244443818e-06, + "loss": 1.0149, + "step": 6122 + }, + { + "epoch": 0.4720166512488437, + "grad_norm": 3.562122344970703, + "learning_rate": 5.694381916652686e-06, + "loss": 0.9871, + "step": 6123 + }, + { + "epoch": 0.47209374036386065, + "grad_norm": 3.538245916366577, + "learning_rate": 5.693145545570439e-06, + "loss": 0.8019, + "step": 6124 + }, + { + "epoch": 0.47217082947887756, + "grad_norm": 3.3615951538085938, + "learning_rate": 5.691909131274156e-06, + "loss": 0.9203, + "step": 6125 + }, + { + "epoch": 0.47224791859389453, + "grad_norm": 3.911485433578491, + "learning_rate": 5.6906726738409215e-06, + "loss": 0.9443, + "step": 6126 + }, + { + "epoch": 0.4723250077089115, + "grad_norm": 3.790710926055908, + "learning_rate": 5.689436173347825e-06, + "loss": 0.9956, + "step": 6127 + }, + { + "epoch": 0.4724020968239285, + "grad_norm": 3.447726249694824, + "learning_rate": 5.688199629871952e-06, + "loss": 0.9532, + "step": 6128 + }, + { + "epoch": 0.47247918593894545, + "grad_norm": 3.5070300102233887, + "learning_rate": 5.686963043490398e-06, + "loss": 0.9041, + "step": 6129 + }, + { + "epoch": 0.47255627505396236, + "grad_norm": 3.885089635848999, + "learning_rate": 5.6857264142802535e-06, + "loss": 1.0288, + "step": 6130 + }, + { + "epoch": 0.47263336416897933, + "grad_norm": 3.5010008811950684, + "learning_rate": 5.68448974231862e-06, + "loss": 0.8738, + "step": 6131 + }, + { + "epoch": 0.4727104532839963, + "grad_norm": 3.6111185550689697, + "learning_rate": 5.683253027682597e-06, + "loss": 1.0589, + "step": 6132 + }, + { + "epoch": 0.47278754239901327, + "grad_norm": 3.557774305343628, + "learning_rate": 5.682016270449286e-06, + "loss": 1.004, + "step": 6133 + }, + { + "epoch": 0.47286463151403024, + "grad_norm": 3.7765564918518066, + "learning_rate": 5.680779470695791e-06, + "loss": 0.9748, + "step": 6134 + }, + { + "epoch": 0.47294172062904716, + "grad_norm": 3.426037549972534, + "learning_rate": 5.679542628499224e-06, + "loss": 0.8693, + "step": 6135 + }, + { + "epoch": 0.47301880974406413, + "grad_norm": 3.553133487701416, + "learning_rate": 5.678305743936692e-06, + "loss": 0.9648, + "step": 6136 + }, + { + "epoch": 0.4730958988590811, + "grad_norm": 3.720686197280884, + "learning_rate": 5.677068817085311e-06, + "loss": 1.0429, + "step": 6137 + }, + { + "epoch": 0.47317298797409807, + "grad_norm": 3.6694304943084717, + "learning_rate": 5.675831848022195e-06, + "loss": 0.9811, + "step": 6138 + }, + { + "epoch": 0.47325007708911504, + "grad_norm": 3.528355121612549, + "learning_rate": 5.674594836824463e-06, + "loss": 1.0162, + "step": 6139 + }, + { + "epoch": 0.47332716620413195, + "grad_norm": 3.9201457500457764, + "learning_rate": 5.673357783569238e-06, + "loss": 1.0571, + "step": 6140 + }, + { + "epoch": 0.4734042553191489, + "grad_norm": 3.4825947284698486, + "learning_rate": 5.672120688333642e-06, + "loss": 0.8942, + "step": 6141 + }, + { + "epoch": 0.4734813444341659, + "grad_norm": 3.4814279079437256, + "learning_rate": 5.6708835511948035e-06, + "loss": 0.8944, + "step": 6142 + }, + { + "epoch": 0.47355843354918287, + "grad_norm": 3.8149969577789307, + "learning_rate": 5.669646372229849e-06, + "loss": 0.9892, + "step": 6143 + }, + { + "epoch": 0.47363552266419984, + "grad_norm": 3.37603497505188, + "learning_rate": 5.6684091515159105e-06, + "loss": 0.8202, + "step": 6144 + }, + { + "epoch": 0.47371261177921675, + "grad_norm": 3.226973056793213, + "learning_rate": 5.667171889130125e-06, + "loss": 0.9294, + "step": 6145 + }, + { + "epoch": 0.4737897008942337, + "grad_norm": 3.621938467025757, + "learning_rate": 5.6659345851496265e-06, + "loss": 1.0182, + "step": 6146 + }, + { + "epoch": 0.4738667900092507, + "grad_norm": 3.5621347427368164, + "learning_rate": 5.6646972396515555e-06, + "loss": 0.9215, + "step": 6147 + }, + { + "epoch": 0.47394387912426766, + "grad_norm": 4.244327545166016, + "learning_rate": 5.663459852713055e-06, + "loss": 1.0142, + "step": 6148 + }, + { + "epoch": 0.47402096823928463, + "grad_norm": 3.6657721996307373, + "learning_rate": 5.662222424411268e-06, + "loss": 1.0159, + "step": 6149 + }, + { + "epoch": 0.47409805735430155, + "grad_norm": 3.6783275604248047, + "learning_rate": 5.660984954823342e-06, + "loss": 1.0533, + "step": 6150 + }, + { + "epoch": 0.4741751464693185, + "grad_norm": 3.747204542160034, + "learning_rate": 5.659747444026429e-06, + "loss": 1.04, + "step": 6151 + }, + { + "epoch": 0.4742522355843355, + "grad_norm": 3.452821969985962, + "learning_rate": 5.658509892097679e-06, + "loss": 0.9084, + "step": 6152 + }, + { + "epoch": 0.47432932469935246, + "grad_norm": 3.9301180839538574, + "learning_rate": 5.657272299114248e-06, + "loss": 0.8685, + "step": 6153 + }, + { + "epoch": 0.47440641381436943, + "grad_norm": 3.441697120666504, + "learning_rate": 5.656034665153294e-06, + "loss": 1.0103, + "step": 6154 + }, + { + "epoch": 0.47448350292938635, + "grad_norm": 3.8465073108673096, + "learning_rate": 5.654796990291974e-06, + "loss": 1.0009, + "step": 6155 + }, + { + "epoch": 0.4745605920444033, + "grad_norm": 3.6904850006103516, + "learning_rate": 5.6535592746074554e-06, + "loss": 0.9484, + "step": 6156 + }, + { + "epoch": 0.4746376811594203, + "grad_norm": 3.667325019836426, + "learning_rate": 5.6523215181769e-06, + "loss": 0.9456, + "step": 6157 + }, + { + "epoch": 0.47471477027443726, + "grad_norm": 3.483203172683716, + "learning_rate": 5.651083721077475e-06, + "loss": 1.0455, + "step": 6158 + }, + { + "epoch": 0.47479185938945423, + "grad_norm": 3.435410499572754, + "learning_rate": 5.649845883386355e-06, + "loss": 0.8986, + "step": 6159 + }, + { + "epoch": 0.47486894850447114, + "grad_norm": 3.477846384048462, + "learning_rate": 5.6486080051807066e-06, + "loss": 0.8902, + "step": 6160 + }, + { + "epoch": 0.4749460376194881, + "grad_norm": 3.9039952754974365, + "learning_rate": 5.647370086537709e-06, + "loss": 1.0054, + "step": 6161 + }, + { + "epoch": 0.4750231267345051, + "grad_norm": 3.6027846336364746, + "learning_rate": 5.646132127534541e-06, + "loss": 1.1071, + "step": 6162 + }, + { + "epoch": 0.47510021584952206, + "grad_norm": 4.0150370597839355, + "learning_rate": 5.6448941282483795e-06, + "loss": 0.9272, + "step": 6163 + }, + { + "epoch": 0.475177304964539, + "grad_norm": 3.5615651607513428, + "learning_rate": 5.64365608875641e-06, + "loss": 0.9647, + "step": 6164 + }, + { + "epoch": 0.47525439407955594, + "grad_norm": 3.8740906715393066, + "learning_rate": 5.6424180091358175e-06, + "loss": 1.0794, + "step": 6165 + }, + { + "epoch": 0.4753314831945729, + "grad_norm": 3.5737662315368652, + "learning_rate": 5.641179889463788e-06, + "loss": 0.9521, + "step": 6166 + }, + { + "epoch": 0.4754085723095899, + "grad_norm": 3.58293080329895, + "learning_rate": 5.639941729817514e-06, + "loss": 1.0373, + "step": 6167 + }, + { + "epoch": 0.47548566142460685, + "grad_norm": 3.5029170513153076, + "learning_rate": 5.638703530274187e-06, + "loss": 1.0038, + "step": 6168 + }, + { + "epoch": 0.4755627505396238, + "grad_norm": 3.324521064758301, + "learning_rate": 5.637465290911004e-06, + "loss": 0.9587, + "step": 6169 + }, + { + "epoch": 0.47563983965464074, + "grad_norm": 3.4700326919555664, + "learning_rate": 5.63622701180516e-06, + "loss": 0.9135, + "step": 6170 + }, + { + "epoch": 0.4757169287696577, + "grad_norm": 3.6384148597717285, + "learning_rate": 5.634988693033857e-06, + "loss": 0.868, + "step": 6171 + }, + { + "epoch": 0.4757940178846747, + "grad_norm": 3.50246000289917, + "learning_rate": 5.6337503346743e-06, + "loss": 0.8695, + "step": 6172 + }, + { + "epoch": 0.47587110699969165, + "grad_norm": 3.835340738296509, + "learning_rate": 5.632511936803689e-06, + "loss": 1.0377, + "step": 6173 + }, + { + "epoch": 0.4759481961147086, + "grad_norm": 3.505685567855835, + "learning_rate": 5.631273499499236e-06, + "loss": 0.902, + "step": 6174 + }, + { + "epoch": 0.47602528522972554, + "grad_norm": 3.740123987197876, + "learning_rate": 5.630035022838151e-06, + "loss": 1.0631, + "step": 6175 + }, + { + "epoch": 0.4761023743447425, + "grad_norm": 3.739340305328369, + "learning_rate": 5.628796506897642e-06, + "loss": 1.0683, + "step": 6176 + }, + { + "epoch": 0.4761794634597595, + "grad_norm": 3.675077438354492, + "learning_rate": 5.6275579517549306e-06, + "loss": 1.016, + "step": 6177 + }, + { + "epoch": 0.47625655257477645, + "grad_norm": 3.387910842895508, + "learning_rate": 5.62631935748723e-06, + "loss": 0.8774, + "step": 6178 + }, + { + "epoch": 0.4763336416897934, + "grad_norm": 3.4367949962615967, + "learning_rate": 5.625080724171761e-06, + "loss": 0.9452, + "step": 6179 + }, + { + "epoch": 0.47641073080481033, + "grad_norm": 3.6171581745147705, + "learning_rate": 5.623842051885747e-06, + "loss": 0.9478, + "step": 6180 + }, + { + "epoch": 0.4764878199198273, + "grad_norm": 3.505159854888916, + "learning_rate": 5.622603340706411e-06, + "loss": 0.9781, + "step": 6181 + }, + { + "epoch": 0.4765649090348443, + "grad_norm": 3.581728219985962, + "learning_rate": 5.621364590710981e-06, + "loss": 0.9409, + "step": 6182 + }, + { + "epoch": 0.47664199814986125, + "grad_norm": 3.7894256114959717, + "learning_rate": 5.620125801976687e-06, + "loss": 0.9496, + "step": 6183 + }, + { + "epoch": 0.4767190872648782, + "grad_norm": 3.7810251712799072, + "learning_rate": 5.6188869745807614e-06, + "loss": 1.0094, + "step": 6184 + }, + { + "epoch": 0.47679617637989513, + "grad_norm": 3.6951205730438232, + "learning_rate": 5.6176481086004395e-06, + "loss": 0.8738, + "step": 6185 + }, + { + "epoch": 0.4768732654949121, + "grad_norm": 3.578882932662964, + "learning_rate": 5.6164092041129544e-06, + "loss": 0.9924, + "step": 6186 + }, + { + "epoch": 0.4769503546099291, + "grad_norm": 3.560987949371338, + "learning_rate": 5.615170261195549e-06, + "loss": 0.9863, + "step": 6187 + }, + { + "epoch": 0.47702744372494604, + "grad_norm": 3.433151960372925, + "learning_rate": 5.613931279925465e-06, + "loss": 0.9885, + "step": 6188 + }, + { + "epoch": 0.477104532839963, + "grad_norm": 3.652071237564087, + "learning_rate": 5.612692260379945e-06, + "loss": 0.9823, + "step": 6189 + }, + { + "epoch": 0.47718162195498, + "grad_norm": 3.784919261932373, + "learning_rate": 5.611453202636236e-06, + "loss": 1.0343, + "step": 6190 + }, + { + "epoch": 0.4772587110699969, + "grad_norm": 4.090549945831299, + "learning_rate": 5.610214106771585e-06, + "loss": 1.0221, + "step": 6191 + }, + { + "epoch": 0.47733580018501387, + "grad_norm": 3.437002182006836, + "learning_rate": 5.608974972863245e-06, + "loss": 0.9871, + "step": 6192 + }, + { + "epoch": 0.47741288930003084, + "grad_norm": 3.5076773166656494, + "learning_rate": 5.6077358009884705e-06, + "loss": 0.8728, + "step": 6193 + }, + { + "epoch": 0.4774899784150478, + "grad_norm": 3.560811996459961, + "learning_rate": 5.606496591224516e-06, + "loss": 1.0416, + "step": 6194 + }, + { + "epoch": 0.4775670675300648, + "grad_norm": 3.939362049102783, + "learning_rate": 5.60525734364864e-06, + "loss": 1.0182, + "step": 6195 + }, + { + "epoch": 0.4776441566450817, + "grad_norm": 3.568834066390991, + "learning_rate": 5.604018058338104e-06, + "loss": 0.8795, + "step": 6196 + }, + { + "epoch": 0.47772124576009867, + "grad_norm": 3.514498710632324, + "learning_rate": 5.602778735370169e-06, + "loss": 0.9279, + "step": 6197 + }, + { + "epoch": 0.47779833487511564, + "grad_norm": 3.4076337814331055, + "learning_rate": 5.601539374822103e-06, + "loss": 0.9863, + "step": 6198 + }, + { + "epoch": 0.4778754239901326, + "grad_norm": 3.399474859237671, + "learning_rate": 5.600299976771172e-06, + "loss": 0.9236, + "step": 6199 + }, + { + "epoch": 0.4779525131051496, + "grad_norm": 3.3580496311187744, + "learning_rate": 5.5990605412946466e-06, + "loss": 0.9451, + "step": 6200 + }, + { + "epoch": 0.4780296022201665, + "grad_norm": 3.756740093231201, + "learning_rate": 5.597821068469799e-06, + "loss": 1.0399, + "step": 6201 + }, + { + "epoch": 0.47810669133518346, + "grad_norm": 3.5835258960723877, + "learning_rate": 5.596581558373903e-06, + "loss": 0.9042, + "step": 6202 + }, + { + "epoch": 0.47818378045020044, + "grad_norm": 3.941892385482788, + "learning_rate": 5.595342011084237e-06, + "loss": 1.0083, + "step": 6203 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 3.4614713191986084, + "learning_rate": 5.594102426678082e-06, + "loss": 0.9398, + "step": 6204 + }, + { + "epoch": 0.4783379586802344, + "grad_norm": 3.545919179916382, + "learning_rate": 5.592862805232714e-06, + "loss": 1.0258, + "step": 6205 + }, + { + "epoch": 0.4784150477952513, + "grad_norm": 3.4241621494293213, + "learning_rate": 5.591623146825423e-06, + "loss": 0.8465, + "step": 6206 + }, + { + "epoch": 0.47849213691026826, + "grad_norm": 3.324366807937622, + "learning_rate": 5.590383451533493e-06, + "loss": 0.9298, + "step": 6207 + }, + { + "epoch": 0.47856922602528523, + "grad_norm": 3.5040102005004883, + "learning_rate": 5.589143719434211e-06, + "loss": 0.9745, + "step": 6208 + }, + { + "epoch": 0.4786463151403022, + "grad_norm": 3.782944440841675, + "learning_rate": 5.587903950604872e-06, + "loss": 1.0106, + "step": 6209 + }, + { + "epoch": 0.4787234042553192, + "grad_norm": 3.437720537185669, + "learning_rate": 5.586664145122764e-06, + "loss": 0.9793, + "step": 6210 + }, + { + "epoch": 0.4788004933703361, + "grad_norm": 4.048053741455078, + "learning_rate": 5.585424303065186e-06, + "loss": 1.0484, + "step": 6211 + }, + { + "epoch": 0.47887758248535306, + "grad_norm": 3.8065176010131836, + "learning_rate": 5.5841844245094345e-06, + "loss": 0.9522, + "step": 6212 + }, + { + "epoch": 0.47895467160037003, + "grad_norm": 3.6226398944854736, + "learning_rate": 5.582944509532809e-06, + "loss": 0.974, + "step": 6213 + }, + { + "epoch": 0.479031760715387, + "grad_norm": 4.275754451751709, + "learning_rate": 5.581704558212615e-06, + "loss": 1.037, + "step": 6214 + }, + { + "epoch": 0.47910884983040397, + "grad_norm": 3.503654718399048, + "learning_rate": 5.5804645706261515e-06, + "loss": 1.0253, + "step": 6215 + }, + { + "epoch": 0.4791859389454209, + "grad_norm": 3.7088232040405273, + "learning_rate": 5.57922454685073e-06, + "loss": 1.0347, + "step": 6216 + }, + { + "epoch": 0.47926302806043786, + "grad_norm": 3.6020233631134033, + "learning_rate": 5.577984486963658e-06, + "loss": 0.9362, + "step": 6217 + }, + { + "epoch": 0.4793401171754548, + "grad_norm": 3.534729242324829, + "learning_rate": 5.576744391042246e-06, + "loss": 1.0147, + "step": 6218 + }, + { + "epoch": 0.4794172062904718, + "grad_norm": 3.644331693649292, + "learning_rate": 5.575504259163807e-06, + "loss": 0.9651, + "step": 6219 + }, + { + "epoch": 0.47949429540548877, + "grad_norm": 3.5781354904174805, + "learning_rate": 5.5742640914056615e-06, + "loss": 0.9953, + "step": 6220 + }, + { + "epoch": 0.4795713845205057, + "grad_norm": 3.2821385860443115, + "learning_rate": 5.573023887845122e-06, + "loss": 0.9041, + "step": 6221 + }, + { + "epoch": 0.47964847363552265, + "grad_norm": 3.6788458824157715, + "learning_rate": 5.57178364855951e-06, + "loss": 1.1181, + "step": 6222 + }, + { + "epoch": 0.4797255627505396, + "grad_norm": 3.8085317611694336, + "learning_rate": 5.57054337362615e-06, + "loss": 0.8443, + "step": 6223 + }, + { + "epoch": 0.4798026518655566, + "grad_norm": 3.922316789627075, + "learning_rate": 5.569303063122364e-06, + "loss": 1.0382, + "step": 6224 + }, + { + "epoch": 0.47987974098057357, + "grad_norm": 3.4889650344848633, + "learning_rate": 5.568062717125483e-06, + "loss": 0.8957, + "step": 6225 + }, + { + "epoch": 0.4799568300955905, + "grad_norm": 3.5855491161346436, + "learning_rate": 5.5668223357128325e-06, + "loss": 0.9909, + "step": 6226 + }, + { + "epoch": 0.48003391921060745, + "grad_norm": 3.6739110946655273, + "learning_rate": 5.5655819189617445e-06, + "loss": 0.8751, + "step": 6227 + }, + { + "epoch": 0.4801110083256244, + "grad_norm": 3.6241581439971924, + "learning_rate": 5.564341466949553e-06, + "loss": 0.9153, + "step": 6228 + }, + { + "epoch": 0.4801880974406414, + "grad_norm": 3.6307456493377686, + "learning_rate": 5.5631009797535955e-06, + "loss": 0.9424, + "step": 6229 + }, + { + "epoch": 0.48026518655565836, + "grad_norm": 3.7684690952301025, + "learning_rate": 5.561860457451207e-06, + "loss": 0.9433, + "step": 6230 + }, + { + "epoch": 0.4803422756706753, + "grad_norm": 3.5321202278137207, + "learning_rate": 5.560619900119729e-06, + "loss": 0.9128, + "step": 6231 + }, + { + "epoch": 0.48041936478569225, + "grad_norm": 3.7544522285461426, + "learning_rate": 5.5593793078365036e-06, + "loss": 1.0287, + "step": 6232 + }, + { + "epoch": 0.4804964539007092, + "grad_norm": 3.454862356185913, + "learning_rate": 5.5581386806788765e-06, + "loss": 0.9557, + "step": 6233 + }, + { + "epoch": 0.4805735430157262, + "grad_norm": 3.669862747192383, + "learning_rate": 5.5568980187241915e-06, + "loss": 1.0802, + "step": 6234 + }, + { + "epoch": 0.48065063213074316, + "grad_norm": 3.8794069290161133, + "learning_rate": 5.5556573220498e-06, + "loss": 0.9532, + "step": 6235 + }, + { + "epoch": 0.4807277212457601, + "grad_norm": 3.671264171600342, + "learning_rate": 5.554416590733054e-06, + "loss": 0.9586, + "step": 6236 + }, + { + "epoch": 0.48080481036077705, + "grad_norm": 3.6556334495544434, + "learning_rate": 5.553175824851304e-06, + "loss": 0.8997, + "step": 6237 + }, + { + "epoch": 0.480881899475794, + "grad_norm": 3.8295559883117676, + "learning_rate": 5.551935024481906e-06, + "loss": 1.142, + "step": 6238 + }, + { + "epoch": 0.480958988590811, + "grad_norm": 3.51387357711792, + "learning_rate": 5.5506941897022175e-06, + "loss": 0.9573, + "step": 6239 + }, + { + "epoch": 0.48103607770582796, + "grad_norm": 3.6698529720306396, + "learning_rate": 5.549453320589598e-06, + "loss": 0.9651, + "step": 6240 + }, + { + "epoch": 0.4811131668208449, + "grad_norm": 3.5171773433685303, + "learning_rate": 5.54821241722141e-06, + "loss": 0.8298, + "step": 6241 + }, + { + "epoch": 0.48119025593586184, + "grad_norm": 4.172097682952881, + "learning_rate": 5.5469714796750175e-06, + "loss": 1.0308, + "step": 6242 + }, + { + "epoch": 0.4812673450508788, + "grad_norm": 3.4253857135772705, + "learning_rate": 5.5457305080277855e-06, + "loss": 0.9464, + "step": 6243 + }, + { + "epoch": 0.4813444341658958, + "grad_norm": 3.268526554107666, + "learning_rate": 5.544489502357085e-06, + "loss": 0.91, + "step": 6244 + }, + { + "epoch": 0.48142152328091276, + "grad_norm": 3.801954507827759, + "learning_rate": 5.543248462740281e-06, + "loss": 0.9596, + "step": 6245 + }, + { + "epoch": 0.48149861239592967, + "grad_norm": 3.7973077297210693, + "learning_rate": 5.542007389254749e-06, + "loss": 1.0331, + "step": 6246 + }, + { + "epoch": 0.48157570151094664, + "grad_norm": 3.8994927406311035, + "learning_rate": 5.540766281977865e-06, + "loss": 0.9226, + "step": 6247 + }, + { + "epoch": 0.4816527906259636, + "grad_norm": 3.246563196182251, + "learning_rate": 5.539525140987003e-06, + "loss": 0.8147, + "step": 6248 + }, + { + "epoch": 0.4817298797409806, + "grad_norm": 3.5368645191192627, + "learning_rate": 5.538283966359545e-06, + "loss": 0.9997, + "step": 6249 + }, + { + "epoch": 0.48180696885599755, + "grad_norm": 4.268583297729492, + "learning_rate": 5.537042758172866e-06, + "loss": 1.0206, + "step": 6250 + }, + { + "epoch": 0.48188405797101447, + "grad_norm": 3.504702568054199, + "learning_rate": 5.535801516504354e-06, + "loss": 0.8864, + "step": 6251 + }, + { + "epoch": 0.48196114708603144, + "grad_norm": 3.5273540019989014, + "learning_rate": 5.534560241431393e-06, + "loss": 0.9426, + "step": 6252 + }, + { + "epoch": 0.4820382362010484, + "grad_norm": 3.7020440101623535, + "learning_rate": 5.533318933031368e-06, + "loss": 1.0103, + "step": 6253 + }, + { + "epoch": 0.4821153253160654, + "grad_norm": 3.3781721591949463, + "learning_rate": 5.532077591381672e-06, + "loss": 0.8451, + "step": 6254 + }, + { + "epoch": 0.48219241443108235, + "grad_norm": 3.646610975265503, + "learning_rate": 5.530836216559692e-06, + "loss": 1.0605, + "step": 6255 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 3.75663423538208, + "learning_rate": 5.5295948086428245e-06, + "loss": 1.0645, + "step": 6256 + }, + { + "epoch": 0.48234659266111624, + "grad_norm": 3.898165702819824, + "learning_rate": 5.528353367708462e-06, + "loss": 1.0525, + "step": 6257 + }, + { + "epoch": 0.4824236817761332, + "grad_norm": 3.5271053314208984, + "learning_rate": 5.527111893834004e-06, + "loss": 0.8598, + "step": 6258 + }, + { + "epoch": 0.4825007708911502, + "grad_norm": 3.4338533878326416, + "learning_rate": 5.52587038709685e-06, + "loss": 0.9286, + "step": 6259 + }, + { + "epoch": 0.48257786000616715, + "grad_norm": 4.084996223449707, + "learning_rate": 5.5246288475744016e-06, + "loss": 1.1074, + "step": 6260 + }, + { + "epoch": 0.48265494912118406, + "grad_norm": 3.887216806411743, + "learning_rate": 5.52338727534406e-06, + "loss": 0.8895, + "step": 6261 + }, + { + "epoch": 0.48273203823620103, + "grad_norm": 3.458646774291992, + "learning_rate": 5.522145670483233e-06, + "loss": 0.938, + "step": 6262 + }, + { + "epoch": 0.482809127351218, + "grad_norm": 3.734957456588745, + "learning_rate": 5.520904033069328e-06, + "loss": 1.0008, + "step": 6263 + }, + { + "epoch": 0.482886216466235, + "grad_norm": 3.3216323852539062, + "learning_rate": 5.519662363179754e-06, + "loss": 1.0191, + "step": 6264 + }, + { + "epoch": 0.48296330558125194, + "grad_norm": 3.6311726570129395, + "learning_rate": 5.518420660891924e-06, + "loss": 0.9028, + "step": 6265 + }, + { + "epoch": 0.48304039469626886, + "grad_norm": 3.6047608852386475, + "learning_rate": 5.51717892628325e-06, + "loss": 0.9282, + "step": 6266 + }, + { + "epoch": 0.48311748381128583, + "grad_norm": 3.5420308113098145, + "learning_rate": 5.515937159431147e-06, + "loss": 0.8657, + "step": 6267 + }, + { + "epoch": 0.4831945729263028, + "grad_norm": 3.5414748191833496, + "learning_rate": 5.514695360413037e-06, + "loss": 0.9715, + "step": 6268 + }, + { + "epoch": 0.48327166204131977, + "grad_norm": 3.513932228088379, + "learning_rate": 5.5134535293063355e-06, + "loss": 0.8782, + "step": 6269 + }, + { + "epoch": 0.48334875115633674, + "grad_norm": 3.6613216400146484, + "learning_rate": 5.512211666188465e-06, + "loss": 1.0077, + "step": 6270 + }, + { + "epoch": 0.48342584027135366, + "grad_norm": 4.074599742889404, + "learning_rate": 5.510969771136852e-06, + "loss": 0.9404, + "step": 6271 + }, + { + "epoch": 0.48350292938637063, + "grad_norm": 3.5803050994873047, + "learning_rate": 5.509727844228917e-06, + "loss": 0.9411, + "step": 6272 + }, + { + "epoch": 0.4835800185013876, + "grad_norm": 3.4778783321380615, + "learning_rate": 5.5084858855420945e-06, + "loss": 0.917, + "step": 6273 + }, + { + "epoch": 0.48365710761640457, + "grad_norm": 3.6360175609588623, + "learning_rate": 5.507243895153808e-06, + "loss": 1.0272, + "step": 6274 + }, + { + "epoch": 0.48373419673142154, + "grad_norm": 3.4350030422210693, + "learning_rate": 5.506001873141493e-06, + "loss": 0.8781, + "step": 6275 + }, + { + "epoch": 0.4838112858464385, + "grad_norm": 3.641852378845215, + "learning_rate": 5.504759819582581e-06, + "loss": 0.9962, + "step": 6276 + }, + { + "epoch": 0.4838883749614554, + "grad_norm": 3.9919748306274414, + "learning_rate": 5.5035177345545085e-06, + "loss": 0.9799, + "step": 6277 + }, + { + "epoch": 0.4839654640764724, + "grad_norm": 3.5844922065734863, + "learning_rate": 5.502275618134715e-06, + "loss": 0.9189, + "step": 6278 + }, + { + "epoch": 0.48404255319148937, + "grad_norm": 3.5310840606689453, + "learning_rate": 5.5010334704006364e-06, + "loss": 0.9687, + "step": 6279 + }, + { + "epoch": 0.48411964230650634, + "grad_norm": 4.053980350494385, + "learning_rate": 5.499791291429714e-06, + "loss": 0.8998, + "step": 6280 + }, + { + "epoch": 0.4841967314215233, + "grad_norm": 3.388463020324707, + "learning_rate": 5.498549081299397e-06, + "loss": 0.9235, + "step": 6281 + }, + { + "epoch": 0.4842738205365402, + "grad_norm": 3.387619972229004, + "learning_rate": 5.497306840087124e-06, + "loss": 0.9145, + "step": 6282 + }, + { + "epoch": 0.4843509096515572, + "grad_norm": 3.7833995819091797, + "learning_rate": 5.496064567870346e-06, + "loss": 0.9596, + "step": 6283 + }, + { + "epoch": 0.48442799876657416, + "grad_norm": 3.767986536026001, + "learning_rate": 5.494822264726512e-06, + "loss": 1.0506, + "step": 6284 + }, + { + "epoch": 0.48450508788159113, + "grad_norm": 3.8580732345581055, + "learning_rate": 5.4935799307330715e-06, + "loss": 1.1381, + "step": 6285 + }, + { + "epoch": 0.4845821769966081, + "grad_norm": 3.7080979347229004, + "learning_rate": 5.492337565967479e-06, + "loss": 0.9691, + "step": 6286 + }, + { + "epoch": 0.484659266111625, + "grad_norm": 3.4065332412719727, + "learning_rate": 5.491095170507189e-06, + "loss": 0.908, + "step": 6287 + }, + { + "epoch": 0.484736355226642, + "grad_norm": 3.787928819656372, + "learning_rate": 5.4898527444296586e-06, + "loss": 0.8999, + "step": 6288 + }, + { + "epoch": 0.48481344434165896, + "grad_norm": 3.6937663555145264, + "learning_rate": 5.488610287812348e-06, + "loss": 0.976, + "step": 6289 + }, + { + "epoch": 0.48489053345667593, + "grad_norm": 3.6718273162841797, + "learning_rate": 5.487367800732715e-06, + "loss": 0.9509, + "step": 6290 + }, + { + "epoch": 0.4849676225716929, + "grad_norm": 3.9421653747558594, + "learning_rate": 5.486125283268223e-06, + "loss": 0.9137, + "step": 6291 + }, + { + "epoch": 0.4850447116867098, + "grad_norm": 3.546727418899536, + "learning_rate": 5.48488273549634e-06, + "loss": 0.9182, + "step": 6292 + }, + { + "epoch": 0.4851218008017268, + "grad_norm": 3.6610233783721924, + "learning_rate": 5.483640157494528e-06, + "loss": 0.9811, + "step": 6293 + }, + { + "epoch": 0.48519888991674376, + "grad_norm": 3.579312801361084, + "learning_rate": 5.482397549340256e-06, + "loss": 0.879, + "step": 6294 + }, + { + "epoch": 0.48527597903176073, + "grad_norm": 3.8670926094055176, + "learning_rate": 5.481154911110995e-06, + "loss": 1.0397, + "step": 6295 + }, + { + "epoch": 0.4853530681467777, + "grad_norm": 3.6566736698150635, + "learning_rate": 5.4799122428842185e-06, + "loss": 1.0115, + "step": 6296 + }, + { + "epoch": 0.4854301572617946, + "grad_norm": 3.503852605819702, + "learning_rate": 5.478669544737401e-06, + "loss": 1.0862, + "step": 6297 + }, + { + "epoch": 0.4855072463768116, + "grad_norm": 3.8774027824401855, + "learning_rate": 5.477426816748014e-06, + "loss": 1.0201, + "step": 6298 + }, + { + "epoch": 0.48558433549182856, + "grad_norm": 3.327522039413452, + "learning_rate": 5.476184058993539e-06, + "loss": 0.9576, + "step": 6299 + }, + { + "epoch": 0.4856614246068455, + "grad_norm": 3.4257946014404297, + "learning_rate": 5.4749412715514525e-06, + "loss": 0.9476, + "step": 6300 + }, + { + "epoch": 0.4857385137218625, + "grad_norm": 4.544504165649414, + "learning_rate": 5.473698454499239e-06, + "loss": 1.0781, + "step": 6301 + }, + { + "epoch": 0.4858156028368794, + "grad_norm": 3.3634119033813477, + "learning_rate": 5.47245560791438e-06, + "loss": 0.936, + "step": 6302 + }, + { + "epoch": 0.4858926919518964, + "grad_norm": 3.5162649154663086, + "learning_rate": 5.47121273187436e-06, + "loss": 1.0161, + "step": 6303 + }, + { + "epoch": 0.48596978106691335, + "grad_norm": 3.8736398220062256, + "learning_rate": 5.4699698264566665e-06, + "loss": 0.9404, + "step": 6304 + }, + { + "epoch": 0.4860468701819303, + "grad_norm": 3.300621509552002, + "learning_rate": 5.468726891738789e-06, + "loss": 0.9441, + "step": 6305 + }, + { + "epoch": 0.4861239592969473, + "grad_norm": 3.4393205642700195, + "learning_rate": 5.467483927798217e-06, + "loss": 0.9706, + "step": 6306 + }, + { + "epoch": 0.4862010484119642, + "grad_norm": 3.7706298828125, + "learning_rate": 5.4662409347124436e-06, + "loss": 0.9214, + "step": 6307 + }, + { + "epoch": 0.4862781375269812, + "grad_norm": 3.6923952102661133, + "learning_rate": 5.464997912558963e-06, + "loss": 0.8973, + "step": 6308 + }, + { + "epoch": 0.48635522664199815, + "grad_norm": 3.481060743331909, + "learning_rate": 5.46375486141527e-06, + "loss": 0.867, + "step": 6309 + }, + { + "epoch": 0.4864323157570151, + "grad_norm": 3.6562881469726562, + "learning_rate": 5.462511781358866e-06, + "loss": 0.9865, + "step": 6310 + }, + { + "epoch": 0.4865094048720321, + "grad_norm": 3.6033737659454346, + "learning_rate": 5.461268672467245e-06, + "loss": 1.0277, + "step": 6311 + }, + { + "epoch": 0.486586493987049, + "grad_norm": 3.8710803985595703, + "learning_rate": 5.460025534817911e-06, + "loss": 0.985, + "step": 6312 + }, + { + "epoch": 0.486663583102066, + "grad_norm": 3.498439073562622, + "learning_rate": 5.458782368488369e-06, + "loss": 1.0291, + "step": 6313 + }, + { + "epoch": 0.48674067221708295, + "grad_norm": 3.638658046722412, + "learning_rate": 5.4575391735561216e-06, + "loss": 0.9331, + "step": 6314 + }, + { + "epoch": 0.4868177613320999, + "grad_norm": 3.5828909873962402, + "learning_rate": 5.456295950098676e-06, + "loss": 1.0794, + "step": 6315 + }, + { + "epoch": 0.4868948504471169, + "grad_norm": 3.282355308532715, + "learning_rate": 5.45505269819354e-06, + "loss": 0.9155, + "step": 6316 + }, + { + "epoch": 0.4869719395621338, + "grad_norm": 3.461933135986328, + "learning_rate": 5.453809417918227e-06, + "loss": 0.9608, + "step": 6317 + }, + { + "epoch": 0.4870490286771508, + "grad_norm": 3.6630191802978516, + "learning_rate": 5.452566109350248e-06, + "loss": 1.0117, + "step": 6318 + }, + { + "epoch": 0.48712611779216775, + "grad_norm": 3.436558485031128, + "learning_rate": 5.451322772567114e-06, + "loss": 0.8844, + "step": 6319 + }, + { + "epoch": 0.4872032069071847, + "grad_norm": 3.7390990257263184, + "learning_rate": 5.450079407646343e-06, + "loss": 1.0276, + "step": 6320 + }, + { + "epoch": 0.4872802960222017, + "grad_norm": 3.4452319145202637, + "learning_rate": 5.448836014665453e-06, + "loss": 0.8985, + "step": 6321 + }, + { + "epoch": 0.4873573851372186, + "grad_norm": 3.383016347885132, + "learning_rate": 5.447592593701961e-06, + "loss": 0.969, + "step": 6322 + }, + { + "epoch": 0.48743447425223557, + "grad_norm": 3.7448880672454834, + "learning_rate": 5.446349144833389e-06, + "loss": 1.0328, + "step": 6323 + }, + { + "epoch": 0.48751156336725254, + "grad_norm": 3.59957218170166, + "learning_rate": 5.44510566813726e-06, + "loss": 1.0494, + "step": 6324 + }, + { + "epoch": 0.4875886524822695, + "grad_norm": 3.316612720489502, + "learning_rate": 5.443862163691097e-06, + "loss": 1.0422, + "step": 6325 + }, + { + "epoch": 0.4876657415972865, + "grad_norm": 3.4998679161071777, + "learning_rate": 5.442618631572428e-06, + "loss": 0.9016, + "step": 6326 + }, + { + "epoch": 0.4877428307123034, + "grad_norm": 3.8718786239624023, + "learning_rate": 5.44137507185878e-06, + "loss": 0.9931, + "step": 6327 + }, + { + "epoch": 0.48781991982732037, + "grad_norm": 3.7852871417999268, + "learning_rate": 5.440131484627681e-06, + "loss": 0.9991, + "step": 6328 + }, + { + "epoch": 0.48789700894233734, + "grad_norm": 3.4875009059906006, + "learning_rate": 5.438887869956664e-06, + "loss": 0.9499, + "step": 6329 + }, + { + "epoch": 0.4879740980573543, + "grad_norm": 3.2732343673706055, + "learning_rate": 5.437644227923261e-06, + "loss": 1.0069, + "step": 6330 + }, + { + "epoch": 0.4880511871723713, + "grad_norm": 3.7164502143859863, + "learning_rate": 5.4364005586050075e-06, + "loss": 1.0977, + "step": 6331 + }, + { + "epoch": 0.4881282762873882, + "grad_norm": 3.7878737449645996, + "learning_rate": 5.4351568620794395e-06, + "loss": 1.0551, + "step": 6332 + }, + { + "epoch": 0.48820536540240517, + "grad_norm": 3.5380685329437256, + "learning_rate": 5.433913138424094e-06, + "loss": 0.8823, + "step": 6333 + }, + { + "epoch": 0.48828245451742214, + "grad_norm": 3.8582963943481445, + "learning_rate": 5.4326693877165125e-06, + "loss": 0.9363, + "step": 6334 + }, + { + "epoch": 0.4883595436324391, + "grad_norm": 3.4837450981140137, + "learning_rate": 5.431425610034235e-06, + "loss": 0.8931, + "step": 6335 + }, + { + "epoch": 0.4884366327474561, + "grad_norm": 4.541477680206299, + "learning_rate": 5.430181805454805e-06, + "loss": 0.9786, + "step": 6336 + }, + { + "epoch": 0.488513721862473, + "grad_norm": 3.627601146697998, + "learning_rate": 5.428937974055769e-06, + "loss": 0.9365, + "step": 6337 + }, + { + "epoch": 0.48859081097748996, + "grad_norm": 3.725983142852783, + "learning_rate": 5.427694115914669e-06, + "loss": 0.9854, + "step": 6338 + }, + { + "epoch": 0.48866790009250693, + "grad_norm": 3.6275992393493652, + "learning_rate": 5.426450231109058e-06, + "loss": 0.9704, + "step": 6339 + }, + { + "epoch": 0.4887449892075239, + "grad_norm": 3.6166484355926514, + "learning_rate": 5.425206319716483e-06, + "loss": 1.0062, + "step": 6340 + }, + { + "epoch": 0.4888220783225409, + "grad_norm": 3.833855152130127, + "learning_rate": 5.423962381814496e-06, + "loss": 0.983, + "step": 6341 + }, + { + "epoch": 0.4888991674375578, + "grad_norm": 3.7868459224700928, + "learning_rate": 5.422718417480651e-06, + "loss": 0.9064, + "step": 6342 + }, + { + "epoch": 0.48897625655257476, + "grad_norm": 3.3974921703338623, + "learning_rate": 5.421474426792501e-06, + "loss": 0.965, + "step": 6343 + }, + { + "epoch": 0.48905334566759173, + "grad_norm": 3.812864065170288, + "learning_rate": 5.420230409827604e-06, + "loss": 1.0489, + "step": 6344 + }, + { + "epoch": 0.4891304347826087, + "grad_norm": 3.837310552597046, + "learning_rate": 5.418986366663518e-06, + "loss": 0.9851, + "step": 6345 + }, + { + "epoch": 0.4892075238976257, + "grad_norm": 3.419736385345459, + "learning_rate": 5.4177422973778015e-06, + "loss": 0.9357, + "step": 6346 + }, + { + "epoch": 0.4892846130126426, + "grad_norm": 3.845294237136841, + "learning_rate": 5.416498202048016e-06, + "loss": 1.0626, + "step": 6347 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 3.5029916763305664, + "learning_rate": 5.415254080751725e-06, + "loss": 0.9745, + "step": 6348 + }, + { + "epoch": 0.48943879124267653, + "grad_norm": 3.959080696105957, + "learning_rate": 5.414009933566492e-06, + "loss": 1.0339, + "step": 6349 + }, + { + "epoch": 0.4895158803576935, + "grad_norm": 3.6839077472686768, + "learning_rate": 5.412765760569886e-06, + "loss": 0.9517, + "step": 6350 + }, + { + "epoch": 0.48959296947271047, + "grad_norm": 3.2615315914154053, + "learning_rate": 5.41152156183947e-06, + "loss": 0.8866, + "step": 6351 + }, + { + "epoch": 0.4896700585877274, + "grad_norm": 3.5984644889831543, + "learning_rate": 5.410277337452817e-06, + "loss": 0.9906, + "step": 6352 + }, + { + "epoch": 0.48974714770274436, + "grad_norm": 3.3124704360961914, + "learning_rate": 5.409033087487498e-06, + "loss": 0.8656, + "step": 6353 + }, + { + "epoch": 0.4898242368177613, + "grad_norm": 3.2847061157226562, + "learning_rate": 5.407788812021082e-06, + "loss": 0.8378, + "step": 6354 + }, + { + "epoch": 0.4899013259327783, + "grad_norm": 3.559105157852173, + "learning_rate": 5.406544511131146e-06, + "loss": 1.0279, + "step": 6355 + }, + { + "epoch": 0.48997841504779527, + "grad_norm": 3.41558575630188, + "learning_rate": 5.405300184895268e-06, + "loss": 0.9121, + "step": 6356 + }, + { + "epoch": 0.4900555041628122, + "grad_norm": 3.4175400733947754, + "learning_rate": 5.40405583339102e-06, + "loss": 0.8206, + "step": 6357 + }, + { + "epoch": 0.49013259327782915, + "grad_norm": 3.7878329753875732, + "learning_rate": 5.402811456695985e-06, + "loss": 0.9255, + "step": 6358 + }, + { + "epoch": 0.4902096823928461, + "grad_norm": 3.816235303878784, + "learning_rate": 5.401567054887741e-06, + "loss": 1.1515, + "step": 6359 + }, + { + "epoch": 0.4902867715078631, + "grad_norm": 3.481640100479126, + "learning_rate": 5.400322628043869e-06, + "loss": 0.8833, + "step": 6360 + }, + { + "epoch": 0.49036386062288007, + "grad_norm": 3.5390591621398926, + "learning_rate": 5.399078176241958e-06, + "loss": 0.9331, + "step": 6361 + }, + { + "epoch": 0.49044094973789704, + "grad_norm": 3.5117104053497314, + "learning_rate": 5.397833699559587e-06, + "loss": 0.994, + "step": 6362 + }, + { + "epoch": 0.49051803885291395, + "grad_norm": 3.922240972518921, + "learning_rate": 5.396589198074347e-06, + "loss": 1.063, + "step": 6363 + }, + { + "epoch": 0.4905951279679309, + "grad_norm": 3.7044625282287598, + "learning_rate": 5.3953446718638235e-06, + "loss": 1.0509, + "step": 6364 + }, + { + "epoch": 0.4906722170829479, + "grad_norm": 3.929018259048462, + "learning_rate": 5.394100121005608e-06, + "loss": 1.0066, + "step": 6365 + }, + { + "epoch": 0.49074930619796486, + "grad_norm": 3.511359214782715, + "learning_rate": 5.39285554557729e-06, + "loss": 0.959, + "step": 6366 + }, + { + "epoch": 0.49082639531298183, + "grad_norm": 3.469261646270752, + "learning_rate": 5.391610945656464e-06, + "loss": 1.0333, + "step": 6367 + }, + { + "epoch": 0.49090348442799875, + "grad_norm": 3.302649736404419, + "learning_rate": 5.390366321320723e-06, + "loss": 0.9105, + "step": 6368 + }, + { + "epoch": 0.4909805735430157, + "grad_norm": 3.9019734859466553, + "learning_rate": 5.389121672647666e-06, + "loss": 1.1229, + "step": 6369 + }, + { + "epoch": 0.4910576626580327, + "grad_norm": 3.741459846496582, + "learning_rate": 5.387876999714885e-06, + "loss": 1.0295, + "step": 6370 + }, + { + "epoch": 0.49113475177304966, + "grad_norm": 3.3575656414031982, + "learning_rate": 5.386632302599985e-06, + "loss": 0.9632, + "step": 6371 + }, + { + "epoch": 0.49121184088806663, + "grad_norm": 3.4289495944976807, + "learning_rate": 5.385387581380561e-06, + "loss": 0.9537, + "step": 6372 + }, + { + "epoch": 0.49128893000308355, + "grad_norm": 3.718967914581299, + "learning_rate": 5.384142836134217e-06, + "loss": 0.953, + "step": 6373 + }, + { + "epoch": 0.4913660191181005, + "grad_norm": 3.8961894512176514, + "learning_rate": 5.382898066938559e-06, + "loss": 0.9896, + "step": 6374 + }, + { + "epoch": 0.4914431082331175, + "grad_norm": 3.611521005630493, + "learning_rate": 5.381653273871188e-06, + "loss": 1.0437, + "step": 6375 + }, + { + "epoch": 0.49152019734813446, + "grad_norm": 3.494535446166992, + "learning_rate": 5.380408457009711e-06, + "loss": 0.9011, + "step": 6376 + }, + { + "epoch": 0.49159728646315143, + "grad_norm": 3.385871648788452, + "learning_rate": 5.37916361643174e-06, + "loss": 0.9788, + "step": 6377 + }, + { + "epoch": 0.49167437557816834, + "grad_norm": 3.541168212890625, + "learning_rate": 5.377918752214878e-06, + "loss": 0.9361, + "step": 6378 + }, + { + "epoch": 0.4917514646931853, + "grad_norm": 3.743074417114258, + "learning_rate": 5.37667386443674e-06, + "loss": 0.9046, + "step": 6379 + }, + { + "epoch": 0.4918285538082023, + "grad_norm": 3.602600574493408, + "learning_rate": 5.375428953174939e-06, + "loss": 1.0171, + "step": 6380 + }, + { + "epoch": 0.49190564292321926, + "grad_norm": 3.6449949741363525, + "learning_rate": 5.374184018507086e-06, + "loss": 1.0, + "step": 6381 + }, + { + "epoch": 0.4919827320382362, + "grad_norm": 3.660562515258789, + "learning_rate": 5.372939060510797e-06, + "loss": 1.0855, + "step": 6382 + }, + { + "epoch": 0.49205982115325314, + "grad_norm": 3.2304728031158447, + "learning_rate": 5.371694079263688e-06, + "loss": 0.8088, + "step": 6383 + }, + { + "epoch": 0.4921369102682701, + "grad_norm": 3.455740451812744, + "learning_rate": 5.37044907484338e-06, + "loss": 0.9095, + "step": 6384 + }, + { + "epoch": 0.4922139993832871, + "grad_norm": 3.787229061126709, + "learning_rate": 5.369204047327491e-06, + "loss": 0.9225, + "step": 6385 + }, + { + "epoch": 0.49229108849830405, + "grad_norm": 3.7648141384124756, + "learning_rate": 5.367958996793641e-06, + "loss": 0.9692, + "step": 6386 + }, + { + "epoch": 0.492368177613321, + "grad_norm": 3.6073803901672363, + "learning_rate": 5.366713923319455e-06, + "loss": 0.9865, + "step": 6387 + }, + { + "epoch": 0.49244526672833794, + "grad_norm": 3.283367872238159, + "learning_rate": 5.365468826982553e-06, + "loss": 0.991, + "step": 6388 + }, + { + "epoch": 0.4925223558433549, + "grad_norm": 3.9354822635650635, + "learning_rate": 5.364223707860563e-06, + "loss": 0.9923, + "step": 6389 + }, + { + "epoch": 0.4925994449583719, + "grad_norm": 3.2957966327667236, + "learning_rate": 5.362978566031112e-06, + "loss": 0.9225, + "step": 6390 + }, + { + "epoch": 0.49267653407338885, + "grad_norm": 3.9109017848968506, + "learning_rate": 5.361733401571826e-06, + "loss": 0.8456, + "step": 6391 + }, + { + "epoch": 0.4927536231884058, + "grad_norm": 3.773970603942871, + "learning_rate": 5.360488214560336e-06, + "loss": 0.9238, + "step": 6392 + }, + { + "epoch": 0.49283071230342274, + "grad_norm": 3.3572847843170166, + "learning_rate": 5.359243005074274e-06, + "loss": 0.9182, + "step": 6393 + }, + { + "epoch": 0.4929078014184397, + "grad_norm": 3.6737236976623535, + "learning_rate": 5.35799777319127e-06, + "loss": 0.994, + "step": 6394 + }, + { + "epoch": 0.4929848905334567, + "grad_norm": 4.093838214874268, + "learning_rate": 5.356752518988961e-06, + "loss": 1.013, + "step": 6395 + }, + { + "epoch": 0.49306197964847365, + "grad_norm": 3.745969295501709, + "learning_rate": 5.355507242544978e-06, + "loss": 0.9943, + "step": 6396 + }, + { + "epoch": 0.4931390687634906, + "grad_norm": 3.748204469680786, + "learning_rate": 5.35426194393696e-06, + "loss": 0.8421, + "step": 6397 + }, + { + "epoch": 0.49321615787850753, + "grad_norm": 3.6849241256713867, + "learning_rate": 5.3530166232425454e-06, + "loss": 0.8745, + "step": 6398 + }, + { + "epoch": 0.4932932469935245, + "grad_norm": 3.651228189468384, + "learning_rate": 5.351771280539372e-06, + "loss": 1.0544, + "step": 6399 + }, + { + "epoch": 0.4933703361085415, + "grad_norm": 4.058038234710693, + "learning_rate": 5.35052591590508e-06, + "loss": 1.0527, + "step": 6400 + }, + { + "epoch": 0.49344742522355844, + "grad_norm": 4.054307460784912, + "learning_rate": 5.349280529417316e-06, + "loss": 0.9891, + "step": 6401 + }, + { + "epoch": 0.4935245143385754, + "grad_norm": 3.8959100246429443, + "learning_rate": 5.348035121153716e-06, + "loss": 1.0433, + "step": 6402 + }, + { + "epoch": 0.49360160345359233, + "grad_norm": 3.7364206314086914, + "learning_rate": 5.346789691191931e-06, + "loss": 0.9561, + "step": 6403 + }, + { + "epoch": 0.4936786925686093, + "grad_norm": 4.071012496948242, + "learning_rate": 5.3455442396096045e-06, + "loss": 0.9654, + "step": 6404 + }, + { + "epoch": 0.49375578168362627, + "grad_norm": 3.570204019546509, + "learning_rate": 5.344298766484382e-06, + "loss": 0.9465, + "step": 6405 + }, + { + "epoch": 0.49383287079864324, + "grad_norm": 3.7908997535705566, + "learning_rate": 5.343053271893919e-06, + "loss": 1.0456, + "step": 6406 + }, + { + "epoch": 0.4939099599136602, + "grad_norm": 3.9499940872192383, + "learning_rate": 5.3418077559158575e-06, + "loss": 1.0423, + "step": 6407 + }, + { + "epoch": 0.4939870490286771, + "grad_norm": 3.3631200790405273, + "learning_rate": 5.340562218627854e-06, + "loss": 0.9541, + "step": 6408 + }, + { + "epoch": 0.4940641381436941, + "grad_norm": 3.548306941986084, + "learning_rate": 5.339316660107561e-06, + "loss": 0.9153, + "step": 6409 + }, + { + "epoch": 0.49414122725871107, + "grad_norm": 3.9176387786865234, + "learning_rate": 5.3380710804326304e-06, + "loss": 0.9603, + "step": 6410 + }, + { + "epoch": 0.49421831637372804, + "grad_norm": 4.336452484130859, + "learning_rate": 5.33682547968072e-06, + "loss": 1.0792, + "step": 6411 + }, + { + "epoch": 0.494295405488745, + "grad_norm": 3.4913458824157715, + "learning_rate": 5.3355798579294834e-06, + "loss": 0.9811, + "step": 6412 + }, + { + "epoch": 0.4943724946037619, + "grad_norm": 3.5579373836517334, + "learning_rate": 5.334334215256582e-06, + "loss": 0.9181, + "step": 6413 + }, + { + "epoch": 0.4944495837187789, + "grad_norm": 3.506436586380005, + "learning_rate": 5.333088551739674e-06, + "loss": 0.9107, + "step": 6414 + }, + { + "epoch": 0.49452667283379587, + "grad_norm": 3.682651996612549, + "learning_rate": 5.3318428674564196e-06, + "loss": 0.989, + "step": 6415 + }, + { + "epoch": 0.49460376194881284, + "grad_norm": 3.929532527923584, + "learning_rate": 5.330597162484481e-06, + "loss": 0.8853, + "step": 6416 + }, + { + "epoch": 0.4946808510638298, + "grad_norm": 3.4837441444396973, + "learning_rate": 5.329351436901522e-06, + "loss": 0.904, + "step": 6417 + }, + { + "epoch": 0.4947579401788467, + "grad_norm": 3.567117691040039, + "learning_rate": 5.3281056907852054e-06, + "loss": 0.9982, + "step": 6418 + }, + { + "epoch": 0.4948350292938637, + "grad_norm": 3.655592918395996, + "learning_rate": 5.3268599242132e-06, + "loss": 0.9322, + "step": 6419 + }, + { + "epoch": 0.49491211840888066, + "grad_norm": 3.478694200515747, + "learning_rate": 5.32561413726317e-06, + "loss": 0.9695, + "step": 6420 + }, + { + "epoch": 0.49498920752389763, + "grad_norm": 3.512316942214966, + "learning_rate": 5.324368330012785e-06, + "loss": 0.9506, + "step": 6421 + }, + { + "epoch": 0.4950662966389146, + "grad_norm": 3.5826921463012695, + "learning_rate": 5.323122502539715e-06, + "loss": 0.9291, + "step": 6422 + }, + { + "epoch": 0.4951433857539315, + "grad_norm": 3.367875337600708, + "learning_rate": 5.32187665492163e-06, + "loss": 1.0068, + "step": 6423 + }, + { + "epoch": 0.4952204748689485, + "grad_norm": 3.737788200378418, + "learning_rate": 5.320630787236203e-06, + "loss": 1.0055, + "step": 6424 + }, + { + "epoch": 0.49529756398396546, + "grad_norm": 3.3039023876190186, + "learning_rate": 5.3193848995611075e-06, + "loss": 1.0002, + "step": 6425 + }, + { + "epoch": 0.49537465309898243, + "grad_norm": 3.7930006980895996, + "learning_rate": 5.318138991974016e-06, + "loss": 1.1429, + "step": 6426 + }, + { + "epoch": 0.4954517422139994, + "grad_norm": 3.4855892658233643, + "learning_rate": 5.316893064552607e-06, + "loss": 0.8802, + "step": 6427 + }, + { + "epoch": 0.4955288313290163, + "grad_norm": 3.870096206665039, + "learning_rate": 5.315647117374556e-06, + "loss": 1.0267, + "step": 6428 + }, + { + "epoch": 0.4956059204440333, + "grad_norm": 3.2889981269836426, + "learning_rate": 5.314401150517543e-06, + "loss": 0.9023, + "step": 6429 + }, + { + "epoch": 0.49568300955905026, + "grad_norm": 3.54913330078125, + "learning_rate": 5.313155164059247e-06, + "loss": 1.023, + "step": 6430 + }, + { + "epoch": 0.49576009867406723, + "grad_norm": 3.7795615196228027, + "learning_rate": 5.311909158077347e-06, + "loss": 1.0105, + "step": 6431 + }, + { + "epoch": 0.4958371877890842, + "grad_norm": 3.661637306213379, + "learning_rate": 5.310663132649526e-06, + "loss": 1.0102, + "step": 6432 + }, + { + "epoch": 0.4959142769041011, + "grad_norm": 3.3759381771087646, + "learning_rate": 5.30941708785347e-06, + "loss": 0.88, + "step": 6433 + }, + { + "epoch": 0.4959913660191181, + "grad_norm": 3.458970546722412, + "learning_rate": 5.3081710237668595e-06, + "loss": 0.9249, + "step": 6434 + }, + { + "epoch": 0.49606845513413506, + "grad_norm": 3.838961362838745, + "learning_rate": 5.306924940467383e-06, + "loss": 1.082, + "step": 6435 + }, + { + "epoch": 0.496145544249152, + "grad_norm": 3.4712164402008057, + "learning_rate": 5.3056788380327255e-06, + "loss": 0.8727, + "step": 6436 + }, + { + "epoch": 0.496222633364169, + "grad_norm": 3.2734153270721436, + "learning_rate": 5.304432716540574e-06, + "loss": 0.8984, + "step": 6437 + }, + { + "epoch": 0.4962997224791859, + "grad_norm": 3.8021316528320312, + "learning_rate": 5.303186576068621e-06, + "loss": 1.051, + "step": 6438 + }, + { + "epoch": 0.4963768115942029, + "grad_norm": 3.4965784549713135, + "learning_rate": 5.301940416694554e-06, + "loss": 0.8909, + "step": 6439 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 3.5403904914855957, + "learning_rate": 5.300694238496067e-06, + "loss": 0.9986, + "step": 6440 + }, + { + "epoch": 0.4965309898242368, + "grad_norm": 3.335237503051758, + "learning_rate": 5.299448041550852e-06, + "loss": 0.9258, + "step": 6441 + }, + { + "epoch": 0.4966080789392538, + "grad_norm": 3.8967440128326416, + "learning_rate": 5.2982018259366004e-06, + "loss": 1.0064, + "step": 6442 + }, + { + "epoch": 0.4966851680542707, + "grad_norm": 3.5903615951538086, + "learning_rate": 5.296955591731012e-06, + "loss": 0.9383, + "step": 6443 + }, + { + "epoch": 0.4967622571692877, + "grad_norm": 3.501309871673584, + "learning_rate": 5.295709339011779e-06, + "loss": 0.895, + "step": 6444 + }, + { + "epoch": 0.49683934628430465, + "grad_norm": 3.6556556224823, + "learning_rate": 5.2944630678566e-06, + "loss": 0.9801, + "step": 6445 + }, + { + "epoch": 0.4969164353993216, + "grad_norm": 3.3892955780029297, + "learning_rate": 5.293216778343175e-06, + "loss": 0.8553, + "step": 6446 + }, + { + "epoch": 0.4969935245143386, + "grad_norm": 3.6636784076690674, + "learning_rate": 5.2919704705492e-06, + "loss": 0.9744, + "step": 6447 + }, + { + "epoch": 0.49707061362935556, + "grad_norm": 3.7394680976867676, + "learning_rate": 5.290724144552379e-06, + "loss": 0.9662, + "step": 6448 + }, + { + "epoch": 0.4971477027443725, + "grad_norm": 3.7077627182006836, + "learning_rate": 5.2894778004304146e-06, + "loss": 0.8284, + "step": 6449 + }, + { + "epoch": 0.49722479185938945, + "grad_norm": 3.826967716217041, + "learning_rate": 5.288231438261008e-06, + "loss": 0.8625, + "step": 6450 + }, + { + "epoch": 0.4973018809744064, + "grad_norm": 3.9576635360717773, + "learning_rate": 5.286985058121865e-06, + "loss": 0.9651, + "step": 6451 + }, + { + "epoch": 0.4973789700894234, + "grad_norm": 3.6830098628997803, + "learning_rate": 5.285738660090688e-06, + "loss": 1.1414, + "step": 6452 + }, + { + "epoch": 0.49745605920444036, + "grad_norm": 3.410430908203125, + "learning_rate": 5.284492244245187e-06, + "loss": 0.8979, + "step": 6453 + }, + { + "epoch": 0.4975331483194573, + "grad_norm": 3.495670795440674, + "learning_rate": 5.283245810663068e-06, + "loss": 0.8779, + "step": 6454 + }, + { + "epoch": 0.49761023743447425, + "grad_norm": 3.549842596054077, + "learning_rate": 5.281999359422039e-06, + "loss": 0.8665, + "step": 6455 + }, + { + "epoch": 0.4976873265494912, + "grad_norm": 3.7550711631774902, + "learning_rate": 5.28075289059981e-06, + "loss": 0.9765, + "step": 6456 + }, + { + "epoch": 0.4977644156645082, + "grad_norm": 3.494971990585327, + "learning_rate": 5.279506404274094e-06, + "loss": 0.9499, + "step": 6457 + }, + { + "epoch": 0.49784150477952516, + "grad_norm": 3.89752197265625, + "learning_rate": 5.2782599005226e-06, + "loss": 1.0034, + "step": 6458 + }, + { + "epoch": 0.49791859389454207, + "grad_norm": 3.6488029956817627, + "learning_rate": 5.277013379423043e-06, + "loss": 1.037, + "step": 6459 + }, + { + "epoch": 0.49799568300955904, + "grad_norm": 3.672584295272827, + "learning_rate": 5.275766841053136e-06, + "loss": 1.0282, + "step": 6460 + }, + { + "epoch": 0.498072772124576, + "grad_norm": 3.7023403644561768, + "learning_rate": 5.2745202854905946e-06, + "loss": 0.9883, + "step": 6461 + }, + { + "epoch": 0.498149861239593, + "grad_norm": 3.654310941696167, + "learning_rate": 5.273273712813135e-06, + "loss": 0.9355, + "step": 6462 + }, + { + "epoch": 0.49822695035460995, + "grad_norm": 3.7347729206085205, + "learning_rate": 5.272027123098475e-06, + "loss": 1.0296, + "step": 6463 + }, + { + "epoch": 0.49830403946962687, + "grad_norm": 3.6972110271453857, + "learning_rate": 5.2707805164243335e-06, + "loss": 0.8939, + "step": 6464 + }, + { + "epoch": 0.49838112858464384, + "grad_norm": 3.7917075157165527, + "learning_rate": 5.269533892868428e-06, + "loss": 0.959, + "step": 6465 + }, + { + "epoch": 0.4984582176996608, + "grad_norm": 3.710469961166382, + "learning_rate": 5.268287252508481e-06, + "loss": 1.0708, + "step": 6466 + }, + { + "epoch": 0.4985353068146778, + "grad_norm": 3.4452409744262695, + "learning_rate": 5.2670405954222134e-06, + "loss": 0.9258, + "step": 6467 + }, + { + "epoch": 0.49861239592969475, + "grad_norm": 3.4874088764190674, + "learning_rate": 5.265793921687347e-06, + "loss": 0.9412, + "step": 6468 + }, + { + "epoch": 0.49868948504471167, + "grad_norm": 3.5041942596435547, + "learning_rate": 5.264547231381606e-06, + "loss": 0.8923, + "step": 6469 + }, + { + "epoch": 0.49876657415972864, + "grad_norm": 3.852966785430908, + "learning_rate": 5.263300524582717e-06, + "loss": 1.0018, + "step": 6470 + }, + { + "epoch": 0.4988436632747456, + "grad_norm": 3.813434600830078, + "learning_rate": 5.2620538013684005e-06, + "loss": 1.0497, + "step": 6471 + }, + { + "epoch": 0.4989207523897626, + "grad_norm": 3.9356420040130615, + "learning_rate": 5.26080706181639e-06, + "loss": 0.9555, + "step": 6472 + }, + { + "epoch": 0.49899784150477955, + "grad_norm": 3.891258716583252, + "learning_rate": 5.259560306004409e-06, + "loss": 0.9037, + "step": 6473 + }, + { + "epoch": 0.49907493061979646, + "grad_norm": 3.454547643661499, + "learning_rate": 5.258313534010187e-06, + "loss": 0.8702, + "step": 6474 + }, + { + "epoch": 0.49915201973481343, + "grad_norm": 3.695034980773926, + "learning_rate": 5.257066745911454e-06, + "loss": 0.8333, + "step": 6475 + }, + { + "epoch": 0.4992291088498304, + "grad_norm": 3.522186040878296, + "learning_rate": 5.2558199417859415e-06, + "loss": 0.948, + "step": 6476 + }, + { + "epoch": 0.4993061979648474, + "grad_norm": 3.7446117401123047, + "learning_rate": 5.25457312171138e-06, + "loss": 0.9535, + "step": 6477 + }, + { + "epoch": 0.49938328707986435, + "grad_norm": 3.8174216747283936, + "learning_rate": 5.253326285765502e-06, + "loss": 0.9085, + "step": 6478 + }, + { + "epoch": 0.49946037619488126, + "grad_norm": 3.598435640335083, + "learning_rate": 5.252079434026043e-06, + "loss": 0.9694, + "step": 6479 + }, + { + "epoch": 0.49953746530989823, + "grad_norm": 4.055591583251953, + "learning_rate": 5.250832566570736e-06, + "loss": 0.998, + "step": 6480 + }, + { + "epoch": 0.4996145544249152, + "grad_norm": 3.655466318130493, + "learning_rate": 5.2495856834773195e-06, + "loss": 0.8998, + "step": 6481 + }, + { + "epoch": 0.4996916435399322, + "grad_norm": 3.346543788909912, + "learning_rate": 5.248338784823526e-06, + "loss": 0.9427, + "step": 6482 + }, + { + "epoch": 0.49976873265494914, + "grad_norm": 3.703274965286255, + "learning_rate": 5.2470918706870975e-06, + "loss": 0.8559, + "step": 6483 + }, + { + "epoch": 0.49984582176996606, + "grad_norm": 3.716444492340088, + "learning_rate": 5.245844941145769e-06, + "loss": 1.017, + "step": 6484 + }, + { + "epoch": 0.49992291088498303, + "grad_norm": 3.966560125350952, + "learning_rate": 5.244597996277282e-06, + "loss": 0.8255, + "step": 6485 + }, + { + "epoch": 0.5, + "grad_norm": 3.69476318359375, + "learning_rate": 5.243351036159377e-06, + "loss": 0.8626, + "step": 6486 + }, + { + "epoch": 0.5000770891150169, + "grad_norm": 3.7048497200012207, + "learning_rate": 5.242104060869796e-06, + "loss": 0.9411, + "step": 6487 + }, + { + "epoch": 0.5001541782300339, + "grad_norm": 3.5426058769226074, + "learning_rate": 5.2408570704862795e-06, + "loss": 0.9875, + "step": 6488 + }, + { + "epoch": 0.5002312673450509, + "grad_norm": 3.579333782196045, + "learning_rate": 5.239610065086574e-06, + "loss": 1.0032, + "step": 6489 + }, + { + "epoch": 0.5003083564600679, + "grad_norm": 3.5092763900756836, + "learning_rate": 5.238363044748419e-06, + "loss": 0.8675, + "step": 6490 + }, + { + "epoch": 0.5003854455750848, + "grad_norm": 4.748165130615234, + "learning_rate": 5.237116009549565e-06, + "loss": 1.0171, + "step": 6491 + }, + { + "epoch": 0.5004625346901017, + "grad_norm": 3.549006938934326, + "learning_rate": 5.235868959567755e-06, + "loss": 0.968, + "step": 6492 + }, + { + "epoch": 0.5005396238051187, + "grad_norm": 4.191278457641602, + "learning_rate": 5.2346218948807345e-06, + "loss": 0.9313, + "step": 6493 + }, + { + "epoch": 0.5006167129201357, + "grad_norm": 3.531649589538574, + "learning_rate": 5.233374815566258e-06, + "loss": 0.8907, + "step": 6494 + }, + { + "epoch": 0.5006938020351527, + "grad_norm": 3.5904295444488525, + "learning_rate": 5.232127721702069e-06, + "loss": 0.9111, + "step": 6495 + }, + { + "epoch": 0.5007708911501696, + "grad_norm": 3.7466869354248047, + "learning_rate": 5.230880613365918e-06, + "loss": 0.9371, + "step": 6496 + }, + { + "epoch": 0.5008479802651865, + "grad_norm": 3.7517306804656982, + "learning_rate": 5.229633490635558e-06, + "loss": 1.0557, + "step": 6497 + }, + { + "epoch": 0.5009250693802035, + "grad_norm": 3.6371867656707764, + "learning_rate": 5.228386353588737e-06, + "loss": 1.0518, + "step": 6498 + }, + { + "epoch": 0.5010021584952205, + "grad_norm": 3.3489437103271484, + "learning_rate": 5.2271392023032115e-06, + "loss": 0.9757, + "step": 6499 + }, + { + "epoch": 0.5010792476102375, + "grad_norm": 3.531639575958252, + "learning_rate": 5.225892036856734e-06, + "loss": 0.9355, + "step": 6500 + }, + { + "epoch": 0.5011563367252544, + "grad_norm": 3.9964072704315186, + "learning_rate": 5.224644857327055e-06, + "loss": 0.9836, + "step": 6501 + }, + { + "epoch": 0.5012334258402713, + "grad_norm": 3.8607258796691895, + "learning_rate": 5.223397663791935e-06, + "loss": 1.0546, + "step": 6502 + }, + { + "epoch": 0.5013105149552883, + "grad_norm": 3.4289846420288086, + "learning_rate": 5.222150456329127e-06, + "loss": 0.8784, + "step": 6503 + }, + { + "epoch": 0.5013876040703052, + "grad_norm": 3.9486873149871826, + "learning_rate": 5.220903235016388e-06, + "loss": 1.0408, + "step": 6504 + }, + { + "epoch": 0.5014646931853223, + "grad_norm": 3.4474892616271973, + "learning_rate": 5.2196559999314765e-06, + "loss": 0.8746, + "step": 6505 + }, + { + "epoch": 0.5015417823003392, + "grad_norm": 3.749389171600342, + "learning_rate": 5.218408751152152e-06, + "loss": 1.0857, + "step": 6506 + }, + { + "epoch": 0.5016188714153561, + "grad_norm": 3.59549880027771, + "learning_rate": 5.217161488756172e-06, + "loss": 0.9574, + "step": 6507 + }, + { + "epoch": 0.5016959605303731, + "grad_norm": 3.458582878112793, + "learning_rate": 5.215914212821298e-06, + "loss": 0.9595, + "step": 6508 + }, + { + "epoch": 0.50177304964539, + "grad_norm": 3.547104835510254, + "learning_rate": 5.214666923425291e-06, + "loss": 0.9317, + "step": 6509 + }, + { + "epoch": 0.5018501387604071, + "grad_norm": 3.983787775039673, + "learning_rate": 5.213419620645914e-06, + "loss": 1.096, + "step": 6510 + }, + { + "epoch": 0.501927227875424, + "grad_norm": 3.5246825218200684, + "learning_rate": 5.212172304560928e-06, + "loss": 0.9323, + "step": 6511 + }, + { + "epoch": 0.5020043169904409, + "grad_norm": 3.895063877105713, + "learning_rate": 5.2109249752480985e-06, + "loss": 0.9427, + "step": 6512 + }, + { + "epoch": 0.5020814061054579, + "grad_norm": 4.591753005981445, + "learning_rate": 5.20967763278519e-06, + "loss": 1.0114, + "step": 6513 + }, + { + "epoch": 0.5021584952204748, + "grad_norm": 4.11314582824707, + "learning_rate": 5.208430277249965e-06, + "loss": 1.0685, + "step": 6514 + }, + { + "epoch": 0.5022355843354919, + "grad_norm": 3.453099012374878, + "learning_rate": 5.207182908720192e-06, + "loss": 0.9276, + "step": 6515 + }, + { + "epoch": 0.5023126734505088, + "grad_norm": 3.538799285888672, + "learning_rate": 5.205935527273638e-06, + "loss": 0.9903, + "step": 6516 + }, + { + "epoch": 0.5023897625655257, + "grad_norm": 3.893820285797119, + "learning_rate": 5.204688132988071e-06, + "loss": 0.8881, + "step": 6517 + }, + { + "epoch": 0.5024668516805427, + "grad_norm": 4.181878089904785, + "learning_rate": 5.203440725941259e-06, + "loss": 0.9407, + "step": 6518 + }, + { + "epoch": 0.5025439407955596, + "grad_norm": 3.6423377990722656, + "learning_rate": 5.2021933062109705e-06, + "loss": 1.031, + "step": 6519 + }, + { + "epoch": 0.5026210299105767, + "grad_norm": 3.723810911178589, + "learning_rate": 5.200945873874979e-06, + "loss": 1.0118, + "step": 6520 + }, + { + "epoch": 0.5026981190255936, + "grad_norm": 3.465095043182373, + "learning_rate": 5.19969842901105e-06, + "loss": 0.942, + "step": 6521 + }, + { + "epoch": 0.5027752081406105, + "grad_norm": 3.6051387786865234, + "learning_rate": 5.198450971696959e-06, + "loss": 0.8803, + "step": 6522 + }, + { + "epoch": 0.5028522972556275, + "grad_norm": 4.09804630279541, + "learning_rate": 5.197203502010478e-06, + "loss": 0.8432, + "step": 6523 + }, + { + "epoch": 0.5029293863706444, + "grad_norm": 4.076051712036133, + "learning_rate": 5.19595602002938e-06, + "loss": 0.9762, + "step": 6524 + }, + { + "epoch": 0.5030064754856615, + "grad_norm": 3.7992618083953857, + "learning_rate": 5.194708525831439e-06, + "loss": 0.8912, + "step": 6525 + }, + { + "epoch": 0.5030835646006784, + "grad_norm": 3.59840989112854, + "learning_rate": 5.1934610194944306e-06, + "loss": 0.874, + "step": 6526 + }, + { + "epoch": 0.5031606537156953, + "grad_norm": 3.5978050231933594, + "learning_rate": 5.192213501096129e-06, + "loss": 0.9612, + "step": 6527 + }, + { + "epoch": 0.5032377428307123, + "grad_norm": 3.7155649662017822, + "learning_rate": 5.1909659707143105e-06, + "loss": 1.042, + "step": 6528 + }, + { + "epoch": 0.5033148319457292, + "grad_norm": 3.7677228450775146, + "learning_rate": 5.189718428426753e-06, + "loss": 1.0446, + "step": 6529 + }, + { + "epoch": 0.5033919210607463, + "grad_norm": 3.759453773498535, + "learning_rate": 5.188470874311234e-06, + "loss": 1.0825, + "step": 6530 + }, + { + "epoch": 0.5034690101757632, + "grad_norm": 3.622924327850342, + "learning_rate": 5.187223308445534e-06, + "loss": 0.9703, + "step": 6531 + }, + { + "epoch": 0.5035460992907801, + "grad_norm": 3.712109088897705, + "learning_rate": 5.185975730907428e-06, + "loss": 1.0215, + "step": 6532 + }, + { + "epoch": 0.5036231884057971, + "grad_norm": 3.5897741317749023, + "learning_rate": 5.184728141774699e-06, + "loss": 0.919, + "step": 6533 + }, + { + "epoch": 0.503700277520814, + "grad_norm": 3.6697773933410645, + "learning_rate": 5.183480541125128e-06, + "loss": 1.0769, + "step": 6534 + }, + { + "epoch": 0.5037773666358311, + "grad_norm": 3.3042874336242676, + "learning_rate": 5.182232929036495e-06, + "loss": 0.9561, + "step": 6535 + }, + { + "epoch": 0.503854455750848, + "grad_norm": 3.4754981994628906, + "learning_rate": 5.180985305586581e-06, + "loss": 0.9105, + "step": 6536 + }, + { + "epoch": 0.5039315448658649, + "grad_norm": 3.574561834335327, + "learning_rate": 5.179737670853173e-06, + "loss": 1.0152, + "step": 6537 + }, + { + "epoch": 0.5040086339808819, + "grad_norm": 3.8162965774536133, + "learning_rate": 5.17849002491405e-06, + "loss": 0.8418, + "step": 6538 + }, + { + "epoch": 0.5040857230958988, + "grad_norm": 3.736490249633789, + "learning_rate": 5.177242367846999e-06, + "loss": 1.0162, + "step": 6539 + }, + { + "epoch": 0.5041628122109159, + "grad_norm": 3.601933240890503, + "learning_rate": 5.175994699729806e-06, + "loss": 0.9982, + "step": 6540 + }, + { + "epoch": 0.5042399013259328, + "grad_norm": 4.0474443435668945, + "learning_rate": 5.174747020640253e-06, + "loss": 1.0384, + "step": 6541 + }, + { + "epoch": 0.5043169904409497, + "grad_norm": 3.8889949321746826, + "learning_rate": 5.17349933065613e-06, + "loss": 0.8445, + "step": 6542 + }, + { + "epoch": 0.5043940795559667, + "grad_norm": 3.7362418174743652, + "learning_rate": 5.1722516298552206e-06, + "loss": 0.9535, + "step": 6543 + }, + { + "epoch": 0.5044711686709836, + "grad_norm": 3.572842597961426, + "learning_rate": 5.171003918315316e-06, + "loss": 0.9203, + "step": 6544 + }, + { + "epoch": 0.5045482577860007, + "grad_norm": 3.86114501953125, + "learning_rate": 5.169756196114202e-06, + "loss": 0.9648, + "step": 6545 + }, + { + "epoch": 0.5046253469010176, + "grad_norm": 3.9800467491149902, + "learning_rate": 5.1685084633296665e-06, + "loss": 0.9466, + "step": 6546 + }, + { + "epoch": 0.5047024360160345, + "grad_norm": 3.3460676670074463, + "learning_rate": 5.167260720039504e-06, + "loss": 0.8801, + "step": 6547 + }, + { + "epoch": 0.5047795251310515, + "grad_norm": 3.8302457332611084, + "learning_rate": 5.1660129663215e-06, + "loss": 1.0093, + "step": 6548 + }, + { + "epoch": 0.5048566142460684, + "grad_norm": 4.078864574432373, + "learning_rate": 5.164765202253448e-06, + "loss": 1.0053, + "step": 6549 + }, + { + "epoch": 0.5049337033610855, + "grad_norm": 3.9565751552581787, + "learning_rate": 5.163517427913139e-06, + "loss": 1.0029, + "step": 6550 + }, + { + "epoch": 0.5050107924761024, + "grad_norm": 3.460613250732422, + "learning_rate": 5.162269643378365e-06, + "loss": 0.9372, + "step": 6551 + }, + { + "epoch": 0.5050878815911193, + "grad_norm": 3.854912757873535, + "learning_rate": 5.161021848726919e-06, + "loss": 0.9244, + "step": 6552 + }, + { + "epoch": 0.5051649707061363, + "grad_norm": 3.741788148880005, + "learning_rate": 5.159774044036595e-06, + "loss": 0.9823, + "step": 6553 + }, + { + "epoch": 0.5052420598211532, + "grad_norm": 3.4191572666168213, + "learning_rate": 5.1585262293851865e-06, + "loss": 0.9462, + "step": 6554 + }, + { + "epoch": 0.5053191489361702, + "grad_norm": 3.4478116035461426, + "learning_rate": 5.1572784048504894e-06, + "loss": 0.9788, + "step": 6555 + }, + { + "epoch": 0.5053962380511872, + "grad_norm": 3.257162094116211, + "learning_rate": 5.156030570510298e-06, + "loss": 0.8265, + "step": 6556 + }, + { + "epoch": 0.5054733271662041, + "grad_norm": 3.8727734088897705, + "learning_rate": 5.154782726442409e-06, + "loss": 0.9721, + "step": 6557 + }, + { + "epoch": 0.5055504162812211, + "grad_norm": 3.7539451122283936, + "learning_rate": 5.153534872724618e-06, + "loss": 0.9972, + "step": 6558 + }, + { + "epoch": 0.505627505396238, + "grad_norm": 3.6078336238861084, + "learning_rate": 5.152287009434723e-06, + "loss": 1.0208, + "step": 6559 + }, + { + "epoch": 0.505704594511255, + "grad_norm": 3.6347334384918213, + "learning_rate": 5.1510391366505204e-06, + "loss": 0.9033, + "step": 6560 + }, + { + "epoch": 0.505781683626272, + "grad_norm": 3.9092540740966797, + "learning_rate": 5.149791254449812e-06, + "loss": 0.9415, + "step": 6561 + }, + { + "epoch": 0.5058587727412889, + "grad_norm": 3.516451597213745, + "learning_rate": 5.148543362910393e-06, + "loss": 1.0037, + "step": 6562 + }, + { + "epoch": 0.5059358618563059, + "grad_norm": 4.396605968475342, + "learning_rate": 5.147295462110066e-06, + "loss": 1.0104, + "step": 6563 + }, + { + "epoch": 0.5060129509713228, + "grad_norm": 4.63103723526001, + "learning_rate": 5.14604755212663e-06, + "loss": 1.0168, + "step": 6564 + }, + { + "epoch": 0.5060900400863398, + "grad_norm": 3.5548949241638184, + "learning_rate": 5.144799633037884e-06, + "loss": 0.8663, + "step": 6565 + }, + { + "epoch": 0.5061671292013568, + "grad_norm": 3.333975315093994, + "learning_rate": 5.143551704921632e-06, + "loss": 0.8946, + "step": 6566 + }, + { + "epoch": 0.5062442183163737, + "grad_norm": 4.707441806793213, + "learning_rate": 5.142303767855674e-06, + "loss": 0.9074, + "step": 6567 + }, + { + "epoch": 0.5063213074313907, + "grad_norm": 3.944218397140503, + "learning_rate": 5.141055821917814e-06, + "loss": 0.9857, + "step": 6568 + }, + { + "epoch": 0.5063983965464076, + "grad_norm": 3.700016975402832, + "learning_rate": 5.139807867185853e-06, + "loss": 0.9898, + "step": 6569 + }, + { + "epoch": 0.5064754856614246, + "grad_norm": 3.7743771076202393, + "learning_rate": 5.138559903737596e-06, + "loss": 0.9928, + "step": 6570 + }, + { + "epoch": 0.5065525747764416, + "grad_norm": 3.386918067932129, + "learning_rate": 5.137311931650847e-06, + "loss": 0.9561, + "step": 6571 + }, + { + "epoch": 0.5066296638914586, + "grad_norm": 3.41721510887146, + "learning_rate": 5.136063951003409e-06, + "loss": 0.9306, + "step": 6572 + }, + { + "epoch": 0.5067067530064755, + "grad_norm": 3.741901397705078, + "learning_rate": 5.134815961873089e-06, + "loss": 1.0163, + "step": 6573 + }, + { + "epoch": 0.5067838421214924, + "grad_norm": 3.600267171859741, + "learning_rate": 5.133567964337693e-06, + "loss": 1.0534, + "step": 6574 + }, + { + "epoch": 0.5068609312365094, + "grad_norm": 3.748769760131836, + "learning_rate": 5.132319958475025e-06, + "loss": 0.8298, + "step": 6575 + }, + { + "epoch": 0.5069380203515264, + "grad_norm": 3.435581684112549, + "learning_rate": 5.131071944362893e-06, + "loss": 0.932, + "step": 6576 + }, + { + "epoch": 0.5070151094665434, + "grad_norm": 3.6183292865753174, + "learning_rate": 5.129823922079105e-06, + "loss": 0.9721, + "step": 6577 + }, + { + "epoch": 0.5070921985815603, + "grad_norm": 3.3050825595855713, + "learning_rate": 5.128575891701467e-06, + "loss": 0.9811, + "step": 6578 + }, + { + "epoch": 0.5071692876965772, + "grad_norm": 3.783656120300293, + "learning_rate": 5.127327853307788e-06, + "loss": 0.8849, + "step": 6579 + }, + { + "epoch": 0.5072463768115942, + "grad_norm": 3.649453639984131, + "learning_rate": 5.126079806975877e-06, + "loss": 1.0261, + "step": 6580 + }, + { + "epoch": 0.5073234659266112, + "grad_norm": 3.476015567779541, + "learning_rate": 5.124831752783543e-06, + "loss": 0.9128, + "step": 6581 + }, + { + "epoch": 0.5074005550416282, + "grad_norm": 3.546862840652466, + "learning_rate": 5.123583690808596e-06, + "loss": 1.0003, + "step": 6582 + }, + { + "epoch": 0.5074776441566451, + "grad_norm": 3.592933177947998, + "learning_rate": 5.122335621128844e-06, + "loss": 0.9616, + "step": 6583 + }, + { + "epoch": 0.507554733271662, + "grad_norm": 3.251722812652588, + "learning_rate": 5.121087543822103e-06, + "loss": 0.9076, + "step": 6584 + }, + { + "epoch": 0.507631822386679, + "grad_norm": 3.8498170375823975, + "learning_rate": 5.119839458966179e-06, + "loss": 0.9246, + "step": 6585 + }, + { + "epoch": 0.507708911501696, + "grad_norm": 3.848869800567627, + "learning_rate": 5.118591366638885e-06, + "loss": 0.9279, + "step": 6586 + }, + { + "epoch": 0.507786000616713, + "grad_norm": 3.7381491661071777, + "learning_rate": 5.117343266918035e-06, + "loss": 0.9364, + "step": 6587 + }, + { + "epoch": 0.5078630897317299, + "grad_norm": 3.8815040588378906, + "learning_rate": 5.116095159881438e-06, + "loss": 0.9837, + "step": 6588 + }, + { + "epoch": 0.5079401788467468, + "grad_norm": 3.340867519378662, + "learning_rate": 5.11484704560691e-06, + "loss": 0.9044, + "step": 6589 + }, + { + "epoch": 0.5080172679617638, + "grad_norm": 3.5740439891815186, + "learning_rate": 5.113598924172264e-06, + "loss": 0.9594, + "step": 6590 + }, + { + "epoch": 0.5080943570767807, + "grad_norm": 3.547769784927368, + "learning_rate": 5.112350795655313e-06, + "loss": 0.8794, + "step": 6591 + }, + { + "epoch": 0.5081714461917978, + "grad_norm": 3.2799148559570312, + "learning_rate": 5.1111026601338735e-06, + "loss": 0.9809, + "step": 6592 + }, + { + "epoch": 0.5082485353068147, + "grad_norm": 3.7307305335998535, + "learning_rate": 5.109854517685756e-06, + "loss": 0.9173, + "step": 6593 + }, + { + "epoch": 0.5083256244218316, + "grad_norm": 3.4130656719207764, + "learning_rate": 5.108606368388779e-06, + "loss": 0.9393, + "step": 6594 + }, + { + "epoch": 0.5084027135368486, + "grad_norm": 3.4056098461151123, + "learning_rate": 5.107358212320758e-06, + "loss": 0.958, + "step": 6595 + }, + { + "epoch": 0.5084798026518655, + "grad_norm": 3.6553897857666016, + "learning_rate": 5.106110049559507e-06, + "loss": 0.9729, + "step": 6596 + }, + { + "epoch": 0.5085568917668826, + "grad_norm": 4.16010046005249, + "learning_rate": 5.1048618801828454e-06, + "loss": 0.9938, + "step": 6597 + }, + { + "epoch": 0.5086339808818995, + "grad_norm": 3.6851933002471924, + "learning_rate": 5.1036137042685885e-06, + "loss": 0.9174, + "step": 6598 + }, + { + "epoch": 0.5087110699969164, + "grad_norm": 3.8315787315368652, + "learning_rate": 5.1023655218945534e-06, + "loss": 0.9586, + "step": 6599 + }, + { + "epoch": 0.5087881591119334, + "grad_norm": 3.3780720233917236, + "learning_rate": 5.101117333138558e-06, + "loss": 0.891, + "step": 6600 + }, + { + "epoch": 0.5088652482269503, + "grad_norm": 3.6677708625793457, + "learning_rate": 5.099869138078421e-06, + "loss": 0.924, + "step": 6601 + }, + { + "epoch": 0.5089423373419674, + "grad_norm": 3.382268190383911, + "learning_rate": 5.09862093679196e-06, + "loss": 0.9927, + "step": 6602 + }, + { + "epoch": 0.5090194264569843, + "grad_norm": 3.3687045574188232, + "learning_rate": 5.097372729356997e-06, + "loss": 0.9537, + "step": 6603 + }, + { + "epoch": 0.5090965155720012, + "grad_norm": 3.567380905151367, + "learning_rate": 5.096124515851344e-06, + "loss": 0.9901, + "step": 6604 + }, + { + "epoch": 0.5091736046870182, + "grad_norm": 3.5415704250335693, + "learning_rate": 5.094876296352829e-06, + "loss": 1.0561, + "step": 6605 + }, + { + "epoch": 0.5092506938020351, + "grad_norm": 4.186346530914307, + "learning_rate": 5.093628070939266e-06, + "loss": 1.0447, + "step": 6606 + }, + { + "epoch": 0.5093277829170522, + "grad_norm": 3.475534677505493, + "learning_rate": 5.0923798396884785e-06, + "loss": 0.9391, + "step": 6607 + }, + { + "epoch": 0.5094048720320691, + "grad_norm": 3.3865292072296143, + "learning_rate": 5.0911316026782865e-06, + "loss": 1.018, + "step": 6608 + }, + { + "epoch": 0.509481961147086, + "grad_norm": 3.4238085746765137, + "learning_rate": 5.089883359986512e-06, + "loss": 0.9044, + "step": 6609 + }, + { + "epoch": 0.509559050262103, + "grad_norm": 3.9409427642822266, + "learning_rate": 5.088635111690974e-06, + "loss": 0.9538, + "step": 6610 + }, + { + "epoch": 0.5096361393771199, + "grad_norm": 3.5488216876983643, + "learning_rate": 5.087386857869496e-06, + "loss": 0.9652, + "step": 6611 + }, + { + "epoch": 0.509713228492137, + "grad_norm": 3.679537534713745, + "learning_rate": 5.086138598599901e-06, + "loss": 0.9817, + "step": 6612 + }, + { + "epoch": 0.5097903176071539, + "grad_norm": 3.6628053188323975, + "learning_rate": 5.08489033396001e-06, + "loss": 1.0088, + "step": 6613 + }, + { + "epoch": 0.5098674067221708, + "grad_norm": 3.9006776809692383, + "learning_rate": 5.083642064027646e-06, + "loss": 0.9371, + "step": 6614 + }, + { + "epoch": 0.5099444958371878, + "grad_norm": 3.7066752910614014, + "learning_rate": 5.082393788880633e-06, + "loss": 0.8783, + "step": 6615 + }, + { + "epoch": 0.5100215849522047, + "grad_norm": 3.3197736740112305, + "learning_rate": 5.081145508596794e-06, + "loss": 0.9278, + "step": 6616 + }, + { + "epoch": 0.5100986740672218, + "grad_norm": 3.691556930541992, + "learning_rate": 5.079897223253953e-06, + "loss": 1.0941, + "step": 6617 + }, + { + "epoch": 0.5101757631822387, + "grad_norm": 3.5036165714263916, + "learning_rate": 5.078648932929933e-06, + "loss": 0.903, + "step": 6618 + }, + { + "epoch": 0.5102528522972556, + "grad_norm": 3.766944408416748, + "learning_rate": 5.077400637702561e-06, + "loss": 0.8997, + "step": 6619 + }, + { + "epoch": 0.5103299414122726, + "grad_norm": 3.8213517665863037, + "learning_rate": 5.076152337649658e-06, + "loss": 0.955, + "step": 6620 + }, + { + "epoch": 0.5104070305272895, + "grad_norm": 3.9807519912719727, + "learning_rate": 5.074904032849052e-06, + "loss": 0.9852, + "step": 6621 + }, + { + "epoch": 0.5104841196423066, + "grad_norm": 3.3743574619293213, + "learning_rate": 5.0736557233785685e-06, + "loss": 0.8994, + "step": 6622 + }, + { + "epoch": 0.5105612087573235, + "grad_norm": 3.691159963607788, + "learning_rate": 5.072407409316031e-06, + "loss": 1.0397, + "step": 6623 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 3.5302302837371826, + "learning_rate": 5.071159090739266e-06, + "loss": 0.8242, + "step": 6624 + }, + { + "epoch": 0.5107153869873574, + "grad_norm": 4.0092926025390625, + "learning_rate": 5.069910767726103e-06, + "loss": 0.9717, + "step": 6625 + }, + { + "epoch": 0.5107924761023743, + "grad_norm": 4.246231555938721, + "learning_rate": 5.068662440354362e-06, + "loss": 1.1298, + "step": 6626 + }, + { + "epoch": 0.5108695652173914, + "grad_norm": 3.4625141620635986, + "learning_rate": 5.067414108701876e-06, + "loss": 0.9316, + "step": 6627 + }, + { + "epoch": 0.5109466543324083, + "grad_norm": 3.577906847000122, + "learning_rate": 5.066165772846468e-06, + "loss": 0.9469, + "step": 6628 + }, + { + "epoch": 0.5110237434474252, + "grad_norm": 3.831422805786133, + "learning_rate": 5.064917432865968e-06, + "loss": 0.9549, + "step": 6629 + }, + { + "epoch": 0.5111008325624422, + "grad_norm": 3.276904344558716, + "learning_rate": 5.063669088838201e-06, + "loss": 0.8982, + "step": 6630 + }, + { + "epoch": 0.5111779216774591, + "grad_norm": 3.7430529594421387, + "learning_rate": 5.0624207408409964e-06, + "loss": 0.9587, + "step": 6631 + }, + { + "epoch": 0.5112550107924761, + "grad_norm": 3.5662450790405273, + "learning_rate": 5.061172388952184e-06, + "loss": 0.8223, + "step": 6632 + }, + { + "epoch": 0.5113320999074931, + "grad_norm": 4.065229892730713, + "learning_rate": 5.059924033249587e-06, + "loss": 0.9091, + "step": 6633 + }, + { + "epoch": 0.51140918902251, + "grad_norm": 3.51495623588562, + "learning_rate": 5.058675673811038e-06, + "loss": 0.9732, + "step": 6634 + }, + { + "epoch": 0.511486278137527, + "grad_norm": 3.6267058849334717, + "learning_rate": 5.057427310714366e-06, + "loss": 0.9541, + "step": 6635 + }, + { + "epoch": 0.5115633672525439, + "grad_norm": 3.579072952270508, + "learning_rate": 5.056178944037396e-06, + "loss": 0.9344, + "step": 6636 + }, + { + "epoch": 0.511640456367561, + "grad_norm": 3.7198750972747803, + "learning_rate": 5.054930573857961e-06, + "loss": 0.9487, + "step": 6637 + }, + { + "epoch": 0.5117175454825779, + "grad_norm": 3.6153461933135986, + "learning_rate": 5.05368220025389e-06, + "loss": 0.9901, + "step": 6638 + }, + { + "epoch": 0.5117946345975948, + "grad_norm": 3.899430513381958, + "learning_rate": 5.052433823303011e-06, + "loss": 0.9722, + "step": 6639 + }, + { + "epoch": 0.5118717237126118, + "grad_norm": 3.687140703201294, + "learning_rate": 5.0511854430831564e-06, + "loss": 0.9444, + "step": 6640 + }, + { + "epoch": 0.5119488128276287, + "grad_norm": 3.4240894317626953, + "learning_rate": 5.0499370596721516e-06, + "loss": 0.9509, + "step": 6641 + }, + { + "epoch": 0.5120259019426457, + "grad_norm": 3.548834800720215, + "learning_rate": 5.048688673147831e-06, + "loss": 0.8858, + "step": 6642 + }, + { + "epoch": 0.5121029910576627, + "grad_norm": 3.7235825061798096, + "learning_rate": 5.047440283588025e-06, + "loss": 0.899, + "step": 6643 + }, + { + "epoch": 0.5121800801726796, + "grad_norm": 3.849551200866699, + "learning_rate": 5.0461918910705625e-06, + "loss": 0.9537, + "step": 6644 + }, + { + "epoch": 0.5122571692876966, + "grad_norm": 3.7150797843933105, + "learning_rate": 5.044943495673273e-06, + "loss": 0.9883, + "step": 6645 + }, + { + "epoch": 0.5123342584027135, + "grad_norm": 4.125245571136475, + "learning_rate": 5.043695097473992e-06, + "loss": 0.7933, + "step": 6646 + }, + { + "epoch": 0.5124113475177305, + "grad_norm": 3.6767773628234863, + "learning_rate": 5.042446696550546e-06, + "loss": 0.8519, + "step": 6647 + }, + { + "epoch": 0.5124884366327475, + "grad_norm": 3.659404754638672, + "learning_rate": 5.041198292980768e-06, + "loss": 0.8724, + "step": 6648 + }, + { + "epoch": 0.5125655257477644, + "grad_norm": 3.8711514472961426, + "learning_rate": 5.03994988684249e-06, + "loss": 0.8896, + "step": 6649 + }, + { + "epoch": 0.5126426148627814, + "grad_norm": 4.328100681304932, + "learning_rate": 5.038701478213545e-06, + "loss": 0.8541, + "step": 6650 + }, + { + "epoch": 0.5127197039777983, + "grad_norm": 4.055181503295898, + "learning_rate": 5.037453067171763e-06, + "loss": 0.9379, + "step": 6651 + }, + { + "epoch": 0.5127967930928153, + "grad_norm": 3.8188493251800537, + "learning_rate": 5.036204653794975e-06, + "loss": 1.0316, + "step": 6652 + }, + { + "epoch": 0.5128738822078323, + "grad_norm": 3.723499059677124, + "learning_rate": 5.034956238161015e-06, + "loss": 1.0024, + "step": 6653 + }, + { + "epoch": 0.5129509713228492, + "grad_norm": 3.589024782180786, + "learning_rate": 5.033707820347715e-06, + "loss": 0.9118, + "step": 6654 + }, + { + "epoch": 0.5130280604378662, + "grad_norm": 4.008199691772461, + "learning_rate": 5.0324594004329065e-06, + "loss": 0.956, + "step": 6655 + }, + { + "epoch": 0.5131051495528831, + "grad_norm": 3.509976387023926, + "learning_rate": 5.031210978494423e-06, + "loss": 0.9386, + "step": 6656 + }, + { + "epoch": 0.5131822386679001, + "grad_norm": 3.630739212036133, + "learning_rate": 5.029962554610096e-06, + "loss": 0.9486, + "step": 6657 + }, + { + "epoch": 0.513259327782917, + "grad_norm": 3.7410812377929688, + "learning_rate": 5.0287141288577575e-06, + "loss": 0.933, + "step": 6658 + }, + { + "epoch": 0.513336416897934, + "grad_norm": 3.538902997970581, + "learning_rate": 5.027465701315245e-06, + "loss": 0.9184, + "step": 6659 + }, + { + "epoch": 0.513413506012951, + "grad_norm": 3.678654670715332, + "learning_rate": 5.026217272060386e-06, + "loss": 0.8957, + "step": 6660 + }, + { + "epoch": 0.5134905951279679, + "grad_norm": 3.5913567543029785, + "learning_rate": 5.024968841171016e-06, + "loss": 1.0199, + "step": 6661 + }, + { + "epoch": 0.5135676842429849, + "grad_norm": 3.574263334274292, + "learning_rate": 5.023720408724971e-06, + "loss": 0.9235, + "step": 6662 + }, + { + "epoch": 0.5136447733580018, + "grad_norm": 4.819427490234375, + "learning_rate": 5.022471974800077e-06, + "loss": 0.8857, + "step": 6663 + }, + { + "epoch": 0.5137218624730188, + "grad_norm": 3.4264743328094482, + "learning_rate": 5.021223539474175e-06, + "loss": 0.9493, + "step": 6664 + }, + { + "epoch": 0.5137989515880358, + "grad_norm": 4.047597885131836, + "learning_rate": 5.019975102825093e-06, + "loss": 0.9116, + "step": 6665 + }, + { + "epoch": 0.5138760407030527, + "grad_norm": 3.5010175704956055, + "learning_rate": 5.018726664930667e-06, + "loss": 0.9236, + "step": 6666 + }, + { + "epoch": 0.5139531298180697, + "grad_norm": 3.643869638442993, + "learning_rate": 5.017478225868733e-06, + "loss": 0.945, + "step": 6667 + }, + { + "epoch": 0.5140302189330866, + "grad_norm": 3.658156156539917, + "learning_rate": 5.0162297857171196e-06, + "loss": 0.8691, + "step": 6668 + }, + { + "epoch": 0.5141073080481036, + "grad_norm": 3.7067670822143555, + "learning_rate": 5.0149813445536634e-06, + "loss": 0.9861, + "step": 6669 + }, + { + "epoch": 0.5141843971631206, + "grad_norm": 3.7881758213043213, + "learning_rate": 5.0137329024561985e-06, + "loss": 0.9236, + "step": 6670 + }, + { + "epoch": 0.5142614862781375, + "grad_norm": 3.391571521759033, + "learning_rate": 5.012484459502558e-06, + "loss": 0.9774, + "step": 6671 + }, + { + "epoch": 0.5143385753931545, + "grad_norm": 3.8176701068878174, + "learning_rate": 5.011236015770577e-06, + "loss": 0.8668, + "step": 6672 + }, + { + "epoch": 0.5144156645081714, + "grad_norm": 3.639432191848755, + "learning_rate": 5.0099875713380875e-06, + "loss": 0.879, + "step": 6673 + }, + { + "epoch": 0.5144927536231884, + "grad_norm": 3.630164861679077, + "learning_rate": 5.008739126282924e-06, + "loss": 0.9275, + "step": 6674 + }, + { + "epoch": 0.5145698427382054, + "grad_norm": 3.4726369380950928, + "learning_rate": 5.007490680682924e-06, + "loss": 0.9402, + "step": 6675 + }, + { + "epoch": 0.5146469318532223, + "grad_norm": 4.002774715423584, + "learning_rate": 5.006242234615916e-06, + "loss": 0.9698, + "step": 6676 + }, + { + "epoch": 0.5147240209682393, + "grad_norm": 3.349799871444702, + "learning_rate": 5.004993788159739e-06, + "loss": 0.913, + "step": 6677 + }, + { + "epoch": 0.5148011100832562, + "grad_norm": 3.616485595703125, + "learning_rate": 5.003745341392225e-06, + "loss": 0.8706, + "step": 6678 + }, + { + "epoch": 0.5148781991982732, + "grad_norm": 3.556090831756592, + "learning_rate": 5.0024968943912055e-06, + "loss": 0.9315, + "step": 6679 + }, + { + "epoch": 0.5149552883132902, + "grad_norm": 3.7224857807159424, + "learning_rate": 5.001248447234521e-06, + "loss": 0.8919, + "step": 6680 + }, + { + "epoch": 0.5150323774283071, + "grad_norm": 3.654167890548706, + "learning_rate": 5e-06, + "loss": 0.8902, + "step": 6681 + }, + { + "epoch": 0.5151094665433241, + "grad_norm": 4.025610446929932, + "learning_rate": 4.99875155276548e-06, + "loss": 0.9718, + "step": 6682 + }, + { + "epoch": 0.515186555658341, + "grad_norm": 3.4189112186431885, + "learning_rate": 4.997503105608795e-06, + "loss": 0.9592, + "step": 6683 + }, + { + "epoch": 0.515263644773358, + "grad_norm": 3.5572469234466553, + "learning_rate": 4.996254658607778e-06, + "loss": 0.9258, + "step": 6684 + }, + { + "epoch": 0.515340733888375, + "grad_norm": 3.8865630626678467, + "learning_rate": 4.995006211840263e-06, + "loss": 0.9746, + "step": 6685 + }, + { + "epoch": 0.5154178230033919, + "grad_norm": 3.353334426879883, + "learning_rate": 4.993757765384085e-06, + "loss": 1.0067, + "step": 6686 + }, + { + "epoch": 0.5154949121184089, + "grad_norm": 3.3230345249176025, + "learning_rate": 4.992509319317078e-06, + "loss": 0.8983, + "step": 6687 + }, + { + "epoch": 0.5155720012334258, + "grad_norm": 3.8244221210479736, + "learning_rate": 4.991260873717077e-06, + "loss": 1.0921, + "step": 6688 + }, + { + "epoch": 0.5156490903484428, + "grad_norm": 3.557263135910034, + "learning_rate": 4.990012428661914e-06, + "loss": 1.0434, + "step": 6689 + }, + { + "epoch": 0.5157261794634598, + "grad_norm": 3.5607333183288574, + "learning_rate": 4.988763984229425e-06, + "loss": 1.0169, + "step": 6690 + }, + { + "epoch": 0.5158032685784767, + "grad_norm": 3.5967090129852295, + "learning_rate": 4.987515540497444e-06, + "loss": 1.0117, + "step": 6691 + }, + { + "epoch": 0.5158803576934937, + "grad_norm": 3.827690839767456, + "learning_rate": 4.986267097543803e-06, + "loss": 0.9017, + "step": 6692 + }, + { + "epoch": 0.5159574468085106, + "grad_norm": 3.67106294631958, + "learning_rate": 4.985018655446337e-06, + "loss": 0.9155, + "step": 6693 + }, + { + "epoch": 0.5160345359235275, + "grad_norm": 3.8216536045074463, + "learning_rate": 4.983770214282883e-06, + "loss": 0.9669, + "step": 6694 + }, + { + "epoch": 0.5161116250385446, + "grad_norm": 4.562589645385742, + "learning_rate": 4.9825217741312695e-06, + "loss": 0.9065, + "step": 6695 + }, + { + "epoch": 0.5161887141535615, + "grad_norm": 3.8225772380828857, + "learning_rate": 4.9812733350693335e-06, + "loss": 1.104, + "step": 6696 + }, + { + "epoch": 0.5162658032685785, + "grad_norm": 3.3218376636505127, + "learning_rate": 4.980024897174909e-06, + "loss": 0.8526, + "step": 6697 + }, + { + "epoch": 0.5163428923835954, + "grad_norm": 3.461397886276245, + "learning_rate": 4.978776460525827e-06, + "loss": 1.0473, + "step": 6698 + }, + { + "epoch": 0.5164199814986123, + "grad_norm": 4.374227523803711, + "learning_rate": 4.977528025199925e-06, + "loss": 1.0439, + "step": 6699 + }, + { + "epoch": 0.5164970706136294, + "grad_norm": 3.6719889640808105, + "learning_rate": 4.976279591275033e-06, + "loss": 0.9636, + "step": 6700 + }, + { + "epoch": 0.5165741597286463, + "grad_norm": 3.455756425857544, + "learning_rate": 4.975031158828985e-06, + "loss": 0.9633, + "step": 6701 + }, + { + "epoch": 0.5166512488436633, + "grad_norm": 3.604682445526123, + "learning_rate": 4.9737827279396165e-06, + "loss": 0.8877, + "step": 6702 + }, + { + "epoch": 0.5167283379586802, + "grad_norm": 4.052313327789307, + "learning_rate": 4.9725342986847575e-06, + "loss": 0.9772, + "step": 6703 + }, + { + "epoch": 0.5168054270736971, + "grad_norm": 3.812180757522583, + "learning_rate": 4.971285871142242e-06, + "loss": 0.9781, + "step": 6704 + }, + { + "epoch": 0.5168825161887142, + "grad_norm": 3.562392234802246, + "learning_rate": 4.9700374453899046e-06, + "loss": 0.918, + "step": 6705 + }, + { + "epoch": 0.5169596053037311, + "grad_norm": 3.3591978549957275, + "learning_rate": 4.968789021505578e-06, + "loss": 0.9033, + "step": 6706 + }, + { + "epoch": 0.5170366944187481, + "grad_norm": 3.2979397773742676, + "learning_rate": 4.967540599567094e-06, + "loss": 0.9739, + "step": 6707 + }, + { + "epoch": 0.517113783533765, + "grad_norm": 3.202559232711792, + "learning_rate": 4.9662921796522856e-06, + "loss": 0.7981, + "step": 6708 + }, + { + "epoch": 0.5171908726487819, + "grad_norm": 3.387112617492676, + "learning_rate": 4.965043761838986e-06, + "loss": 0.911, + "step": 6709 + }, + { + "epoch": 0.517267961763799, + "grad_norm": 3.931759834289551, + "learning_rate": 4.963795346205026e-06, + "loss": 1.0281, + "step": 6710 + }, + { + "epoch": 0.5173450508788159, + "grad_norm": 3.8739635944366455, + "learning_rate": 4.962546932828238e-06, + "loss": 1.0021, + "step": 6711 + }, + { + "epoch": 0.5174221399938329, + "grad_norm": 3.6043684482574463, + "learning_rate": 4.961298521786456e-06, + "loss": 0.9596, + "step": 6712 + }, + { + "epoch": 0.5174992291088498, + "grad_norm": 3.7522406578063965, + "learning_rate": 4.96005011315751e-06, + "loss": 0.9987, + "step": 6713 + }, + { + "epoch": 0.5175763182238667, + "grad_norm": 3.633314609527588, + "learning_rate": 4.958801707019234e-06, + "loss": 0.9147, + "step": 6714 + }, + { + "epoch": 0.5176534073388838, + "grad_norm": 3.7346205711364746, + "learning_rate": 4.957553303449456e-06, + "loss": 0.9952, + "step": 6715 + }, + { + "epoch": 0.5177304964539007, + "grad_norm": 3.443331003189087, + "learning_rate": 4.9563049025260105e-06, + "loss": 0.8854, + "step": 6716 + }, + { + "epoch": 0.5178075855689177, + "grad_norm": 3.816298723220825, + "learning_rate": 4.9550565043267285e-06, + "loss": 1.0634, + "step": 6717 + }, + { + "epoch": 0.5178846746839346, + "grad_norm": 3.5989532470703125, + "learning_rate": 4.95380810892944e-06, + "loss": 0.9299, + "step": 6718 + }, + { + "epoch": 0.5179617637989515, + "grad_norm": 3.5715746879577637, + "learning_rate": 4.952559716411976e-06, + "loss": 0.9129, + "step": 6719 + }, + { + "epoch": 0.5180388529139686, + "grad_norm": 3.993812322616577, + "learning_rate": 4.95131132685217e-06, + "loss": 0.9028, + "step": 6720 + }, + { + "epoch": 0.5181159420289855, + "grad_norm": 3.4312379360198975, + "learning_rate": 4.950062940327849e-06, + "loss": 1.039, + "step": 6721 + }, + { + "epoch": 0.5181930311440025, + "grad_norm": 3.6945204734802246, + "learning_rate": 4.948814556916845e-06, + "loss": 0.9212, + "step": 6722 + }, + { + "epoch": 0.5182701202590194, + "grad_norm": 3.5042591094970703, + "learning_rate": 4.94756617669699e-06, + "loss": 0.9631, + "step": 6723 + }, + { + "epoch": 0.5183472093740363, + "grad_norm": 3.6338624954223633, + "learning_rate": 4.946317799746111e-06, + "loss": 1.0292, + "step": 6724 + }, + { + "epoch": 0.5184242984890534, + "grad_norm": 3.668816566467285, + "learning_rate": 4.94506942614204e-06, + "loss": 0.9855, + "step": 6725 + }, + { + "epoch": 0.5185013876040703, + "grad_norm": 4.495701313018799, + "learning_rate": 4.9438210559626045e-06, + "loss": 0.9881, + "step": 6726 + }, + { + "epoch": 0.5185784767190873, + "grad_norm": 3.3641388416290283, + "learning_rate": 4.942572689285636e-06, + "loss": 0.9117, + "step": 6727 + }, + { + "epoch": 0.5186555658341042, + "grad_norm": 4.106291770935059, + "learning_rate": 4.941324326188963e-06, + "loss": 0.9193, + "step": 6728 + }, + { + "epoch": 0.5187326549491211, + "grad_norm": 3.9092233180999756, + "learning_rate": 4.940075966750414e-06, + "loss": 1.0645, + "step": 6729 + }, + { + "epoch": 0.5188097440641382, + "grad_norm": 4.035946846008301, + "learning_rate": 4.938827611047818e-06, + "loss": 1.0177, + "step": 6730 + }, + { + "epoch": 0.5188868331791551, + "grad_norm": 3.45704984664917, + "learning_rate": 4.937579259159004e-06, + "loss": 0.9508, + "step": 6731 + }, + { + "epoch": 0.5189639222941721, + "grad_norm": 3.8246238231658936, + "learning_rate": 4.9363309111618e-06, + "loss": 1.014, + "step": 6732 + }, + { + "epoch": 0.519041011409189, + "grad_norm": 3.2717719078063965, + "learning_rate": 4.935082567134033e-06, + "loss": 0.8324, + "step": 6733 + }, + { + "epoch": 0.5191181005242059, + "grad_norm": 3.4391167163848877, + "learning_rate": 4.933834227153533e-06, + "loss": 0.8627, + "step": 6734 + }, + { + "epoch": 0.519195189639223, + "grad_norm": 3.452841281890869, + "learning_rate": 4.9325858912981265e-06, + "loss": 1.0195, + "step": 6735 + }, + { + "epoch": 0.5192722787542399, + "grad_norm": 3.5897481441497803, + "learning_rate": 4.93133755964564e-06, + "loss": 0.8639, + "step": 6736 + }, + { + "epoch": 0.5193493678692569, + "grad_norm": 3.7452785968780518, + "learning_rate": 4.9300892322739e-06, + "loss": 0.9422, + "step": 6737 + }, + { + "epoch": 0.5194264569842738, + "grad_norm": 3.614225149154663, + "learning_rate": 4.928840909260735e-06, + "loss": 0.9332, + "step": 6738 + }, + { + "epoch": 0.5195035460992907, + "grad_norm": 3.500887870788574, + "learning_rate": 4.927592590683972e-06, + "loss": 0.9325, + "step": 6739 + }, + { + "epoch": 0.5195806352143078, + "grad_norm": 3.3096768856048584, + "learning_rate": 4.926344276621434e-06, + "loss": 0.8339, + "step": 6740 + }, + { + "epoch": 0.5196577243293247, + "grad_norm": 3.481545925140381, + "learning_rate": 4.925095967150949e-06, + "loss": 1.0082, + "step": 6741 + }, + { + "epoch": 0.5197348134443417, + "grad_norm": 3.6488254070281982, + "learning_rate": 4.923847662350344e-06, + "loss": 1.0195, + "step": 6742 + }, + { + "epoch": 0.5198119025593586, + "grad_norm": 3.6356041431427, + "learning_rate": 4.922599362297442e-06, + "loss": 0.9489, + "step": 6743 + }, + { + "epoch": 0.5198889916743756, + "grad_norm": 3.7208831310272217, + "learning_rate": 4.921351067070068e-06, + "loss": 0.9843, + "step": 6744 + }, + { + "epoch": 0.5199660807893925, + "grad_norm": 3.6727988719940186, + "learning_rate": 4.920102776746049e-06, + "loss": 0.9307, + "step": 6745 + }, + { + "epoch": 0.5200431699044095, + "grad_norm": 3.49745774269104, + "learning_rate": 4.918854491403208e-06, + "loss": 0.8772, + "step": 6746 + }, + { + "epoch": 0.5201202590194265, + "grad_norm": 3.6098008155822754, + "learning_rate": 4.91760621111937e-06, + "loss": 0.9431, + "step": 6747 + }, + { + "epoch": 0.5201973481344434, + "grad_norm": 3.857675552368164, + "learning_rate": 4.916357935972354e-06, + "loss": 0.9809, + "step": 6748 + }, + { + "epoch": 0.5202744372494604, + "grad_norm": 3.8687050342559814, + "learning_rate": 4.915109666039991e-06, + "loss": 1.0598, + "step": 6749 + }, + { + "epoch": 0.5203515263644773, + "grad_norm": 3.66666841506958, + "learning_rate": 4.9138614014001e-06, + "loss": 0.844, + "step": 6750 + }, + { + "epoch": 0.5204286154794943, + "grad_norm": 3.5708515644073486, + "learning_rate": 4.912613142130504e-06, + "loss": 1.031, + "step": 6751 + }, + { + "epoch": 0.5205057045945113, + "grad_norm": 3.5527634620666504, + "learning_rate": 4.911364888309027e-06, + "loss": 0.9202, + "step": 6752 + }, + { + "epoch": 0.5205827937095282, + "grad_norm": 3.893904447555542, + "learning_rate": 4.910116640013489e-06, + "loss": 0.8644, + "step": 6753 + }, + { + "epoch": 0.5206598828245452, + "grad_norm": 3.3729803562164307, + "learning_rate": 4.908868397321714e-06, + "loss": 0.848, + "step": 6754 + }, + { + "epoch": 0.5207369719395621, + "grad_norm": 3.5364203453063965, + "learning_rate": 4.907620160311522e-06, + "loss": 0.9323, + "step": 6755 + }, + { + "epoch": 0.5208140610545791, + "grad_norm": 3.7312333583831787, + "learning_rate": 4.906371929060734e-06, + "loss": 0.9715, + "step": 6756 + }, + { + "epoch": 0.5208911501695961, + "grad_norm": 3.8178627490997314, + "learning_rate": 4.905123703647173e-06, + "loss": 0.8637, + "step": 6757 + }, + { + "epoch": 0.520968239284613, + "grad_norm": 4.14602518081665, + "learning_rate": 4.903875484148656e-06, + "loss": 1.0387, + "step": 6758 + }, + { + "epoch": 0.52104532839963, + "grad_norm": 4.017736911773682, + "learning_rate": 4.902627270643006e-06, + "loss": 1.0268, + "step": 6759 + }, + { + "epoch": 0.5211224175146469, + "grad_norm": 3.643479347229004, + "learning_rate": 4.901379063208041e-06, + "loss": 1.0273, + "step": 6760 + }, + { + "epoch": 0.5211995066296639, + "grad_norm": 3.823141574859619, + "learning_rate": 4.90013086192158e-06, + "loss": 0.9976, + "step": 6761 + }, + { + "epoch": 0.5212765957446809, + "grad_norm": 3.5352299213409424, + "learning_rate": 4.898882666861444e-06, + "loss": 1.007, + "step": 6762 + }, + { + "epoch": 0.5213536848596978, + "grad_norm": 3.585045576095581, + "learning_rate": 4.897634478105448e-06, + "loss": 0.9049, + "step": 6763 + }, + { + "epoch": 0.5214307739747148, + "grad_norm": 3.5495879650115967, + "learning_rate": 4.896386295731413e-06, + "loss": 0.9165, + "step": 6764 + }, + { + "epoch": 0.5215078630897317, + "grad_norm": 3.4797205924987793, + "learning_rate": 4.895138119817156e-06, + "loss": 0.991, + "step": 6765 + }, + { + "epoch": 0.5215849522047487, + "grad_norm": 3.806225538253784, + "learning_rate": 4.8938899504404944e-06, + "loss": 0.867, + "step": 6766 + }, + { + "epoch": 0.5216620413197657, + "grad_norm": 3.340394973754883, + "learning_rate": 4.892641787679244e-06, + "loss": 0.8744, + "step": 6767 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 3.620096206665039, + "learning_rate": 4.891393631611223e-06, + "loss": 1.0585, + "step": 6768 + }, + { + "epoch": 0.5218162195497996, + "grad_norm": 3.8515403270721436, + "learning_rate": 4.8901454823142454e-06, + "loss": 1.0662, + "step": 6769 + }, + { + "epoch": 0.5218933086648165, + "grad_norm": 4.0987420082092285, + "learning_rate": 4.888897339866129e-06, + "loss": 1.0095, + "step": 6770 + }, + { + "epoch": 0.5219703977798335, + "grad_norm": 3.7818572521209717, + "learning_rate": 4.887649204344688e-06, + "loss": 0.9227, + "step": 6771 + }, + { + "epoch": 0.5220474868948505, + "grad_norm": 3.931004285812378, + "learning_rate": 4.886401075827737e-06, + "loss": 0.9917, + "step": 6772 + }, + { + "epoch": 0.5221245760098674, + "grad_norm": 3.2993662357330322, + "learning_rate": 4.8851529543930915e-06, + "loss": 0.7365, + "step": 6773 + }, + { + "epoch": 0.5222016651248844, + "grad_norm": 3.5455148220062256, + "learning_rate": 4.883904840118563e-06, + "loss": 0.9588, + "step": 6774 + }, + { + "epoch": 0.5222787542399013, + "grad_norm": 3.565535306930542, + "learning_rate": 4.882656733081967e-06, + "loss": 0.9395, + "step": 6775 + }, + { + "epoch": 0.5223558433549182, + "grad_norm": 4.117207050323486, + "learning_rate": 4.881408633361116e-06, + "loss": 1.1194, + "step": 6776 + }, + { + "epoch": 0.5224329324699353, + "grad_norm": 3.6578311920166016, + "learning_rate": 4.880160541033822e-06, + "loss": 0.9629, + "step": 6777 + }, + { + "epoch": 0.5225100215849522, + "grad_norm": 4.134820938110352, + "learning_rate": 4.878912456177898e-06, + "loss": 0.9406, + "step": 6778 + }, + { + "epoch": 0.5225871106999692, + "grad_norm": 3.462031841278076, + "learning_rate": 4.8776643788711565e-06, + "loss": 0.8621, + "step": 6779 + }, + { + "epoch": 0.5226641998149861, + "grad_norm": 3.252340793609619, + "learning_rate": 4.876416309191406e-06, + "loss": 0.9939, + "step": 6780 + }, + { + "epoch": 0.522741288930003, + "grad_norm": 3.3890349864959717, + "learning_rate": 4.8751682472164585e-06, + "loss": 0.9544, + "step": 6781 + }, + { + "epoch": 0.5228183780450201, + "grad_norm": 3.870720624923706, + "learning_rate": 4.873920193024125e-06, + "loss": 0.9931, + "step": 6782 + }, + { + "epoch": 0.522895467160037, + "grad_norm": 3.4139373302459717, + "learning_rate": 4.872672146692214e-06, + "loss": 0.8944, + "step": 6783 + }, + { + "epoch": 0.522972556275054, + "grad_norm": 4.145259380340576, + "learning_rate": 4.871424108298536e-06, + "loss": 1.0266, + "step": 6784 + }, + { + "epoch": 0.5230496453900709, + "grad_norm": 3.556744337081909, + "learning_rate": 4.8701760779208975e-06, + "loss": 0.8754, + "step": 6785 + }, + { + "epoch": 0.5231267345050878, + "grad_norm": 3.8212878704071045, + "learning_rate": 4.8689280556371084e-06, + "loss": 1.0068, + "step": 6786 + }, + { + "epoch": 0.5232038236201049, + "grad_norm": 4.2917399406433105, + "learning_rate": 4.867680041524977e-06, + "loss": 1.0567, + "step": 6787 + }, + { + "epoch": 0.5232809127351218, + "grad_norm": 3.264390230178833, + "learning_rate": 4.866432035662309e-06, + "loss": 0.9187, + "step": 6788 + }, + { + "epoch": 0.5233580018501388, + "grad_norm": 3.5728774070739746, + "learning_rate": 4.865184038126912e-06, + "loss": 0.9863, + "step": 6789 + }, + { + "epoch": 0.5234350909651557, + "grad_norm": 4.23891544342041, + "learning_rate": 4.863936048996593e-06, + "loss": 0.8834, + "step": 6790 + }, + { + "epoch": 0.5235121800801726, + "grad_norm": 3.400658130645752, + "learning_rate": 4.862688068349156e-06, + "loss": 0.8721, + "step": 6791 + }, + { + "epoch": 0.5235892691951897, + "grad_norm": 3.758239507675171, + "learning_rate": 4.861440096262404e-06, + "loss": 0.8671, + "step": 6792 + }, + { + "epoch": 0.5236663583102066, + "grad_norm": 3.4603350162506104, + "learning_rate": 4.860192132814148e-06, + "loss": 0.9106, + "step": 6793 + }, + { + "epoch": 0.5237434474252236, + "grad_norm": 3.423933506011963, + "learning_rate": 4.858944178082188e-06, + "loss": 0.9117, + "step": 6794 + }, + { + "epoch": 0.5238205365402405, + "grad_norm": 3.4197144508361816, + "learning_rate": 4.857696232144327e-06, + "loss": 0.9417, + "step": 6795 + }, + { + "epoch": 0.5238976256552574, + "grad_norm": 3.6277825832366943, + "learning_rate": 4.8564482950783685e-06, + "loss": 0.9014, + "step": 6796 + }, + { + "epoch": 0.5239747147702745, + "grad_norm": 3.313845157623291, + "learning_rate": 4.855200366962116e-06, + "loss": 1.0002, + "step": 6797 + }, + { + "epoch": 0.5240518038852914, + "grad_norm": 3.5861170291900635, + "learning_rate": 4.853952447873371e-06, + "loss": 0.9123, + "step": 6798 + }, + { + "epoch": 0.5241288930003084, + "grad_norm": 3.7218551635742188, + "learning_rate": 4.852704537889934e-06, + "loss": 0.988, + "step": 6799 + }, + { + "epoch": 0.5242059821153253, + "grad_norm": 3.6301844120025635, + "learning_rate": 4.851456637089607e-06, + "loss": 0.9492, + "step": 6800 + }, + { + "epoch": 0.5242830712303422, + "grad_norm": 3.789766788482666, + "learning_rate": 4.850208745550189e-06, + "loss": 0.8719, + "step": 6801 + }, + { + "epoch": 0.5243601603453593, + "grad_norm": 3.7619876861572266, + "learning_rate": 4.84896086334948e-06, + "loss": 1.0564, + "step": 6802 + }, + { + "epoch": 0.5244372494603762, + "grad_norm": 3.331724166870117, + "learning_rate": 4.8477129905652785e-06, + "loss": 0.9431, + "step": 6803 + }, + { + "epoch": 0.5245143385753932, + "grad_norm": 3.551481246948242, + "learning_rate": 4.8464651272753835e-06, + "loss": 0.9178, + "step": 6804 + }, + { + "epoch": 0.5245914276904101, + "grad_norm": 3.397059440612793, + "learning_rate": 4.845217273557593e-06, + "loss": 1.0281, + "step": 6805 + }, + { + "epoch": 0.524668516805427, + "grad_norm": 3.586930990219116, + "learning_rate": 4.843969429489703e-06, + "loss": 1.0194, + "step": 6806 + }, + { + "epoch": 0.5247456059204441, + "grad_norm": 3.493337392807007, + "learning_rate": 4.842721595149511e-06, + "loss": 0.9052, + "step": 6807 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 3.607456684112549, + "learning_rate": 4.841473770614814e-06, + "loss": 0.8831, + "step": 6808 + }, + { + "epoch": 0.524899784150478, + "grad_norm": 4.256878852844238, + "learning_rate": 4.840225955963406e-06, + "loss": 0.9496, + "step": 6809 + }, + { + "epoch": 0.5249768732654949, + "grad_norm": 3.489366292953491, + "learning_rate": 4.838978151273082e-06, + "loss": 0.8825, + "step": 6810 + }, + { + "epoch": 0.5250539623805118, + "grad_norm": 3.237924814224243, + "learning_rate": 4.8377303566216365e-06, + "loss": 0.8377, + "step": 6811 + }, + { + "epoch": 0.5251310514955289, + "grad_norm": 3.6981184482574463, + "learning_rate": 4.836482572086862e-06, + "loss": 0.975, + "step": 6812 + }, + { + "epoch": 0.5252081406105458, + "grad_norm": 3.9798178672790527, + "learning_rate": 4.8352347977465535e-06, + "loss": 1.0573, + "step": 6813 + }, + { + "epoch": 0.5252852297255628, + "grad_norm": 3.7807118892669678, + "learning_rate": 4.833987033678501e-06, + "loss": 1.055, + "step": 6814 + }, + { + "epoch": 0.5253623188405797, + "grad_norm": 3.7995917797088623, + "learning_rate": 4.8327392799604974e-06, + "loss": 1.0315, + "step": 6815 + }, + { + "epoch": 0.5254394079555966, + "grad_norm": 3.570406675338745, + "learning_rate": 4.831491536670334e-06, + "loss": 0.8707, + "step": 6816 + }, + { + "epoch": 0.5255164970706137, + "grad_norm": 3.5016283988952637, + "learning_rate": 4.8302438038858005e-06, + "loss": 1.0007, + "step": 6817 + }, + { + "epoch": 0.5255935861856306, + "grad_norm": 3.539832830429077, + "learning_rate": 4.828996081684685e-06, + "loss": 0.8653, + "step": 6818 + }, + { + "epoch": 0.5256706753006476, + "grad_norm": 3.599923610687256, + "learning_rate": 4.82774837014478e-06, + "loss": 0.9648, + "step": 6819 + }, + { + "epoch": 0.5257477644156645, + "grad_norm": 3.7027664184570312, + "learning_rate": 4.826500669343872e-06, + "loss": 1.0022, + "step": 6820 + }, + { + "epoch": 0.5258248535306814, + "grad_norm": 3.6279308795928955, + "learning_rate": 4.8252529793597484e-06, + "loss": 0.978, + "step": 6821 + }, + { + "epoch": 0.5259019426456985, + "grad_norm": 3.6738011837005615, + "learning_rate": 4.824005300270196e-06, + "loss": 0.9233, + "step": 6822 + }, + { + "epoch": 0.5259790317607154, + "grad_norm": 3.7691490650177, + "learning_rate": 4.8227576321530015e-06, + "loss": 0.9322, + "step": 6823 + }, + { + "epoch": 0.5260561208757324, + "grad_norm": 3.266089916229248, + "learning_rate": 4.821509975085952e-06, + "loss": 0.8356, + "step": 6824 + }, + { + "epoch": 0.5261332099907493, + "grad_norm": 3.7823686599731445, + "learning_rate": 4.820262329146829e-06, + "loss": 0.9445, + "step": 6825 + }, + { + "epoch": 0.5262102991057662, + "grad_norm": 3.267396926879883, + "learning_rate": 4.81901469441342e-06, + "loss": 0.7796, + "step": 6826 + }, + { + "epoch": 0.5262873882207832, + "grad_norm": 3.59197998046875, + "learning_rate": 4.8177670709635085e-06, + "loss": 0.9793, + "step": 6827 + }, + { + "epoch": 0.5263644773358002, + "grad_norm": 3.956080913543701, + "learning_rate": 4.8165194588748745e-06, + "loss": 0.9847, + "step": 6828 + }, + { + "epoch": 0.5264415664508172, + "grad_norm": 3.319718837738037, + "learning_rate": 4.815271858225303e-06, + "loss": 0.7932, + "step": 6829 + }, + { + "epoch": 0.5265186555658341, + "grad_norm": 3.7214298248291016, + "learning_rate": 4.814024269092575e-06, + "loss": 0.9562, + "step": 6830 + }, + { + "epoch": 0.526595744680851, + "grad_norm": 3.7417917251586914, + "learning_rate": 4.812776691554469e-06, + "loss": 1.0291, + "step": 6831 + }, + { + "epoch": 0.526672833795868, + "grad_norm": 3.410249948501587, + "learning_rate": 4.811529125688768e-06, + "loss": 0.9492, + "step": 6832 + }, + { + "epoch": 0.526749922910885, + "grad_norm": 3.934293031692505, + "learning_rate": 4.8102815715732495e-06, + "loss": 1.0856, + "step": 6833 + }, + { + "epoch": 0.526827012025902, + "grad_norm": 3.7458932399749756, + "learning_rate": 4.809034029285691e-06, + "loss": 0.9264, + "step": 6834 + }, + { + "epoch": 0.5269041011409189, + "grad_norm": 3.63541579246521, + "learning_rate": 4.807786498903874e-06, + "loss": 0.9759, + "step": 6835 + }, + { + "epoch": 0.5269811902559358, + "grad_norm": 3.5854616165161133, + "learning_rate": 4.806538980505572e-06, + "loss": 1.0086, + "step": 6836 + }, + { + "epoch": 0.5270582793709528, + "grad_norm": 3.602339267730713, + "learning_rate": 4.805291474168561e-06, + "loss": 0.8689, + "step": 6837 + }, + { + "epoch": 0.5271353684859698, + "grad_norm": 3.346555471420288, + "learning_rate": 4.80404397997062e-06, + "loss": 0.867, + "step": 6838 + }, + { + "epoch": 0.5272124576009868, + "grad_norm": 3.9737861156463623, + "learning_rate": 4.802796497989523e-06, + "loss": 0.9963, + "step": 6839 + }, + { + "epoch": 0.5272895467160037, + "grad_norm": 3.378373146057129, + "learning_rate": 4.801549028303042e-06, + "loss": 0.8988, + "step": 6840 + }, + { + "epoch": 0.5273666358310206, + "grad_norm": 3.7955613136291504, + "learning_rate": 4.800301570988951e-06, + "loss": 0.9651, + "step": 6841 + }, + { + "epoch": 0.5274437249460376, + "grad_norm": 3.713027238845825, + "learning_rate": 4.799054126125023e-06, + "loss": 0.9965, + "step": 6842 + }, + { + "epoch": 0.5275208140610546, + "grad_norm": 3.291476011276245, + "learning_rate": 4.79780669378903e-06, + "loss": 0.9719, + "step": 6843 + }, + { + "epoch": 0.5275979031760716, + "grad_norm": 3.837628126144409, + "learning_rate": 4.796559274058741e-06, + "loss": 0.8665, + "step": 6844 + }, + { + "epoch": 0.5276749922910885, + "grad_norm": 3.562666177749634, + "learning_rate": 4.79531186701193e-06, + "loss": 0.9667, + "step": 6845 + }, + { + "epoch": 0.5277520814061054, + "grad_norm": 3.401890754699707, + "learning_rate": 4.7940644727263615e-06, + "loss": 0.8789, + "step": 6846 + }, + { + "epoch": 0.5278291705211224, + "grad_norm": 3.439560651779175, + "learning_rate": 4.7928170912798075e-06, + "loss": 0.8647, + "step": 6847 + }, + { + "epoch": 0.5279062596361394, + "grad_norm": 3.419785737991333, + "learning_rate": 4.791569722750036e-06, + "loss": 0.8197, + "step": 6848 + }, + { + "epoch": 0.5279833487511564, + "grad_norm": 3.7677369117736816, + "learning_rate": 4.790322367214812e-06, + "loss": 0.9698, + "step": 6849 + }, + { + "epoch": 0.5280604378661733, + "grad_norm": 3.788708448410034, + "learning_rate": 4.789075024751903e-06, + "loss": 0.9806, + "step": 6850 + }, + { + "epoch": 0.5281375269811902, + "grad_norm": 3.4838881492614746, + "learning_rate": 4.7878276954390726e-06, + "loss": 0.9545, + "step": 6851 + }, + { + "epoch": 0.5282146160962072, + "grad_norm": 3.561063528060913, + "learning_rate": 4.786580379354087e-06, + "loss": 0.9546, + "step": 6852 + }, + { + "epoch": 0.5282917052112241, + "grad_norm": 3.6301426887512207, + "learning_rate": 4.78533307657471e-06, + "loss": 0.9259, + "step": 6853 + }, + { + "epoch": 0.5283687943262412, + "grad_norm": 3.6689276695251465, + "learning_rate": 4.784085787178703e-06, + "loss": 0.9707, + "step": 6854 + }, + { + "epoch": 0.5284458834412581, + "grad_norm": 3.4558374881744385, + "learning_rate": 4.782838511243828e-06, + "loss": 0.8788, + "step": 6855 + }, + { + "epoch": 0.528522972556275, + "grad_norm": 3.636366367340088, + "learning_rate": 4.78159124884785e-06, + "loss": 0.9705, + "step": 6856 + }, + { + "epoch": 0.528600061671292, + "grad_norm": 3.362304925918579, + "learning_rate": 4.780344000068524e-06, + "loss": 0.898, + "step": 6857 + }, + { + "epoch": 0.528677150786309, + "grad_norm": 3.9811975955963135, + "learning_rate": 4.779096764983614e-06, + "loss": 0.9952, + "step": 6858 + }, + { + "epoch": 0.528754239901326, + "grad_norm": 3.3469631671905518, + "learning_rate": 4.777849543670875e-06, + "loss": 0.91, + "step": 6859 + }, + { + "epoch": 0.5288313290163429, + "grad_norm": 3.896716594696045, + "learning_rate": 4.776602336208066e-06, + "loss": 1.0635, + "step": 6860 + }, + { + "epoch": 0.5289084181313598, + "grad_norm": 3.27893328666687, + "learning_rate": 4.775355142672946e-06, + "loss": 0.9398, + "step": 6861 + }, + { + "epoch": 0.5289855072463768, + "grad_norm": 3.709341049194336, + "learning_rate": 4.774107963143269e-06, + "loss": 0.8442, + "step": 6862 + }, + { + "epoch": 0.5290625963613937, + "grad_norm": 4.007583141326904, + "learning_rate": 4.772860797696789e-06, + "loss": 0.9884, + "step": 6863 + }, + { + "epoch": 0.5291396854764108, + "grad_norm": 3.773080587387085, + "learning_rate": 4.771613646411264e-06, + "loss": 1.0555, + "step": 6864 + }, + { + "epoch": 0.5292167745914277, + "grad_norm": 3.6124513149261475, + "learning_rate": 4.770366509364444e-06, + "loss": 1.0263, + "step": 6865 + }, + { + "epoch": 0.5292938637064446, + "grad_norm": 3.6658780574798584, + "learning_rate": 4.7691193866340825e-06, + "loss": 0.9506, + "step": 6866 + }, + { + "epoch": 0.5293709528214616, + "grad_norm": 3.8184187412261963, + "learning_rate": 4.767872278297933e-06, + "loss": 0.9266, + "step": 6867 + }, + { + "epoch": 0.5294480419364785, + "grad_norm": 3.761213779449463, + "learning_rate": 4.766625184433744e-06, + "loss": 0.9132, + "step": 6868 + }, + { + "epoch": 0.5295251310514956, + "grad_norm": 3.682281970977783, + "learning_rate": 4.765378105119266e-06, + "loss": 0.9613, + "step": 6869 + }, + { + "epoch": 0.5296022201665125, + "grad_norm": 4.122997760772705, + "learning_rate": 4.7641310404322475e-06, + "loss": 1.1595, + "step": 6870 + }, + { + "epoch": 0.5296793092815294, + "grad_norm": 3.7581260204315186, + "learning_rate": 4.762883990450437e-06, + "loss": 0.8558, + "step": 6871 + }, + { + "epoch": 0.5297563983965464, + "grad_norm": 3.5212559700012207, + "learning_rate": 4.761636955251584e-06, + "loss": 0.9798, + "step": 6872 + }, + { + "epoch": 0.5298334875115633, + "grad_norm": 3.686304807662964, + "learning_rate": 4.760389934913429e-06, + "loss": 0.9631, + "step": 6873 + }, + { + "epoch": 0.5299105766265804, + "grad_norm": 3.6530697345733643, + "learning_rate": 4.759142929513722e-06, + "loss": 0.9296, + "step": 6874 + }, + { + "epoch": 0.5299876657415973, + "grad_norm": 3.5719711780548096, + "learning_rate": 4.757895939130206e-06, + "loss": 1.0083, + "step": 6875 + }, + { + "epoch": 0.5300647548566142, + "grad_norm": 4.652621746063232, + "learning_rate": 4.7566489638406245e-06, + "loss": 0.8969, + "step": 6876 + }, + { + "epoch": 0.5301418439716312, + "grad_norm": 3.799062490463257, + "learning_rate": 4.755402003722719e-06, + "loss": 1.0924, + "step": 6877 + }, + { + "epoch": 0.5302189330866481, + "grad_norm": 3.6467134952545166, + "learning_rate": 4.754155058854233e-06, + "loss": 1.0393, + "step": 6878 + }, + { + "epoch": 0.5302960222016652, + "grad_norm": 3.597630500793457, + "learning_rate": 4.752908129312905e-06, + "loss": 1.0003, + "step": 6879 + }, + { + "epoch": 0.5303731113166821, + "grad_norm": 3.513861894607544, + "learning_rate": 4.751661215176476e-06, + "loss": 0.9198, + "step": 6880 + }, + { + "epoch": 0.530450200431699, + "grad_norm": 3.668997049331665, + "learning_rate": 4.750414316522681e-06, + "loss": 0.9596, + "step": 6881 + }, + { + "epoch": 0.530527289546716, + "grad_norm": 3.9775052070617676, + "learning_rate": 4.7491674334292646e-06, + "loss": 1.0214, + "step": 6882 + }, + { + "epoch": 0.5306043786617329, + "grad_norm": 3.3533380031585693, + "learning_rate": 4.747920565973957e-06, + "loss": 0.9444, + "step": 6883 + }, + { + "epoch": 0.53068146777675, + "grad_norm": 3.5897982120513916, + "learning_rate": 4.746673714234498e-06, + "loss": 0.9681, + "step": 6884 + }, + { + "epoch": 0.5307585568917669, + "grad_norm": 3.429609537124634, + "learning_rate": 4.745426878288621e-06, + "loss": 0.9459, + "step": 6885 + }, + { + "epoch": 0.5308356460067838, + "grad_norm": 3.4824554920196533, + "learning_rate": 4.744180058214059e-06, + "loss": 1.0649, + "step": 6886 + }, + { + "epoch": 0.5309127351218008, + "grad_norm": 3.27522611618042, + "learning_rate": 4.742933254088547e-06, + "loss": 0.8855, + "step": 6887 + }, + { + "epoch": 0.5309898242368177, + "grad_norm": 3.366482973098755, + "learning_rate": 4.741686465989814e-06, + "loss": 0.8464, + "step": 6888 + }, + { + "epoch": 0.5310669133518348, + "grad_norm": 3.3331689834594727, + "learning_rate": 4.740439693995592e-06, + "loss": 0.8612, + "step": 6889 + }, + { + "epoch": 0.5311440024668517, + "grad_norm": 3.410470485687256, + "learning_rate": 4.739192938183611e-06, + "loss": 1.1055, + "step": 6890 + }, + { + "epoch": 0.5312210915818686, + "grad_norm": 3.858147144317627, + "learning_rate": 4.7379461986315995e-06, + "loss": 0.9747, + "step": 6891 + }, + { + "epoch": 0.5312981806968856, + "grad_norm": 3.707534074783325, + "learning_rate": 4.736699475417285e-06, + "loss": 0.8828, + "step": 6892 + }, + { + "epoch": 0.5313752698119025, + "grad_norm": 3.58382248878479, + "learning_rate": 4.735452768618395e-06, + "loss": 0.9151, + "step": 6893 + }, + { + "epoch": 0.5314523589269196, + "grad_norm": 3.5018043518066406, + "learning_rate": 4.734206078312654e-06, + "loss": 1.0009, + "step": 6894 + }, + { + "epoch": 0.5315294480419365, + "grad_norm": 4.095643520355225, + "learning_rate": 4.732959404577787e-06, + "loss": 0.9141, + "step": 6895 + }, + { + "epoch": 0.5316065371569534, + "grad_norm": 3.533647060394287, + "learning_rate": 4.731712747491521e-06, + "loss": 0.9161, + "step": 6896 + }, + { + "epoch": 0.5316836262719704, + "grad_norm": 3.72644305229187, + "learning_rate": 4.730466107131574e-06, + "loss": 0.9536, + "step": 6897 + }, + { + "epoch": 0.5317607153869873, + "grad_norm": 3.7895758152008057, + "learning_rate": 4.729219483575669e-06, + "loss": 0.9062, + "step": 6898 + }, + { + "epoch": 0.5318378045020044, + "grad_norm": 3.6426897048950195, + "learning_rate": 4.7279728769015266e-06, + "loss": 0.9667, + "step": 6899 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 3.507504463195801, + "learning_rate": 4.726726287186866e-06, + "loss": 0.8926, + "step": 6900 + }, + { + "epoch": 0.5319919827320382, + "grad_norm": 3.811716318130493, + "learning_rate": 4.725479714509408e-06, + "loss": 0.9849, + "step": 6901 + }, + { + "epoch": 0.5320690718470552, + "grad_norm": 3.6967577934265137, + "learning_rate": 4.724233158946866e-06, + "loss": 1.0016, + "step": 6902 + }, + { + "epoch": 0.5321461609620721, + "grad_norm": 3.3442699909210205, + "learning_rate": 4.722986620576959e-06, + "loss": 0.8726, + "step": 6903 + }, + { + "epoch": 0.5322232500770891, + "grad_norm": 3.455738067626953, + "learning_rate": 4.721740099477402e-06, + "loss": 0.9078, + "step": 6904 + }, + { + "epoch": 0.5323003391921061, + "grad_norm": 3.491821765899658, + "learning_rate": 4.720493595725908e-06, + "loss": 0.9688, + "step": 6905 + }, + { + "epoch": 0.532377428307123, + "grad_norm": 3.792142868041992, + "learning_rate": 4.719247109400192e-06, + "loss": 0.9775, + "step": 6906 + }, + { + "epoch": 0.53245451742214, + "grad_norm": 3.832749605178833, + "learning_rate": 4.718000640577963e-06, + "loss": 1.0054, + "step": 6907 + }, + { + "epoch": 0.5325316065371569, + "grad_norm": 3.693619728088379, + "learning_rate": 4.716754189336934e-06, + "loss": 0.9271, + "step": 6908 + }, + { + "epoch": 0.532608695652174, + "grad_norm": 3.5125207901000977, + "learning_rate": 4.715507755754815e-06, + "loss": 0.8987, + "step": 6909 + }, + { + "epoch": 0.5326857847671909, + "grad_norm": 3.387834310531616, + "learning_rate": 4.714261339909313e-06, + "loss": 0.9932, + "step": 6910 + }, + { + "epoch": 0.5327628738822078, + "grad_norm": 3.5012171268463135, + "learning_rate": 4.713014941878137e-06, + "loss": 0.8553, + "step": 6911 + }, + { + "epoch": 0.5328399629972248, + "grad_norm": 3.9803214073181152, + "learning_rate": 4.711768561738993e-06, + "loss": 1.0452, + "step": 6912 + }, + { + "epoch": 0.5329170521122417, + "grad_norm": 3.555570363998413, + "learning_rate": 4.710522199569586e-06, + "loss": 1.0523, + "step": 6913 + }, + { + "epoch": 0.5329941412272587, + "grad_norm": 3.6644721031188965, + "learning_rate": 4.7092758554476215e-06, + "loss": 0.9008, + "step": 6914 + }, + { + "epoch": 0.5330712303422757, + "grad_norm": 3.4861278533935547, + "learning_rate": 4.708029529450802e-06, + "loss": 0.9557, + "step": 6915 + }, + { + "epoch": 0.5331483194572927, + "grad_norm": 3.4730987548828125, + "learning_rate": 4.706783221656828e-06, + "loss": 0.9147, + "step": 6916 + }, + { + "epoch": 0.5332254085723096, + "grad_norm": 3.5856359004974365, + "learning_rate": 4.705536932143403e-06, + "loss": 0.8769, + "step": 6917 + }, + { + "epoch": 0.5333024976873265, + "grad_norm": 3.6524572372436523, + "learning_rate": 4.704290660988224e-06, + "loss": 0.9084, + "step": 6918 + }, + { + "epoch": 0.5333795868023435, + "grad_norm": 3.7509005069732666, + "learning_rate": 4.70304440826899e-06, + "loss": 0.9222, + "step": 6919 + }, + { + "epoch": 0.5334566759173605, + "grad_norm": 3.5913355350494385, + "learning_rate": 4.701798174063402e-06, + "loss": 0.9637, + "step": 6920 + }, + { + "epoch": 0.5335337650323775, + "grad_norm": 3.756228446960449, + "learning_rate": 4.700551958449151e-06, + "loss": 0.9755, + "step": 6921 + }, + { + "epoch": 0.5336108541473944, + "grad_norm": 3.908708333969116, + "learning_rate": 4.699305761503935e-06, + "loss": 1.0154, + "step": 6922 + }, + { + "epoch": 0.5336879432624113, + "grad_norm": 3.226095676422119, + "learning_rate": 4.698059583305447e-06, + "loss": 0.855, + "step": 6923 + }, + { + "epoch": 0.5337650323774283, + "grad_norm": 3.847050428390503, + "learning_rate": 4.696813423931381e-06, + "loss": 0.9876, + "step": 6924 + }, + { + "epoch": 0.5338421214924453, + "grad_norm": 3.6332948207855225, + "learning_rate": 4.695567283459426e-06, + "loss": 0.9547, + "step": 6925 + }, + { + "epoch": 0.5339192106074623, + "grad_norm": 3.968454122543335, + "learning_rate": 4.694321161967275e-06, + "loss": 0.9856, + "step": 6926 + }, + { + "epoch": 0.5339962997224792, + "grad_norm": 3.6489675045013428, + "learning_rate": 4.693075059532619e-06, + "loss": 0.9324, + "step": 6927 + }, + { + "epoch": 0.5340733888374961, + "grad_norm": 3.761967182159424, + "learning_rate": 4.6918289762331405e-06, + "loss": 0.9429, + "step": 6928 + }, + { + "epoch": 0.5341504779525131, + "grad_norm": 3.6253440380096436, + "learning_rate": 4.69058291214653e-06, + "loss": 0.9184, + "step": 6929 + }, + { + "epoch": 0.53422756706753, + "grad_norm": 3.8986105918884277, + "learning_rate": 4.689336867350474e-06, + "loss": 0.9953, + "step": 6930 + }, + { + "epoch": 0.5343046561825471, + "grad_norm": 3.430659055709839, + "learning_rate": 4.688090841922654e-06, + "loss": 0.9899, + "step": 6931 + }, + { + "epoch": 0.534381745297564, + "grad_norm": 3.6624832153320312, + "learning_rate": 4.686844835940754e-06, + "loss": 1.0142, + "step": 6932 + }, + { + "epoch": 0.5344588344125809, + "grad_norm": 3.478834390640259, + "learning_rate": 4.685598849482458e-06, + "loss": 0.9255, + "step": 6933 + }, + { + "epoch": 0.5345359235275979, + "grad_norm": 3.8827271461486816, + "learning_rate": 4.6843528826254444e-06, + "loss": 1.0197, + "step": 6934 + }, + { + "epoch": 0.5346130126426148, + "grad_norm": 3.9673526287078857, + "learning_rate": 4.683106935447395e-06, + "loss": 0.8611, + "step": 6935 + }, + { + "epoch": 0.5346901017576319, + "grad_norm": 3.5819478034973145, + "learning_rate": 4.681861008025985e-06, + "loss": 0.9893, + "step": 6936 + }, + { + "epoch": 0.5347671908726488, + "grad_norm": 3.0837767124176025, + "learning_rate": 4.680615100438894e-06, + "loss": 0.9102, + "step": 6937 + }, + { + "epoch": 0.5348442799876657, + "grad_norm": 3.779120922088623, + "learning_rate": 4.679369212763799e-06, + "loss": 0.9429, + "step": 6938 + }, + { + "epoch": 0.5349213691026827, + "grad_norm": 3.4904768466949463, + "learning_rate": 4.678123345078371e-06, + "loss": 0.9892, + "step": 6939 + }, + { + "epoch": 0.5349984582176996, + "grad_norm": 3.5319314002990723, + "learning_rate": 4.676877497460286e-06, + "loss": 0.9813, + "step": 6940 + }, + { + "epoch": 0.5350755473327167, + "grad_norm": 4.093774795532227, + "learning_rate": 4.675631669987217e-06, + "loss": 1.0125, + "step": 6941 + }, + { + "epoch": 0.5351526364477336, + "grad_norm": 3.4952986240386963, + "learning_rate": 4.674385862736832e-06, + "loss": 0.9641, + "step": 6942 + }, + { + "epoch": 0.5352297255627505, + "grad_norm": 3.800241470336914, + "learning_rate": 4.673140075786801e-06, + "loss": 0.9452, + "step": 6943 + }, + { + "epoch": 0.5353068146777675, + "grad_norm": 3.972257614135742, + "learning_rate": 4.671894309214796e-06, + "loss": 0.9489, + "step": 6944 + }, + { + "epoch": 0.5353839037927844, + "grad_norm": 3.972130060195923, + "learning_rate": 4.67064856309848e-06, + "loss": 1.0165, + "step": 6945 + }, + { + "epoch": 0.5354609929078015, + "grad_norm": 3.7796213626861572, + "learning_rate": 4.669402837515521e-06, + "loss": 0.8956, + "step": 6946 + }, + { + "epoch": 0.5355380820228184, + "grad_norm": 3.68583345413208, + "learning_rate": 4.668157132543582e-06, + "loss": 0.8976, + "step": 6947 + }, + { + "epoch": 0.5356151711378353, + "grad_norm": 3.53948974609375, + "learning_rate": 4.666911448260327e-06, + "loss": 0.9083, + "step": 6948 + }, + { + "epoch": 0.5356922602528523, + "grad_norm": 3.7999818325042725, + "learning_rate": 4.665665784743419e-06, + "loss": 1.0559, + "step": 6949 + }, + { + "epoch": 0.5357693493678692, + "grad_norm": 3.492060661315918, + "learning_rate": 4.664420142070517e-06, + "loss": 0.9844, + "step": 6950 + }, + { + "epoch": 0.5358464384828863, + "grad_norm": 3.376392364501953, + "learning_rate": 4.663174520319282e-06, + "loss": 1.0456, + "step": 6951 + }, + { + "epoch": 0.5359235275979032, + "grad_norm": 3.963566303253174, + "learning_rate": 4.661928919567371e-06, + "loss": 1.0061, + "step": 6952 + }, + { + "epoch": 0.5360006167129201, + "grad_norm": 3.267530679702759, + "learning_rate": 4.660683339892441e-06, + "loss": 0.8218, + "step": 6953 + }, + { + "epoch": 0.5360777058279371, + "grad_norm": 3.88205623626709, + "learning_rate": 4.659437781372147e-06, + "loss": 0.9725, + "step": 6954 + }, + { + "epoch": 0.536154794942954, + "grad_norm": 3.550692081451416, + "learning_rate": 4.658192244084143e-06, + "loss": 0.9525, + "step": 6955 + }, + { + "epoch": 0.5362318840579711, + "grad_norm": 3.2505733966827393, + "learning_rate": 4.656946728106083e-06, + "loss": 0.8847, + "step": 6956 + }, + { + "epoch": 0.536308973172988, + "grad_norm": 3.462864398956299, + "learning_rate": 4.6557012335156185e-06, + "loss": 0.8702, + "step": 6957 + }, + { + "epoch": 0.5363860622880049, + "grad_norm": 3.4686732292175293, + "learning_rate": 4.654455760390398e-06, + "loss": 0.8376, + "step": 6958 + }, + { + "epoch": 0.5364631514030219, + "grad_norm": 3.5081446170806885, + "learning_rate": 4.653210308808071e-06, + "loss": 0.8766, + "step": 6959 + }, + { + "epoch": 0.5365402405180388, + "grad_norm": 3.306342124938965, + "learning_rate": 4.651964878846285e-06, + "loss": 0.8244, + "step": 6960 + }, + { + "epoch": 0.5366173296330559, + "grad_norm": 4.375651836395264, + "learning_rate": 4.6507194705826866e-06, + "loss": 1.039, + "step": 6961 + }, + { + "epoch": 0.5366944187480728, + "grad_norm": 3.5302603244781494, + "learning_rate": 4.649474084094921e-06, + "loss": 0.9445, + "step": 6962 + }, + { + "epoch": 0.5367715078630897, + "grad_norm": 3.6997478008270264, + "learning_rate": 4.64822871946063e-06, + "loss": 0.9587, + "step": 6963 + }, + { + "epoch": 0.5368485969781067, + "grad_norm": 3.9736435413360596, + "learning_rate": 4.646983376757457e-06, + "loss": 0.955, + "step": 6964 + }, + { + "epoch": 0.5369256860931236, + "grad_norm": 3.3650596141815186, + "learning_rate": 4.645738056063042e-06, + "loss": 0.9348, + "step": 6965 + }, + { + "epoch": 0.5370027752081407, + "grad_norm": 3.6180570125579834, + "learning_rate": 4.6444927574550245e-06, + "loss": 0.7941, + "step": 6966 + }, + { + "epoch": 0.5370798643231576, + "grad_norm": 3.7963132858276367, + "learning_rate": 4.6432474810110415e-06, + "loss": 0.8803, + "step": 6967 + }, + { + "epoch": 0.5371569534381745, + "grad_norm": 3.3811287879943848, + "learning_rate": 4.642002226808732e-06, + "loss": 0.7674, + "step": 6968 + }, + { + "epoch": 0.5372340425531915, + "grad_norm": 3.9231021404266357, + "learning_rate": 4.640756994925726e-06, + "loss": 1.0332, + "step": 6969 + }, + { + "epoch": 0.5373111316682084, + "grad_norm": 3.508418321609497, + "learning_rate": 4.639511785439664e-06, + "loss": 0.9099, + "step": 6970 + }, + { + "epoch": 0.5373882207832255, + "grad_norm": 3.725053310394287, + "learning_rate": 4.638266598428175e-06, + "loss": 0.8434, + "step": 6971 + }, + { + "epoch": 0.5374653098982424, + "grad_norm": 3.9625444412231445, + "learning_rate": 4.63702143396889e-06, + "loss": 0.8459, + "step": 6972 + }, + { + "epoch": 0.5375423990132593, + "grad_norm": 3.8281447887420654, + "learning_rate": 4.635776292139437e-06, + "loss": 0.9814, + "step": 6973 + }, + { + "epoch": 0.5376194881282763, + "grad_norm": 4.015565872192383, + "learning_rate": 4.634531173017448e-06, + "loss": 1.0362, + "step": 6974 + }, + { + "epoch": 0.5376965772432932, + "grad_norm": 3.4385459423065186, + "learning_rate": 4.633286076680546e-06, + "loss": 0.8451, + "step": 6975 + }, + { + "epoch": 0.5377736663583103, + "grad_norm": 3.9049465656280518, + "learning_rate": 4.632041003206359e-06, + "loss": 0.7773, + "step": 6976 + }, + { + "epoch": 0.5378507554733272, + "grad_norm": 5.6488142013549805, + "learning_rate": 4.630795952672509e-06, + "loss": 0.9841, + "step": 6977 + }, + { + "epoch": 0.5379278445883441, + "grad_norm": 3.4264445304870605, + "learning_rate": 4.62955092515662e-06, + "loss": 0.9391, + "step": 6978 + }, + { + "epoch": 0.5380049337033611, + "grad_norm": 3.8330531120300293, + "learning_rate": 4.628305920736311e-06, + "loss": 1.0133, + "step": 6979 + }, + { + "epoch": 0.538082022818378, + "grad_norm": 3.940965414047241, + "learning_rate": 4.627060939489204e-06, + "loss": 0.959, + "step": 6980 + }, + { + "epoch": 0.538159111933395, + "grad_norm": 3.4086647033691406, + "learning_rate": 4.625815981492916e-06, + "loss": 0.95, + "step": 6981 + }, + { + "epoch": 0.538236201048412, + "grad_norm": 3.4563660621643066, + "learning_rate": 4.6245710468250626e-06, + "loss": 0.9469, + "step": 6982 + }, + { + "epoch": 0.5383132901634289, + "grad_norm": 3.675934076309204, + "learning_rate": 4.623326135563261e-06, + "loss": 0.8943, + "step": 6983 + }, + { + "epoch": 0.5383903792784459, + "grad_norm": 3.754777431488037, + "learning_rate": 4.622081247785123e-06, + "loss": 0.9712, + "step": 6984 + }, + { + "epoch": 0.5384674683934628, + "grad_norm": 3.8146581649780273, + "learning_rate": 4.620836383568262e-06, + "loss": 0.9867, + "step": 6985 + }, + { + "epoch": 0.5385445575084798, + "grad_norm": 3.5257890224456787, + "learning_rate": 4.61959154299029e-06, + "loss": 0.9331, + "step": 6986 + }, + { + "epoch": 0.5386216466234968, + "grad_norm": 3.52950119972229, + "learning_rate": 4.618346726128814e-06, + "loss": 0.8518, + "step": 6987 + }, + { + "epoch": 0.5386987357385137, + "grad_norm": 3.408184766769409, + "learning_rate": 4.6171019330614424e-06, + "loss": 0.9116, + "step": 6988 + }, + { + "epoch": 0.5387758248535307, + "grad_norm": 4.128994941711426, + "learning_rate": 4.615857163865784e-06, + "loss": 0.9861, + "step": 6989 + }, + { + "epoch": 0.5388529139685476, + "grad_norm": 3.886155366897583, + "learning_rate": 4.614612418619441e-06, + "loss": 1.0266, + "step": 6990 + }, + { + "epoch": 0.5389300030835646, + "grad_norm": 3.525916814804077, + "learning_rate": 4.613367697400017e-06, + "loss": 0.8755, + "step": 6991 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 3.6894612312316895, + "learning_rate": 4.612123000285116e-06, + "loss": 0.917, + "step": 6992 + }, + { + "epoch": 0.5390841813135985, + "grad_norm": 3.610119342803955, + "learning_rate": 4.610878327352336e-06, + "loss": 0.9675, + "step": 6993 + }, + { + "epoch": 0.5391612704286155, + "grad_norm": 3.5024073123931885, + "learning_rate": 4.6096336786792775e-06, + "loss": 0.9345, + "step": 6994 + }, + { + "epoch": 0.5392383595436324, + "grad_norm": 3.419611930847168, + "learning_rate": 4.608389054343537e-06, + "loss": 0.8538, + "step": 6995 + }, + { + "epoch": 0.5393154486586494, + "grad_norm": 3.4121594429016113, + "learning_rate": 4.607144454422711e-06, + "loss": 0.9487, + "step": 6996 + }, + { + "epoch": 0.5393925377736664, + "grad_norm": 3.849426746368408, + "learning_rate": 4.605899878994395e-06, + "loss": 0.9854, + "step": 6997 + }, + { + "epoch": 0.5394696268886833, + "grad_norm": 3.513972759246826, + "learning_rate": 4.604655328136177e-06, + "loss": 0.948, + "step": 6998 + }, + { + "epoch": 0.5395467160037003, + "grad_norm": 3.6121532917022705, + "learning_rate": 4.603410801925654e-06, + "loss": 1.0326, + "step": 6999 + }, + { + "epoch": 0.5396238051187172, + "grad_norm": 3.5565638542175293, + "learning_rate": 4.602166300440415e-06, + "loss": 0.8536, + "step": 7000 + }, + { + "epoch": 0.5397008942337342, + "grad_norm": 3.5659432411193848, + "learning_rate": 4.600921823758044e-06, + "loss": 0.9094, + "step": 7001 + }, + { + "epoch": 0.5397779833487512, + "grad_norm": 3.8643648624420166, + "learning_rate": 4.599677371956132e-06, + "loss": 0.9769, + "step": 7002 + }, + { + "epoch": 0.5398550724637681, + "grad_norm": 3.5835399627685547, + "learning_rate": 4.598432945112262e-06, + "loss": 0.8676, + "step": 7003 + }, + { + "epoch": 0.5399321615787851, + "grad_norm": 3.427264451980591, + "learning_rate": 4.597188543304017e-06, + "loss": 0.9429, + "step": 7004 + }, + { + "epoch": 0.540009250693802, + "grad_norm": 4.065280437469482, + "learning_rate": 4.595944166608982e-06, + "loss": 0.8582, + "step": 7005 + }, + { + "epoch": 0.540086339808819, + "grad_norm": 3.8755879402160645, + "learning_rate": 4.594699815104735e-06, + "loss": 0.9051, + "step": 7006 + }, + { + "epoch": 0.540163428923836, + "grad_norm": 3.609768867492676, + "learning_rate": 4.593455488868855e-06, + "loss": 0.9795, + "step": 7007 + }, + { + "epoch": 0.5402405180388529, + "grad_norm": 3.6061177253723145, + "learning_rate": 4.59221118797892e-06, + "loss": 0.8821, + "step": 7008 + }, + { + "epoch": 0.5403176071538699, + "grad_norm": 3.737497568130493, + "learning_rate": 4.590966912512505e-06, + "loss": 0.8979, + "step": 7009 + }, + { + "epoch": 0.5403946962688868, + "grad_norm": 3.619135618209839, + "learning_rate": 4.589722662547185e-06, + "loss": 0.8114, + "step": 7010 + }, + { + "epoch": 0.5404717853839038, + "grad_norm": 3.6571645736694336, + "learning_rate": 4.588478438160532e-06, + "loss": 0.9064, + "step": 7011 + }, + { + "epoch": 0.5405488744989208, + "grad_norm": 3.8513779640197754, + "learning_rate": 4.5872342394301175e-06, + "loss": 0.9858, + "step": 7012 + }, + { + "epoch": 0.5406259636139377, + "grad_norm": 3.6383817195892334, + "learning_rate": 4.58599006643351e-06, + "loss": 0.979, + "step": 7013 + }, + { + "epoch": 0.5407030527289547, + "grad_norm": 3.927966356277466, + "learning_rate": 4.584745919248275e-06, + "loss": 1.0223, + "step": 7014 + }, + { + "epoch": 0.5407801418439716, + "grad_norm": 3.733217716217041, + "learning_rate": 4.5835017979519844e-06, + "loss": 0.9701, + "step": 7015 + }, + { + "epoch": 0.5408572309589886, + "grad_norm": 3.937772035598755, + "learning_rate": 4.582257702622199e-06, + "loss": 0.9882, + "step": 7016 + }, + { + "epoch": 0.5409343200740055, + "grad_norm": 3.6561355590820312, + "learning_rate": 4.581013633336483e-06, + "loss": 0.8608, + "step": 7017 + }, + { + "epoch": 0.5410114091890225, + "grad_norm": 3.8480238914489746, + "learning_rate": 4.579769590172396e-06, + "loss": 0.9592, + "step": 7018 + }, + { + "epoch": 0.5410884983040395, + "grad_norm": 3.8090572357177734, + "learning_rate": 4.578525573207499e-06, + "loss": 0.9319, + "step": 7019 + }, + { + "epoch": 0.5411655874190564, + "grad_norm": 3.5534706115722656, + "learning_rate": 4.57728158251935e-06, + "loss": 0.8965, + "step": 7020 + }, + { + "epoch": 0.5412426765340734, + "grad_norm": 3.8051340579986572, + "learning_rate": 4.576037618185504e-06, + "loss": 1.0375, + "step": 7021 + }, + { + "epoch": 0.5413197656490903, + "grad_norm": 3.5272462368011475, + "learning_rate": 4.574793680283518e-06, + "loss": 0.8829, + "step": 7022 + }, + { + "epoch": 0.5413968547641073, + "grad_norm": 3.668534278869629, + "learning_rate": 4.573549768890944e-06, + "loss": 1.0591, + "step": 7023 + }, + { + "epoch": 0.5414739438791243, + "grad_norm": 3.377134084701538, + "learning_rate": 4.572305884085332e-06, + "loss": 0.9829, + "step": 7024 + }, + { + "epoch": 0.5415510329941412, + "grad_norm": 3.962786912918091, + "learning_rate": 4.5710620259442325e-06, + "loss": 1.0349, + "step": 7025 + }, + { + "epoch": 0.5416281221091582, + "grad_norm": 3.8737564086914062, + "learning_rate": 4.5698181945451966e-06, + "loss": 0.995, + "step": 7026 + }, + { + "epoch": 0.5417052112241751, + "grad_norm": 3.767915725708008, + "learning_rate": 4.568574389965766e-06, + "loss": 0.8897, + "step": 7027 + }, + { + "epoch": 0.5417823003391921, + "grad_norm": 3.9697983264923096, + "learning_rate": 4.567330612283488e-06, + "loss": 0.8637, + "step": 7028 + }, + { + "epoch": 0.5418593894542091, + "grad_norm": 3.8324038982391357, + "learning_rate": 4.566086861575908e-06, + "loss": 1.0064, + "step": 7029 + }, + { + "epoch": 0.541936478569226, + "grad_norm": 3.6003153324127197, + "learning_rate": 4.564843137920561e-06, + "loss": 0.9691, + "step": 7030 + }, + { + "epoch": 0.542013567684243, + "grad_norm": 3.5627217292785645, + "learning_rate": 4.563599441394994e-06, + "loss": 0.86, + "step": 7031 + }, + { + "epoch": 0.5420906567992599, + "grad_norm": 3.6470260620117188, + "learning_rate": 4.56235577207674e-06, + "loss": 0.9588, + "step": 7032 + }, + { + "epoch": 0.5421677459142769, + "grad_norm": 4.533912181854248, + "learning_rate": 4.561112130043337e-06, + "loss": 1.0559, + "step": 7033 + }, + { + "epoch": 0.5422448350292939, + "grad_norm": 3.7450127601623535, + "learning_rate": 4.5598685153723205e-06, + "loss": 0.9593, + "step": 7034 + }, + { + "epoch": 0.5423219241443108, + "grad_norm": 3.7098305225372314, + "learning_rate": 4.558624928141222e-06, + "loss": 1.0346, + "step": 7035 + }, + { + "epoch": 0.5423990132593278, + "grad_norm": 3.9631752967834473, + "learning_rate": 4.557381368427573e-06, + "loss": 0.9668, + "step": 7036 + }, + { + "epoch": 0.5424761023743447, + "grad_norm": 3.5435612201690674, + "learning_rate": 4.5561378363089035e-06, + "loss": 1.0405, + "step": 7037 + }, + { + "epoch": 0.5425531914893617, + "grad_norm": 3.9541807174682617, + "learning_rate": 4.554894331862741e-06, + "loss": 0.9185, + "step": 7038 + }, + { + "epoch": 0.5426302806043787, + "grad_norm": 3.6304094791412354, + "learning_rate": 4.553650855166612e-06, + "loss": 0.9789, + "step": 7039 + }, + { + "epoch": 0.5427073697193956, + "grad_norm": 3.823951005935669, + "learning_rate": 4.55240740629804e-06, + "loss": 0.9947, + "step": 7040 + }, + { + "epoch": 0.5427844588344126, + "grad_norm": 3.5936341285705566, + "learning_rate": 4.551163985334548e-06, + "loss": 0.971, + "step": 7041 + }, + { + "epoch": 0.5428615479494295, + "grad_norm": 3.330362319946289, + "learning_rate": 4.549920592353659e-06, + "loss": 0.8836, + "step": 7042 + }, + { + "epoch": 0.5429386370644464, + "grad_norm": 3.501664400100708, + "learning_rate": 4.548677227432888e-06, + "loss": 0.9594, + "step": 7043 + }, + { + "epoch": 0.5430157261794635, + "grad_norm": 3.7609310150146484, + "learning_rate": 4.547433890649754e-06, + "loss": 0.83, + "step": 7044 + }, + { + "epoch": 0.5430928152944804, + "grad_norm": 3.811150312423706, + "learning_rate": 4.546190582081774e-06, + "loss": 0.9383, + "step": 7045 + }, + { + "epoch": 0.5431699044094974, + "grad_norm": 3.409965991973877, + "learning_rate": 4.544947301806461e-06, + "loss": 0.894, + "step": 7046 + }, + { + "epoch": 0.5432469935245143, + "grad_norm": 3.6347696781158447, + "learning_rate": 4.543704049901326e-06, + "loss": 0.8651, + "step": 7047 + }, + { + "epoch": 0.5433240826395312, + "grad_norm": 3.6981849670410156, + "learning_rate": 4.542460826443881e-06, + "loss": 0.964, + "step": 7048 + }, + { + "epoch": 0.5434011717545483, + "grad_norm": 3.7254111766815186, + "learning_rate": 4.541217631511633e-06, + "loss": 0.9934, + "step": 7049 + }, + { + "epoch": 0.5434782608695652, + "grad_norm": 3.680558443069458, + "learning_rate": 4.5399744651820915e-06, + "loss": 0.7968, + "step": 7050 + }, + { + "epoch": 0.5435553499845822, + "grad_norm": 3.6431643962860107, + "learning_rate": 4.538731327532758e-06, + "loss": 0.9735, + "step": 7051 + }, + { + "epoch": 0.5436324390995991, + "grad_norm": 3.7536261081695557, + "learning_rate": 4.5374882186411375e-06, + "loss": 0.8981, + "step": 7052 + }, + { + "epoch": 0.543709528214616, + "grad_norm": 3.773322582244873, + "learning_rate": 4.536245138584731e-06, + "loss": 1.0596, + "step": 7053 + }, + { + "epoch": 0.5437866173296331, + "grad_norm": 3.827707052230835, + "learning_rate": 4.5350020874410385e-06, + "loss": 0.9203, + "step": 7054 + }, + { + "epoch": 0.54386370644465, + "grad_norm": 3.5610642433166504, + "learning_rate": 4.533759065287557e-06, + "loss": 0.9087, + "step": 7055 + }, + { + "epoch": 0.543940795559667, + "grad_norm": 3.6839845180511475, + "learning_rate": 4.5325160722017845e-06, + "loss": 0.8665, + "step": 7056 + }, + { + "epoch": 0.5440178846746839, + "grad_norm": 3.7331130504608154, + "learning_rate": 4.531273108261213e-06, + "loss": 0.8876, + "step": 7057 + }, + { + "epoch": 0.5440949737897008, + "grad_norm": 3.359189510345459, + "learning_rate": 4.530030173543334e-06, + "loss": 0.9535, + "step": 7058 + }, + { + "epoch": 0.5441720629047179, + "grad_norm": 3.495391368865967, + "learning_rate": 4.528787268125641e-06, + "loss": 0.8966, + "step": 7059 + }, + { + "epoch": 0.5442491520197348, + "grad_norm": 3.6998205184936523, + "learning_rate": 4.527544392085622e-06, + "loss": 0.9623, + "step": 7060 + }, + { + "epoch": 0.5443262411347518, + "grad_norm": 3.4941718578338623, + "learning_rate": 4.526301545500762e-06, + "loss": 1.0587, + "step": 7061 + }, + { + "epoch": 0.5444033302497687, + "grad_norm": 3.546320676803589, + "learning_rate": 4.5250587284485474e-06, + "loss": 0.9509, + "step": 7062 + }, + { + "epoch": 0.5444804193647856, + "grad_norm": 3.8419322967529297, + "learning_rate": 4.523815941006463e-06, + "loss": 0.9845, + "step": 7063 + }, + { + "epoch": 0.5445575084798027, + "grad_norm": 3.7636489868164062, + "learning_rate": 4.522573183251987e-06, + "loss": 1.0808, + "step": 7064 + }, + { + "epoch": 0.5446345975948196, + "grad_norm": 3.6135027408599854, + "learning_rate": 4.5213304552626e-06, + "loss": 0.9379, + "step": 7065 + }, + { + "epoch": 0.5447116867098366, + "grad_norm": 4.0867414474487305, + "learning_rate": 4.5200877571157815e-06, + "loss": 0.969, + "step": 7066 + }, + { + "epoch": 0.5447887758248535, + "grad_norm": 3.768380641937256, + "learning_rate": 4.518845088889004e-06, + "loss": 1.017, + "step": 7067 + }, + { + "epoch": 0.5448658649398704, + "grad_norm": 3.526736259460449, + "learning_rate": 4.517602450659746e-06, + "loss": 0.8962, + "step": 7068 + }, + { + "epoch": 0.5449429540548875, + "grad_norm": 3.6190271377563477, + "learning_rate": 4.516359842505475e-06, + "loss": 0.9257, + "step": 7069 + }, + { + "epoch": 0.5450200431699044, + "grad_norm": 3.3780858516693115, + "learning_rate": 4.515117264503662e-06, + "loss": 0.9142, + "step": 7070 + }, + { + "epoch": 0.5450971322849214, + "grad_norm": 3.3571228981018066, + "learning_rate": 4.513874716731778e-06, + "loss": 0.9285, + "step": 7071 + }, + { + "epoch": 0.5451742213999383, + "grad_norm": 3.818427801132202, + "learning_rate": 4.5126321992672866e-06, + "loss": 0.928, + "step": 7072 + }, + { + "epoch": 0.5452513105149552, + "grad_norm": 3.895630359649658, + "learning_rate": 4.511389712187654e-06, + "loss": 0.9358, + "step": 7073 + }, + { + "epoch": 0.5453283996299723, + "grad_norm": 3.877424955368042, + "learning_rate": 4.510147255570342e-06, + "loss": 1.0102, + "step": 7074 + }, + { + "epoch": 0.5454054887449892, + "grad_norm": 3.911123752593994, + "learning_rate": 4.508904829492812e-06, + "loss": 0.9505, + "step": 7075 + }, + { + "epoch": 0.5454825778600062, + "grad_norm": 3.7010762691497803, + "learning_rate": 4.507662434032522e-06, + "loss": 0.98, + "step": 7076 + }, + { + "epoch": 0.5455596669750231, + "grad_norm": 3.6135289669036865, + "learning_rate": 4.50642006926693e-06, + "loss": 0.9494, + "step": 7077 + }, + { + "epoch": 0.54563675609004, + "grad_norm": 3.883880138397217, + "learning_rate": 4.505177735273489e-06, + "loss": 0.9959, + "step": 7078 + }, + { + "epoch": 0.5457138452050571, + "grad_norm": 3.969799518585205, + "learning_rate": 4.503935432129656e-06, + "loss": 0.9272, + "step": 7079 + }, + { + "epoch": 0.545790934320074, + "grad_norm": 4.497955799102783, + "learning_rate": 4.5026931599128774e-06, + "loss": 0.9667, + "step": 7080 + }, + { + "epoch": 0.545868023435091, + "grad_norm": 3.585750102996826, + "learning_rate": 4.501450918700605e-06, + "loss": 0.9303, + "step": 7081 + }, + { + "epoch": 0.5459451125501079, + "grad_norm": 3.6245148181915283, + "learning_rate": 4.5002087085702865e-06, + "loss": 0.9388, + "step": 7082 + }, + { + "epoch": 0.5460222016651248, + "grad_norm": 3.6533985137939453, + "learning_rate": 4.498966529599365e-06, + "loss": 1.0059, + "step": 7083 + }, + { + "epoch": 0.5460992907801419, + "grad_norm": 3.751574754714966, + "learning_rate": 4.497724381865287e-06, + "loss": 0.9515, + "step": 7084 + }, + { + "epoch": 0.5461763798951588, + "grad_norm": 3.4590702056884766, + "learning_rate": 4.496482265445492e-06, + "loss": 0.9293, + "step": 7085 + }, + { + "epoch": 0.5462534690101758, + "grad_norm": 3.8915021419525146, + "learning_rate": 4.49524018041742e-06, + "loss": 1.051, + "step": 7086 + }, + { + "epoch": 0.5463305581251927, + "grad_norm": 4.0933990478515625, + "learning_rate": 4.493998126858508e-06, + "loss": 0.9782, + "step": 7087 + }, + { + "epoch": 0.5464076472402097, + "grad_norm": 3.402745485305786, + "learning_rate": 4.492756104846193e-06, + "loss": 0.9318, + "step": 7088 + }, + { + "epoch": 0.5464847363552267, + "grad_norm": 3.519670009613037, + "learning_rate": 4.491514114457907e-06, + "loss": 0.9108, + "step": 7089 + }, + { + "epoch": 0.5465618254702436, + "grad_norm": 3.736721992492676, + "learning_rate": 4.490272155771084e-06, + "loss": 0.8759, + "step": 7090 + }, + { + "epoch": 0.5466389145852606, + "grad_norm": 3.819697141647339, + "learning_rate": 4.489030228863151e-06, + "loss": 1.0047, + "step": 7091 + }, + { + "epoch": 0.5467160037002775, + "grad_norm": 3.8831934928894043, + "learning_rate": 4.487788333811536e-06, + "loss": 0.9225, + "step": 7092 + }, + { + "epoch": 0.5467930928152945, + "grad_norm": 3.722339153289795, + "learning_rate": 4.486546470693667e-06, + "loss": 0.8511, + "step": 7093 + }, + { + "epoch": 0.5468701819303114, + "grad_norm": 3.325698137283325, + "learning_rate": 4.4853046395869655e-06, + "loss": 0.8824, + "step": 7094 + }, + { + "epoch": 0.5469472710453284, + "grad_norm": 3.8227202892303467, + "learning_rate": 4.484062840568854e-06, + "loss": 0.918, + "step": 7095 + }, + { + "epoch": 0.5470243601603454, + "grad_norm": 3.414175033569336, + "learning_rate": 4.4828210737167535e-06, + "loss": 0.9069, + "step": 7096 + }, + { + "epoch": 0.5471014492753623, + "grad_norm": 3.996797800064087, + "learning_rate": 4.481579339108079e-06, + "loss": 1.0326, + "step": 7097 + }, + { + "epoch": 0.5471785383903793, + "grad_norm": 3.6179771423339844, + "learning_rate": 4.480337636820249e-06, + "loss": 0.9529, + "step": 7098 + }, + { + "epoch": 0.5472556275053962, + "grad_norm": 3.6945784091949463, + "learning_rate": 4.479095966930674e-06, + "loss": 1.1146, + "step": 7099 + }, + { + "epoch": 0.5473327166204132, + "grad_norm": 3.5141420364379883, + "learning_rate": 4.477854329516769e-06, + "loss": 0.8478, + "step": 7100 + }, + { + "epoch": 0.5474098057354302, + "grad_norm": 3.9102392196655273, + "learning_rate": 4.476612724655943e-06, + "loss": 0.9833, + "step": 7101 + }, + { + "epoch": 0.5474868948504471, + "grad_norm": 3.6566107273101807, + "learning_rate": 4.475371152425599e-06, + "loss": 1.0023, + "step": 7102 + }, + { + "epoch": 0.5475639839654641, + "grad_norm": 3.598893880844116, + "learning_rate": 4.47412961290315e-06, + "loss": 0.9257, + "step": 7103 + }, + { + "epoch": 0.547641073080481, + "grad_norm": 3.900968313217163, + "learning_rate": 4.472888106165995e-06, + "loss": 0.9892, + "step": 7104 + }, + { + "epoch": 0.547718162195498, + "grad_norm": 3.7693870067596436, + "learning_rate": 4.471646632291539e-06, + "loss": 0.9436, + "step": 7105 + }, + { + "epoch": 0.547795251310515, + "grad_norm": 3.838836193084717, + "learning_rate": 4.470405191357176e-06, + "loss": 0.9466, + "step": 7106 + }, + { + "epoch": 0.5478723404255319, + "grad_norm": 3.1654272079467773, + "learning_rate": 4.469163783440308e-06, + "loss": 0.823, + "step": 7107 + }, + { + "epoch": 0.5479494295405489, + "grad_norm": 3.3918416500091553, + "learning_rate": 4.467922408618329e-06, + "loss": 0.9723, + "step": 7108 + }, + { + "epoch": 0.5480265186555658, + "grad_norm": 4.015628814697266, + "learning_rate": 4.466681066968632e-06, + "loss": 0.9698, + "step": 7109 + }, + { + "epoch": 0.5481036077705828, + "grad_norm": 3.8677635192871094, + "learning_rate": 4.465439758568607e-06, + "loss": 0.9895, + "step": 7110 + }, + { + "epoch": 0.5481806968855998, + "grad_norm": 3.577852249145508, + "learning_rate": 4.464198483495647e-06, + "loss": 0.9398, + "step": 7111 + }, + { + "epoch": 0.5482577860006167, + "grad_norm": 3.617177724838257, + "learning_rate": 4.462957241827135e-06, + "loss": 0.9324, + "step": 7112 + }, + { + "epoch": 0.5483348751156337, + "grad_norm": 3.606205463409424, + "learning_rate": 4.461716033640457e-06, + "loss": 0.916, + "step": 7113 + }, + { + "epoch": 0.5484119642306506, + "grad_norm": 3.655104637145996, + "learning_rate": 4.460474859012998e-06, + "loss": 0.9226, + "step": 7114 + }, + { + "epoch": 0.5484890533456676, + "grad_norm": 3.676701068878174, + "learning_rate": 4.4592337180221365e-06, + "loss": 0.9754, + "step": 7115 + }, + { + "epoch": 0.5485661424606846, + "grad_norm": 3.2556674480438232, + "learning_rate": 4.457992610745252e-06, + "loss": 0.8332, + "step": 7116 + }, + { + "epoch": 0.5486432315757015, + "grad_norm": 3.65635347366333, + "learning_rate": 4.456751537259721e-06, + "loss": 0.9277, + "step": 7117 + }, + { + "epoch": 0.5487203206907185, + "grad_norm": 3.583652973175049, + "learning_rate": 4.455510497642918e-06, + "loss": 0.9282, + "step": 7118 + }, + { + "epoch": 0.5487974098057354, + "grad_norm": 3.441347599029541, + "learning_rate": 4.454269491972215e-06, + "loss": 0.8934, + "step": 7119 + }, + { + "epoch": 0.5488744989207524, + "grad_norm": 3.9320881366729736, + "learning_rate": 4.453028520324984e-06, + "loss": 0.9556, + "step": 7120 + }, + { + "epoch": 0.5489515880357694, + "grad_norm": 3.3631417751312256, + "learning_rate": 4.4517875827785905e-06, + "loss": 0.9234, + "step": 7121 + }, + { + "epoch": 0.5490286771507863, + "grad_norm": 3.7885899543762207, + "learning_rate": 4.450546679410403e-06, + "loss": 1.0157, + "step": 7122 + }, + { + "epoch": 0.5491057662658033, + "grad_norm": 3.6120402812957764, + "learning_rate": 4.449305810297784e-06, + "loss": 0.8965, + "step": 7123 + }, + { + "epoch": 0.5491828553808202, + "grad_norm": 3.5970468521118164, + "learning_rate": 4.448064975518096e-06, + "loss": 0.9291, + "step": 7124 + }, + { + "epoch": 0.5492599444958371, + "grad_norm": 3.6783077716827393, + "learning_rate": 4.446824175148699e-06, + "loss": 0.82, + "step": 7125 + }, + { + "epoch": 0.5493370336108542, + "grad_norm": 3.4367566108703613, + "learning_rate": 4.445583409266947e-06, + "loss": 1.0236, + "step": 7126 + }, + { + "epoch": 0.5494141227258711, + "grad_norm": 4.298835277557373, + "learning_rate": 4.444342677950201e-06, + "loss": 1.0732, + "step": 7127 + }, + { + "epoch": 0.5494912118408881, + "grad_norm": 3.573850631713867, + "learning_rate": 4.443101981275809e-06, + "loss": 0.9049, + "step": 7128 + }, + { + "epoch": 0.549568300955905, + "grad_norm": 3.9243526458740234, + "learning_rate": 4.441861319321125e-06, + "loss": 1.0487, + "step": 7129 + }, + { + "epoch": 0.549645390070922, + "grad_norm": 3.5670223236083984, + "learning_rate": 4.440620692163498e-06, + "loss": 0.9543, + "step": 7130 + }, + { + "epoch": 0.549722479185939, + "grad_norm": 5.046988010406494, + "learning_rate": 4.439380099880272e-06, + "loss": 1.1399, + "step": 7131 + }, + { + "epoch": 0.5497995683009559, + "grad_norm": 4.248262405395508, + "learning_rate": 4.438139542548795e-06, + "loss": 0.9838, + "step": 7132 + }, + { + "epoch": 0.5498766574159729, + "grad_norm": 3.758612632751465, + "learning_rate": 4.436899020246407e-06, + "loss": 0.8835, + "step": 7133 + }, + { + "epoch": 0.5499537465309898, + "grad_norm": 3.501319408416748, + "learning_rate": 4.435658533050448e-06, + "loss": 0.8992, + "step": 7134 + }, + { + "epoch": 0.5500308356460067, + "grad_norm": 3.7768168449401855, + "learning_rate": 4.434418081038256e-06, + "loss": 0.9216, + "step": 7135 + }, + { + "epoch": 0.5501079247610238, + "grad_norm": 3.8390257358551025, + "learning_rate": 4.43317766428717e-06, + "loss": 0.997, + "step": 7136 + }, + { + "epoch": 0.5501850138760407, + "grad_norm": 3.9735095500946045, + "learning_rate": 4.4319372828745185e-06, + "loss": 1.0097, + "step": 7137 + }, + { + "epoch": 0.5502621029910577, + "grad_norm": 3.6092162132263184, + "learning_rate": 4.430696936877638e-06, + "loss": 0.9526, + "step": 7138 + }, + { + "epoch": 0.5503391921060746, + "grad_norm": 3.396688461303711, + "learning_rate": 4.429456626373853e-06, + "loss": 0.8063, + "step": 7139 + }, + { + "epoch": 0.5504162812210915, + "grad_norm": 3.1804189682006836, + "learning_rate": 4.428216351440492e-06, + "loss": 0.733, + "step": 7140 + }, + { + "epoch": 0.5504933703361086, + "grad_norm": 3.653811454772949, + "learning_rate": 4.4269761121548815e-06, + "loss": 1.1142, + "step": 7141 + }, + { + "epoch": 0.5505704594511255, + "grad_norm": 3.4570720195770264, + "learning_rate": 4.425735908594342e-06, + "loss": 0.8599, + "step": 7142 + }, + { + "epoch": 0.5506475485661425, + "grad_norm": 3.407050609588623, + "learning_rate": 4.424495740836193e-06, + "loss": 0.8526, + "step": 7143 + }, + { + "epoch": 0.5507246376811594, + "grad_norm": 4.278409481048584, + "learning_rate": 4.4232556089577575e-06, + "loss": 1.0937, + "step": 7144 + }, + { + "epoch": 0.5508017267961763, + "grad_norm": 3.6288211345672607, + "learning_rate": 4.4220155130363445e-06, + "loss": 1.0142, + "step": 7145 + }, + { + "epoch": 0.5508788159111934, + "grad_norm": 4.041045188903809, + "learning_rate": 4.420775453149273e-06, + "loss": 0.955, + "step": 7146 + }, + { + "epoch": 0.5509559050262103, + "grad_norm": 3.636233329772949, + "learning_rate": 4.4195354293738484e-06, + "loss": 0.937, + "step": 7147 + }, + { + "epoch": 0.5510329941412273, + "grad_norm": 3.6226770877838135, + "learning_rate": 4.418295441787387e-06, + "loss": 0.9552, + "step": 7148 + }, + { + "epoch": 0.5511100832562442, + "grad_norm": 3.7020938396453857, + "learning_rate": 4.417055490467191e-06, + "loss": 1.002, + "step": 7149 + }, + { + "epoch": 0.5511871723712611, + "grad_norm": 3.8105432987213135, + "learning_rate": 4.4158155754905655e-06, + "loss": 0.9974, + "step": 7150 + }, + { + "epoch": 0.5512642614862782, + "grad_norm": 3.620668888092041, + "learning_rate": 4.414575696934814e-06, + "loss": 0.9996, + "step": 7151 + }, + { + "epoch": 0.5513413506012951, + "grad_norm": 3.2780075073242188, + "learning_rate": 4.413335854877237e-06, + "loss": 0.9061, + "step": 7152 + }, + { + "epoch": 0.5514184397163121, + "grad_norm": 3.7314953804016113, + "learning_rate": 4.41209604939513e-06, + "loss": 1.008, + "step": 7153 + }, + { + "epoch": 0.551495528831329, + "grad_norm": 3.6991615295410156, + "learning_rate": 4.4108562805657886e-06, + "loss": 0.9696, + "step": 7154 + }, + { + "epoch": 0.5515726179463459, + "grad_norm": 3.4623279571533203, + "learning_rate": 4.409616548466508e-06, + "loss": 0.8454, + "step": 7155 + }, + { + "epoch": 0.551649707061363, + "grad_norm": 3.6211092472076416, + "learning_rate": 4.408376853174578e-06, + "loss": 0.9226, + "step": 7156 + }, + { + "epoch": 0.5517267961763799, + "grad_norm": 3.806649684906006, + "learning_rate": 4.407137194767286e-06, + "loss": 0.9433, + "step": 7157 + }, + { + "epoch": 0.5518038852913969, + "grad_norm": 3.5186119079589844, + "learning_rate": 4.40589757332192e-06, + "loss": 0.9126, + "step": 7158 + }, + { + "epoch": 0.5518809744064138, + "grad_norm": 3.941894054412842, + "learning_rate": 4.404657988915764e-06, + "loss": 1.053, + "step": 7159 + }, + { + "epoch": 0.5519580635214307, + "grad_norm": 3.503800868988037, + "learning_rate": 4.4034184416260975e-06, + "loss": 0.8895, + "step": 7160 + }, + { + "epoch": 0.5520351526364478, + "grad_norm": 3.7842636108398438, + "learning_rate": 4.402178931530202e-06, + "loss": 0.9407, + "step": 7161 + }, + { + "epoch": 0.5521122417514647, + "grad_norm": 3.6619977951049805, + "learning_rate": 4.400939458705356e-06, + "loss": 0.9676, + "step": 7162 + }, + { + "epoch": 0.5521893308664817, + "grad_norm": 3.5588581562042236, + "learning_rate": 4.3997000232288295e-06, + "loss": 0.8884, + "step": 7163 + }, + { + "epoch": 0.5522664199814986, + "grad_norm": 3.8872478008270264, + "learning_rate": 4.398460625177899e-06, + "loss": 1.0666, + "step": 7164 + }, + { + "epoch": 0.5523435090965155, + "grad_norm": 3.6066267490386963, + "learning_rate": 4.397221264629833e-06, + "loss": 0.8131, + "step": 7165 + }, + { + "epoch": 0.5524205982115326, + "grad_norm": 4.402515411376953, + "learning_rate": 4.395981941661897e-06, + "loss": 0.9393, + "step": 7166 + }, + { + "epoch": 0.5524976873265495, + "grad_norm": 3.6887872219085693, + "learning_rate": 4.394742656351362e-06, + "loss": 0.8782, + "step": 7167 + }, + { + "epoch": 0.5525747764415665, + "grad_norm": 3.382742404937744, + "learning_rate": 4.393503408775485e-06, + "loss": 0.9317, + "step": 7168 + }, + { + "epoch": 0.5526518655565834, + "grad_norm": 3.7242555618286133, + "learning_rate": 4.39226419901153e-06, + "loss": 0.9919, + "step": 7169 + }, + { + "epoch": 0.5527289546716003, + "grad_norm": 3.5612120628356934, + "learning_rate": 4.391025027136756e-06, + "loss": 1.0148, + "step": 7170 + }, + { + "epoch": 0.5528060437866174, + "grad_norm": 3.597330331802368, + "learning_rate": 4.389785893228416e-06, + "loss": 0.9333, + "step": 7171 + }, + { + "epoch": 0.5528831329016343, + "grad_norm": 3.885812520980835, + "learning_rate": 4.388546797363767e-06, + "loss": 0.949, + "step": 7172 + }, + { + "epoch": 0.5529602220166513, + "grad_norm": 3.399912118911743, + "learning_rate": 4.387307739620057e-06, + "loss": 0.906, + "step": 7173 + }, + { + "epoch": 0.5530373111316682, + "grad_norm": 3.6478137969970703, + "learning_rate": 4.386068720074536e-06, + "loss": 0.9246, + "step": 7174 + }, + { + "epoch": 0.5531144002466851, + "grad_norm": 3.986933946609497, + "learning_rate": 4.384829738804452e-06, + "loss": 0.9272, + "step": 7175 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 3.6242878437042236, + "learning_rate": 4.383590795887046e-06, + "loss": 0.9373, + "step": 7176 + }, + { + "epoch": 0.5532685784767191, + "grad_norm": 3.7404351234436035, + "learning_rate": 4.382351891399562e-06, + "loss": 0.9936, + "step": 7177 + }, + { + "epoch": 0.5533456675917361, + "grad_norm": 3.527928590774536, + "learning_rate": 4.38111302541924e-06, + "loss": 0.947, + "step": 7178 + }, + { + "epoch": 0.553422756706753, + "grad_norm": 3.5108187198638916, + "learning_rate": 4.379874198023314e-06, + "loss": 0.8112, + "step": 7179 + }, + { + "epoch": 0.5534998458217699, + "grad_norm": 3.2686216831207275, + "learning_rate": 4.37863540928902e-06, + "loss": 0.8961, + "step": 7180 + }, + { + "epoch": 0.553576934936787, + "grad_norm": 3.910783290863037, + "learning_rate": 4.3773966592935914e-06, + "loss": 1.0448, + "step": 7181 + }, + { + "epoch": 0.5536540240518039, + "grad_norm": 3.619373083114624, + "learning_rate": 4.3761579481142555e-06, + "loss": 0.9193, + "step": 7182 + }, + { + "epoch": 0.5537311131668209, + "grad_norm": 3.356902837753296, + "learning_rate": 4.374919275828241e-06, + "loss": 0.8701, + "step": 7183 + }, + { + "epoch": 0.5538082022818378, + "grad_norm": 4.081925392150879, + "learning_rate": 4.373680642512773e-06, + "loss": 1.0359, + "step": 7184 + }, + { + "epoch": 0.5538852913968547, + "grad_norm": 3.6913883686065674, + "learning_rate": 4.372442048245071e-06, + "loss": 1.0076, + "step": 7185 + }, + { + "epoch": 0.5539623805118717, + "grad_norm": 3.53983736038208, + "learning_rate": 4.371203493102359e-06, + "loss": 0.9063, + "step": 7186 + }, + { + "epoch": 0.5540394696268887, + "grad_norm": 3.321823835372925, + "learning_rate": 4.369964977161853e-06, + "loss": 0.8075, + "step": 7187 + }, + { + "epoch": 0.5541165587419057, + "grad_norm": 3.6654202938079834, + "learning_rate": 4.368726500500765e-06, + "loss": 0.9791, + "step": 7188 + }, + { + "epoch": 0.5541936478569226, + "grad_norm": 3.718724012374878, + "learning_rate": 4.367488063196313e-06, + "loss": 0.9182, + "step": 7189 + }, + { + "epoch": 0.5542707369719395, + "grad_norm": 3.7318592071533203, + "learning_rate": 4.366249665325703e-06, + "loss": 0.9435, + "step": 7190 + }, + { + "epoch": 0.5543478260869565, + "grad_norm": 3.3900840282440186, + "learning_rate": 4.3650113069661426e-06, + "loss": 0.7534, + "step": 7191 + }, + { + "epoch": 0.5544249152019735, + "grad_norm": 3.4240376949310303, + "learning_rate": 4.36377298819484e-06, + "loss": 0.8855, + "step": 7192 + }, + { + "epoch": 0.5545020043169905, + "grad_norm": 4.262863636016846, + "learning_rate": 4.3625347090889975e-06, + "loss": 0.9735, + "step": 7193 + }, + { + "epoch": 0.5545790934320074, + "grad_norm": 4.42769193649292, + "learning_rate": 4.361296469725813e-06, + "loss": 1.1063, + "step": 7194 + }, + { + "epoch": 0.5546561825470243, + "grad_norm": 4.841814041137695, + "learning_rate": 4.360058270182486e-06, + "loss": 0.9777, + "step": 7195 + }, + { + "epoch": 0.5547332716620413, + "grad_norm": 3.457386016845703, + "learning_rate": 4.3588201105362125e-06, + "loss": 0.9072, + "step": 7196 + }, + { + "epoch": 0.5548103607770583, + "grad_norm": 3.6573784351348877, + "learning_rate": 4.357581990864183e-06, + "loss": 0.8965, + "step": 7197 + }, + { + "epoch": 0.5548874498920753, + "grad_norm": 3.5548505783081055, + "learning_rate": 4.3563439112435904e-06, + "loss": 0.7159, + "step": 7198 + }, + { + "epoch": 0.5549645390070922, + "grad_norm": 3.5277884006500244, + "learning_rate": 4.355105871751621e-06, + "loss": 1.0291, + "step": 7199 + }, + { + "epoch": 0.5550416281221091, + "grad_norm": 3.4423916339874268, + "learning_rate": 4.35386787246546e-06, + "loss": 0.8138, + "step": 7200 + }, + { + "epoch": 0.5551187172371261, + "grad_norm": 4.191361904144287, + "learning_rate": 4.352629913462292e-06, + "loss": 0.9909, + "step": 7201 + }, + { + "epoch": 0.555195806352143, + "grad_norm": 3.601893663406372, + "learning_rate": 4.351391994819294e-06, + "loss": 0.9646, + "step": 7202 + }, + { + "epoch": 0.5552728954671601, + "grad_norm": 3.6490609645843506, + "learning_rate": 4.350154116613648e-06, + "loss": 0.9988, + "step": 7203 + }, + { + "epoch": 0.555349984582177, + "grad_norm": 3.3986620903015137, + "learning_rate": 4.348916278922526e-06, + "loss": 0.9039, + "step": 7204 + }, + { + "epoch": 0.5554270736971939, + "grad_norm": 4.264974117279053, + "learning_rate": 4.347678481823102e-06, + "loss": 1.0964, + "step": 7205 + }, + { + "epoch": 0.5555041628122109, + "grad_norm": 4.056002140045166, + "learning_rate": 4.346440725392546e-06, + "loss": 1.044, + "step": 7206 + }, + { + "epoch": 0.5555812519272278, + "grad_norm": 3.6536853313446045, + "learning_rate": 4.345203009708027e-06, + "loss": 1.0003, + "step": 7207 + }, + { + "epoch": 0.5556583410422449, + "grad_norm": 3.690647840499878, + "learning_rate": 4.343965334846708e-06, + "loss": 0.9672, + "step": 7208 + }, + { + "epoch": 0.5557354301572618, + "grad_norm": 3.597975969314575, + "learning_rate": 4.3427277008857526e-06, + "loss": 0.9204, + "step": 7209 + }, + { + "epoch": 0.5558125192722787, + "grad_norm": 3.5312154293060303, + "learning_rate": 4.341490107902323e-06, + "loss": 0.9343, + "step": 7210 + }, + { + "epoch": 0.5558896083872957, + "grad_norm": 3.8047034740448, + "learning_rate": 4.340252555973572e-06, + "loss": 1.0683, + "step": 7211 + }, + { + "epoch": 0.5559666975023126, + "grad_norm": 3.7284018993377686, + "learning_rate": 4.339015045176659e-06, + "loss": 0.8455, + "step": 7212 + }, + { + "epoch": 0.5560437866173297, + "grad_norm": 4.019394397735596, + "learning_rate": 4.3377775755887335e-06, + "loss": 1.0309, + "step": 7213 + }, + { + "epoch": 0.5561208757323466, + "grad_norm": 4.077296733856201, + "learning_rate": 4.336540147286946e-06, + "loss": 1.0705, + "step": 7214 + }, + { + "epoch": 0.5561979648473635, + "grad_norm": 3.3389627933502197, + "learning_rate": 4.335302760348446e-06, + "loss": 0.8988, + "step": 7215 + }, + { + "epoch": 0.5562750539623805, + "grad_norm": 3.577312469482422, + "learning_rate": 4.334065414850375e-06, + "loss": 0.9091, + "step": 7216 + }, + { + "epoch": 0.5563521430773974, + "grad_norm": 3.472486734390259, + "learning_rate": 4.3328281108698765e-06, + "loss": 0.9721, + "step": 7217 + }, + { + "epoch": 0.5564292321924145, + "grad_norm": 4.345758438110352, + "learning_rate": 4.33159084848409e-06, + "loss": 1.0537, + "step": 7218 + }, + { + "epoch": 0.5565063213074314, + "grad_norm": 3.5897371768951416, + "learning_rate": 4.330353627770153e-06, + "loss": 0.8934, + "step": 7219 + }, + { + "epoch": 0.5565834104224483, + "grad_norm": 4.042248725891113, + "learning_rate": 4.329116448805198e-06, + "loss": 1.1139, + "step": 7220 + }, + { + "epoch": 0.5566604995374653, + "grad_norm": 4.091678619384766, + "learning_rate": 4.327879311666359e-06, + "loss": 0.9429, + "step": 7221 + }, + { + "epoch": 0.5567375886524822, + "grad_norm": 3.7830512523651123, + "learning_rate": 4.326642216430763e-06, + "loss": 1.0191, + "step": 7222 + }, + { + "epoch": 0.5568146777674993, + "grad_norm": 3.766197919845581, + "learning_rate": 4.325405163175539e-06, + "loss": 0.983, + "step": 7223 + }, + { + "epoch": 0.5568917668825162, + "grad_norm": 3.5098490715026855, + "learning_rate": 4.3241681519778074e-06, + "loss": 0.9068, + "step": 7224 + }, + { + "epoch": 0.5569688559975331, + "grad_norm": 3.6200573444366455, + "learning_rate": 4.322931182914692e-06, + "loss": 0.8358, + "step": 7225 + }, + { + "epoch": 0.5570459451125501, + "grad_norm": 3.716994285583496, + "learning_rate": 4.321694256063311e-06, + "loss": 1.0499, + "step": 7226 + }, + { + "epoch": 0.557123034227567, + "grad_norm": 3.880415439605713, + "learning_rate": 4.3204573715007795e-06, + "loss": 0.9322, + "step": 7227 + }, + { + "epoch": 0.5572001233425841, + "grad_norm": 3.300236940383911, + "learning_rate": 4.3192205293042105e-06, + "loss": 0.8236, + "step": 7228 + }, + { + "epoch": 0.557277212457601, + "grad_norm": 4.212109565734863, + "learning_rate": 4.317983729550718e-06, + "loss": 0.9395, + "step": 7229 + }, + { + "epoch": 0.5573543015726179, + "grad_norm": 3.5828144550323486, + "learning_rate": 4.316746972317406e-06, + "loss": 1.0149, + "step": 7230 + }, + { + "epoch": 0.5574313906876349, + "grad_norm": 3.6307485103607178, + "learning_rate": 4.315510257681381e-06, + "loss": 1.0208, + "step": 7231 + }, + { + "epoch": 0.5575084798026518, + "grad_norm": 3.832481622695923, + "learning_rate": 4.314273585719749e-06, + "loss": 1.0138, + "step": 7232 + }, + { + "epoch": 0.5575855689176689, + "grad_norm": 3.533522367477417, + "learning_rate": 4.313036956509605e-06, + "loss": 1.0133, + "step": 7233 + }, + { + "epoch": 0.5576626580326858, + "grad_norm": 3.565697193145752, + "learning_rate": 4.3118003701280504e-06, + "loss": 1.0029, + "step": 7234 + }, + { + "epoch": 0.5577397471477027, + "grad_norm": 3.788064956665039, + "learning_rate": 4.310563826652175e-06, + "loss": 0.9481, + "step": 7235 + }, + { + "epoch": 0.5578168362627197, + "grad_norm": 3.4550724029541016, + "learning_rate": 4.3093273261590785e-06, + "loss": 0.8953, + "step": 7236 + }, + { + "epoch": 0.5578939253777366, + "grad_norm": 3.3875648975372314, + "learning_rate": 4.308090868725844e-06, + "loss": 0.928, + "step": 7237 + }, + { + "epoch": 0.5579710144927537, + "grad_norm": 4.244081497192383, + "learning_rate": 4.306854454429561e-06, + "loss": 0.9824, + "step": 7238 + }, + { + "epoch": 0.5580481036077706, + "grad_norm": 3.3602986335754395, + "learning_rate": 4.3056180833473135e-06, + "loss": 0.812, + "step": 7239 + }, + { + "epoch": 0.5581251927227875, + "grad_norm": 3.6101772785186768, + "learning_rate": 4.304381755556182e-06, + "loss": 0.8871, + "step": 7240 + }, + { + "epoch": 0.5582022818378045, + "grad_norm": 3.709418535232544, + "learning_rate": 4.303145471133246e-06, + "loss": 0.9672, + "step": 7241 + }, + { + "epoch": 0.5582793709528214, + "grad_norm": 3.9773929119110107, + "learning_rate": 4.301909230155579e-06, + "loss": 0.9546, + "step": 7242 + }, + { + "epoch": 0.5583564600678385, + "grad_norm": 3.4679555892944336, + "learning_rate": 4.300673032700256e-06, + "loss": 0.8538, + "step": 7243 + }, + { + "epoch": 0.5584335491828554, + "grad_norm": 3.525493860244751, + "learning_rate": 4.2994368788443496e-06, + "loss": 0.9495, + "step": 7244 + }, + { + "epoch": 0.5585106382978723, + "grad_norm": 3.462528705596924, + "learning_rate": 4.298200768664924e-06, + "loss": 0.9793, + "step": 7245 + }, + { + "epoch": 0.5585877274128893, + "grad_norm": 3.402458906173706, + "learning_rate": 4.296964702239046e-06, + "loss": 0.8481, + "step": 7246 + }, + { + "epoch": 0.5586648165279062, + "grad_norm": 3.4688830375671387, + "learning_rate": 4.295728679643778e-06, + "loss": 0.898, + "step": 7247 + }, + { + "epoch": 0.5587419056429233, + "grad_norm": 3.5132532119750977, + "learning_rate": 4.2944927009561786e-06, + "loss": 0.9714, + "step": 7248 + }, + { + "epoch": 0.5588189947579402, + "grad_norm": 3.7128775119781494, + "learning_rate": 4.293256766253307e-06, + "loss": 1.0389, + "step": 7249 + }, + { + "epoch": 0.5588960838729571, + "grad_norm": 3.6854405403137207, + "learning_rate": 4.2920208756122136e-06, + "loss": 0.9308, + "step": 7250 + }, + { + "epoch": 0.5589731729879741, + "grad_norm": 3.7184994220733643, + "learning_rate": 4.290785029109953e-06, + "loss": 0.9581, + "step": 7251 + }, + { + "epoch": 0.559050262102991, + "grad_norm": 3.4944541454315186, + "learning_rate": 4.2895492268235725e-06, + "loss": 0.9439, + "step": 7252 + }, + { + "epoch": 0.559127351218008, + "grad_norm": 3.616788387298584, + "learning_rate": 4.288313468830119e-06, + "loss": 0.8744, + "step": 7253 + }, + { + "epoch": 0.559204440333025, + "grad_norm": 3.7554306983947754, + "learning_rate": 4.2870777552066336e-06, + "loss": 0.8796, + "step": 7254 + }, + { + "epoch": 0.559281529448042, + "grad_norm": 3.7401881217956543, + "learning_rate": 4.285842086030159e-06, + "loss": 0.8975, + "step": 7255 + }, + { + "epoch": 0.5593586185630589, + "grad_norm": 3.8018808364868164, + "learning_rate": 4.28460646137773e-06, + "loss": 0.827, + "step": 7256 + }, + { + "epoch": 0.5594357076780758, + "grad_norm": 3.6006662845611572, + "learning_rate": 4.283370881326384e-06, + "loss": 0.9216, + "step": 7257 + }, + { + "epoch": 0.5595127967930928, + "grad_norm": 3.953252077102661, + "learning_rate": 4.2821353459531525e-06, + "loss": 0.9649, + "step": 7258 + }, + { + "epoch": 0.5595898859081098, + "grad_norm": 3.724891185760498, + "learning_rate": 4.2808998553350625e-06, + "loss": 1.0039, + "step": 7259 + }, + { + "epoch": 0.5596669750231268, + "grad_norm": 3.795192241668701, + "learning_rate": 4.279664409549145e-06, + "loss": 0.9401, + "step": 7260 + }, + { + "epoch": 0.5597440641381437, + "grad_norm": 4.127142906188965, + "learning_rate": 4.2784290086724186e-06, + "loss": 0.9922, + "step": 7261 + }, + { + "epoch": 0.5598211532531606, + "grad_norm": 3.5862457752227783, + "learning_rate": 4.277193652781906e-06, + "loss": 0.9433, + "step": 7262 + }, + { + "epoch": 0.5598982423681776, + "grad_norm": 4.025807857513428, + "learning_rate": 4.275958341954628e-06, + "loss": 0.8679, + "step": 7263 + }, + { + "epoch": 0.5599753314831946, + "grad_norm": 3.7022628784179688, + "learning_rate": 4.274723076267596e-06, + "loss": 0.8451, + "step": 7264 + }, + { + "epoch": 0.5600524205982116, + "grad_norm": 3.6183743476867676, + "learning_rate": 4.273487855797823e-06, + "loss": 0.8063, + "step": 7265 + }, + { + "epoch": 0.5601295097132285, + "grad_norm": 3.7264175415039062, + "learning_rate": 4.272252680622321e-06, + "loss": 0.8843, + "step": 7266 + }, + { + "epoch": 0.5602065988282454, + "grad_norm": 3.512242317199707, + "learning_rate": 4.271017550818095e-06, + "loss": 0.925, + "step": 7267 + }, + { + "epoch": 0.5602836879432624, + "grad_norm": 3.9300243854522705, + "learning_rate": 4.269782466462149e-06, + "loss": 0.949, + "step": 7268 + }, + { + "epoch": 0.5603607770582794, + "grad_norm": 3.0627076625823975, + "learning_rate": 4.268547427631485e-06, + "loss": 0.8389, + "step": 7269 + }, + { + "epoch": 0.5604378661732964, + "grad_norm": 3.468635082244873, + "learning_rate": 4.267312434403099e-06, + "loss": 0.911, + "step": 7270 + }, + { + "epoch": 0.5605149552883133, + "grad_norm": 3.891932249069214, + "learning_rate": 4.26607748685399e-06, + "loss": 0.9965, + "step": 7271 + }, + { + "epoch": 0.5605920444033302, + "grad_norm": 3.7001166343688965, + "learning_rate": 4.2648425850611465e-06, + "loss": 0.8025, + "step": 7272 + }, + { + "epoch": 0.5606691335183472, + "grad_norm": 3.481006145477295, + "learning_rate": 4.2636077291015615e-06, + "loss": 0.8563, + "step": 7273 + }, + { + "epoch": 0.5607462226333642, + "grad_norm": 3.4894566535949707, + "learning_rate": 4.262372919052221e-06, + "loss": 0.9797, + "step": 7274 + }, + { + "epoch": 0.5608233117483812, + "grad_norm": 3.728573799133301, + "learning_rate": 4.2611381549901085e-06, + "loss": 0.8734, + "step": 7275 + }, + { + "epoch": 0.5609004008633981, + "grad_norm": 3.5050055980682373, + "learning_rate": 4.259903436992204e-06, + "loss": 0.8772, + "step": 7276 + }, + { + "epoch": 0.560977489978415, + "grad_norm": 3.7864038944244385, + "learning_rate": 4.258668765135489e-06, + "loss": 0.939, + "step": 7277 + }, + { + "epoch": 0.561054579093432, + "grad_norm": 3.818662405014038, + "learning_rate": 4.257434139496937e-06, + "loss": 1.0586, + "step": 7278 + }, + { + "epoch": 0.561131668208449, + "grad_norm": 4.216384410858154, + "learning_rate": 4.25619956015352e-06, + "loss": 1.0841, + "step": 7279 + }, + { + "epoch": 0.561208757323466, + "grad_norm": 3.877438545227051, + "learning_rate": 4.254965027182206e-06, + "loss": 0.8282, + "step": 7280 + }, + { + "epoch": 0.5612858464384829, + "grad_norm": 3.9610254764556885, + "learning_rate": 4.253730540659966e-06, + "loss": 1.0798, + "step": 7281 + }, + { + "epoch": 0.5613629355534998, + "grad_norm": 3.5456466674804688, + "learning_rate": 4.252496100663762e-06, + "loss": 0.8978, + "step": 7282 + }, + { + "epoch": 0.5614400246685168, + "grad_norm": 3.5786006450653076, + "learning_rate": 4.2512617072705546e-06, + "loss": 0.956, + "step": 7283 + }, + { + "epoch": 0.5615171137835338, + "grad_norm": 3.5269174575805664, + "learning_rate": 4.250027360557302e-06, + "loss": 0.8546, + "step": 7284 + }, + { + "epoch": 0.5615942028985508, + "grad_norm": 3.479971408843994, + "learning_rate": 4.248793060600959e-06, + "loss": 0.832, + "step": 7285 + }, + { + "epoch": 0.5616712920135677, + "grad_norm": 3.701209783554077, + "learning_rate": 4.247558807478478e-06, + "loss": 1.0244, + "step": 7286 + }, + { + "epoch": 0.5617483811285846, + "grad_norm": 3.449061393737793, + "learning_rate": 4.246324601266809e-06, + "loss": 0.8604, + "step": 7287 + }, + { + "epoch": 0.5618254702436016, + "grad_norm": 3.8695156574249268, + "learning_rate": 4.245090442042897e-06, + "loss": 1.0627, + "step": 7288 + }, + { + "epoch": 0.5619025593586185, + "grad_norm": 3.7182259559631348, + "learning_rate": 4.243856329883687e-06, + "loss": 0.9507, + "step": 7289 + }, + { + "epoch": 0.5619796484736356, + "grad_norm": 3.4913182258605957, + "learning_rate": 4.242622264866118e-06, + "loss": 0.9745, + "step": 7290 + }, + { + "epoch": 0.5620567375886525, + "grad_norm": 3.5762779712677, + "learning_rate": 4.241388247067128e-06, + "loss": 0.899, + "step": 7291 + }, + { + "epoch": 0.5621338267036694, + "grad_norm": 3.425102472305298, + "learning_rate": 4.240154276563652e-06, + "loss": 0.9087, + "step": 7292 + }, + { + "epoch": 0.5622109158186864, + "grad_norm": 3.7708728313446045, + "learning_rate": 4.238920353432623e-06, + "loss": 0.9664, + "step": 7293 + }, + { + "epoch": 0.5622880049337033, + "grad_norm": 3.508423089981079, + "learning_rate": 4.237686477750966e-06, + "loss": 0.9499, + "step": 7294 + }, + { + "epoch": 0.5623650940487204, + "grad_norm": 3.62243390083313, + "learning_rate": 4.23645264959561e-06, + "loss": 0.9453, + "step": 7295 + }, + { + "epoch": 0.5624421831637373, + "grad_norm": 3.7550387382507324, + "learning_rate": 4.235218869043477e-06, + "loss": 0.9756, + "step": 7296 + }, + { + "epoch": 0.5625192722787542, + "grad_norm": 3.69486927986145, + "learning_rate": 4.233985136171487e-06, + "loss": 0.9725, + "step": 7297 + }, + { + "epoch": 0.5625963613937712, + "grad_norm": 3.8165245056152344, + "learning_rate": 4.232751451056555e-06, + "loss": 0.917, + "step": 7298 + }, + { + "epoch": 0.5626734505087881, + "grad_norm": 3.5180726051330566, + "learning_rate": 4.231517813775597e-06, + "loss": 0.8487, + "step": 7299 + }, + { + "epoch": 0.5627505396238052, + "grad_norm": 3.76409912109375, + "learning_rate": 4.230284224405523e-06, + "loss": 0.9902, + "step": 7300 + }, + { + "epoch": 0.5628276287388221, + "grad_norm": 3.4409983158111572, + "learning_rate": 4.2290506830232415e-06, + "loss": 0.8749, + "step": 7301 + }, + { + "epoch": 0.562904717853839, + "grad_norm": 3.861907958984375, + "learning_rate": 4.227817189705657e-06, + "loss": 0.8812, + "step": 7302 + }, + { + "epoch": 0.562981806968856, + "grad_norm": 3.839994192123413, + "learning_rate": 4.226583744529672e-06, + "loss": 0.954, + "step": 7303 + }, + { + "epoch": 0.5630588960838729, + "grad_norm": 4.001551151275635, + "learning_rate": 4.225350347572185e-06, + "loss": 0.8083, + "step": 7304 + }, + { + "epoch": 0.56313598519889, + "grad_norm": 3.7375106811523438, + "learning_rate": 4.22411699891009e-06, + "loss": 1.0402, + "step": 7305 + }, + { + "epoch": 0.5632130743139069, + "grad_norm": 3.5613088607788086, + "learning_rate": 4.2228836986202845e-06, + "loss": 0.9585, + "step": 7306 + }, + { + "epoch": 0.5632901634289238, + "grad_norm": 3.4407291412353516, + "learning_rate": 4.221650446779653e-06, + "loss": 0.9678, + "step": 7307 + }, + { + "epoch": 0.5633672525439408, + "grad_norm": 3.501506805419922, + "learning_rate": 4.2204172434650866e-06, + "loss": 0.9946, + "step": 7308 + }, + { + "epoch": 0.5634443416589577, + "grad_norm": 3.91636061668396, + "learning_rate": 4.219184088753467e-06, + "loss": 1.0566, + "step": 7309 + }, + { + "epoch": 0.5635214307739748, + "grad_norm": 3.84093976020813, + "learning_rate": 4.217950982721675e-06, + "loss": 0.9006, + "step": 7310 + }, + { + "epoch": 0.5635985198889917, + "grad_norm": 3.702842950820923, + "learning_rate": 4.21671792544659e-06, + "loss": 0.9995, + "step": 7311 + }, + { + "epoch": 0.5636756090040086, + "grad_norm": 3.6409497261047363, + "learning_rate": 4.215484917005085e-06, + "loss": 0.9295, + "step": 7312 + }, + { + "epoch": 0.5637526981190256, + "grad_norm": 3.425840377807617, + "learning_rate": 4.2142519574740315e-06, + "loss": 0.8669, + "step": 7313 + }, + { + "epoch": 0.5638297872340425, + "grad_norm": 3.576141595840454, + "learning_rate": 4.2130190469303005e-06, + "loss": 0.9028, + "step": 7314 + }, + { + "epoch": 0.5639068763490596, + "grad_norm": 3.4973714351654053, + "learning_rate": 4.211786185450756e-06, + "loss": 0.8018, + "step": 7315 + }, + { + "epoch": 0.5639839654640765, + "grad_norm": 3.3027119636535645, + "learning_rate": 4.210553373112259e-06, + "loss": 0.8958, + "step": 7316 + }, + { + "epoch": 0.5640610545790934, + "grad_norm": 4.032995223999023, + "learning_rate": 4.209320609991672e-06, + "loss": 0.9982, + "step": 7317 + }, + { + "epoch": 0.5641381436941104, + "grad_norm": 3.718665838241577, + "learning_rate": 4.208087896165849e-06, + "loss": 0.9066, + "step": 7318 + }, + { + "epoch": 0.5642152328091273, + "grad_norm": 3.670677900314331, + "learning_rate": 4.206855231711645e-06, + "loss": 1.0318, + "step": 7319 + }, + { + "epoch": 0.5642923219241444, + "grad_norm": 3.2902212142944336, + "learning_rate": 4.205622616705909e-06, + "loss": 0.896, + "step": 7320 + }, + { + "epoch": 0.5643694110391613, + "grad_norm": 3.813958168029785, + "learning_rate": 4.204390051225488e-06, + "loss": 1.0047, + "step": 7321 + }, + { + "epoch": 0.5644465001541782, + "grad_norm": 3.3876993656158447, + "learning_rate": 4.203157535347229e-06, + "loss": 0.9258, + "step": 7322 + }, + { + "epoch": 0.5645235892691952, + "grad_norm": 3.70788311958313, + "learning_rate": 4.201925069147969e-06, + "loss": 0.962, + "step": 7323 + }, + { + "epoch": 0.5646006783842121, + "grad_norm": 3.678077459335327, + "learning_rate": 4.200692652704545e-06, + "loss": 1.0408, + "step": 7324 + }, + { + "epoch": 0.5646777674992292, + "grad_norm": 3.8763816356658936, + "learning_rate": 4.199460286093797e-06, + "loss": 0.9782, + "step": 7325 + }, + { + "epoch": 0.5647548566142461, + "grad_norm": 3.756704807281494, + "learning_rate": 4.198227969392556e-06, + "loss": 0.9667, + "step": 7326 + }, + { + "epoch": 0.564831945729263, + "grad_norm": 3.946401596069336, + "learning_rate": 4.196995702677646e-06, + "loss": 0.9345, + "step": 7327 + }, + { + "epoch": 0.56490903484428, + "grad_norm": 3.7415993213653564, + "learning_rate": 4.195763486025896e-06, + "loss": 0.9629, + "step": 7328 + }, + { + "epoch": 0.5649861239592969, + "grad_norm": 3.9311485290527344, + "learning_rate": 4.194531319514128e-06, + "loss": 0.9507, + "step": 7329 + }, + { + "epoch": 0.565063213074314, + "grad_norm": 3.924248456954956, + "learning_rate": 4.1932992032191595e-06, + "loss": 0.9141, + "step": 7330 + }, + { + "epoch": 0.5651403021893309, + "grad_norm": 3.478628396987915, + "learning_rate": 4.1920671372178075e-06, + "loss": 1.0346, + "step": 7331 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 3.4320709705352783, + "learning_rate": 4.190835121586887e-06, + "loss": 0.9594, + "step": 7332 + }, + { + "epoch": 0.5652944804193648, + "grad_norm": 3.83650541305542, + "learning_rate": 4.189603156403204e-06, + "loss": 0.9973, + "step": 7333 + }, + { + "epoch": 0.5653715695343817, + "grad_norm": 3.5375068187713623, + "learning_rate": 4.188371241743568e-06, + "loss": 0.9137, + "step": 7334 + }, + { + "epoch": 0.5654486586493987, + "grad_norm": 4.036672592163086, + "learning_rate": 4.187139377684781e-06, + "loss": 1.0529, + "step": 7335 + }, + { + "epoch": 0.5655257477644157, + "grad_norm": 3.483830213546753, + "learning_rate": 4.185907564303644e-06, + "loss": 0.8893, + "step": 7336 + }, + { + "epoch": 0.5656028368794326, + "grad_norm": 3.592637300491333, + "learning_rate": 4.184675801676955e-06, + "loss": 1.0526, + "step": 7337 + }, + { + "epoch": 0.5656799259944496, + "grad_norm": 3.425968647003174, + "learning_rate": 4.183444089881506e-06, + "loss": 0.9941, + "step": 7338 + }, + { + "epoch": 0.5657570151094665, + "grad_norm": 3.730457305908203, + "learning_rate": 4.182212428994088e-06, + "loss": 0.9906, + "step": 7339 + }, + { + "epoch": 0.5658341042244835, + "grad_norm": 3.9241623878479004, + "learning_rate": 4.1809808190914925e-06, + "loss": 0.9277, + "step": 7340 + }, + { + "epoch": 0.5659111933395005, + "grad_norm": 3.7960057258605957, + "learning_rate": 4.1797492602504985e-06, + "loss": 0.8966, + "step": 7341 + }, + { + "epoch": 0.5659882824545174, + "grad_norm": 3.747955083847046, + "learning_rate": 4.178517752547891e-06, + "loss": 1.0132, + "step": 7342 + }, + { + "epoch": 0.5660653715695344, + "grad_norm": 3.649674654006958, + "learning_rate": 4.177286296060448e-06, + "loss": 0.8899, + "step": 7343 + }, + { + "epoch": 0.5661424606845513, + "grad_norm": 3.396299123764038, + "learning_rate": 4.176054890864942e-06, + "loss": 0.9728, + "step": 7344 + }, + { + "epoch": 0.5662195497995683, + "grad_norm": 3.8877921104431152, + "learning_rate": 4.174823537038147e-06, + "loss": 1.0016, + "step": 7345 + }, + { + "epoch": 0.5662966389145853, + "grad_norm": 3.6778106689453125, + "learning_rate": 4.173592234656831e-06, + "loss": 0.9368, + "step": 7346 + }, + { + "epoch": 0.5663737280296022, + "grad_norm": 3.6797902584075928, + "learning_rate": 4.172360983797757e-06, + "loss": 0.9625, + "step": 7347 + }, + { + "epoch": 0.5664508171446192, + "grad_norm": 4.3902268409729, + "learning_rate": 4.171129784537692e-06, + "loss": 0.961, + "step": 7348 + }, + { + "epoch": 0.5665279062596361, + "grad_norm": 3.7198588848114014, + "learning_rate": 4.16989863695339e-06, + "loss": 0.9612, + "step": 7349 + }, + { + "epoch": 0.5666049953746531, + "grad_norm": 3.431483507156372, + "learning_rate": 4.16866754112161e-06, + "loss": 0.8565, + "step": 7350 + }, + { + "epoch": 0.5666820844896701, + "grad_norm": 3.6897926330566406, + "learning_rate": 4.167436497119103e-06, + "loss": 0.9245, + "step": 7351 + }, + { + "epoch": 0.566759173604687, + "grad_norm": 3.8588991165161133, + "learning_rate": 4.166205505022618e-06, + "loss": 1.04, + "step": 7352 + }, + { + "epoch": 0.566836262719704, + "grad_norm": 3.7585813999176025, + "learning_rate": 4.1649745649089015e-06, + "loss": 0.8811, + "step": 7353 + }, + { + "epoch": 0.5669133518347209, + "grad_norm": 3.642169713973999, + "learning_rate": 4.163743676854697e-06, + "loss": 0.8462, + "step": 7354 + }, + { + "epoch": 0.5669904409497379, + "grad_norm": 3.2136752605438232, + "learning_rate": 4.162512840936742e-06, + "loss": 0.9009, + "step": 7355 + }, + { + "epoch": 0.5670675300647549, + "grad_norm": 3.7039544582366943, + "learning_rate": 4.161282057231776e-06, + "loss": 0.9187, + "step": 7356 + }, + { + "epoch": 0.5671446191797718, + "grad_norm": 3.5421409606933594, + "learning_rate": 4.160051325816528e-06, + "loss": 0.9465, + "step": 7357 + }, + { + "epoch": 0.5672217082947888, + "grad_norm": 3.6218063831329346, + "learning_rate": 4.158820646767729e-06, + "loss": 1.0368, + "step": 7358 + }, + { + "epoch": 0.5672987974098057, + "grad_norm": 3.5446629524230957, + "learning_rate": 4.157590020162108e-06, + "loss": 1.0172, + "step": 7359 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 3.736348867416382, + "learning_rate": 4.156359446076385e-06, + "loss": 0.9261, + "step": 7360 + }, + { + "epoch": 0.5674529756398397, + "grad_norm": 3.8830409049987793, + "learning_rate": 4.1551289245872815e-06, + "loss": 1.0677, + "step": 7361 + }, + { + "epoch": 0.5675300647548566, + "grad_norm": 3.8011181354522705, + "learning_rate": 4.153898455771514e-06, + "loss": 0.9841, + "step": 7362 + }, + { + "epoch": 0.5676071538698736, + "grad_norm": 3.9767227172851562, + "learning_rate": 4.152668039705796e-06, + "loss": 1.0165, + "step": 7363 + }, + { + "epoch": 0.5676842429848905, + "grad_norm": 3.4617295265197754, + "learning_rate": 4.151437676466836e-06, + "loss": 0.9734, + "step": 7364 + }, + { + "epoch": 0.5677613320999075, + "grad_norm": 3.2695651054382324, + "learning_rate": 4.150207366131344e-06, + "loss": 0.9571, + "step": 7365 + }, + { + "epoch": 0.5678384212149244, + "grad_norm": 3.5925192832946777, + "learning_rate": 4.14897710877602e-06, + "loss": 0.9581, + "step": 7366 + }, + { + "epoch": 0.5679155103299414, + "grad_norm": 3.435485363006592, + "learning_rate": 4.147746904477567e-06, + "loss": 0.9586, + "step": 7367 + }, + { + "epoch": 0.5679925994449584, + "grad_norm": 3.665407657623291, + "learning_rate": 4.146516753312677e-06, + "loss": 1.023, + "step": 7368 + }, + { + "epoch": 0.5680696885599753, + "grad_norm": 3.7188949584960938, + "learning_rate": 4.145286655358051e-06, + "loss": 0.9351, + "step": 7369 + }, + { + "epoch": 0.5681467776749923, + "grad_norm": 4.338947296142578, + "learning_rate": 4.144056610690375e-06, + "loss": 0.9218, + "step": 7370 + }, + { + "epoch": 0.5682238667900092, + "grad_norm": 4.026747703552246, + "learning_rate": 4.142826619386334e-06, + "loss": 1.0557, + "step": 7371 + }, + { + "epoch": 0.5683009559050262, + "grad_norm": 3.3105926513671875, + "learning_rate": 4.141596681522616e-06, + "loss": 0.8553, + "step": 7372 + }, + { + "epoch": 0.5683780450200432, + "grad_norm": 3.3676323890686035, + "learning_rate": 4.140366797175899e-06, + "loss": 0.8005, + "step": 7373 + }, + { + "epoch": 0.5684551341350601, + "grad_norm": 3.641796588897705, + "learning_rate": 4.13913696642286e-06, + "loss": 0.8849, + "step": 7374 + }, + { + "epoch": 0.5685322232500771, + "grad_norm": 3.88008975982666, + "learning_rate": 4.137907189340172e-06, + "loss": 0.9257, + "step": 7375 + }, + { + "epoch": 0.568609312365094, + "grad_norm": 3.845065116882324, + "learning_rate": 4.136677466004506e-06, + "loss": 0.9235, + "step": 7376 + }, + { + "epoch": 0.568686401480111, + "grad_norm": 3.5287108421325684, + "learning_rate": 4.13544779649253e-06, + "loss": 0.9357, + "step": 7377 + }, + { + "epoch": 0.568763490595128, + "grad_norm": 3.5272388458251953, + "learning_rate": 4.1342181808809046e-06, + "loss": 0.9282, + "step": 7378 + }, + { + "epoch": 0.5688405797101449, + "grad_norm": 4.141164779663086, + "learning_rate": 4.132988619246291e-06, + "loss": 0.8634, + "step": 7379 + }, + { + "epoch": 0.5689176688251619, + "grad_norm": 3.388904571533203, + "learning_rate": 4.131759111665349e-06, + "loss": 0.9187, + "step": 7380 + }, + { + "epoch": 0.5689947579401788, + "grad_norm": 3.592479944229126, + "learning_rate": 4.130529658214728e-06, + "loss": 0.9531, + "step": 7381 + }, + { + "epoch": 0.5690718470551958, + "grad_norm": 3.7211825847625732, + "learning_rate": 4.1293002589710795e-06, + "loss": 1.0478, + "step": 7382 + }, + { + "epoch": 0.5691489361702128, + "grad_norm": 3.6857645511627197, + "learning_rate": 4.128070914011052e-06, + "loss": 1.0107, + "step": 7383 + }, + { + "epoch": 0.5692260252852297, + "grad_norm": 3.7964882850646973, + "learning_rate": 4.126841623411286e-06, + "loss": 1.0194, + "step": 7384 + }, + { + "epoch": 0.5693031144002467, + "grad_norm": 3.608358860015869, + "learning_rate": 4.125612387248423e-06, + "loss": 0.8752, + "step": 7385 + }, + { + "epoch": 0.5693802035152636, + "grad_norm": 3.4516360759735107, + "learning_rate": 4.124383205599099e-06, + "loss": 0.9234, + "step": 7386 + }, + { + "epoch": 0.5694572926302806, + "grad_norm": 3.3848817348480225, + "learning_rate": 4.123154078539946e-06, + "loss": 0.8775, + "step": 7387 + }, + { + "epoch": 0.5695343817452976, + "grad_norm": 3.8858206272125244, + "learning_rate": 4.121925006147597e-06, + "loss": 1.0166, + "step": 7388 + }, + { + "epoch": 0.5696114708603145, + "grad_norm": 3.7127907276153564, + "learning_rate": 4.120695988498674e-06, + "loss": 0.9005, + "step": 7389 + }, + { + "epoch": 0.5696885599753315, + "grad_norm": 3.9486067295074463, + "learning_rate": 4.119467025669803e-06, + "loss": 0.9782, + "step": 7390 + }, + { + "epoch": 0.5697656490903484, + "grad_norm": 3.396491765975952, + "learning_rate": 4.118238117737604e-06, + "loss": 0.8566, + "step": 7391 + }, + { + "epoch": 0.5698427382053654, + "grad_norm": 3.7514426708221436, + "learning_rate": 4.1170092647786895e-06, + "loss": 0.9061, + "step": 7392 + }, + { + "epoch": 0.5699198273203824, + "grad_norm": 3.4897749423980713, + "learning_rate": 4.115780466869676e-06, + "loss": 0.9237, + "step": 7393 + }, + { + "epoch": 0.5699969164353993, + "grad_norm": 3.4681146144866943, + "learning_rate": 4.11455172408717e-06, + "loss": 0.9261, + "step": 7394 + }, + { + "epoch": 0.5700740055504163, + "grad_norm": 3.301769733428955, + "learning_rate": 4.113323036507778e-06, + "loss": 0.8158, + "step": 7395 + }, + { + "epoch": 0.5701510946654332, + "grad_norm": 3.7004668712615967, + "learning_rate": 4.112094404208102e-06, + "loss": 0.8965, + "step": 7396 + }, + { + "epoch": 0.5702281837804501, + "grad_norm": 3.7279725074768066, + "learning_rate": 4.110865827264742e-06, + "loss": 0.9718, + "step": 7397 + }, + { + "epoch": 0.5703052728954672, + "grad_norm": 3.387369394302368, + "learning_rate": 4.109637305754293e-06, + "loss": 0.9405, + "step": 7398 + }, + { + "epoch": 0.5703823620104841, + "grad_norm": 3.686414957046509, + "learning_rate": 4.108408839753346e-06, + "loss": 0.8854, + "step": 7399 + }, + { + "epoch": 0.5704594511255011, + "grad_norm": 3.8577775955200195, + "learning_rate": 4.107180429338491e-06, + "loss": 0.9723, + "step": 7400 + }, + { + "epoch": 0.570536540240518, + "grad_norm": 3.784442901611328, + "learning_rate": 4.105952074586311e-06, + "loss": 0.8432, + "step": 7401 + }, + { + "epoch": 0.570613629355535, + "grad_norm": 4.02522611618042, + "learning_rate": 4.10472377557339e-06, + "loss": 0.9356, + "step": 7402 + }, + { + "epoch": 0.570690718470552, + "grad_norm": 4.0557966232299805, + "learning_rate": 4.103495532376304e-06, + "loss": 0.9714, + "step": 7403 + }, + { + "epoch": 0.5707678075855689, + "grad_norm": 4.144707679748535, + "learning_rate": 4.1022673450716295e-06, + "loss": 1.0333, + "step": 7404 + }, + { + "epoch": 0.5708448967005859, + "grad_norm": 3.7137889862060547, + "learning_rate": 4.101039213735935e-06, + "loss": 1.0662, + "step": 7405 + }, + { + "epoch": 0.5709219858156028, + "grad_norm": 3.3066835403442383, + "learning_rate": 4.09981113844579e-06, + "loss": 0.7372, + "step": 7406 + }, + { + "epoch": 0.5709990749306197, + "grad_norm": 3.355921983718872, + "learning_rate": 4.098583119277759e-06, + "loss": 0.9921, + "step": 7407 + }, + { + "epoch": 0.5710761640456368, + "grad_norm": 3.732572555541992, + "learning_rate": 4.097355156308402e-06, + "loss": 0.9918, + "step": 7408 + }, + { + "epoch": 0.5711532531606537, + "grad_norm": 3.7717831134796143, + "learning_rate": 4.096127249614276e-06, + "loss": 0.8853, + "step": 7409 + }, + { + "epoch": 0.5712303422756707, + "grad_norm": 3.5494415760040283, + "learning_rate": 4.094899399271935e-06, + "loss": 1.0293, + "step": 7410 + }, + { + "epoch": 0.5713074313906876, + "grad_norm": 3.6082026958465576, + "learning_rate": 4.093671605357928e-06, + "loss": 0.9877, + "step": 7411 + }, + { + "epoch": 0.5713845205057045, + "grad_norm": 3.492135524749756, + "learning_rate": 4.092443867948801e-06, + "loss": 0.8388, + "step": 7412 + }, + { + "epoch": 0.5714616096207216, + "grad_norm": 3.4625587463378906, + "learning_rate": 4.0912161871211e-06, + "loss": 0.8672, + "step": 7413 + }, + { + "epoch": 0.5715386987357385, + "grad_norm": 3.4896044731140137, + "learning_rate": 4.089988562951363e-06, + "loss": 0.933, + "step": 7414 + }, + { + "epoch": 0.5716157878507555, + "grad_norm": 3.792921781539917, + "learning_rate": 4.088760995516127e-06, + "loss": 0.9755, + "step": 7415 + }, + { + "epoch": 0.5716928769657724, + "grad_norm": 4.106998443603516, + "learning_rate": 4.0875334848919225e-06, + "loss": 0.9181, + "step": 7416 + }, + { + "epoch": 0.5717699660807893, + "grad_norm": 3.3308520317077637, + "learning_rate": 4.08630603115528e-06, + "loss": 0.9823, + "step": 7417 + }, + { + "epoch": 0.5718470551958064, + "grad_norm": 3.626404285430908, + "learning_rate": 4.085078634382724e-06, + "loss": 0.924, + "step": 7418 + }, + { + "epoch": 0.5719241443108233, + "grad_norm": 3.646789312362671, + "learning_rate": 4.083851294650776e-06, + "loss": 0.9783, + "step": 7419 + }, + { + "epoch": 0.5720012334258403, + "grad_norm": 3.3674442768096924, + "learning_rate": 4.0826240120359565e-06, + "loss": 0.8714, + "step": 7420 + }, + { + "epoch": 0.5720783225408572, + "grad_norm": 3.333601474761963, + "learning_rate": 4.081396786614777e-06, + "loss": 0.9889, + "step": 7421 + }, + { + "epoch": 0.5721554116558741, + "grad_norm": 3.5855748653411865, + "learning_rate": 4.080169618463751e-06, + "loss": 0.8852, + "step": 7422 + }, + { + "epoch": 0.5722325007708912, + "grad_norm": 3.490710496902466, + "learning_rate": 4.078942507659386e-06, + "loss": 0.9468, + "step": 7423 + }, + { + "epoch": 0.5723095898859081, + "grad_norm": 3.4587180614471436, + "learning_rate": 4.0777154542781846e-06, + "loss": 0.9592, + "step": 7424 + }, + { + "epoch": 0.5723866790009251, + "grad_norm": 3.637395143508911, + "learning_rate": 4.076488458396649e-06, + "loss": 0.9181, + "step": 7425 + }, + { + "epoch": 0.572463768115942, + "grad_norm": 3.4306821823120117, + "learning_rate": 4.075261520091273e-06, + "loss": 0.9677, + "step": 7426 + }, + { + "epoch": 0.572540857230959, + "grad_norm": 3.6101748943328857, + "learning_rate": 4.074034639438553e-06, + "loss": 0.9951, + "step": 7427 + }, + { + "epoch": 0.572617946345976, + "grad_norm": 3.5071911811828613, + "learning_rate": 4.072807816514978e-06, + "loss": 0.878, + "step": 7428 + }, + { + "epoch": 0.5726950354609929, + "grad_norm": 3.8580219745635986, + "learning_rate": 4.071581051397033e-06, + "loss": 0.8271, + "step": 7429 + }, + { + "epoch": 0.5727721245760099, + "grad_norm": 3.526630401611328, + "learning_rate": 4.070354344161201e-06, + "loss": 0.9959, + "step": 7430 + }, + { + "epoch": 0.5728492136910268, + "grad_norm": 3.5120689868927, + "learning_rate": 4.069127694883962e-06, + "loss": 0.8585, + "step": 7431 + }, + { + "epoch": 0.5729263028060438, + "grad_norm": 3.920745611190796, + "learning_rate": 4.067901103641788e-06, + "loss": 0.9903, + "step": 7432 + }, + { + "epoch": 0.5730033919210608, + "grad_norm": 3.5974879264831543, + "learning_rate": 4.066674570511156e-06, + "loss": 0.9851, + "step": 7433 + }, + { + "epoch": 0.5730804810360777, + "grad_norm": 3.3706092834472656, + "learning_rate": 4.065448095568527e-06, + "loss": 0.9094, + "step": 7434 + }, + { + "epoch": 0.5731575701510947, + "grad_norm": 3.542107582092285, + "learning_rate": 4.064221678890371e-06, + "loss": 0.9867, + "step": 7435 + }, + { + "epoch": 0.5732346592661116, + "grad_norm": 3.5891056060791016, + "learning_rate": 4.062995320553147e-06, + "loss": 0.8284, + "step": 7436 + }, + { + "epoch": 0.5733117483811286, + "grad_norm": 3.375009059906006, + "learning_rate": 4.061769020633311e-06, + "loss": 0.86, + "step": 7437 + }, + { + "epoch": 0.5733888374961456, + "grad_norm": 3.44054913520813, + "learning_rate": 4.060542779207317e-06, + "loss": 0.9128, + "step": 7438 + }, + { + "epoch": 0.5734659266111625, + "grad_norm": 3.8439524173736572, + "learning_rate": 4.059316596351617e-06, + "loss": 1.0774, + "step": 7439 + }, + { + "epoch": 0.5735430157261795, + "grad_norm": 4.025179862976074, + "learning_rate": 4.058090472142654e-06, + "loss": 1.0527, + "step": 7440 + }, + { + "epoch": 0.5736201048411964, + "grad_norm": 3.771923780441284, + "learning_rate": 4.056864406656872e-06, + "loss": 0.9199, + "step": 7441 + }, + { + "epoch": 0.5736971939562134, + "grad_norm": 3.6226768493652344, + "learning_rate": 4.05563839997071e-06, + "loss": 0.8523, + "step": 7442 + }, + { + "epoch": 0.5737742830712304, + "grad_norm": 3.7793591022491455, + "learning_rate": 4.054412452160601e-06, + "loss": 0.9109, + "step": 7443 + }, + { + "epoch": 0.5738513721862473, + "grad_norm": 3.322064161300659, + "learning_rate": 4.053186563302981e-06, + "loss": 0.8126, + "step": 7444 + }, + { + "epoch": 0.5739284613012643, + "grad_norm": 3.742952585220337, + "learning_rate": 4.051960733474273e-06, + "loss": 1.0401, + "step": 7445 + }, + { + "epoch": 0.5740055504162812, + "grad_norm": 3.36472487449646, + "learning_rate": 4.0507349627509045e-06, + "loss": 0.7989, + "step": 7446 + }, + { + "epoch": 0.5740826395312982, + "grad_norm": 3.582036018371582, + "learning_rate": 4.049509251209295e-06, + "loss": 0.9565, + "step": 7447 + }, + { + "epoch": 0.5741597286463151, + "grad_norm": 3.877749443054199, + "learning_rate": 4.048283598925859e-06, + "loss": 0.8715, + "step": 7448 + }, + { + "epoch": 0.5742368177613321, + "grad_norm": 3.445472002029419, + "learning_rate": 4.0470580059770125e-06, + "loss": 0.9293, + "step": 7449 + }, + { + "epoch": 0.5743139068763491, + "grad_norm": 3.6132113933563232, + "learning_rate": 4.045832472439165e-06, + "loss": 1.0043, + "step": 7450 + }, + { + "epoch": 0.574390995991366, + "grad_norm": 3.4551825523376465, + "learning_rate": 4.04460699838872e-06, + "loss": 0.8898, + "step": 7451 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 3.322658061981201, + "learning_rate": 4.0433815839020815e-06, + "loss": 0.8621, + "step": 7452 + }, + { + "epoch": 0.5745451742214, + "grad_norm": 3.8116767406463623, + "learning_rate": 4.042156229055645e-06, + "loss": 0.9973, + "step": 7453 + }, + { + "epoch": 0.5746222633364169, + "grad_norm": 3.5855953693389893, + "learning_rate": 4.040930933925808e-06, + "loss": 1.0915, + "step": 7454 + }, + { + "epoch": 0.5746993524514339, + "grad_norm": 3.5874369144439697, + "learning_rate": 4.039705698588961e-06, + "loss": 1.0041, + "step": 7455 + }, + { + "epoch": 0.5747764415664508, + "grad_norm": 4.597201824188232, + "learning_rate": 4.038480523121488e-06, + "loss": 0.851, + "step": 7456 + }, + { + "epoch": 0.5748535306814678, + "grad_norm": 3.7259809970855713, + "learning_rate": 4.037255407599775e-06, + "loss": 0.9158, + "step": 7457 + }, + { + "epoch": 0.5749306197964847, + "grad_norm": 3.8748040199279785, + "learning_rate": 4.0360303521002014e-06, + "loss": 0.9733, + "step": 7458 + }, + { + "epoch": 0.5750077089115017, + "grad_norm": 3.888749599456787, + "learning_rate": 4.034805356699144e-06, + "loss": 1.0054, + "step": 7459 + }, + { + "epoch": 0.5750847980265187, + "grad_norm": 3.4167816638946533, + "learning_rate": 4.033580421472973e-06, + "loss": 0.8625, + "step": 7460 + }, + { + "epoch": 0.5751618871415356, + "grad_norm": 4.041205406188965, + "learning_rate": 4.032355546498057e-06, + "loss": 0.9047, + "step": 7461 + }, + { + "epoch": 0.5752389762565526, + "grad_norm": 4.094997882843018, + "learning_rate": 4.031130731850762e-06, + "loss": 0.9403, + "step": 7462 + }, + { + "epoch": 0.5753160653715695, + "grad_norm": 3.3104913234710693, + "learning_rate": 4.029905977607448e-06, + "loss": 0.8705, + "step": 7463 + }, + { + "epoch": 0.5753931544865865, + "grad_norm": 3.7642409801483154, + "learning_rate": 4.028681283844471e-06, + "loss": 0.961, + "step": 7464 + }, + { + "epoch": 0.5754702436016035, + "grad_norm": 3.346750497817993, + "learning_rate": 4.027456650638187e-06, + "loss": 0.8146, + "step": 7465 + }, + { + "epoch": 0.5755473327166204, + "grad_norm": 3.714411735534668, + "learning_rate": 4.026232078064942e-06, + "loss": 1.0303, + "step": 7466 + }, + { + "epoch": 0.5756244218316374, + "grad_norm": 3.7148983478546143, + "learning_rate": 4.025007566201085e-06, + "loss": 0.8392, + "step": 7467 + }, + { + "epoch": 0.5757015109466543, + "grad_norm": 3.490588426589966, + "learning_rate": 4.023783115122956e-06, + "loss": 0.9568, + "step": 7468 + }, + { + "epoch": 0.5757786000616713, + "grad_norm": 3.651613235473633, + "learning_rate": 4.0225587249068945e-06, + "loss": 0.972, + "step": 7469 + }, + { + "epoch": 0.5758556891766883, + "grad_norm": 3.942833185195923, + "learning_rate": 4.021334395629234e-06, + "loss": 1.0118, + "step": 7470 + }, + { + "epoch": 0.5759327782917052, + "grad_norm": 3.5396273136138916, + "learning_rate": 4.020110127366304e-06, + "loss": 0.8425, + "step": 7471 + }, + { + "epoch": 0.5760098674067222, + "grad_norm": 3.949531078338623, + "learning_rate": 4.018885920194434e-06, + "loss": 0.9779, + "step": 7472 + }, + { + "epoch": 0.5760869565217391, + "grad_norm": 4.047253131866455, + "learning_rate": 4.0176617741899456e-06, + "loss": 1.018, + "step": 7473 + }, + { + "epoch": 0.576164045636756, + "grad_norm": 4.037176609039307, + "learning_rate": 4.016437689429158e-06, + "loss": 0.949, + "step": 7474 + }, + { + "epoch": 0.5762411347517731, + "grad_norm": 3.5076513290405273, + "learning_rate": 4.015213665988385e-06, + "loss": 0.9106, + "step": 7475 + }, + { + "epoch": 0.57631822386679, + "grad_norm": 3.6117470264434814, + "learning_rate": 4.013989703943942e-06, + "loss": 0.9441, + "step": 7476 + }, + { + "epoch": 0.576395312981807, + "grad_norm": 3.7086315155029297, + "learning_rate": 4.012765803372132e-06, + "loss": 0.9981, + "step": 7477 + }, + { + "epoch": 0.5764724020968239, + "grad_norm": 3.609908103942871, + "learning_rate": 4.011541964349261e-06, + "loss": 0.9198, + "step": 7478 + }, + { + "epoch": 0.5765494912118408, + "grad_norm": 3.6467490196228027, + "learning_rate": 4.010318186951631e-06, + "loss": 0.9531, + "step": 7479 + }, + { + "epoch": 0.5766265803268579, + "grad_norm": 3.7244629859924316, + "learning_rate": 4.009094471255536e-06, + "loss": 0.9511, + "step": 7480 + }, + { + "epoch": 0.5767036694418748, + "grad_norm": 4.267176628112793, + "learning_rate": 4.007870817337268e-06, + "loss": 0.9585, + "step": 7481 + }, + { + "epoch": 0.5767807585568918, + "grad_norm": 4.043630599975586, + "learning_rate": 4.006647225273116e-06, + "loss": 0.9615, + "step": 7482 + }, + { + "epoch": 0.5768578476719087, + "grad_norm": 3.869731903076172, + "learning_rate": 4.005423695139366e-06, + "loss": 0.9452, + "step": 7483 + }, + { + "epoch": 0.5769349367869256, + "grad_norm": 3.4697086811065674, + "learning_rate": 4.004200227012296e-06, + "loss": 0.7982, + "step": 7484 + }, + { + "epoch": 0.5770120259019427, + "grad_norm": 3.798818826675415, + "learning_rate": 4.002976820968186e-06, + "loss": 0.9925, + "step": 7485 + }, + { + "epoch": 0.5770891150169596, + "grad_norm": 3.5721566677093506, + "learning_rate": 4.001753477083306e-06, + "loss": 0.8914, + "step": 7486 + }, + { + "epoch": 0.5771662041319766, + "grad_norm": 3.3828439712524414, + "learning_rate": 4.000530195433928e-06, + "loss": 0.8027, + "step": 7487 + }, + { + "epoch": 0.5772432932469935, + "grad_norm": 3.4918408393859863, + "learning_rate": 3.999306976096315e-06, + "loss": 0.8907, + "step": 7488 + }, + { + "epoch": 0.5773203823620104, + "grad_norm": 3.524773359298706, + "learning_rate": 3.9980838191467296e-06, + "loss": 0.9068, + "step": 7489 + }, + { + "epoch": 0.5773974714770275, + "grad_norm": 3.529926300048828, + "learning_rate": 3.996860724661429e-06, + "loss": 0.9297, + "step": 7490 + }, + { + "epoch": 0.5774745605920444, + "grad_norm": 3.6954150199890137, + "learning_rate": 3.995637692716666e-06, + "loss": 0.9739, + "step": 7491 + }, + { + "epoch": 0.5775516497070614, + "grad_norm": 3.9414443969726562, + "learning_rate": 3.994414723388693e-06, + "loss": 1.0595, + "step": 7492 + }, + { + "epoch": 0.5776287388220783, + "grad_norm": 3.7672884464263916, + "learning_rate": 3.993191816753753e-06, + "loss": 0.9773, + "step": 7493 + }, + { + "epoch": 0.5777058279370952, + "grad_norm": 3.520125389099121, + "learning_rate": 3.991968972888088e-06, + "loss": 0.8697, + "step": 7494 + }, + { + "epoch": 0.5777829170521123, + "grad_norm": 3.5860490798950195, + "learning_rate": 3.990746191867938e-06, + "loss": 0.9923, + "step": 7495 + }, + { + "epoch": 0.5778600061671292, + "grad_norm": 3.428400993347168, + "learning_rate": 3.989523473769536e-06, + "loss": 0.7466, + "step": 7496 + }, + { + "epoch": 0.5779370952821462, + "grad_norm": 3.6904683113098145, + "learning_rate": 3.98830081866911e-06, + "loss": 1.0291, + "step": 7497 + }, + { + "epoch": 0.5780141843971631, + "grad_norm": 3.687467336654663, + "learning_rate": 3.987078226642891e-06, + "loss": 0.8533, + "step": 7498 + }, + { + "epoch": 0.57809127351218, + "grad_norm": 3.687129020690918, + "learning_rate": 3.985855697767097e-06, + "loss": 0.9885, + "step": 7499 + }, + { + "epoch": 0.5781683626271971, + "grad_norm": 3.5919816493988037, + "learning_rate": 3.984633232117948e-06, + "loss": 1.0096, + "step": 7500 + }, + { + "epoch": 0.578245451742214, + "grad_norm": 3.3104560375213623, + "learning_rate": 3.983410829771656e-06, + "loss": 0.9715, + "step": 7501 + }, + { + "epoch": 0.578322540857231, + "grad_norm": 3.539083957672119, + "learning_rate": 3.982188490804437e-06, + "loss": 0.8584, + "step": 7502 + }, + { + "epoch": 0.5783996299722479, + "grad_norm": 3.4675793647766113, + "learning_rate": 3.980966215292493e-06, + "loss": 0.903, + "step": 7503 + }, + { + "epoch": 0.5784767190872648, + "grad_norm": 3.8223822116851807, + "learning_rate": 3.979744003312027e-06, + "loss": 0.9112, + "step": 7504 + }, + { + "epoch": 0.5785538082022819, + "grad_norm": 3.458359956741333, + "learning_rate": 3.97852185493924e-06, + "loss": 1.0016, + "step": 7505 + }, + { + "epoch": 0.5786308973172988, + "grad_norm": 3.756317138671875, + "learning_rate": 3.977299770250324e-06, + "loss": 0.9623, + "step": 7506 + }, + { + "epoch": 0.5787079864323158, + "grad_norm": 3.434952735900879, + "learning_rate": 3.976077749321472e-06, + "loss": 0.8595, + "step": 7507 + }, + { + "epoch": 0.5787850755473327, + "grad_norm": 3.5256009101867676, + "learning_rate": 3.974855792228868e-06, + "loss": 0.9876, + "step": 7508 + }, + { + "epoch": 0.5788621646623496, + "grad_norm": 3.5869874954223633, + "learning_rate": 3.973633899048696e-06, + "loss": 0.9544, + "step": 7509 + }, + { + "epoch": 0.5789392537773667, + "grad_norm": 3.856430768966675, + "learning_rate": 3.9724120698571354e-06, + "loss": 1.058, + "step": 7510 + }, + { + "epoch": 0.5790163428923836, + "grad_norm": 3.683361053466797, + "learning_rate": 3.971190304730359e-06, + "loss": 0.9864, + "step": 7511 + }, + { + "epoch": 0.5790934320074006, + "grad_norm": 3.582198143005371, + "learning_rate": 3.969968603744539e-06, + "loss": 0.8897, + "step": 7512 + }, + { + "epoch": 0.5791705211224175, + "grad_norm": 3.6801340579986572, + "learning_rate": 3.968746966975844e-06, + "loss": 0.9485, + "step": 7513 + }, + { + "epoch": 0.5792476102374344, + "grad_norm": 3.6165666580200195, + "learning_rate": 3.967525394500432e-06, + "loss": 0.9006, + "step": 7514 + }, + { + "epoch": 0.5793246993524515, + "grad_norm": 3.775045394897461, + "learning_rate": 3.966303886394465e-06, + "loss": 0.9171, + "step": 7515 + }, + { + "epoch": 0.5794017884674684, + "grad_norm": 3.5905048847198486, + "learning_rate": 3.965082442734098e-06, + "loss": 0.8586, + "step": 7516 + }, + { + "epoch": 0.5794788775824854, + "grad_norm": 3.667304277420044, + "learning_rate": 3.96386106359548e-06, + "loss": 0.9203, + "step": 7517 + }, + { + "epoch": 0.5795559666975023, + "grad_norm": 3.824692726135254, + "learning_rate": 3.96263974905476e-06, + "loss": 0.8806, + "step": 7518 + }, + { + "epoch": 0.5796330558125192, + "grad_norm": 3.4772675037384033, + "learning_rate": 3.961418499188076e-06, + "loss": 0.8067, + "step": 7519 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 3.65657377243042, + "learning_rate": 3.960197314071571e-06, + "loss": 1.0266, + "step": 7520 + }, + { + "epoch": 0.5797872340425532, + "grad_norm": 3.3273978233337402, + "learning_rate": 3.9589761937813795e-06, + "loss": 0.82, + "step": 7521 + }, + { + "epoch": 0.5798643231575702, + "grad_norm": 3.5966200828552246, + "learning_rate": 3.957755138393629e-06, + "loss": 0.8728, + "step": 7522 + }, + { + "epoch": 0.5799414122725871, + "grad_norm": 3.8184077739715576, + "learning_rate": 3.9565341479844476e-06, + "loss": 1.0214, + "step": 7523 + }, + { + "epoch": 0.580018501387604, + "grad_norm": 3.5819761753082275, + "learning_rate": 3.9553132226299595e-06, + "loss": 0.8721, + "step": 7524 + }, + { + "epoch": 0.580095590502621, + "grad_norm": 3.768620491027832, + "learning_rate": 3.95409236240628e-06, + "loss": 0.905, + "step": 7525 + }, + { + "epoch": 0.580172679617638, + "grad_norm": 3.6173248291015625, + "learning_rate": 3.952871567389525e-06, + "loss": 0.8692, + "step": 7526 + }, + { + "epoch": 0.580249768732655, + "grad_norm": 3.3437893390655518, + "learning_rate": 3.951650837655805e-06, + "loss": 0.982, + "step": 7527 + }, + { + "epoch": 0.5803268578476719, + "grad_norm": 3.5120673179626465, + "learning_rate": 3.9504301732812255e-06, + "loss": 0.9568, + "step": 7528 + }, + { + "epoch": 0.5804039469626888, + "grad_norm": 3.586928606033325, + "learning_rate": 3.949209574341889e-06, + "loss": 0.9897, + "step": 7529 + }, + { + "epoch": 0.5804810360777058, + "grad_norm": 3.88667631149292, + "learning_rate": 3.947989040913893e-06, + "loss": 1.0405, + "step": 7530 + }, + { + "epoch": 0.5805581251927228, + "grad_norm": 3.989518165588379, + "learning_rate": 3.946768573073332e-06, + "loss": 0.9536, + "step": 7531 + }, + { + "epoch": 0.5806352143077398, + "grad_norm": 3.595759630203247, + "learning_rate": 3.945548170896296e-06, + "loss": 0.9909, + "step": 7532 + }, + { + "epoch": 0.5807123034227567, + "grad_norm": 3.8122918605804443, + "learning_rate": 3.9443278344588696e-06, + "loss": 0.9124, + "step": 7533 + }, + { + "epoch": 0.5807893925377736, + "grad_norm": 3.4445278644561768, + "learning_rate": 3.943107563837135e-06, + "loss": 0.862, + "step": 7534 + }, + { + "epoch": 0.5808664816527906, + "grad_norm": 3.818312168121338, + "learning_rate": 3.941887359107172e-06, + "loss": 0.9938, + "step": 7535 + }, + { + "epoch": 0.5809435707678076, + "grad_norm": 3.6092066764831543, + "learning_rate": 3.94066722034505e-06, + "loss": 0.9801, + "step": 7536 + }, + { + "epoch": 0.5810206598828246, + "grad_norm": 3.58729887008667, + "learning_rate": 3.939447147626842e-06, + "loss": 0.8662, + "step": 7537 + }, + { + "epoch": 0.5810977489978415, + "grad_norm": 4.0023298263549805, + "learning_rate": 3.93822714102861e-06, + "loss": 0.929, + "step": 7538 + }, + { + "epoch": 0.5811748381128584, + "grad_norm": 3.671865940093994, + "learning_rate": 3.937007200626417e-06, + "loss": 1.0063, + "step": 7539 + }, + { + "epoch": 0.5812519272278754, + "grad_norm": 3.9772789478302, + "learning_rate": 3.935787326496321e-06, + "loss": 1.1299, + "step": 7540 + }, + { + "epoch": 0.5813290163428924, + "grad_norm": 3.3404088020324707, + "learning_rate": 3.934567518714372e-06, + "loss": 0.9902, + "step": 7541 + }, + { + "epoch": 0.5814061054579094, + "grad_norm": 3.876286506652832, + "learning_rate": 3.9333477773566204e-06, + "loss": 0.9492, + "step": 7542 + }, + { + "epoch": 0.5814831945729263, + "grad_norm": 3.982120990753174, + "learning_rate": 3.932128102499111e-06, + "loss": 1.0037, + "step": 7543 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 4.225841999053955, + "learning_rate": 3.930908494217884e-06, + "loss": 0.8912, + "step": 7544 + }, + { + "epoch": 0.5816373728029602, + "grad_norm": 3.5947415828704834, + "learning_rate": 3.929688952588974e-06, + "loss": 1.0022, + "step": 7545 + }, + { + "epoch": 0.5817144619179772, + "grad_norm": 3.5156421661376953, + "learning_rate": 3.928469477688415e-06, + "loss": 0.8517, + "step": 7546 + }, + { + "epoch": 0.5817915510329942, + "grad_norm": 3.9697585105895996, + "learning_rate": 3.927250069592236e-06, + "loss": 0.868, + "step": 7547 + }, + { + "epoch": 0.5818686401480111, + "grad_norm": 3.4268386363983154, + "learning_rate": 3.926030728376458e-06, + "loss": 0.8795, + "step": 7548 + }, + { + "epoch": 0.581945729263028, + "grad_norm": 3.7398383617401123, + "learning_rate": 3.924811454117101e-06, + "loss": 0.963, + "step": 7549 + }, + { + "epoch": 0.582022818378045, + "grad_norm": 3.5979716777801514, + "learning_rate": 3.923592246890183e-06, + "loss": 0.9222, + "step": 7550 + }, + { + "epoch": 0.582099907493062, + "grad_norm": 3.304446220397949, + "learning_rate": 3.922373106771713e-06, + "loss": 0.895, + "step": 7551 + }, + { + "epoch": 0.582176996608079, + "grad_norm": 3.7752010822296143, + "learning_rate": 3.921154033837699e-06, + "loss": 1.0166, + "step": 7552 + }, + { + "epoch": 0.5822540857230959, + "grad_norm": 3.516181707382202, + "learning_rate": 3.919935028164143e-06, + "loss": 0.9529, + "step": 7553 + }, + { + "epoch": 0.5823311748381128, + "grad_norm": 3.4951820373535156, + "learning_rate": 3.9187160898270435e-06, + "loss": 0.9276, + "step": 7554 + }, + { + "epoch": 0.5824082639531298, + "grad_norm": 3.369300365447998, + "learning_rate": 3.917497218902398e-06, + "loss": 0.9608, + "step": 7555 + }, + { + "epoch": 0.5824853530681467, + "grad_norm": 3.4424049854278564, + "learning_rate": 3.916278415466193e-06, + "loss": 0.9437, + "step": 7556 + }, + { + "epoch": 0.5825624421831638, + "grad_norm": 3.40697979927063, + "learning_rate": 3.9150596795944155e-06, + "loss": 0.9566, + "step": 7557 + }, + { + "epoch": 0.5826395312981807, + "grad_norm": 3.608034133911133, + "learning_rate": 3.913841011363049e-06, + "loss": 0.9437, + "step": 7558 + }, + { + "epoch": 0.5827166204131976, + "grad_norm": 3.5082850456237793, + "learning_rate": 3.912622410848069e-06, + "loss": 0.8977, + "step": 7559 + }, + { + "epoch": 0.5827937095282146, + "grad_norm": 3.802832841873169, + "learning_rate": 3.91140387812545e-06, + "loss": 0.9256, + "step": 7560 + }, + { + "epoch": 0.5828707986432315, + "grad_norm": 3.8759782314300537, + "learning_rate": 3.910185413271162e-06, + "loss": 0.8519, + "step": 7561 + }, + { + "epoch": 0.5829478877582486, + "grad_norm": 3.485323429107666, + "learning_rate": 3.908967016361169e-06, + "loss": 0.9075, + "step": 7562 + }, + { + "epoch": 0.5830249768732655, + "grad_norm": 3.791802167892456, + "learning_rate": 3.907748687471431e-06, + "loss": 0.9889, + "step": 7563 + }, + { + "epoch": 0.5831020659882824, + "grad_norm": 3.474576950073242, + "learning_rate": 3.906530426677907e-06, + "loss": 0.8918, + "step": 7564 + }, + { + "epoch": 0.5831791551032994, + "grad_norm": 3.3780479431152344, + "learning_rate": 3.905312234056547e-06, + "loss": 0.8903, + "step": 7565 + }, + { + "epoch": 0.5832562442183163, + "grad_norm": 3.6616179943084717, + "learning_rate": 3.904094109683301e-06, + "loss": 0.9756, + "step": 7566 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 3.564548969268799, + "learning_rate": 3.902876053634109e-06, + "loss": 0.8836, + "step": 7567 + }, + { + "epoch": 0.5834104224483503, + "grad_norm": 3.944483518600464, + "learning_rate": 3.901658065984913e-06, + "loss": 0.9277, + "step": 7568 + }, + { + "epoch": 0.5834875115633672, + "grad_norm": 3.700523614883423, + "learning_rate": 3.90044014681165e-06, + "loss": 1.0303, + "step": 7569 + }, + { + "epoch": 0.5835646006783842, + "grad_norm": 3.6363394260406494, + "learning_rate": 3.899222296190248e-06, + "loss": 0.8375, + "step": 7570 + }, + { + "epoch": 0.5836416897934011, + "grad_norm": 3.8112072944641113, + "learning_rate": 3.898004514196635e-06, + "loss": 0.9404, + "step": 7571 + }, + { + "epoch": 0.5837187789084182, + "grad_norm": 3.8142549991607666, + "learning_rate": 3.896786800906734e-06, + "loss": 0.908, + "step": 7572 + }, + { + "epoch": 0.5837958680234351, + "grad_norm": 3.6068294048309326, + "learning_rate": 3.8955691563964605e-06, + "loss": 0.8951, + "step": 7573 + }, + { + "epoch": 0.583872957138452, + "grad_norm": 3.8355910778045654, + "learning_rate": 3.894351580741731e-06, + "loss": 0.8517, + "step": 7574 + }, + { + "epoch": 0.583950046253469, + "grad_norm": 3.838613986968994, + "learning_rate": 3.893134074018454e-06, + "loss": 1.0072, + "step": 7575 + }, + { + "epoch": 0.5840271353684859, + "grad_norm": 3.381627321243286, + "learning_rate": 3.891916636302535e-06, + "loss": 0.9339, + "step": 7576 + }, + { + "epoch": 0.584104224483503, + "grad_norm": 3.5409560203552246, + "learning_rate": 3.890699267669876e-06, + "loss": 0.8285, + "step": 7577 + }, + { + "epoch": 0.5841813135985199, + "grad_norm": 3.678004741668701, + "learning_rate": 3.889481968196371e-06, + "loss": 0.9596, + "step": 7578 + }, + { + "epoch": 0.5842584027135368, + "grad_norm": 4.06223726272583, + "learning_rate": 3.888264737957913e-06, + "loss": 1.0516, + "step": 7579 + }, + { + "epoch": 0.5843354918285538, + "grad_norm": 3.5083158016204834, + "learning_rate": 3.887047577030393e-06, + "loss": 0.8042, + "step": 7580 + }, + { + "epoch": 0.5844125809435707, + "grad_norm": 3.717420816421509, + "learning_rate": 3.8858304854896906e-06, + "loss": 1.0194, + "step": 7581 + }, + { + "epoch": 0.5844896700585878, + "grad_norm": 3.8681986331939697, + "learning_rate": 3.884613463411687e-06, + "loss": 0.9029, + "step": 7582 + }, + { + "epoch": 0.5845667591736047, + "grad_norm": 3.5388240814208984, + "learning_rate": 3.883396510872259e-06, + "loss": 0.9348, + "step": 7583 + }, + { + "epoch": 0.5846438482886216, + "grad_norm": 3.749166488647461, + "learning_rate": 3.882179627947273e-06, + "loss": 0.9825, + "step": 7584 + }, + { + "epoch": 0.5847209374036386, + "grad_norm": 3.7191250324249268, + "learning_rate": 3.8809628147126e-06, + "loss": 0.9845, + "step": 7585 + }, + { + "epoch": 0.5847980265186555, + "grad_norm": 3.772263526916504, + "learning_rate": 3.879746071244099e-06, + "loss": 0.8341, + "step": 7586 + }, + { + "epoch": 0.5848751156336726, + "grad_norm": 3.8493549823760986, + "learning_rate": 3.8785293976176285e-06, + "loss": 1.0723, + "step": 7587 + }, + { + "epoch": 0.5849522047486895, + "grad_norm": 3.944638967514038, + "learning_rate": 3.877312793909043e-06, + "loss": 1.0129, + "step": 7588 + }, + { + "epoch": 0.5850292938637064, + "grad_norm": 3.4591333866119385, + "learning_rate": 3.876096260194188e-06, + "loss": 1.0135, + "step": 7589 + }, + { + "epoch": 0.5851063829787234, + "grad_norm": 3.324209451675415, + "learning_rate": 3.87487979654891e-06, + "loss": 0.8952, + "step": 7590 + }, + { + "epoch": 0.5851834720937403, + "grad_norm": 3.500072956085205, + "learning_rate": 3.873663403049051e-06, + "loss": 0.8838, + "step": 7591 + }, + { + "epoch": 0.5852605612087574, + "grad_norm": 3.5885183811187744, + "learning_rate": 3.872447079770446e-06, + "loss": 0.9321, + "step": 7592 + }, + { + "epoch": 0.5853376503237743, + "grad_norm": 3.4161670207977295, + "learning_rate": 3.871230826788925e-06, + "loss": 0.9088, + "step": 7593 + }, + { + "epoch": 0.5854147394387912, + "grad_norm": 3.7749154567718506, + "learning_rate": 3.870014644180315e-06, + "loss": 0.9423, + "step": 7594 + }, + { + "epoch": 0.5854918285538082, + "grad_norm": 3.5398154258728027, + "learning_rate": 3.868798532020442e-06, + "loss": 0.932, + "step": 7595 + }, + { + "epoch": 0.5855689176688251, + "grad_norm": 3.844788074493408, + "learning_rate": 3.86758249038512e-06, + "loss": 0.9811, + "step": 7596 + }, + { + "epoch": 0.5856460067838422, + "grad_norm": 3.943168878555298, + "learning_rate": 3.866366519350165e-06, + "loss": 0.9965, + "step": 7597 + }, + { + "epoch": 0.5857230958988591, + "grad_norm": 3.5133891105651855, + "learning_rate": 3.865150618991388e-06, + "loss": 0.9619, + "step": 7598 + }, + { + "epoch": 0.5858001850138761, + "grad_norm": 3.773076295852661, + "learning_rate": 3.8639347893845905e-06, + "loss": 0.8272, + "step": 7599 + }, + { + "epoch": 0.585877274128893, + "grad_norm": 3.4386439323425293, + "learning_rate": 3.862719030605575e-06, + "loss": 0.921, + "step": 7600 + }, + { + "epoch": 0.5859543632439099, + "grad_norm": 3.826698064804077, + "learning_rate": 3.861503342730139e-06, + "loss": 0.8279, + "step": 7601 + }, + { + "epoch": 0.586031452358927, + "grad_norm": 3.6429896354675293, + "learning_rate": 3.860287725834072e-06, + "loss": 0.8615, + "step": 7602 + }, + { + "epoch": 0.5861085414739439, + "grad_norm": 3.78666353225708, + "learning_rate": 3.859072179993164e-06, + "loss": 0.9863, + "step": 7603 + }, + { + "epoch": 0.5861856305889609, + "grad_norm": 3.275872230529785, + "learning_rate": 3.857856705283195e-06, + "loss": 0.7179, + "step": 7604 + }, + { + "epoch": 0.5862627197039778, + "grad_norm": 3.5281994342803955, + "learning_rate": 3.856641301779946e-06, + "loss": 0.9711, + "step": 7605 + }, + { + "epoch": 0.5863398088189947, + "grad_norm": 3.61147141456604, + "learning_rate": 3.855425969559191e-06, + "loss": 0.9409, + "step": 7606 + }, + { + "epoch": 0.5864168979340117, + "grad_norm": 3.7258195877075195, + "learning_rate": 3.854210708696697e-06, + "loss": 0.8085, + "step": 7607 + }, + { + "epoch": 0.5864939870490287, + "grad_norm": 3.8481357097625732, + "learning_rate": 3.852995519268231e-06, + "loss": 0.9489, + "step": 7608 + }, + { + "epoch": 0.5865710761640457, + "grad_norm": 4.056955337524414, + "learning_rate": 3.851780401349557e-06, + "loss": 1.0348, + "step": 7609 + }, + { + "epoch": 0.5866481652790626, + "grad_norm": 3.392282485961914, + "learning_rate": 3.8505653550164255e-06, + "loss": 0.8725, + "step": 7610 + }, + { + "epoch": 0.5867252543940795, + "grad_norm": 3.6491785049438477, + "learning_rate": 3.849350380344591e-06, + "loss": 0.8837, + "step": 7611 + }, + { + "epoch": 0.5868023435090965, + "grad_norm": 3.8450396060943604, + "learning_rate": 3.8481354774098025e-06, + "loss": 0.9088, + "step": 7612 + }, + { + "epoch": 0.5868794326241135, + "grad_norm": 3.7339627742767334, + "learning_rate": 3.8469206462878e-06, + "loss": 0.9026, + "step": 7613 + }, + { + "epoch": 0.5869565217391305, + "grad_norm": 3.736210346221924, + "learning_rate": 3.845705887054324e-06, + "loss": 0.8872, + "step": 7614 + }, + { + "epoch": 0.5870336108541474, + "grad_norm": 3.851383924484253, + "learning_rate": 3.844491199785107e-06, + "loss": 0.9875, + "step": 7615 + }, + { + "epoch": 0.5871106999691643, + "grad_norm": 3.5837175846099854, + "learning_rate": 3.843276584555879e-06, + "loss": 0.9964, + "step": 7616 + }, + { + "epoch": 0.5871877890841813, + "grad_norm": 3.548447847366333, + "learning_rate": 3.842062041442366e-06, + "loss": 0.9743, + "step": 7617 + }, + { + "epoch": 0.5872648781991983, + "grad_norm": 3.630624532699585, + "learning_rate": 3.8408475705202876e-06, + "loss": 0.9896, + "step": 7618 + }, + { + "epoch": 0.5873419673142153, + "grad_norm": 3.6596896648406982, + "learning_rate": 3.839633171865359e-06, + "loss": 0.8792, + "step": 7619 + }, + { + "epoch": 0.5874190564292322, + "grad_norm": 3.846327066421509, + "learning_rate": 3.838418845553293e-06, + "loss": 0.9892, + "step": 7620 + }, + { + "epoch": 0.5874961455442491, + "grad_norm": 3.916472911834717, + "learning_rate": 3.837204591659795e-06, + "loss": 1.0246, + "step": 7621 + }, + { + "epoch": 0.5875732346592661, + "grad_norm": 3.52048397064209, + "learning_rate": 3.83599041026057e-06, + "loss": 0.7953, + "step": 7622 + }, + { + "epoch": 0.5876503237742831, + "grad_norm": 3.525165557861328, + "learning_rate": 3.834776301431314e-06, + "loss": 0.8311, + "step": 7623 + }, + { + "epoch": 0.5877274128893001, + "grad_norm": 3.754812240600586, + "learning_rate": 3.83356226524772e-06, + "loss": 0.9588, + "step": 7624 + }, + { + "epoch": 0.587804502004317, + "grad_norm": 3.5467734336853027, + "learning_rate": 3.832348301785479e-06, + "loss": 0.9716, + "step": 7625 + }, + { + "epoch": 0.5878815911193339, + "grad_norm": 3.7867307662963867, + "learning_rate": 3.831134411120273e-06, + "loss": 1.0345, + "step": 7626 + }, + { + "epoch": 0.5879586802343509, + "grad_norm": 3.4668374061584473, + "learning_rate": 3.8299205933277814e-06, + "loss": 0.8672, + "step": 7627 + }, + { + "epoch": 0.5880357693493679, + "grad_norm": 3.5994906425476074, + "learning_rate": 3.8287068484836835e-06, + "loss": 0.8797, + "step": 7628 + }, + { + "epoch": 0.5881128584643849, + "grad_norm": 3.47133731842041, + "learning_rate": 3.827493176663645e-06, + "loss": 0.8707, + "step": 7629 + }, + { + "epoch": 0.5881899475794018, + "grad_norm": 3.8516485691070557, + "learning_rate": 3.826279577943335e-06, + "loss": 0.9896, + "step": 7630 + }, + { + "epoch": 0.5882670366944187, + "grad_norm": 3.4910974502563477, + "learning_rate": 3.825066052398416e-06, + "loss": 0.9694, + "step": 7631 + }, + { + "epoch": 0.5883441258094357, + "grad_norm": 3.6999757289886475, + "learning_rate": 3.823852600104542e-06, + "loss": 0.8824, + "step": 7632 + }, + { + "epoch": 0.5884212149244527, + "grad_norm": 3.632878541946411, + "learning_rate": 3.822639221137368e-06, + "loss": 0.9755, + "step": 7633 + }, + { + "epoch": 0.5884983040394697, + "grad_norm": 3.42018461227417, + "learning_rate": 3.821425915572537e-06, + "loss": 0.902, + "step": 7634 + }, + { + "epoch": 0.5885753931544866, + "grad_norm": 3.5439507961273193, + "learning_rate": 3.8202126834857e-06, + "loss": 0.8284, + "step": 7635 + }, + { + "epoch": 0.5886524822695035, + "grad_norm": 3.7671468257904053, + "learning_rate": 3.818999524952491e-06, + "loss": 0.9682, + "step": 7636 + }, + { + "epoch": 0.5887295713845205, + "grad_norm": 3.287444829940796, + "learning_rate": 3.817786440048545e-06, + "loss": 0.9903, + "step": 7637 + }, + { + "epoch": 0.5888066604995374, + "grad_norm": 3.516890525817871, + "learning_rate": 3.8165734288494925e-06, + "loss": 0.9282, + "step": 7638 + }, + { + "epoch": 0.5888837496145545, + "grad_norm": 3.4296483993530273, + "learning_rate": 3.815360491430956e-06, + "loss": 0.8815, + "step": 7639 + }, + { + "epoch": 0.5889608387295714, + "grad_norm": 3.8724493980407715, + "learning_rate": 3.8141476278685596e-06, + "loss": 0.8815, + "step": 7640 + }, + { + "epoch": 0.5890379278445883, + "grad_norm": 3.6292412281036377, + "learning_rate": 3.812934838237915e-06, + "loss": 0.9961, + "step": 7641 + }, + { + "epoch": 0.5891150169596053, + "grad_norm": 3.47171688079834, + "learning_rate": 3.811722122614636e-06, + "loss": 0.9523, + "step": 7642 + }, + { + "epoch": 0.5891921060746222, + "grad_norm": 3.357323408126831, + "learning_rate": 3.8105094810743286e-06, + "loss": 0.959, + "step": 7643 + }, + { + "epoch": 0.5892691951896393, + "grad_norm": 3.610004425048828, + "learning_rate": 3.809296913692594e-06, + "loss": 0.9088, + "step": 7644 + }, + { + "epoch": 0.5893462843046562, + "grad_norm": 3.6334939002990723, + "learning_rate": 3.80808442054503e-06, + "loss": 0.9915, + "step": 7645 + }, + { + "epoch": 0.5894233734196731, + "grad_norm": 3.537346601486206, + "learning_rate": 3.80687200170723e-06, + "loss": 0.8867, + "step": 7646 + }, + { + "epoch": 0.5895004625346901, + "grad_norm": 3.5367846488952637, + "learning_rate": 3.805659657254781e-06, + "loss": 0.9774, + "step": 7647 + }, + { + "epoch": 0.589577551649707, + "grad_norm": 3.7989237308502197, + "learning_rate": 3.8044473872632663e-06, + "loss": 0.959, + "step": 7648 + }, + { + "epoch": 0.5896546407647241, + "grad_norm": 3.7498884201049805, + "learning_rate": 3.8032351918082665e-06, + "loss": 0.9014, + "step": 7649 + }, + { + "epoch": 0.589731729879741, + "grad_norm": 3.9480056762695312, + "learning_rate": 3.8020230709653527e-06, + "loss": 0.9617, + "step": 7650 + }, + { + "epoch": 0.5898088189947579, + "grad_norm": 3.7743465900421143, + "learning_rate": 3.800811024810097e-06, + "loss": 1.0255, + "step": 7651 + }, + { + "epoch": 0.5898859081097749, + "grad_norm": 3.489729166030884, + "learning_rate": 3.7995990534180627e-06, + "loss": 0.929, + "step": 7652 + }, + { + "epoch": 0.5899629972247918, + "grad_norm": 4.14895486831665, + "learning_rate": 3.7983871568648095e-06, + "loss": 1.0033, + "step": 7653 + }, + { + "epoch": 0.5900400863398089, + "grad_norm": 3.2822515964508057, + "learning_rate": 3.7971753352258955e-06, + "loss": 0.8825, + "step": 7654 + }, + { + "epoch": 0.5901171754548258, + "grad_norm": 3.469475507736206, + "learning_rate": 3.7959635885768686e-06, + "loss": 0.8867, + "step": 7655 + }, + { + "epoch": 0.5901942645698427, + "grad_norm": 3.7009308338165283, + "learning_rate": 3.7947519169932754e-06, + "loss": 0.9416, + "step": 7656 + }, + { + "epoch": 0.5902713536848597, + "grad_norm": 3.662630319595337, + "learning_rate": 3.79354032055066e-06, + "loss": 0.9328, + "step": 7657 + }, + { + "epoch": 0.5903484427998766, + "grad_norm": 3.5414748191833496, + "learning_rate": 3.7923287993245556e-06, + "loss": 0.8743, + "step": 7658 + }, + { + "epoch": 0.5904255319148937, + "grad_norm": 3.310471773147583, + "learning_rate": 3.791117353390496e-06, + "loss": 0.7666, + "step": 7659 + }, + { + "epoch": 0.5905026210299106, + "grad_norm": 3.43023943901062, + "learning_rate": 3.78990598282401e-06, + "loss": 0.8936, + "step": 7660 + }, + { + "epoch": 0.5905797101449275, + "grad_norm": 3.8442037105560303, + "learning_rate": 3.788694687700617e-06, + "loss": 1.0593, + "step": 7661 + }, + { + "epoch": 0.5906567992599445, + "grad_norm": 3.430696487426758, + "learning_rate": 3.787483468095838e-06, + "loss": 0.8401, + "step": 7662 + }, + { + "epoch": 0.5907338883749614, + "grad_norm": 3.727947950363159, + "learning_rate": 3.786272324085184e-06, + "loss": 0.9685, + "step": 7663 + }, + { + "epoch": 0.5908109774899785, + "grad_norm": 3.676292657852173, + "learning_rate": 3.7850612557441648e-06, + "loss": 1.0102, + "step": 7664 + }, + { + "epoch": 0.5908880666049954, + "grad_norm": 3.6820626258850098, + "learning_rate": 3.783850263148284e-06, + "loss": 0.9545, + "step": 7665 + }, + { + "epoch": 0.5909651557200123, + "grad_norm": 3.582075357437134, + "learning_rate": 3.7826393463730403e-06, + "loss": 0.8736, + "step": 7666 + }, + { + "epoch": 0.5910422448350293, + "grad_norm": 3.6667416095733643, + "learning_rate": 3.7814285054939285e-06, + "loss": 0.9364, + "step": 7667 + }, + { + "epoch": 0.5911193339500462, + "grad_norm": 3.8910152912139893, + "learning_rate": 3.7802177405864392e-06, + "loss": 1.0272, + "step": 7668 + }, + { + "epoch": 0.5911964230650633, + "grad_norm": 3.327561378479004, + "learning_rate": 3.779007051726056e-06, + "loss": 0.7072, + "step": 7669 + }, + { + "epoch": 0.5912735121800802, + "grad_norm": 3.4671108722686768, + "learning_rate": 3.77779643898826e-06, + "loss": 0.8911, + "step": 7670 + }, + { + "epoch": 0.5913506012950971, + "grad_norm": 3.4955384731292725, + "learning_rate": 3.7765859024485246e-06, + "loss": 0.7711, + "step": 7671 + }, + { + "epoch": 0.5914276904101141, + "grad_norm": 3.6591098308563232, + "learning_rate": 3.7753754421823225e-06, + "loss": 0.9811, + "step": 7672 + }, + { + "epoch": 0.591504779525131, + "grad_norm": 3.4825854301452637, + "learning_rate": 3.7741650582651195e-06, + "loss": 0.8656, + "step": 7673 + }, + { + "epoch": 0.5915818686401481, + "grad_norm": 3.4312667846679688, + "learning_rate": 3.7729547507723764e-06, + "loss": 0.8741, + "step": 7674 + }, + { + "epoch": 0.591658957755165, + "grad_norm": 3.5050833225250244, + "learning_rate": 3.771744519779548e-06, + "loss": 0.8984, + "step": 7675 + }, + { + "epoch": 0.5917360468701819, + "grad_norm": 3.687978982925415, + "learning_rate": 3.7705343653620894e-06, + "loss": 0.8962, + "step": 7676 + }, + { + "epoch": 0.5918131359851989, + "grad_norm": 3.417243480682373, + "learning_rate": 3.769324287595445e-06, + "loss": 0.922, + "step": 7677 + }, + { + "epoch": 0.5918902251002158, + "grad_norm": 3.6329598426818848, + "learning_rate": 3.7681142865550555e-06, + "loss": 0.9152, + "step": 7678 + }, + { + "epoch": 0.5919673142152329, + "grad_norm": 3.985581398010254, + "learning_rate": 3.766904362316362e-06, + "loss": 0.9494, + "step": 7679 + }, + { + "epoch": 0.5920444033302498, + "grad_norm": 3.8809006214141846, + "learning_rate": 3.765694514954796e-06, + "loss": 0.9822, + "step": 7680 + }, + { + "epoch": 0.5921214924452667, + "grad_norm": 3.394606828689575, + "learning_rate": 3.7644847445457826e-06, + "loss": 0.8432, + "step": 7681 + }, + { + "epoch": 0.5921985815602837, + "grad_norm": 3.865122079849243, + "learning_rate": 3.7632750511647478e-06, + "loss": 0.9003, + "step": 7682 + }, + { + "epoch": 0.5922756706753006, + "grad_norm": 3.693389892578125, + "learning_rate": 3.7620654348871083e-06, + "loss": 0.9377, + "step": 7683 + }, + { + "epoch": 0.5923527597903177, + "grad_norm": 3.6535701751708984, + "learning_rate": 3.760855895788277e-06, + "loss": 0.9298, + "step": 7684 + }, + { + "epoch": 0.5924298489053346, + "grad_norm": 3.484705686569214, + "learning_rate": 3.759646433943662e-06, + "loss": 0.9926, + "step": 7685 + }, + { + "epoch": 0.5925069380203515, + "grad_norm": 3.9103033542633057, + "learning_rate": 3.7584370494286697e-06, + "loss": 1.0298, + "step": 7686 + }, + { + "epoch": 0.5925840271353685, + "grad_norm": 3.3555185794830322, + "learning_rate": 3.7572277423186964e-06, + "loss": 0.8897, + "step": 7687 + }, + { + "epoch": 0.5926611162503854, + "grad_norm": 3.7804808616638184, + "learning_rate": 3.7560185126891375e-06, + "loss": 0.9413, + "step": 7688 + }, + { + "epoch": 0.5927382053654024, + "grad_norm": 3.7043962478637695, + "learning_rate": 3.75480936061538e-06, + "loss": 0.8945, + "step": 7689 + }, + { + "epoch": 0.5928152944804194, + "grad_norm": 3.398715019226074, + "learning_rate": 3.753600286172811e-06, + "loss": 0.8635, + "step": 7690 + }, + { + "epoch": 0.5928923835954363, + "grad_norm": 3.560133934020996, + "learning_rate": 3.7523912894368093e-06, + "loss": 0.9516, + "step": 7691 + }, + { + "epoch": 0.5929694727104533, + "grad_norm": 3.709573745727539, + "learning_rate": 3.751182370482748e-06, + "loss": 0.9026, + "step": 7692 + }, + { + "epoch": 0.5930465618254702, + "grad_norm": 3.8817639350891113, + "learning_rate": 3.7499735293859985e-06, + "loss": 0.9197, + "step": 7693 + }, + { + "epoch": 0.5931236509404872, + "grad_norm": 3.9092764854431152, + "learning_rate": 3.7487647662219263e-06, + "loss": 0.8946, + "step": 7694 + }, + { + "epoch": 0.5932007400555042, + "grad_norm": 3.4209654331207275, + "learning_rate": 3.7475560810658896e-06, + "loss": 0.8795, + "step": 7695 + }, + { + "epoch": 0.5932778291705211, + "grad_norm": 3.7244107723236084, + "learning_rate": 3.746347473993245e-06, + "loss": 0.9669, + "step": 7696 + }, + { + "epoch": 0.5933549182855381, + "grad_norm": 3.8219058513641357, + "learning_rate": 3.745138945079343e-06, + "loss": 0.9342, + "step": 7697 + }, + { + "epoch": 0.593432007400555, + "grad_norm": 3.733274459838867, + "learning_rate": 3.7439304943995274e-06, + "loss": 1.05, + "step": 7698 + }, + { + "epoch": 0.593509096515572, + "grad_norm": 3.3344218730926514, + "learning_rate": 3.742722122029142e-06, + "loss": 0.8988, + "step": 7699 + }, + { + "epoch": 0.593586185630589, + "grad_norm": 3.70858097076416, + "learning_rate": 3.741513828043519e-06, + "loss": 0.9501, + "step": 7700 + }, + { + "epoch": 0.5936632747456059, + "grad_norm": 3.8456056118011475, + "learning_rate": 3.7403056125179916e-06, + "loss": 0.9357, + "step": 7701 + }, + { + "epoch": 0.5937403638606229, + "grad_norm": 3.785587787628174, + "learning_rate": 3.739097475527885e-06, + "loss": 0.9546, + "step": 7702 + }, + { + "epoch": 0.5938174529756398, + "grad_norm": 3.5768582820892334, + "learning_rate": 3.7378894171485203e-06, + "loss": 0.8708, + "step": 7703 + }, + { + "epoch": 0.5938945420906568, + "grad_norm": 3.9242618083953857, + "learning_rate": 3.736681437455214e-06, + "loss": 1.0719, + "step": 7704 + }, + { + "epoch": 0.5939716312056738, + "grad_norm": 3.8821098804473877, + "learning_rate": 3.7354735365232777e-06, + "loss": 0.914, + "step": 7705 + }, + { + "epoch": 0.5940487203206907, + "grad_norm": 3.5671024322509766, + "learning_rate": 3.7342657144280162e-06, + "loss": 1.0382, + "step": 7706 + }, + { + "epoch": 0.5941258094357077, + "grad_norm": 4.059629917144775, + "learning_rate": 3.7330579712447324e-06, + "loss": 0.9346, + "step": 7707 + }, + { + "epoch": 0.5942028985507246, + "grad_norm": 3.79659366607666, + "learning_rate": 3.7318503070487235e-06, + "loss": 0.9125, + "step": 7708 + }, + { + "epoch": 0.5942799876657416, + "grad_norm": 3.4974241256713867, + "learning_rate": 3.7306427219152786e-06, + "loss": 0.9265, + "step": 7709 + }, + { + "epoch": 0.5943570767807586, + "grad_norm": 3.5367648601531982, + "learning_rate": 3.7294352159196865e-06, + "loss": 0.9146, + "step": 7710 + }, + { + "epoch": 0.5944341658957755, + "grad_norm": 3.3127148151397705, + "learning_rate": 3.7282277891372287e-06, + "loss": 0.9294, + "step": 7711 + }, + { + "epoch": 0.5945112550107925, + "grad_norm": 3.6922547817230225, + "learning_rate": 3.72702044164318e-06, + "loss": 0.9764, + "step": 7712 + }, + { + "epoch": 0.5945883441258094, + "grad_norm": 3.6928598880767822, + "learning_rate": 3.7258131735128156e-06, + "loss": 1.0221, + "step": 7713 + }, + { + "epoch": 0.5946654332408264, + "grad_norm": 3.2286767959594727, + "learning_rate": 3.7246059848213996e-06, + "loss": 0.736, + "step": 7714 + }, + { + "epoch": 0.5947425223558434, + "grad_norm": 3.4887712001800537, + "learning_rate": 3.7233988756441953e-06, + "loss": 0.9037, + "step": 7715 + }, + { + "epoch": 0.5948196114708603, + "grad_norm": 4.033304691314697, + "learning_rate": 3.72219184605646e-06, + "loss": 0.956, + "step": 7716 + }, + { + "epoch": 0.5948967005858773, + "grad_norm": 3.5219640731811523, + "learning_rate": 3.720984896133444e-06, + "loss": 0.9298, + "step": 7717 + }, + { + "epoch": 0.5949737897008942, + "grad_norm": 3.6129660606384277, + "learning_rate": 3.719778025950397e-06, + "loss": 0.9154, + "step": 7718 + }, + { + "epoch": 0.5950508788159112, + "grad_norm": 3.6359000205993652, + "learning_rate": 3.7185712355825577e-06, + "loss": 0.9319, + "step": 7719 + }, + { + "epoch": 0.5951279679309281, + "grad_norm": 3.6915714740753174, + "learning_rate": 3.717364525105166e-06, + "loss": 0.899, + "step": 7720 + }, + { + "epoch": 0.5952050570459451, + "grad_norm": 3.943981409072876, + "learning_rate": 3.716157894593454e-06, + "loss": 1.0859, + "step": 7721 + }, + { + "epoch": 0.5952821461609621, + "grad_norm": 3.356869697570801, + "learning_rate": 3.714951344122647e-06, + "loss": 0.8368, + "step": 7722 + }, + { + "epoch": 0.595359235275979, + "grad_norm": 3.6155312061309814, + "learning_rate": 3.7137448737679677e-06, + "loss": 0.8818, + "step": 7723 + }, + { + "epoch": 0.595436324390996, + "grad_norm": 3.962568759918213, + "learning_rate": 3.712538483604634e-06, + "loss": 0.93, + "step": 7724 + }, + { + "epoch": 0.595513413506013, + "grad_norm": 3.563605785369873, + "learning_rate": 3.7113321737078587e-06, + "loss": 0.872, + "step": 7725 + }, + { + "epoch": 0.5955905026210299, + "grad_norm": 3.954345226287842, + "learning_rate": 3.710125944152849e-06, + "loss": 0.9366, + "step": 7726 + }, + { + "epoch": 0.5956675917360469, + "grad_norm": 3.7304627895355225, + "learning_rate": 3.7089197950148054e-06, + "loss": 0.9616, + "step": 7727 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 3.6339216232299805, + "learning_rate": 3.7077137263689266e-06, + "loss": 0.8768, + "step": 7728 + }, + { + "epoch": 0.5958217699660808, + "grad_norm": 3.9956276416778564, + "learning_rate": 3.706507738290403e-06, + "loss": 1.0666, + "step": 7729 + }, + { + "epoch": 0.5958988590810977, + "grad_norm": 3.52579402923584, + "learning_rate": 3.705301830854423e-06, + "loss": 0.9173, + "step": 7730 + }, + { + "epoch": 0.5959759481961147, + "grad_norm": 3.9058704376220703, + "learning_rate": 3.7040960041361696e-06, + "loss": 1.0185, + "step": 7731 + }, + { + "epoch": 0.5960530373111317, + "grad_norm": 4.03030252456665, + "learning_rate": 3.7028902582108174e-06, + "loss": 1.025, + "step": 7732 + }, + { + "epoch": 0.5961301264261486, + "grad_norm": 4.030737400054932, + "learning_rate": 3.7016845931535395e-06, + "loss": 0.9925, + "step": 7733 + }, + { + "epoch": 0.5962072155411656, + "grad_norm": 3.7005774974823, + "learning_rate": 3.7004790090395043e-06, + "loss": 0.9436, + "step": 7734 + }, + { + "epoch": 0.5962843046561825, + "grad_norm": 3.9587509632110596, + "learning_rate": 3.699273505943871e-06, + "loss": 1.0326, + "step": 7735 + }, + { + "epoch": 0.5963613937711995, + "grad_norm": 3.745950937271118, + "learning_rate": 3.6980680839418e-06, + "loss": 0.8556, + "step": 7736 + }, + { + "epoch": 0.5964384828862165, + "grad_norm": 3.4118099212646484, + "learning_rate": 3.696862743108439e-06, + "loss": 0.9266, + "step": 7737 + }, + { + "epoch": 0.5965155720012334, + "grad_norm": 3.6588337421417236, + "learning_rate": 3.6956574835189374e-06, + "loss": 0.9452, + "step": 7738 + }, + { + "epoch": 0.5965926611162504, + "grad_norm": 3.32086443901062, + "learning_rate": 3.694452305248437e-06, + "loss": 0.9146, + "step": 7739 + }, + { + "epoch": 0.5966697502312673, + "grad_norm": 3.491138458251953, + "learning_rate": 3.693247208372074e-06, + "loss": 0.7812, + "step": 7740 + }, + { + "epoch": 0.5967468393462843, + "grad_norm": 3.2790908813476562, + "learning_rate": 3.6920421929649786e-06, + "loss": 0.7925, + "step": 7741 + }, + { + "epoch": 0.5968239284613013, + "grad_norm": 3.902376174926758, + "learning_rate": 3.690837259102279e-06, + "loss": 0.991, + "step": 7742 + }, + { + "epoch": 0.5969010175763182, + "grad_norm": 3.6180334091186523, + "learning_rate": 3.689632406859096e-06, + "loss": 0.8429, + "step": 7743 + }, + { + "epoch": 0.5969781066913352, + "grad_norm": 4.053879737854004, + "learning_rate": 3.688427636310545e-06, + "loss": 1.1251, + "step": 7744 + }, + { + "epoch": 0.5970551958063521, + "grad_norm": 4.163644790649414, + "learning_rate": 3.68722294753174e-06, + "loss": 0.9068, + "step": 7745 + }, + { + "epoch": 0.597132284921369, + "grad_norm": 3.3807566165924072, + "learning_rate": 3.6860183405977833e-06, + "loss": 0.8411, + "step": 7746 + }, + { + "epoch": 0.5972093740363861, + "grad_norm": 3.3274459838867188, + "learning_rate": 3.6848138155837786e-06, + "loss": 0.7565, + "step": 7747 + }, + { + "epoch": 0.597286463151403, + "grad_norm": 3.35614013671875, + "learning_rate": 3.683609372564821e-06, + "loss": 0.7392, + "step": 7748 + }, + { + "epoch": 0.59736355226642, + "grad_norm": 4.015716075897217, + "learning_rate": 3.6824050116160002e-06, + "loss": 1.0421, + "step": 7749 + }, + { + "epoch": 0.5974406413814369, + "grad_norm": 3.730281114578247, + "learning_rate": 3.681200732812405e-06, + "loss": 1.0248, + "step": 7750 + }, + { + "epoch": 0.5975177304964538, + "grad_norm": 3.4438388347625732, + "learning_rate": 3.6799965362291123e-06, + "loss": 0.8048, + "step": 7751 + }, + { + "epoch": 0.5975948196114709, + "grad_norm": 3.696259021759033, + "learning_rate": 3.678792421941199e-06, + "loss": 0.9177, + "step": 7752 + }, + { + "epoch": 0.5976719087264878, + "grad_norm": 3.7113966941833496, + "learning_rate": 3.677588390023736e-06, + "loss": 0.9895, + "step": 7753 + }, + { + "epoch": 0.5977489978415048, + "grad_norm": 3.3123090267181396, + "learning_rate": 3.6763844405517877e-06, + "loss": 0.8952, + "step": 7754 + }, + { + "epoch": 0.5978260869565217, + "grad_norm": 3.5116400718688965, + "learning_rate": 3.6751805736004148e-06, + "loss": 0.9271, + "step": 7755 + }, + { + "epoch": 0.5979031760715386, + "grad_norm": 3.612499237060547, + "learning_rate": 3.673976789244672e-06, + "loss": 0.8571, + "step": 7756 + }, + { + "epoch": 0.5979802651865557, + "grad_norm": 3.808100938796997, + "learning_rate": 3.672773087559608e-06, + "loss": 0.9372, + "step": 7757 + }, + { + "epoch": 0.5980573543015726, + "grad_norm": 3.6125247478485107, + "learning_rate": 3.6715694686202686e-06, + "loss": 0.93, + "step": 7758 + }, + { + "epoch": 0.5981344434165896, + "grad_norm": 3.831362724304199, + "learning_rate": 3.6703659325016927e-06, + "loss": 0.8422, + "step": 7759 + }, + { + "epoch": 0.5982115325316065, + "grad_norm": 3.4651618003845215, + "learning_rate": 3.669162479278914e-06, + "loss": 0.9065, + "step": 7760 + }, + { + "epoch": 0.5982886216466234, + "grad_norm": 3.6976826190948486, + "learning_rate": 3.667959109026963e-06, + "loss": 0.8673, + "step": 7761 + }, + { + "epoch": 0.5983657107616405, + "grad_norm": 3.865349292755127, + "learning_rate": 3.6667558218208617e-06, + "loss": 0.988, + "step": 7762 + }, + { + "epoch": 0.5984427998766574, + "grad_norm": 3.4592785835266113, + "learning_rate": 3.665552617735629e-06, + "loss": 0.9485, + "step": 7763 + }, + { + "epoch": 0.5985198889916744, + "grad_norm": 3.4235899448394775, + "learning_rate": 3.6643494968462822e-06, + "loss": 0.867, + "step": 7764 + }, + { + "epoch": 0.5985969781066913, + "grad_norm": 3.647528648376465, + "learning_rate": 3.663146459227824e-06, + "loss": 0.851, + "step": 7765 + }, + { + "epoch": 0.5986740672217082, + "grad_norm": 3.5159714221954346, + "learning_rate": 3.6619435049552633e-06, + "loss": 0.908, + "step": 7766 + }, + { + "epoch": 0.5987511563367253, + "grad_norm": 3.558000326156616, + "learning_rate": 3.66074063410359e-06, + "loss": 0.9187, + "step": 7767 + }, + { + "epoch": 0.5988282454517422, + "grad_norm": 3.671823263168335, + "learning_rate": 3.659537846747806e-06, + "loss": 0.9016, + "step": 7768 + }, + { + "epoch": 0.5989053345667592, + "grad_norm": 3.4551210403442383, + "learning_rate": 3.658335142962894e-06, + "loss": 0.8888, + "step": 7769 + }, + { + "epoch": 0.5989824236817761, + "grad_norm": 3.604067087173462, + "learning_rate": 3.657132522823837e-06, + "loss": 0.9862, + "step": 7770 + }, + { + "epoch": 0.5990595127967931, + "grad_norm": 3.8700289726257324, + "learning_rate": 3.655929986405613e-06, + "loss": 0.963, + "step": 7771 + }, + { + "epoch": 0.5991366019118101, + "grad_norm": 3.566358804702759, + "learning_rate": 3.6547275337831922e-06, + "loss": 0.8178, + "step": 7772 + }, + { + "epoch": 0.599213691026827, + "grad_norm": 3.576695203781128, + "learning_rate": 3.653525165031543e-06, + "loss": 0.9241, + "step": 7773 + }, + { + "epoch": 0.599290780141844, + "grad_norm": 3.5884556770324707, + "learning_rate": 3.6523228802256264e-06, + "loss": 0.8936, + "step": 7774 + }, + { + "epoch": 0.5993678692568609, + "grad_norm": 3.7305357456207275, + "learning_rate": 3.651120679440398e-06, + "loss": 0.8829, + "step": 7775 + }, + { + "epoch": 0.599444958371878, + "grad_norm": 3.6430912017822266, + "learning_rate": 3.6499185627508098e-06, + "loss": 0.9241, + "step": 7776 + }, + { + "epoch": 0.5995220474868949, + "grad_norm": 3.6668894290924072, + "learning_rate": 3.648716530231806e-06, + "loss": 0.8658, + "step": 7777 + }, + { + "epoch": 0.5995991366019118, + "grad_norm": 3.5166971683502197, + "learning_rate": 3.6475145819583285e-06, + "loss": 0.9614, + "step": 7778 + }, + { + "epoch": 0.5996762257169288, + "grad_norm": 3.9668848514556885, + "learning_rate": 3.6463127180053125e-06, + "loss": 1.0337, + "step": 7779 + }, + { + "epoch": 0.5997533148319457, + "grad_norm": 3.5384409427642822, + "learning_rate": 3.6451109384476875e-06, + "loss": 0.9416, + "step": 7780 + }, + { + "epoch": 0.5998304039469627, + "grad_norm": 3.4631733894348145, + "learning_rate": 3.6439092433603775e-06, + "loss": 0.819, + "step": 7781 + }, + { + "epoch": 0.5999074930619797, + "grad_norm": 3.906912326812744, + "learning_rate": 3.642707632818304e-06, + "loss": 0.908, + "step": 7782 + }, + { + "epoch": 0.5999845821769966, + "grad_norm": 3.743074893951416, + "learning_rate": 3.641506106896379e-06, + "loss": 0.9959, + "step": 7783 + }, + { + "epoch": 0.6000616712920136, + "grad_norm": 3.657150983810425, + "learning_rate": 3.640304665669514e-06, + "loss": 0.9171, + "step": 7784 + }, + { + "epoch": 0.6001387604070305, + "grad_norm": 3.718385934829712, + "learning_rate": 3.639103309212609e-06, + "loss": 0.8975, + "step": 7785 + }, + { + "epoch": 0.6002158495220475, + "grad_norm": 3.5247554779052734, + "learning_rate": 3.6379020376005646e-06, + "loss": 0.8734, + "step": 7786 + }, + { + "epoch": 0.6002929386370645, + "grad_norm": 3.818239688873291, + "learning_rate": 3.6367008509082757e-06, + "loss": 0.9179, + "step": 7787 + }, + { + "epoch": 0.6003700277520814, + "grad_norm": 4.0803728103637695, + "learning_rate": 3.6354997492106258e-06, + "loss": 0.9927, + "step": 7788 + }, + { + "epoch": 0.6004471168670984, + "grad_norm": 3.478654384613037, + "learning_rate": 3.634298732582501e-06, + "loss": 0.9289, + "step": 7789 + }, + { + "epoch": 0.6005242059821153, + "grad_norm": 3.6446428298950195, + "learning_rate": 3.6330978010987767e-06, + "loss": 0.9966, + "step": 7790 + }, + { + "epoch": 0.6006012950971323, + "grad_norm": 3.805497407913208, + "learning_rate": 3.6318969548343246e-06, + "loss": 1.0771, + "step": 7791 + }, + { + "epoch": 0.6006783842121493, + "grad_norm": 3.4235615730285645, + "learning_rate": 3.6306961938640116e-06, + "loss": 0.9165, + "step": 7792 + }, + { + "epoch": 0.6007554733271662, + "grad_norm": 3.7066256999969482, + "learning_rate": 3.6294955182627007e-06, + "loss": 0.8412, + "step": 7793 + }, + { + "epoch": 0.6008325624421832, + "grad_norm": 3.6571426391601562, + "learning_rate": 3.628294928105245e-06, + "loss": 0.8972, + "step": 7794 + }, + { + "epoch": 0.6009096515572001, + "grad_norm": 3.8981006145477295, + "learning_rate": 3.6270944234664972e-06, + "loss": 0.8919, + "step": 7795 + }, + { + "epoch": 0.6009867406722171, + "grad_norm": 3.7625627517700195, + "learning_rate": 3.6258940044213004e-06, + "loss": 0.8221, + "step": 7796 + }, + { + "epoch": 0.601063829787234, + "grad_norm": 4.007872104644775, + "learning_rate": 3.6246936710444957e-06, + "loss": 0.9362, + "step": 7797 + }, + { + "epoch": 0.601140918902251, + "grad_norm": 3.732297420501709, + "learning_rate": 3.623493423410919e-06, + "loss": 0.8811, + "step": 7798 + }, + { + "epoch": 0.601218008017268, + "grad_norm": 3.483485698699951, + "learning_rate": 3.6222932615953977e-06, + "loss": 0.9207, + "step": 7799 + }, + { + "epoch": 0.6012950971322849, + "grad_norm": 3.7129147052764893, + "learning_rate": 3.6210931856727547e-06, + "loss": 0.8513, + "step": 7800 + }, + { + "epoch": 0.6013721862473019, + "grad_norm": 3.39215087890625, + "learning_rate": 3.619893195717813e-06, + "loss": 0.8216, + "step": 7801 + }, + { + "epoch": 0.6014492753623188, + "grad_norm": 3.39719295501709, + "learning_rate": 3.6186932918053806e-06, + "loss": 1.0014, + "step": 7802 + }, + { + "epoch": 0.6015263644773358, + "grad_norm": 3.573181629180908, + "learning_rate": 3.6174934740102672e-06, + "loss": 0.864, + "step": 7803 + }, + { + "epoch": 0.6016034535923528, + "grad_norm": 3.7137699127197266, + "learning_rate": 3.6162937424072775e-06, + "loss": 1.0308, + "step": 7804 + }, + { + "epoch": 0.6016805427073697, + "grad_norm": 3.913328170776367, + "learning_rate": 3.6150940970712045e-06, + "loss": 0.9982, + "step": 7805 + }, + { + "epoch": 0.6017576318223867, + "grad_norm": 3.644329071044922, + "learning_rate": 3.6138945380768442e-06, + "loss": 0.9262, + "step": 7806 + }, + { + "epoch": 0.6018347209374036, + "grad_norm": 3.936444044113159, + "learning_rate": 3.612695065498979e-06, + "loss": 0.8839, + "step": 7807 + }, + { + "epoch": 0.6019118100524206, + "grad_norm": 3.725987434387207, + "learning_rate": 3.6114956794123913e-06, + "loss": 0.9187, + "step": 7808 + }, + { + "epoch": 0.6019888991674376, + "grad_norm": 3.515115261077881, + "learning_rate": 3.6102963798918576e-06, + "loss": 0.9055, + "step": 7809 + }, + { + "epoch": 0.6020659882824545, + "grad_norm": 3.660930871963501, + "learning_rate": 3.6090971670121468e-06, + "loss": 0.8665, + "step": 7810 + }, + { + "epoch": 0.6021430773974715, + "grad_norm": 3.7818009853363037, + "learning_rate": 3.6078980408480217e-06, + "loss": 1.0293, + "step": 7811 + }, + { + "epoch": 0.6022201665124884, + "grad_norm": 4.332160949707031, + "learning_rate": 3.606699001474246e-06, + "loss": 1.1111, + "step": 7812 + }, + { + "epoch": 0.6022972556275054, + "grad_norm": 3.5895073413848877, + "learning_rate": 3.6055000489655725e-06, + "loss": 0.8604, + "step": 7813 + }, + { + "epoch": 0.6023743447425224, + "grad_norm": 3.754880666732788, + "learning_rate": 3.6043011833967465e-06, + "loss": 0.9298, + "step": 7814 + }, + { + "epoch": 0.6024514338575393, + "grad_norm": 4.089043140411377, + "learning_rate": 3.6031024048425146e-06, + "loss": 1.0692, + "step": 7815 + }, + { + "epoch": 0.6025285229725563, + "grad_norm": 3.798560619354248, + "learning_rate": 3.6019037133776136e-06, + "loss": 0.918, + "step": 7816 + }, + { + "epoch": 0.6026056120875732, + "grad_norm": 3.821998357772827, + "learning_rate": 3.6007051090767737e-06, + "loss": 0.9856, + "step": 7817 + }, + { + "epoch": 0.6026827012025902, + "grad_norm": 3.517061948776245, + "learning_rate": 3.5995065920147233e-06, + "loss": 0.8683, + "step": 7818 + }, + { + "epoch": 0.6027597903176072, + "grad_norm": 3.6391801834106445, + "learning_rate": 3.598308162266185e-06, + "loss": 0.8445, + "step": 7819 + }, + { + "epoch": 0.6028368794326241, + "grad_norm": 3.6682894229888916, + "learning_rate": 3.5971098199058716e-06, + "loss": 0.9821, + "step": 7820 + }, + { + "epoch": 0.6029139685476411, + "grad_norm": 4.120725154876709, + "learning_rate": 3.5959115650084954e-06, + "loss": 0.8732, + "step": 7821 + }, + { + "epoch": 0.602991057662658, + "grad_norm": 3.8292622566223145, + "learning_rate": 3.5947133976487623e-06, + "loss": 0.986, + "step": 7822 + }, + { + "epoch": 0.603068146777675, + "grad_norm": 3.57987904548645, + "learning_rate": 3.5935153179013692e-06, + "loss": 0.8983, + "step": 7823 + }, + { + "epoch": 0.603145235892692, + "grad_norm": 3.4487192630767822, + "learning_rate": 3.592317325841014e-06, + "loss": 0.8779, + "step": 7824 + }, + { + "epoch": 0.6032223250077089, + "grad_norm": 3.598322629928589, + "learning_rate": 3.5911194215423805e-06, + "loss": 0.9429, + "step": 7825 + }, + { + "epoch": 0.6032994141227259, + "grad_norm": 3.9131698608398438, + "learning_rate": 3.5899216050801548e-06, + "loss": 1.0573, + "step": 7826 + }, + { + "epoch": 0.6033765032377428, + "grad_norm": 3.5165915489196777, + "learning_rate": 3.5887238765290155e-06, + "loss": 0.8065, + "step": 7827 + }, + { + "epoch": 0.6034535923527597, + "grad_norm": 3.4802615642547607, + "learning_rate": 3.5875262359636316e-06, + "loss": 0.9431, + "step": 7828 + }, + { + "epoch": 0.6035306814677768, + "grad_norm": 4.010217666625977, + "learning_rate": 3.586328683458672e-06, + "loss": 0.9236, + "step": 7829 + }, + { + "epoch": 0.6036077705827937, + "grad_norm": 3.4742398262023926, + "learning_rate": 3.5851312190887975e-06, + "loss": 0.9148, + "step": 7830 + }, + { + "epoch": 0.6036848596978107, + "grad_norm": 3.8777101039886475, + "learning_rate": 3.583933842928664e-06, + "loss": 0.9534, + "step": 7831 + }, + { + "epoch": 0.6037619488128276, + "grad_norm": 3.97666335105896, + "learning_rate": 3.5827365550529215e-06, + "loss": 1.0859, + "step": 7832 + }, + { + "epoch": 0.6038390379278445, + "grad_norm": 3.1657111644744873, + "learning_rate": 3.581539355536214e-06, + "loss": 0.9061, + "step": 7833 + }, + { + "epoch": 0.6039161270428616, + "grad_norm": 3.550740957260132, + "learning_rate": 3.580342244453181e-06, + "loss": 0.9728, + "step": 7834 + }, + { + "epoch": 0.6039932161578785, + "grad_norm": 3.700817108154297, + "learning_rate": 3.5791452218784572e-06, + "loss": 0.9626, + "step": 7835 + }, + { + "epoch": 0.6040703052728955, + "grad_norm": 3.693549633026123, + "learning_rate": 3.57794828788667e-06, + "loss": 1.0247, + "step": 7836 + }, + { + "epoch": 0.6041473943879124, + "grad_norm": 3.4788200855255127, + "learning_rate": 3.5767514425524413e-06, + "loss": 0.9126, + "step": 7837 + }, + { + "epoch": 0.6042244835029293, + "grad_norm": 3.526183843612671, + "learning_rate": 3.57555468595039e-06, + "loss": 1.0105, + "step": 7838 + }, + { + "epoch": 0.6043015726179464, + "grad_norm": 3.612081289291382, + "learning_rate": 3.5743580181551265e-06, + "loss": 0.9198, + "step": 7839 + }, + { + "epoch": 0.6043786617329633, + "grad_norm": 3.6444640159606934, + "learning_rate": 3.5731614392412557e-06, + "loss": 0.9105, + "step": 7840 + }, + { + "epoch": 0.6044557508479803, + "grad_norm": 3.4978742599487305, + "learning_rate": 3.571964949283381e-06, + "loss": 0.8368, + "step": 7841 + }, + { + "epoch": 0.6045328399629972, + "grad_norm": 3.5534303188323975, + "learning_rate": 3.5707685483560948e-06, + "loss": 0.9524, + "step": 7842 + }, + { + "epoch": 0.6046099290780141, + "grad_norm": 3.6037681102752686, + "learning_rate": 3.569572236533988e-06, + "loss": 0.9408, + "step": 7843 + }, + { + "epoch": 0.6046870181930312, + "grad_norm": 3.651366949081421, + "learning_rate": 3.5683760138916433e-06, + "loss": 0.8935, + "step": 7844 + }, + { + "epoch": 0.6047641073080481, + "grad_norm": 3.578256368637085, + "learning_rate": 3.5671798805036396e-06, + "loss": 0.9498, + "step": 7845 + }, + { + "epoch": 0.6048411964230651, + "grad_norm": 4.041118621826172, + "learning_rate": 3.5659838364445505e-06, + "loss": 0.9394, + "step": 7846 + }, + { + "epoch": 0.604918285538082, + "grad_norm": 3.86403489112854, + "learning_rate": 3.564787881788941e-06, + "loss": 0.9925, + "step": 7847 + }, + { + "epoch": 0.6049953746530989, + "grad_norm": 3.811384916305542, + "learning_rate": 3.5635920166113735e-06, + "loss": 0.9659, + "step": 7848 + }, + { + "epoch": 0.605072463768116, + "grad_norm": 3.6318352222442627, + "learning_rate": 3.5623962409864066e-06, + "loss": 0.8375, + "step": 7849 + }, + { + "epoch": 0.6051495528831329, + "grad_norm": 3.6698193550109863, + "learning_rate": 3.5612005549885865e-06, + "loss": 1.0244, + "step": 7850 + }, + { + "epoch": 0.6052266419981499, + "grad_norm": 3.5718917846679688, + "learning_rate": 3.5600049586924607e-06, + "loss": 0.9182, + "step": 7851 + }, + { + "epoch": 0.6053037311131668, + "grad_norm": 3.9225900173187256, + "learning_rate": 3.5588094521725687e-06, + "loss": 0.9336, + "step": 7852 + }, + { + "epoch": 0.6053808202281837, + "grad_norm": 3.1916608810424805, + "learning_rate": 3.5576140355034415e-06, + "loss": 0.8681, + "step": 7853 + }, + { + "epoch": 0.6054579093432008, + "grad_norm": 3.703646659851074, + "learning_rate": 3.5564187087596116e-06, + "loss": 0.9635, + "step": 7854 + }, + { + "epoch": 0.6055349984582177, + "grad_norm": 3.636467218399048, + "learning_rate": 3.5552234720155963e-06, + "loss": 0.8925, + "step": 7855 + }, + { + "epoch": 0.6056120875732347, + "grad_norm": 3.733719825744629, + "learning_rate": 3.554028325345914e-06, + "loss": 0.9168, + "step": 7856 + }, + { + "epoch": 0.6056891766882516, + "grad_norm": 3.497403860092163, + "learning_rate": 3.552833268825078e-06, + "loss": 0.9535, + "step": 7857 + }, + { + "epoch": 0.6057662658032685, + "grad_norm": 4.8122382164001465, + "learning_rate": 3.5516383025275925e-06, + "loss": 0.9782, + "step": 7858 + }, + { + "epoch": 0.6058433549182856, + "grad_norm": 3.9689764976501465, + "learning_rate": 3.5504434265279587e-06, + "loss": 1.006, + "step": 7859 + }, + { + "epoch": 0.6059204440333025, + "grad_norm": 3.636197566986084, + "learning_rate": 3.5492486409006684e-06, + "loss": 0.9264, + "step": 7860 + }, + { + "epoch": 0.6059975331483195, + "grad_norm": 4.189083099365234, + "learning_rate": 3.5480539457202127e-06, + "loss": 0.9417, + "step": 7861 + }, + { + "epoch": 0.6060746222633364, + "grad_norm": 3.744331121444702, + "learning_rate": 3.546859341061073e-06, + "loss": 0.8417, + "step": 7862 + }, + { + "epoch": 0.6061517113783533, + "grad_norm": 3.3087058067321777, + "learning_rate": 3.545664826997727e-06, + "loss": 0.911, + "step": 7863 + }, + { + "epoch": 0.6062288004933704, + "grad_norm": 3.681582450866699, + "learning_rate": 3.5444704036046485e-06, + "loss": 0.9636, + "step": 7864 + }, + { + "epoch": 0.6063058896083873, + "grad_norm": 3.7215073108673096, + "learning_rate": 3.543276070956301e-06, + "loss": 0.9945, + "step": 7865 + }, + { + "epoch": 0.6063829787234043, + "grad_norm": 3.6413660049438477, + "learning_rate": 3.542081829127145e-06, + "loss": 0.8434, + "step": 7866 + }, + { + "epoch": 0.6064600678384212, + "grad_norm": 3.857893705368042, + "learning_rate": 3.540887678191638e-06, + "loss": 0.9055, + "step": 7867 + }, + { + "epoch": 0.6065371569534381, + "grad_norm": 3.426194906234741, + "learning_rate": 3.539693618224226e-06, + "loss": 0.9556, + "step": 7868 + }, + { + "epoch": 0.6066142460684552, + "grad_norm": 3.5567467212677, + "learning_rate": 3.538499649299354e-06, + "loss": 0.9462, + "step": 7869 + }, + { + "epoch": 0.6066913351834721, + "grad_norm": 3.7717530727386475, + "learning_rate": 3.5373057714914607e-06, + "loss": 0.9073, + "step": 7870 + }, + { + "epoch": 0.6067684242984891, + "grad_norm": 4.004831790924072, + "learning_rate": 3.5361119848749755e-06, + "loss": 0.9405, + "step": 7871 + }, + { + "epoch": 0.606845513413506, + "grad_norm": 3.8351056575775146, + "learning_rate": 3.5349182895243274e-06, + "loss": 1.0261, + "step": 7872 + }, + { + "epoch": 0.6069226025285229, + "grad_norm": 3.8197968006134033, + "learning_rate": 3.533724685513935e-06, + "loss": 0.9427, + "step": 7873 + }, + { + "epoch": 0.60699969164354, + "grad_norm": 3.7887625694274902, + "learning_rate": 3.5325311729182155e-06, + "loss": 1.1297, + "step": 7874 + }, + { + "epoch": 0.6070767807585569, + "grad_norm": 3.8279473781585693, + "learning_rate": 3.5313377518115772e-06, + "loss": 0.9428, + "step": 7875 + }, + { + "epoch": 0.6071538698735739, + "grad_norm": 3.926093101501465, + "learning_rate": 3.530144422268423e-06, + "loss": 0.8763, + "step": 7876 + }, + { + "epoch": 0.6072309589885908, + "grad_norm": 3.941603183746338, + "learning_rate": 3.528951184363151e-06, + "loss": 1.0133, + "step": 7877 + }, + { + "epoch": 0.6073080481036077, + "grad_norm": 3.6375105381011963, + "learning_rate": 3.5277580381701553e-06, + "loss": 0.9153, + "step": 7878 + }, + { + "epoch": 0.6073851372186247, + "grad_norm": 3.378063678741455, + "learning_rate": 3.52656498376382e-06, + "loss": 0.8914, + "step": 7879 + }, + { + "epoch": 0.6074622263336417, + "grad_norm": 3.7920594215393066, + "learning_rate": 3.5253720212185284e-06, + "loss": 1.0532, + "step": 7880 + }, + { + "epoch": 0.6075393154486587, + "grad_norm": 3.6664042472839355, + "learning_rate": 3.524179150608652e-06, + "loss": 0.7833, + "step": 7881 + }, + { + "epoch": 0.6076164045636756, + "grad_norm": 3.3724348545074463, + "learning_rate": 3.5229863720085623e-06, + "loss": 0.9102, + "step": 7882 + }, + { + "epoch": 0.6076934936786925, + "grad_norm": 3.686537504196167, + "learning_rate": 3.521793685492624e-06, + "loss": 0.9696, + "step": 7883 + }, + { + "epoch": 0.6077705827937095, + "grad_norm": 3.747305154800415, + "learning_rate": 3.5206010911351924e-06, + "loss": 1.0284, + "step": 7884 + }, + { + "epoch": 0.6078476719087265, + "grad_norm": 3.655407667160034, + "learning_rate": 3.5194085890106204e-06, + "loss": 0.9406, + "step": 7885 + }, + { + "epoch": 0.6079247610237435, + "grad_norm": 3.6293342113494873, + "learning_rate": 3.518216179193257e-06, + "loss": 0.9043, + "step": 7886 + }, + { + "epoch": 0.6080018501387604, + "grad_norm": 3.54872465133667, + "learning_rate": 3.517023861757438e-06, + "loss": 1.0235, + "step": 7887 + }, + { + "epoch": 0.6080789392537773, + "grad_norm": 4.1367106437683105, + "learning_rate": 3.5158316367775016e-06, + "loss": 0.968, + "step": 7888 + }, + { + "epoch": 0.6081560283687943, + "grad_norm": 3.3810555934906006, + "learning_rate": 3.514639504327776e-06, + "loss": 0.8467, + "step": 7889 + }, + { + "epoch": 0.6082331174838113, + "grad_norm": 3.417717456817627, + "learning_rate": 3.5134474644825847e-06, + "loss": 0.8455, + "step": 7890 + }, + { + "epoch": 0.6083102065988283, + "grad_norm": 3.642557144165039, + "learning_rate": 3.512255517316245e-06, + "loss": 0.9056, + "step": 7891 + }, + { + "epoch": 0.6083872957138452, + "grad_norm": 3.8719959259033203, + "learning_rate": 3.5110636629030674e-06, + "loss": 0.9462, + "step": 7892 + }, + { + "epoch": 0.6084643848288621, + "grad_norm": 3.6329193115234375, + "learning_rate": 3.509871901317359e-06, + "loss": 0.9395, + "step": 7893 + }, + { + "epoch": 0.6085414739438791, + "grad_norm": 4.278444290161133, + "learning_rate": 3.5086802326334217e-06, + "loss": 0.946, + "step": 7894 + }, + { + "epoch": 0.6086185630588961, + "grad_norm": 3.6533453464508057, + "learning_rate": 3.507488656925547e-06, + "loss": 0.9943, + "step": 7895 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 3.3873438835144043, + "learning_rate": 3.5062971742680244e-06, + "loss": 0.9275, + "step": 7896 + }, + { + "epoch": 0.60877274128893, + "grad_norm": 3.647336006164551, + "learning_rate": 3.5051057847351377e-06, + "loss": 0.8534, + "step": 7897 + }, + { + "epoch": 0.6088498304039469, + "grad_norm": 3.4682047367095947, + "learning_rate": 3.503914488401163e-06, + "loss": 0.9414, + "step": 7898 + }, + { + "epoch": 0.6089269195189639, + "grad_norm": 3.890171527862549, + "learning_rate": 3.5027232853403705e-06, + "loss": 1.0898, + "step": 7899 + }, + { + "epoch": 0.6090040086339809, + "grad_norm": 3.491349220275879, + "learning_rate": 3.501532175627026e-06, + "loss": 0.9413, + "step": 7900 + }, + { + "epoch": 0.6090810977489979, + "grad_norm": 3.580474376678467, + "learning_rate": 3.5003411593353913e-06, + "loss": 0.9499, + "step": 7901 + }, + { + "epoch": 0.6091581868640148, + "grad_norm": 3.8808116912841797, + "learning_rate": 3.4991502365397177e-06, + "loss": 0.9255, + "step": 7902 + }, + { + "epoch": 0.6092352759790317, + "grad_norm": 3.8387744426727295, + "learning_rate": 3.4979594073142535e-06, + "loss": 0.7857, + "step": 7903 + }, + { + "epoch": 0.6093123650940487, + "grad_norm": 3.8449535369873047, + "learning_rate": 3.4967686717332426e-06, + "loss": 0.851, + "step": 7904 + }, + { + "epoch": 0.6093894542090657, + "grad_norm": 4.096821308135986, + "learning_rate": 3.495578029870918e-06, + "loss": 0.9903, + "step": 7905 + }, + { + "epoch": 0.6094665433240827, + "grad_norm": 3.537116050720215, + "learning_rate": 3.4943874818015115e-06, + "loss": 0.9367, + "step": 7906 + }, + { + "epoch": 0.6095436324390996, + "grad_norm": 3.9148404598236084, + "learning_rate": 3.493197027599249e-06, + "loss": 1.0676, + "step": 7907 + }, + { + "epoch": 0.6096207215541165, + "grad_norm": 3.719594717025757, + "learning_rate": 3.492006667338347e-06, + "loss": 0.8925, + "step": 7908 + }, + { + "epoch": 0.6096978106691335, + "grad_norm": 3.5892961025238037, + "learning_rate": 3.49081640109302e-06, + "loss": 0.9494, + "step": 7909 + }, + { + "epoch": 0.6097748997841504, + "grad_norm": 3.9413259029388428, + "learning_rate": 3.489626228937473e-06, + "loss": 0.8696, + "step": 7910 + }, + { + "epoch": 0.6098519888991675, + "grad_norm": 3.645066499710083, + "learning_rate": 3.4884361509459088e-06, + "loss": 0.9745, + "step": 7911 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 3.4567646980285645, + "learning_rate": 3.4872461671925227e-06, + "loss": 0.721, + "step": 7912 + }, + { + "epoch": 0.6100061671292013, + "grad_norm": 3.5239381790161133, + "learning_rate": 3.486056277751502e-06, + "loss": 0.9016, + "step": 7913 + }, + { + "epoch": 0.6100832562442183, + "grad_norm": 3.2868807315826416, + "learning_rate": 3.484866482697032e-06, + "loss": 0.888, + "step": 7914 + }, + { + "epoch": 0.6101603453592352, + "grad_norm": 3.2013051509857178, + "learning_rate": 3.4836767821032902e-06, + "loss": 0.8686, + "step": 7915 + }, + { + "epoch": 0.6102374344742523, + "grad_norm": 3.507582426071167, + "learning_rate": 3.4824871760444465e-06, + "loss": 1.027, + "step": 7916 + }, + { + "epoch": 0.6103145235892692, + "grad_norm": 3.7498128414154053, + "learning_rate": 3.48129766459467e-06, + "loss": 1.0387, + "step": 7917 + }, + { + "epoch": 0.6103916127042861, + "grad_norm": 3.7105958461761475, + "learning_rate": 3.480108247828117e-06, + "loss": 0.9652, + "step": 7918 + }, + { + "epoch": 0.6104687018193031, + "grad_norm": 3.8917605876922607, + "learning_rate": 3.478918925818943e-06, + "loss": 0.9192, + "step": 7919 + }, + { + "epoch": 0.61054579093432, + "grad_norm": 3.6685686111450195, + "learning_rate": 3.4777296986412972e-06, + "loss": 0.9294, + "step": 7920 + }, + { + "epoch": 0.6106228800493371, + "grad_norm": 3.4759345054626465, + "learning_rate": 3.4765405663693196e-06, + "loss": 0.8923, + "step": 7921 + }, + { + "epoch": 0.610699969164354, + "grad_norm": 3.711207151412964, + "learning_rate": 3.475351529077148e-06, + "loss": 0.9937, + "step": 7922 + }, + { + "epoch": 0.6107770582793709, + "grad_norm": 3.594850540161133, + "learning_rate": 3.474162586838913e-06, + "loss": 0.8816, + "step": 7923 + }, + { + "epoch": 0.6108541473943879, + "grad_norm": 3.8033182621002197, + "learning_rate": 3.4729737397287365e-06, + "loss": 0.9382, + "step": 7924 + }, + { + "epoch": 0.6109312365094048, + "grad_norm": 3.380199432373047, + "learning_rate": 3.4717849878207387e-06, + "loss": 0.8568, + "step": 7925 + }, + { + "epoch": 0.6110083256244219, + "grad_norm": 3.973435640335083, + "learning_rate": 3.4705963311890334e-06, + "loss": 0.8793, + "step": 7926 + }, + { + "epoch": 0.6110854147394388, + "grad_norm": 3.6724209785461426, + "learning_rate": 3.469407769907725e-06, + "loss": 0.8936, + "step": 7927 + }, + { + "epoch": 0.6111625038544557, + "grad_norm": 3.7241382598876953, + "learning_rate": 3.468219304050916e-06, + "loss": 0.9815, + "step": 7928 + }, + { + "epoch": 0.6112395929694727, + "grad_norm": 3.8722872734069824, + "learning_rate": 3.4670309336926988e-06, + "loss": 1.0309, + "step": 7929 + }, + { + "epoch": 0.6113166820844896, + "grad_norm": 3.580594301223755, + "learning_rate": 3.4658426589071637e-06, + "loss": 0.834, + "step": 7930 + }, + { + "epoch": 0.6113937711995067, + "grad_norm": 3.637439012527466, + "learning_rate": 3.4646544797683946e-06, + "loss": 0.9892, + "step": 7931 + }, + { + "epoch": 0.6114708603145236, + "grad_norm": 3.7209787368774414, + "learning_rate": 3.4634663963504654e-06, + "loss": 0.8565, + "step": 7932 + }, + { + "epoch": 0.6115479494295405, + "grad_norm": 3.676300525665283, + "learning_rate": 3.4622784087274487e-06, + "loss": 1.0424, + "step": 7933 + }, + { + "epoch": 0.6116250385445575, + "grad_norm": 4.965175151824951, + "learning_rate": 3.46109051697341e-06, + "loss": 0.9192, + "step": 7934 + }, + { + "epoch": 0.6117021276595744, + "grad_norm": 3.5655887126922607, + "learning_rate": 3.459902721162406e-06, + "loss": 0.958, + "step": 7935 + }, + { + "epoch": 0.6117792167745915, + "grad_norm": 3.9590542316436768, + "learning_rate": 3.4587150213684917e-06, + "loss": 0.9491, + "step": 7936 + }, + { + "epoch": 0.6118563058896084, + "grad_norm": 3.773998498916626, + "learning_rate": 3.4575274176657134e-06, + "loss": 0.9361, + "step": 7937 + }, + { + "epoch": 0.6119333950046254, + "grad_norm": 3.76638126373291, + "learning_rate": 3.4563399101281116e-06, + "loss": 0.9906, + "step": 7938 + }, + { + "epoch": 0.6120104841196423, + "grad_norm": 3.668475389480591, + "learning_rate": 3.4551524988297224e-06, + "loss": 0.782, + "step": 7939 + }, + { + "epoch": 0.6120875732346592, + "grad_norm": 3.6993038654327393, + "learning_rate": 3.453965183844573e-06, + "loss": 0.9687, + "step": 7940 + }, + { + "epoch": 0.6121646623496763, + "grad_norm": 4.174620628356934, + "learning_rate": 3.4527779652466876e-06, + "loss": 0.9987, + "step": 7941 + }, + { + "epoch": 0.6122417514646932, + "grad_norm": 3.559088945388794, + "learning_rate": 3.451590843110083e-06, + "loss": 1.0611, + "step": 7942 + }, + { + "epoch": 0.6123188405797102, + "grad_norm": 3.860713243484497, + "learning_rate": 3.4504038175087697e-06, + "loss": 0.9319, + "step": 7943 + }, + { + "epoch": 0.6123959296947271, + "grad_norm": 3.864760160446167, + "learning_rate": 3.449216888516751e-06, + "loss": 0.8688, + "step": 7944 + }, + { + "epoch": 0.612473018809744, + "grad_norm": 3.620938539505005, + "learning_rate": 3.448030056208029e-06, + "loss": 0.8719, + "step": 7945 + }, + { + "epoch": 0.6125501079247611, + "grad_norm": 3.6368861198425293, + "learning_rate": 3.4468433206565964e-06, + "loss": 0.9333, + "step": 7946 + }, + { + "epoch": 0.612627197039778, + "grad_norm": 3.713548421859741, + "learning_rate": 3.445656681936438e-06, + "loss": 1.0769, + "step": 7947 + }, + { + "epoch": 0.612704286154795, + "grad_norm": 3.9157612323760986, + "learning_rate": 3.4444701401215343e-06, + "loss": 0.8697, + "step": 7948 + }, + { + "epoch": 0.6127813752698119, + "grad_norm": 3.3834121227264404, + "learning_rate": 3.443283695285863e-06, + "loss": 0.7926, + "step": 7949 + }, + { + "epoch": 0.6128584643848288, + "grad_norm": 3.601344108581543, + "learning_rate": 3.4420973475033894e-06, + "loss": 0.922, + "step": 7950 + }, + { + "epoch": 0.6129355534998459, + "grad_norm": 3.9263248443603516, + "learning_rate": 3.4409110968480773e-06, + "loss": 0.8751, + "step": 7951 + }, + { + "epoch": 0.6130126426148628, + "grad_norm": 3.377624988555908, + "learning_rate": 3.439724943393885e-06, + "loss": 0.8985, + "step": 7952 + }, + { + "epoch": 0.6130897317298798, + "grad_norm": 3.6270108222961426, + "learning_rate": 3.438538887214761e-06, + "loss": 0.965, + "step": 7953 + }, + { + "epoch": 0.6131668208448967, + "grad_norm": 3.6317126750946045, + "learning_rate": 3.4373529283846498e-06, + "loss": 0.8884, + "step": 7954 + }, + { + "epoch": 0.6132439099599136, + "grad_norm": 3.744412422180176, + "learning_rate": 3.4361670669774917e-06, + "loss": 0.9421, + "step": 7955 + }, + { + "epoch": 0.6133209990749307, + "grad_norm": 3.9327218532562256, + "learning_rate": 3.4349813030672165e-06, + "loss": 0.894, + "step": 7956 + }, + { + "epoch": 0.6133980881899476, + "grad_norm": 3.5057458877563477, + "learning_rate": 3.4337956367277524e-06, + "loss": 0.9483, + "step": 7957 + }, + { + "epoch": 0.6134751773049646, + "grad_norm": 3.859243869781494, + "learning_rate": 3.432610068033018e-06, + "loss": 1.0009, + "step": 7958 + }, + { + "epoch": 0.6135522664199815, + "grad_norm": 3.938803195953369, + "learning_rate": 3.4314245970569283e-06, + "loss": 0.8712, + "step": 7959 + }, + { + "epoch": 0.6136293555349984, + "grad_norm": 3.743238687515259, + "learning_rate": 3.4302392238733916e-06, + "loss": 0.9356, + "step": 7960 + }, + { + "epoch": 0.6137064446500154, + "grad_norm": 3.5383241176605225, + "learning_rate": 3.429053948556309e-06, + "loss": 1.0357, + "step": 7961 + }, + { + "epoch": 0.6137835337650324, + "grad_norm": 3.5702669620513916, + "learning_rate": 3.4278687711795754e-06, + "loss": 0.9484, + "step": 7962 + }, + { + "epoch": 0.6138606228800494, + "grad_norm": 3.585568904876709, + "learning_rate": 3.4266836918170844e-06, + "loss": 0.8646, + "step": 7963 + }, + { + "epoch": 0.6139377119950663, + "grad_norm": 3.544301748275757, + "learning_rate": 3.425498710542714e-06, + "loss": 0.9274, + "step": 7964 + }, + { + "epoch": 0.6140148011100832, + "grad_norm": 3.528738498687744, + "learning_rate": 3.424313827430347e-06, + "loss": 0.8791, + "step": 7965 + }, + { + "epoch": 0.6140918902251002, + "grad_norm": 3.8694934844970703, + "learning_rate": 3.42312904255385e-06, + "loss": 0.9614, + "step": 7966 + }, + { + "epoch": 0.6141689793401172, + "grad_norm": 3.590520143508911, + "learning_rate": 3.4219443559870906e-06, + "loss": 0.9202, + "step": 7967 + }, + { + "epoch": 0.6142460684551342, + "grad_norm": 3.699113607406616, + "learning_rate": 3.4207597678039293e-06, + "loss": 0.9423, + "step": 7968 + }, + { + "epoch": 0.6143231575701511, + "grad_norm": 3.463752508163452, + "learning_rate": 3.4195752780782156e-06, + "loss": 0.9054, + "step": 7969 + }, + { + "epoch": 0.614400246685168, + "grad_norm": 3.7661490440368652, + "learning_rate": 3.4183908868837988e-06, + "loss": 0.8484, + "step": 7970 + }, + { + "epoch": 0.614477335800185, + "grad_norm": 3.7998509407043457, + "learning_rate": 3.4172065942945194e-06, + "loss": 0.9527, + "step": 7971 + }, + { + "epoch": 0.614554424915202, + "grad_norm": 3.6133172512054443, + "learning_rate": 3.4160224003842103e-06, + "loss": 0.9766, + "step": 7972 + }, + { + "epoch": 0.614631514030219, + "grad_norm": 3.8746337890625, + "learning_rate": 3.414838305226701e-06, + "loss": 0.9468, + "step": 7973 + }, + { + "epoch": 0.6147086031452359, + "grad_norm": 3.8318541049957275, + "learning_rate": 3.4136543088958147e-06, + "loss": 0.9815, + "step": 7974 + }, + { + "epoch": 0.6147856922602528, + "grad_norm": 3.35244083404541, + "learning_rate": 3.4124704114653653e-06, + "loss": 0.83, + "step": 7975 + }, + { + "epoch": 0.6148627813752698, + "grad_norm": 3.994019031524658, + "learning_rate": 3.4112866130091648e-06, + "loss": 0.915, + "step": 7976 + }, + { + "epoch": 0.6149398704902868, + "grad_norm": 3.6324615478515625, + "learning_rate": 3.4101029136010144e-06, + "loss": 0.8744, + "step": 7977 + }, + { + "epoch": 0.6150169596053038, + "grad_norm": 3.4820597171783447, + "learning_rate": 3.4089193133147136e-06, + "loss": 0.8885, + "step": 7978 + }, + { + "epoch": 0.6150940487203207, + "grad_norm": 3.644369602203369, + "learning_rate": 3.4077358122240532e-06, + "loss": 0.996, + "step": 7979 + }, + { + "epoch": 0.6151711378353376, + "grad_norm": 3.64266300201416, + "learning_rate": 3.4065524104028185e-06, + "loss": 0.9692, + "step": 7980 + }, + { + "epoch": 0.6152482269503546, + "grad_norm": 3.786377191543579, + "learning_rate": 3.405369107924788e-06, + "loss": 0.9424, + "step": 7981 + }, + { + "epoch": 0.6153253160653716, + "grad_norm": 3.296741008758545, + "learning_rate": 3.404185904863736e-06, + "loss": 0.8991, + "step": 7982 + }, + { + "epoch": 0.6154024051803886, + "grad_norm": 3.8608269691467285, + "learning_rate": 3.4030028012934264e-06, + "loss": 0.9511, + "step": 7983 + }, + { + "epoch": 0.6154794942954055, + "grad_norm": 3.641268730163574, + "learning_rate": 3.401819797287621e-06, + "loss": 0.9326, + "step": 7984 + }, + { + "epoch": 0.6155565834104224, + "grad_norm": 3.7189748287200928, + "learning_rate": 3.400636892920076e-06, + "loss": 0.901, + "step": 7985 + }, + { + "epoch": 0.6156336725254394, + "grad_norm": 3.6073148250579834, + "learning_rate": 3.3994540882645353e-06, + "loss": 0.9221, + "step": 7986 + }, + { + "epoch": 0.6157107616404563, + "grad_norm": 3.328263282775879, + "learning_rate": 3.3982713833947447e-06, + "loss": 0.8374, + "step": 7987 + }, + { + "epoch": 0.6157878507554734, + "grad_norm": 3.7613234519958496, + "learning_rate": 3.397088778384434e-06, + "loss": 0.8323, + "step": 7988 + }, + { + "epoch": 0.6158649398704903, + "grad_norm": 3.8894762992858887, + "learning_rate": 3.39590627330734e-06, + "loss": 0.9505, + "step": 7989 + }, + { + "epoch": 0.6159420289855072, + "grad_norm": 3.8909053802490234, + "learning_rate": 3.3947238682371803e-06, + "loss": 0.9096, + "step": 7990 + }, + { + "epoch": 0.6160191181005242, + "grad_norm": 3.90527081489563, + "learning_rate": 3.3935415632476736e-06, + "loss": 0.9503, + "step": 7991 + }, + { + "epoch": 0.6160962072155411, + "grad_norm": 3.739124298095703, + "learning_rate": 3.392359358412532e-06, + "loss": 0.9485, + "step": 7992 + }, + { + "epoch": 0.6161732963305582, + "grad_norm": 3.6810646057128906, + "learning_rate": 3.3911772538054565e-06, + "loss": 0.9624, + "step": 7993 + }, + { + "epoch": 0.6162503854455751, + "grad_norm": 3.561396598815918, + "learning_rate": 3.3899952495001486e-06, + "loss": 0.9049, + "step": 7994 + }, + { + "epoch": 0.616327474560592, + "grad_norm": 4.142645835876465, + "learning_rate": 3.3888133455702964e-06, + "loss": 0.9669, + "step": 7995 + }, + { + "epoch": 0.616404563675609, + "grad_norm": 3.6202030181884766, + "learning_rate": 3.387631542089589e-06, + "loss": 0.9981, + "step": 7996 + }, + { + "epoch": 0.616481652790626, + "grad_norm": 3.5489907264709473, + "learning_rate": 3.3864498391317047e-06, + "loss": 0.9013, + "step": 7997 + }, + { + "epoch": 0.616558741905643, + "grad_norm": 3.5619547367095947, + "learning_rate": 3.385268236770315e-06, + "loss": 0.8911, + "step": 7998 + }, + { + "epoch": 0.6166358310206599, + "grad_norm": 3.5192930698394775, + "learning_rate": 3.3840867350790872e-06, + "loss": 0.9838, + "step": 7999 + }, + { + "epoch": 0.6167129201356768, + "grad_norm": 3.7306177616119385, + "learning_rate": 3.3829053341316843e-06, + "loss": 0.9779, + "step": 8000 + }, + { + "epoch": 0.6167900092506938, + "grad_norm": 4.072969436645508, + "learning_rate": 3.3817240340017575e-06, + "loss": 0.9314, + "step": 8001 + }, + { + "epoch": 0.6168670983657107, + "grad_norm": 4.617973804473877, + "learning_rate": 3.3805428347629554e-06, + "loss": 0.9557, + "step": 8002 + }, + { + "epoch": 0.6169441874807278, + "grad_norm": 3.65148663520813, + "learning_rate": 3.379361736488922e-06, + "loss": 0.9209, + "step": 8003 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 3.826519727706909, + "learning_rate": 3.3781807392532893e-06, + "loss": 1.0016, + "step": 8004 + }, + { + "epoch": 0.6170983657107616, + "grad_norm": 3.791816473007202, + "learning_rate": 3.376999843129689e-06, + "loss": 0.8873, + "step": 8005 + }, + { + "epoch": 0.6171754548257786, + "grad_norm": 3.338088274002075, + "learning_rate": 3.375819048191742e-06, + "loss": 0.9135, + "step": 8006 + }, + { + "epoch": 0.6172525439407955, + "grad_norm": 3.48136043548584, + "learning_rate": 3.3746383545130646e-06, + "loss": 0.8923, + "step": 8007 + }, + { + "epoch": 0.6173296330558126, + "grad_norm": 3.3619744777679443, + "learning_rate": 3.37345776216727e-06, + "loss": 0.9164, + "step": 8008 + }, + { + "epoch": 0.6174067221708295, + "grad_norm": 3.594543218612671, + "learning_rate": 3.3722772712279583e-06, + "loss": 0.9498, + "step": 8009 + }, + { + "epoch": 0.6174838112858464, + "grad_norm": 3.30615496635437, + "learning_rate": 3.3710968817687284e-06, + "loss": 0.8533, + "step": 8010 + }, + { + "epoch": 0.6175609004008634, + "grad_norm": 3.645437479019165, + "learning_rate": 3.369916593863173e-06, + "loss": 0.9808, + "step": 8011 + }, + { + "epoch": 0.6176379895158803, + "grad_norm": 3.909817695617676, + "learning_rate": 3.3687364075848738e-06, + "loss": 1.0065, + "step": 8012 + }, + { + "epoch": 0.6177150786308974, + "grad_norm": 4.033278465270996, + "learning_rate": 3.3675563230074126e-06, + "loss": 0.9278, + "step": 8013 + }, + { + "epoch": 0.6177921677459143, + "grad_norm": 3.6392765045166016, + "learning_rate": 3.366376340204359e-06, + "loss": 0.8868, + "step": 8014 + }, + { + "epoch": 0.6178692568609312, + "grad_norm": 3.7995078563690186, + "learning_rate": 3.3651964592492805e-06, + "loss": 1.0029, + "step": 8015 + }, + { + "epoch": 0.6179463459759482, + "grad_norm": 4.048414707183838, + "learning_rate": 3.3640166802157357e-06, + "loss": 0.9189, + "step": 8016 + }, + { + "epoch": 0.6180234350909651, + "grad_norm": 3.517993927001953, + "learning_rate": 3.362837003177278e-06, + "loss": 0.9468, + "step": 8017 + }, + { + "epoch": 0.6181005242059822, + "grad_norm": 3.915440559387207, + "learning_rate": 3.3616574282074533e-06, + "loss": 0.9799, + "step": 8018 + }, + { + "epoch": 0.6181776133209991, + "grad_norm": 3.5532066822052, + "learning_rate": 3.360477955379804e-06, + "loss": 0.8911, + "step": 8019 + }, + { + "epoch": 0.618254702436016, + "grad_norm": 3.4006614685058594, + "learning_rate": 3.3592985847678617e-06, + "loss": 0.9145, + "step": 8020 + }, + { + "epoch": 0.618331791551033, + "grad_norm": 3.6829326152801514, + "learning_rate": 3.3581193164451555e-06, + "loss": 1.0141, + "step": 8021 + }, + { + "epoch": 0.6184088806660499, + "grad_norm": 3.5860280990600586, + "learning_rate": 3.3569401504852073e-06, + "loss": 0.8548, + "step": 8022 + }, + { + "epoch": 0.618485969781067, + "grad_norm": 3.718271255493164, + "learning_rate": 3.35576108696153e-06, + "loss": 0.9893, + "step": 8023 + }, + { + "epoch": 0.6185630588960839, + "grad_norm": 3.679704189300537, + "learning_rate": 3.354582125947634e-06, + "loss": 0.9285, + "step": 8024 + }, + { + "epoch": 0.6186401480111008, + "grad_norm": 3.5176236629486084, + "learning_rate": 3.3534032675170205e-06, + "loss": 0.9224, + "step": 8025 + }, + { + "epoch": 0.6187172371261178, + "grad_norm": 3.7090749740600586, + "learning_rate": 3.3522245117431845e-06, + "loss": 0.8834, + "step": 8026 + }, + { + "epoch": 0.6187943262411347, + "grad_norm": 3.7500243186950684, + "learning_rate": 3.3510458586996173e-06, + "loss": 1.0048, + "step": 8027 + }, + { + "epoch": 0.6188714153561518, + "grad_norm": 3.5407137870788574, + "learning_rate": 3.3498673084598e-06, + "loss": 1.0076, + "step": 8028 + }, + { + "epoch": 0.6189485044711687, + "grad_norm": 3.900679588317871, + "learning_rate": 3.348688861097209e-06, + "loss": 0.9704, + "step": 8029 + }, + { + "epoch": 0.6190255935861856, + "grad_norm": 3.705470323562622, + "learning_rate": 3.347510516685317e-06, + "loss": 0.9472, + "step": 8030 + }, + { + "epoch": 0.6191026827012026, + "grad_norm": 3.6836581230163574, + "learning_rate": 3.3463322752975846e-06, + "loss": 0.8369, + "step": 8031 + }, + { + "epoch": 0.6191797718162195, + "grad_norm": 3.5624961853027344, + "learning_rate": 3.34515413700747e-06, + "loss": 0.9787, + "step": 8032 + }, + { + "epoch": 0.6192568609312366, + "grad_norm": 3.8100168704986572, + "learning_rate": 3.3439761018884233e-06, + "loss": 1.0565, + "step": 8033 + }, + { + "epoch": 0.6193339500462535, + "grad_norm": 3.6078031063079834, + "learning_rate": 3.342798170013892e-06, + "loss": 0.8859, + "step": 8034 + }, + { + "epoch": 0.6194110391612704, + "grad_norm": 3.7805142402648926, + "learning_rate": 3.3416203414573113e-06, + "loss": 0.9629, + "step": 8035 + }, + { + "epoch": 0.6194881282762874, + "grad_norm": 3.473383903503418, + "learning_rate": 3.3404426162921123e-06, + "loss": 0.8921, + "step": 8036 + }, + { + "epoch": 0.6195652173913043, + "grad_norm": 3.560927629470825, + "learning_rate": 3.3392649945917235e-06, + "loss": 0.8671, + "step": 8037 + }, + { + "epoch": 0.6196423065063213, + "grad_norm": 3.80720591545105, + "learning_rate": 3.338087476429559e-06, + "loss": 0.9247, + "step": 8038 + }, + { + "epoch": 0.6197193956213383, + "grad_norm": 3.5851659774780273, + "learning_rate": 3.3369100618790336e-06, + "loss": 0.9808, + "step": 8039 + }, + { + "epoch": 0.6197964847363552, + "grad_norm": 3.476499319076538, + "learning_rate": 3.335732751013553e-06, + "loss": 0.8289, + "step": 8040 + }, + { + "epoch": 0.6198735738513722, + "grad_norm": 3.651381731033325, + "learning_rate": 3.334555543906516e-06, + "loss": 0.8949, + "step": 8041 + }, + { + "epoch": 0.6199506629663891, + "grad_norm": 3.6143252849578857, + "learning_rate": 3.333378440631315e-06, + "loss": 0.8938, + "step": 8042 + }, + { + "epoch": 0.6200277520814061, + "grad_norm": 3.81123685836792, + "learning_rate": 3.3322014412613364e-06, + "loss": 0.8147, + "step": 8043 + }, + { + "epoch": 0.6201048411964231, + "grad_norm": 3.5519094467163086, + "learning_rate": 3.3310245458699595e-06, + "loss": 0.8491, + "step": 8044 + }, + { + "epoch": 0.62018193031144, + "grad_norm": 3.531566619873047, + "learning_rate": 3.3298477545305595e-06, + "loss": 0.8893, + "step": 8045 + }, + { + "epoch": 0.620259019426457, + "grad_norm": 3.8642385005950928, + "learning_rate": 3.3286710673165012e-06, + "loss": 0.9373, + "step": 8046 + }, + { + "epoch": 0.6203361085414739, + "grad_norm": 4.416684627532959, + "learning_rate": 3.3274944843011463e-06, + "loss": 1.0649, + "step": 8047 + }, + { + "epoch": 0.620413197656491, + "grad_norm": 3.7867462635040283, + "learning_rate": 3.3263180055578493e-06, + "loss": 0.9491, + "step": 8048 + }, + { + "epoch": 0.6204902867715079, + "grad_norm": 3.4176316261291504, + "learning_rate": 3.325141631159954e-06, + "loss": 0.9123, + "step": 8049 + }, + { + "epoch": 0.6205673758865248, + "grad_norm": 3.9313580989837646, + "learning_rate": 3.3239653611808054e-06, + "loss": 0.938, + "step": 8050 + }, + { + "epoch": 0.6206444650015418, + "grad_norm": 3.718086004257202, + "learning_rate": 3.3227891956937366e-06, + "loss": 0.9519, + "step": 8051 + }, + { + "epoch": 0.6207215541165587, + "grad_norm": 3.9227771759033203, + "learning_rate": 3.321613134772073e-06, + "loss": 1.0329, + "step": 8052 + }, + { + "epoch": 0.6207986432315757, + "grad_norm": 3.459362506866455, + "learning_rate": 3.32043717848914e-06, + "loss": 0.92, + "step": 8053 + }, + { + "epoch": 0.6208757323465927, + "grad_norm": 3.933849811553955, + "learning_rate": 3.3192613269182483e-06, + "loss": 0.9814, + "step": 8054 + }, + { + "epoch": 0.6209528214616096, + "grad_norm": 4.124648094177246, + "learning_rate": 3.3180855801327083e-06, + "loss": 0.979, + "step": 8055 + }, + { + "epoch": 0.6210299105766266, + "grad_norm": 3.7071919441223145, + "learning_rate": 3.3169099382058233e-06, + "loss": 0.9179, + "step": 8056 + }, + { + "epoch": 0.6211069996916435, + "grad_norm": 3.726698398590088, + "learning_rate": 3.315734401210885e-06, + "loss": 0.9425, + "step": 8057 + }, + { + "epoch": 0.6211840888066605, + "grad_norm": 3.652276039123535, + "learning_rate": 3.3145589692211837e-06, + "loss": 0.9226, + "step": 8058 + }, + { + "epoch": 0.6212611779216775, + "grad_norm": 3.5816662311553955, + "learning_rate": 3.313383642310003e-06, + "loss": 0.7326, + "step": 8059 + }, + { + "epoch": 0.6213382670366944, + "grad_norm": 3.948348045349121, + "learning_rate": 3.3122084205506153e-06, + "loss": 0.9797, + "step": 8060 + }, + { + "epoch": 0.6214153561517114, + "grad_norm": 3.8742499351501465, + "learning_rate": 3.3110333040162934e-06, + "loss": 1.0117, + "step": 8061 + }, + { + "epoch": 0.6214924452667283, + "grad_norm": 3.3892946243286133, + "learning_rate": 3.309858292780296e-06, + "loss": 0.8751, + "step": 8062 + }, + { + "epoch": 0.6215695343817453, + "grad_norm": 3.410844087600708, + "learning_rate": 3.308683386915881e-06, + "loss": 0.8663, + "step": 8063 + }, + { + "epoch": 0.6216466234967623, + "grad_norm": 3.326235055923462, + "learning_rate": 3.3075085864962975e-06, + "loss": 0.8682, + "step": 8064 + }, + { + "epoch": 0.6217237126117792, + "grad_norm": 4.0448079109191895, + "learning_rate": 3.306333891594787e-06, + "loss": 0.8579, + "step": 8065 + }, + { + "epoch": 0.6218008017267962, + "grad_norm": 3.6627490520477295, + "learning_rate": 3.3051593022845867e-06, + "loss": 0.9246, + "step": 8066 + }, + { + "epoch": 0.6218778908418131, + "grad_norm": 3.7059576511383057, + "learning_rate": 3.3039848186389278e-06, + "loss": 0.7995, + "step": 8067 + }, + { + "epoch": 0.6219549799568301, + "grad_norm": 3.518906831741333, + "learning_rate": 3.3028104407310302e-06, + "loss": 0.9744, + "step": 8068 + }, + { + "epoch": 0.622032069071847, + "grad_norm": 3.5456724166870117, + "learning_rate": 3.3016361686341115e-06, + "loss": 0.9983, + "step": 8069 + }, + { + "epoch": 0.622109158186864, + "grad_norm": 4.1023101806640625, + "learning_rate": 3.3004620024213824e-06, + "loss": 0.9888, + "step": 8070 + }, + { + "epoch": 0.622186247301881, + "grad_norm": 3.6297264099121094, + "learning_rate": 3.2992879421660447e-06, + "loss": 0.9784, + "step": 8071 + }, + { + "epoch": 0.6222633364168979, + "grad_norm": 3.9697625637054443, + "learning_rate": 3.2981139879412965e-06, + "loss": 1.038, + "step": 8072 + }, + { + "epoch": 0.6223404255319149, + "grad_norm": 3.5402143001556396, + "learning_rate": 3.296940139820326e-06, + "loss": 0.8862, + "step": 8073 + }, + { + "epoch": 0.6224175146469318, + "grad_norm": 3.9144885540008545, + "learning_rate": 3.2957663978763172e-06, + "loss": 0.9882, + "step": 8074 + }, + { + "epoch": 0.6224946037619488, + "grad_norm": 3.5432751178741455, + "learning_rate": 3.294592762182448e-06, + "loss": 0.9141, + "step": 8075 + }, + { + "epoch": 0.6225716928769658, + "grad_norm": 3.5189805030822754, + "learning_rate": 3.2934192328118866e-06, + "loss": 0.7741, + "step": 8076 + }, + { + "epoch": 0.6226487819919827, + "grad_norm": 3.9740869998931885, + "learning_rate": 3.292245809837796e-06, + "loss": 0.9351, + "step": 8077 + }, + { + "epoch": 0.6227258711069997, + "grad_norm": 3.5222132205963135, + "learning_rate": 3.2910724933333365e-06, + "loss": 0.9749, + "step": 8078 + }, + { + "epoch": 0.6228029602220166, + "grad_norm": 4.026155948638916, + "learning_rate": 3.289899283371657e-06, + "loss": 0.9626, + "step": 8079 + }, + { + "epoch": 0.6228800493370336, + "grad_norm": 3.8821022510528564, + "learning_rate": 3.2887261800258985e-06, + "loss": 1.0371, + "step": 8080 + }, + { + "epoch": 0.6229571384520506, + "grad_norm": 3.676262855529785, + "learning_rate": 3.287553183369201e-06, + "loss": 1.0328, + "step": 8081 + }, + { + "epoch": 0.6230342275670675, + "grad_norm": 3.552816152572632, + "learning_rate": 3.2863802934746938e-06, + "loss": 0.8604, + "step": 8082 + }, + { + "epoch": 0.6231113166820845, + "grad_norm": 4.076422214508057, + "learning_rate": 3.285207510415499e-06, + "loss": 0.8237, + "step": 8083 + }, + { + "epoch": 0.6231884057971014, + "grad_norm": 3.790148973464966, + "learning_rate": 3.2840348342647357e-06, + "loss": 0.8949, + "step": 8084 + }, + { + "epoch": 0.6232654949121184, + "grad_norm": 3.885585308074951, + "learning_rate": 3.2828622650955133e-06, + "loss": 0.9832, + "step": 8085 + }, + { + "epoch": 0.6233425840271354, + "grad_norm": 4.014956474304199, + "learning_rate": 3.281689802980934e-06, + "loss": 0.9888, + "step": 8086 + }, + { + "epoch": 0.6234196731421523, + "grad_norm": 3.65321946144104, + "learning_rate": 3.280517447994097e-06, + "loss": 0.9624, + "step": 8087 + }, + { + "epoch": 0.6234967622571693, + "grad_norm": 3.9095733165740967, + "learning_rate": 3.279345200208093e-06, + "loss": 0.8794, + "step": 8088 + }, + { + "epoch": 0.6235738513721862, + "grad_norm": 3.463289737701416, + "learning_rate": 3.2781730596960027e-06, + "loss": 0.9902, + "step": 8089 + }, + { + "epoch": 0.6236509404872032, + "grad_norm": 4.12209939956665, + "learning_rate": 3.277001026530906e-06, + "loss": 0.8898, + "step": 8090 + }, + { + "epoch": 0.6237280296022202, + "grad_norm": 3.618441104888916, + "learning_rate": 3.27582910078587e-06, + "loss": 0.9217, + "step": 8091 + }, + { + "epoch": 0.6238051187172371, + "grad_norm": 3.7258079051971436, + "learning_rate": 3.2746572825339607e-06, + "loss": 0.9826, + "step": 8092 + }, + { + "epoch": 0.6238822078322541, + "grad_norm": 3.7279441356658936, + "learning_rate": 3.273485571848234e-06, + "loss": 0.9179, + "step": 8093 + }, + { + "epoch": 0.623959296947271, + "grad_norm": 3.8348352909088135, + "learning_rate": 3.27231396880174e-06, + "loss": 1.006, + "step": 8094 + }, + { + "epoch": 0.624036386062288, + "grad_norm": 3.53607177734375, + "learning_rate": 3.2711424734675212e-06, + "loss": 0.8691, + "step": 8095 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 3.5375266075134277, + "learning_rate": 3.2699710859186175e-06, + "loss": 0.9688, + "step": 8096 + }, + { + "epoch": 0.6241905642923219, + "grad_norm": 4.027289390563965, + "learning_rate": 3.2687998062280546e-06, + "loss": 1.0257, + "step": 8097 + }, + { + "epoch": 0.6242676534073389, + "grad_norm": 3.652118682861328, + "learning_rate": 3.2676286344688575e-06, + "loss": 0.923, + "step": 8098 + }, + { + "epoch": 0.6243447425223558, + "grad_norm": 4.014166355133057, + "learning_rate": 3.2664575707140446e-06, + "loss": 0.951, + "step": 8099 + }, + { + "epoch": 0.6244218316373727, + "grad_norm": 3.562201738357544, + "learning_rate": 3.265286615036622e-06, + "loss": 0.9807, + "step": 8100 + }, + { + "epoch": 0.6244989207523898, + "grad_norm": 3.267332077026367, + "learning_rate": 3.2641157675095965e-06, + "loss": 0.9209, + "step": 8101 + }, + { + "epoch": 0.6245760098674067, + "grad_norm": 3.832848310470581, + "learning_rate": 3.2629450282059603e-06, + "loss": 0.8897, + "step": 8102 + }, + { + "epoch": 0.6246530989824237, + "grad_norm": 3.6828267574310303, + "learning_rate": 3.2617743971987063e-06, + "loss": 0.8706, + "step": 8103 + }, + { + "epoch": 0.6247301880974406, + "grad_norm": 3.771867275238037, + "learning_rate": 3.260603874560817e-06, + "loss": 0.9786, + "step": 8104 + }, + { + "epoch": 0.6248072772124575, + "grad_norm": 3.621521234512329, + "learning_rate": 3.2594334603652668e-06, + "loss": 0.9209, + "step": 8105 + }, + { + "epoch": 0.6248843663274746, + "grad_norm": 3.851837635040283, + "learning_rate": 3.2582631546850254e-06, + "loss": 0.893, + "step": 8106 + }, + { + "epoch": 0.6249614554424915, + "grad_norm": 4.190458297729492, + "learning_rate": 3.2570929575930563e-06, + "loss": 0.9945, + "step": 8107 + }, + { + "epoch": 0.6250385445575085, + "grad_norm": 3.653280019760132, + "learning_rate": 3.2559228691623134e-06, + "loss": 0.9732, + "step": 8108 + }, + { + "epoch": 0.6251156336725254, + "grad_norm": 3.965890645980835, + "learning_rate": 3.254752889465749e-06, + "loss": 0.9156, + "step": 8109 + }, + { + "epoch": 0.6251927227875425, + "grad_norm": 3.464805841445923, + "learning_rate": 3.2535830185763006e-06, + "loss": 0.7856, + "step": 8110 + }, + { + "epoch": 0.6252698119025594, + "grad_norm": 3.8827240467071533, + "learning_rate": 3.252413256566907e-06, + "loss": 0.9146, + "step": 8111 + }, + { + "epoch": 0.6253469010175763, + "grad_norm": 3.4386796951293945, + "learning_rate": 3.2512436035104968e-06, + "loss": 0.9379, + "step": 8112 + }, + { + "epoch": 0.6254239901325933, + "grad_norm": 3.7330002784729004, + "learning_rate": 3.2500740594799895e-06, + "loss": 0.942, + "step": 8113 + }, + { + "epoch": 0.6255010792476102, + "grad_norm": 3.6872072219848633, + "learning_rate": 3.2489046245483016e-06, + "loss": 0.8975, + "step": 8114 + }, + { + "epoch": 0.6255781683626273, + "grad_norm": 3.910029888153076, + "learning_rate": 3.2477352987883426e-06, + "loss": 1.0826, + "step": 8115 + }, + { + "epoch": 0.6256552574776442, + "grad_norm": 4.114757537841797, + "learning_rate": 3.246566082273011e-06, + "loss": 0.9402, + "step": 8116 + }, + { + "epoch": 0.6257323465926611, + "grad_norm": 3.61517071723938, + "learning_rate": 3.245396975075203e-06, + "loss": 0.9002, + "step": 8117 + }, + { + "epoch": 0.6258094357076781, + "grad_norm": 3.6216084957122803, + "learning_rate": 3.2442279772678076e-06, + "loss": 0.9916, + "step": 8118 + }, + { + "epoch": 0.625886524822695, + "grad_norm": 3.727811336517334, + "learning_rate": 3.2430590889237027e-06, + "loss": 1.0077, + "step": 8119 + }, + { + "epoch": 0.625963613937712, + "grad_norm": 3.697965145111084, + "learning_rate": 3.241890310115766e-06, + "loss": 1.0235, + "step": 8120 + }, + { + "epoch": 0.626040703052729, + "grad_norm": 3.661677122116089, + "learning_rate": 3.2407216409168597e-06, + "loss": 0.8098, + "step": 8121 + }, + { + "epoch": 0.6261177921677459, + "grad_norm": 3.530116558074951, + "learning_rate": 3.2395530813998506e-06, + "loss": 0.904, + "step": 8122 + }, + { + "epoch": 0.6261948812827629, + "grad_norm": 3.739652633666992, + "learning_rate": 3.2383846316375876e-06, + "loss": 0.9422, + "step": 8123 + }, + { + "epoch": 0.6262719703977798, + "grad_norm": 4.194034576416016, + "learning_rate": 3.2372162917029205e-06, + "loss": 1.0163, + "step": 8124 + }, + { + "epoch": 0.6263490595127968, + "grad_norm": 3.3283534049987793, + "learning_rate": 3.236048061668688e-06, + "loss": 0.7624, + "step": 8125 + }, + { + "epoch": 0.6264261486278138, + "grad_norm": 3.7306947708129883, + "learning_rate": 3.2348799416077225e-06, + "loss": 0.9407, + "step": 8126 + }, + { + "epoch": 0.6265032377428307, + "grad_norm": 3.8820812702178955, + "learning_rate": 3.2337119315928513e-06, + "loss": 0.9529, + "step": 8127 + }, + { + "epoch": 0.6265803268578477, + "grad_norm": 3.882148027420044, + "learning_rate": 3.2325440316968927e-06, + "loss": 0.93, + "step": 8128 + }, + { + "epoch": 0.6266574159728646, + "grad_norm": 3.439239025115967, + "learning_rate": 3.2313762419926597e-06, + "loss": 0.9563, + "step": 8129 + }, + { + "epoch": 0.6267345050878816, + "grad_norm": 3.5904295444488525, + "learning_rate": 3.2302085625529596e-06, + "loss": 1.0631, + "step": 8130 + }, + { + "epoch": 0.6268115942028986, + "grad_norm": 3.6750895977020264, + "learning_rate": 3.2290409934505884e-06, + "loss": 0.9154, + "step": 8131 + }, + { + "epoch": 0.6268886833179155, + "grad_norm": 3.750882625579834, + "learning_rate": 3.227873534758339e-06, + "loss": 0.8514, + "step": 8132 + }, + { + "epoch": 0.6269657724329325, + "grad_norm": 3.87109637260437, + "learning_rate": 3.226706186548998e-06, + "loss": 0.9765, + "step": 8133 + }, + { + "epoch": 0.6270428615479494, + "grad_norm": 4.06139612197876, + "learning_rate": 3.22553894889534e-06, + "loss": 0.9913, + "step": 8134 + }, + { + "epoch": 0.6271199506629664, + "grad_norm": 3.7100861072540283, + "learning_rate": 3.2243718218701393e-06, + "loss": 1.0381, + "step": 8135 + }, + { + "epoch": 0.6271970397779834, + "grad_norm": 3.1716091632843018, + "learning_rate": 3.2232048055461608e-06, + "loss": 0.8383, + "step": 8136 + }, + { + "epoch": 0.6272741288930003, + "grad_norm": 3.715208053588867, + "learning_rate": 3.2220378999961588e-06, + "loss": 0.943, + "step": 8137 + }, + { + "epoch": 0.6273512180080173, + "grad_norm": 3.427281618118286, + "learning_rate": 3.2208711052928867e-06, + "loss": 0.9138, + "step": 8138 + }, + { + "epoch": 0.6274283071230342, + "grad_norm": 3.5145509243011475, + "learning_rate": 3.219704421509085e-06, + "loss": 0.8756, + "step": 8139 + }, + { + "epoch": 0.6275053962380512, + "grad_norm": 4.032465934753418, + "learning_rate": 3.218537848717493e-06, + "loss": 0.9441, + "step": 8140 + }, + { + "epoch": 0.6275824853530682, + "grad_norm": 3.699963331222534, + "learning_rate": 3.2173713869908406e-06, + "loss": 0.9135, + "step": 8141 + }, + { + "epoch": 0.6276595744680851, + "grad_norm": 3.3463034629821777, + "learning_rate": 3.2162050364018484e-06, + "loss": 0.8761, + "step": 8142 + }, + { + "epoch": 0.6277366635831021, + "grad_norm": 3.821621894836426, + "learning_rate": 3.215038797023234e-06, + "loss": 0.9123, + "step": 8143 + }, + { + "epoch": 0.627813752698119, + "grad_norm": 3.723275899887085, + "learning_rate": 3.2138726689277067e-06, + "loss": 0.9333, + "step": 8144 + }, + { + "epoch": 0.627890841813136, + "grad_norm": 3.7927725315093994, + "learning_rate": 3.2127066521879674e-06, + "loss": 0.9102, + "step": 8145 + }, + { + "epoch": 0.627967930928153, + "grad_norm": 3.668217182159424, + "learning_rate": 3.211540746876711e-06, + "loss": 0.9221, + "step": 8146 + }, + { + "epoch": 0.6280450200431699, + "grad_norm": 3.8914999961853027, + "learning_rate": 3.2103749530666283e-06, + "loss": 0.9604, + "step": 8147 + }, + { + "epoch": 0.6281221091581869, + "grad_norm": 3.947474718093872, + "learning_rate": 3.2092092708303973e-06, + "loss": 0.9765, + "step": 8148 + }, + { + "epoch": 0.6281991982732038, + "grad_norm": 3.575840711593628, + "learning_rate": 3.2080437002406943e-06, + "loss": 0.9536, + "step": 8149 + }, + { + "epoch": 0.6282762873882208, + "grad_norm": 3.5878658294677734, + "learning_rate": 3.206878241370185e-06, + "loss": 0.9539, + "step": 8150 + }, + { + "epoch": 0.6283533765032377, + "grad_norm": 4.496673107147217, + "learning_rate": 3.2057128942915306e-06, + "loss": 0.9298, + "step": 8151 + }, + { + "epoch": 0.6284304656182547, + "grad_norm": 3.7084543704986572, + "learning_rate": 3.204547659077385e-06, + "loss": 0.9288, + "step": 8152 + }, + { + "epoch": 0.6285075547332717, + "grad_norm": 3.8590118885040283, + "learning_rate": 3.2033825358003936e-06, + "loss": 0.9057, + "step": 8153 + }, + { + "epoch": 0.6285846438482886, + "grad_norm": 3.7039713859558105, + "learning_rate": 3.2022175245331954e-06, + "loss": 0.9582, + "step": 8154 + }, + { + "epoch": 0.6286617329633056, + "grad_norm": 3.4202165603637695, + "learning_rate": 3.2010526253484246e-06, + "loss": 0.8993, + "step": 8155 + }, + { + "epoch": 0.6287388220783225, + "grad_norm": 3.575178861618042, + "learning_rate": 3.199887838318705e-06, + "loss": 0.8471, + "step": 8156 + }, + { + "epoch": 0.6288159111933395, + "grad_norm": 3.737819194793701, + "learning_rate": 3.1987231635166565e-06, + "loss": 0.8481, + "step": 8157 + }, + { + "epoch": 0.6288930003083565, + "grad_norm": 3.638150930404663, + "learning_rate": 3.197558601014889e-06, + "loss": 0.9176, + "step": 8158 + }, + { + "epoch": 0.6289700894233734, + "grad_norm": 3.68680477142334, + "learning_rate": 3.1963941508860076e-06, + "loss": 0.9781, + "step": 8159 + }, + { + "epoch": 0.6290471785383904, + "grad_norm": 3.491912841796875, + "learning_rate": 3.1952298132026107e-06, + "loss": 0.9208, + "step": 8160 + }, + { + "epoch": 0.6291242676534073, + "grad_norm": 3.932940721511841, + "learning_rate": 3.194065588037286e-06, + "loss": 0.9005, + "step": 8161 + }, + { + "epoch": 0.6292013567684243, + "grad_norm": 3.7449350357055664, + "learning_rate": 3.1929014754626197e-06, + "loss": 1.0379, + "step": 8162 + }, + { + "epoch": 0.6292784458834413, + "grad_norm": 3.840050220489502, + "learning_rate": 3.191737475551188e-06, + "loss": 0.9366, + "step": 8163 + }, + { + "epoch": 0.6293555349984582, + "grad_norm": 3.678901195526123, + "learning_rate": 3.1905735883755582e-06, + "loss": 0.8937, + "step": 8164 + }, + { + "epoch": 0.6294326241134752, + "grad_norm": 4.070024013519287, + "learning_rate": 3.1894098140082943e-06, + "loss": 1.0219, + "step": 8165 + }, + { + "epoch": 0.6295097132284921, + "grad_norm": 3.705054759979248, + "learning_rate": 3.18824615252195e-06, + "loss": 1.0407, + "step": 8166 + }, + { + "epoch": 0.629586802343509, + "grad_norm": 3.3765339851379395, + "learning_rate": 3.1870826039890766e-06, + "loss": 0.7974, + "step": 8167 + }, + { + "epoch": 0.6296638914585261, + "grad_norm": 3.9743378162384033, + "learning_rate": 3.1859191684822132e-06, + "loss": 0.9537, + "step": 8168 + }, + { + "epoch": 0.629740980573543, + "grad_norm": 3.8855860233306885, + "learning_rate": 3.1847558460738936e-06, + "loss": 0.9006, + "step": 8169 + }, + { + "epoch": 0.62981806968856, + "grad_norm": 3.418191909790039, + "learning_rate": 3.1835926368366465e-06, + "loss": 0.8984, + "step": 8170 + }, + { + "epoch": 0.6298951588035769, + "grad_norm": 3.7673134803771973, + "learning_rate": 3.1824295408429907e-06, + "loss": 0.9385, + "step": 8171 + }, + { + "epoch": 0.6299722479185939, + "grad_norm": 3.6283762454986572, + "learning_rate": 3.181266558165439e-06, + "loss": 0.94, + "step": 8172 + }, + { + "epoch": 0.6300493370336109, + "grad_norm": 3.8887081146240234, + "learning_rate": 3.180103688876499e-06, + "loss": 1.0551, + "step": 8173 + }, + { + "epoch": 0.6301264261486278, + "grad_norm": 4.249144554138184, + "learning_rate": 3.1789409330486683e-06, + "loss": 1.1189, + "step": 8174 + }, + { + "epoch": 0.6302035152636448, + "grad_norm": 3.9418768882751465, + "learning_rate": 3.1777782907544392e-06, + "loss": 0.9109, + "step": 8175 + }, + { + "epoch": 0.6302806043786617, + "grad_norm": 3.891338348388672, + "learning_rate": 3.176615762066295e-06, + "loss": 0.9539, + "step": 8176 + }, + { + "epoch": 0.6303576934936787, + "grad_norm": 3.6028475761413574, + "learning_rate": 3.175453347056715e-06, + "loss": 0.8756, + "step": 8177 + }, + { + "epoch": 0.6304347826086957, + "grad_norm": 3.6023495197296143, + "learning_rate": 3.17429104579817e-06, + "loss": 0.9605, + "step": 8178 + }, + { + "epoch": 0.6305118717237126, + "grad_norm": 4.147401332855225, + "learning_rate": 3.1731288583631214e-06, + "loss": 0.9424, + "step": 8179 + }, + { + "epoch": 0.6305889608387296, + "grad_norm": 3.7113969326019287, + "learning_rate": 3.1719667848240276e-06, + "loss": 0.9408, + "step": 8180 + }, + { + "epoch": 0.6306660499537465, + "grad_norm": 3.5659947395324707, + "learning_rate": 3.1708048252533376e-06, + "loss": 0.9135, + "step": 8181 + }, + { + "epoch": 0.6307431390687634, + "grad_norm": 3.6994526386260986, + "learning_rate": 3.1696429797234918e-06, + "loss": 0.8658, + "step": 8182 + }, + { + "epoch": 0.6308202281837805, + "grad_norm": 3.7129364013671875, + "learning_rate": 3.168481248306927e-06, + "loss": 0.9827, + "step": 8183 + }, + { + "epoch": 0.6308973172987974, + "grad_norm": 3.838097095489502, + "learning_rate": 3.1673196310760723e-06, + "loss": 0.9963, + "step": 8184 + }, + { + "epoch": 0.6309744064138144, + "grad_norm": 4.0744242668151855, + "learning_rate": 3.166158128103345e-06, + "loss": 0.9455, + "step": 8185 + }, + { + "epoch": 0.6310514955288313, + "grad_norm": 3.745927572250366, + "learning_rate": 3.164996739461162e-06, + "loss": 0.8731, + "step": 8186 + }, + { + "epoch": 0.6311285846438482, + "grad_norm": 3.8844759464263916, + "learning_rate": 3.163835465221927e-06, + "loss": 0.8394, + "step": 8187 + }, + { + "epoch": 0.6312056737588653, + "grad_norm": 3.549527406692505, + "learning_rate": 3.162674305458042e-06, + "loss": 0.9107, + "step": 8188 + }, + { + "epoch": 0.6312827628738822, + "grad_norm": 3.6786959171295166, + "learning_rate": 3.1615132602418986e-06, + "loss": 0.928, + "step": 8189 + }, + { + "epoch": 0.6313598519888992, + "grad_norm": 3.693917751312256, + "learning_rate": 3.160352329645881e-06, + "loss": 1.0224, + "step": 8190 + }, + { + "epoch": 0.6314369411039161, + "grad_norm": 3.7990047931671143, + "learning_rate": 3.159191513742368e-06, + "loss": 0.8727, + "step": 8191 + }, + { + "epoch": 0.631514030218933, + "grad_norm": 3.8932435512542725, + "learning_rate": 3.158030812603731e-06, + "loss": 0.9425, + "step": 8192 + }, + { + "epoch": 0.6315911193339501, + "grad_norm": 3.7320001125335693, + "learning_rate": 3.156870226302332e-06, + "loss": 0.804, + "step": 8193 + }, + { + "epoch": 0.631668208448967, + "grad_norm": 3.685913562774658, + "learning_rate": 3.155709754910529e-06, + "loss": 0.9927, + "step": 8194 + }, + { + "epoch": 0.631745297563984, + "grad_norm": 4.240044593811035, + "learning_rate": 3.1545493985006713e-06, + "loss": 1.0915, + "step": 8195 + }, + { + "epoch": 0.6318223866790009, + "grad_norm": 4.133537292480469, + "learning_rate": 3.1533891571451002e-06, + "loss": 0.9907, + "step": 8196 + }, + { + "epoch": 0.6318994757940178, + "grad_norm": 3.4915730953216553, + "learning_rate": 3.152229030916152e-06, + "loss": 0.8439, + "step": 8197 + }, + { + "epoch": 0.6319765649090349, + "grad_norm": 3.9785425662994385, + "learning_rate": 3.1510690198861533e-06, + "loss": 0.9674, + "step": 8198 + }, + { + "epoch": 0.6320536540240518, + "grad_norm": 3.388223886489868, + "learning_rate": 3.149909124127425e-06, + "loss": 0.8956, + "step": 8199 + }, + { + "epoch": 0.6321307431390688, + "grad_norm": 3.3853883743286133, + "learning_rate": 3.148749343712282e-06, + "loss": 1.0348, + "step": 8200 + }, + { + "epoch": 0.6322078322540857, + "grad_norm": 3.7054595947265625, + "learning_rate": 3.1475896787130287e-06, + "loss": 0.9161, + "step": 8201 + }, + { + "epoch": 0.6322849213691026, + "grad_norm": 4.188024997711182, + "learning_rate": 3.146430129201965e-06, + "loss": 1.0198, + "step": 8202 + }, + { + "epoch": 0.6323620104841197, + "grad_norm": 3.5970795154571533, + "learning_rate": 3.1452706952513836e-06, + "loss": 1.0027, + "step": 8203 + }, + { + "epoch": 0.6324390995991366, + "grad_norm": 4.296309947967529, + "learning_rate": 3.144111376933568e-06, + "loss": 0.9452, + "step": 8204 + }, + { + "epoch": 0.6325161887141536, + "grad_norm": 3.8872005939483643, + "learning_rate": 3.142952174320797e-06, + "loss": 0.985, + "step": 8205 + }, + { + "epoch": 0.6325932778291705, + "grad_norm": 3.649055004119873, + "learning_rate": 3.1417930874853386e-06, + "loss": 0.8932, + "step": 8206 + }, + { + "epoch": 0.6326703669441874, + "grad_norm": 3.9435665607452393, + "learning_rate": 3.1406341164994574e-06, + "loss": 0.9035, + "step": 8207 + }, + { + "epoch": 0.6327474560592045, + "grad_norm": 3.8601176738739014, + "learning_rate": 3.1394752614354106e-06, + "loss": 0.8653, + "step": 8208 + }, + { + "epoch": 0.6328245451742214, + "grad_norm": 3.7992372512817383, + "learning_rate": 3.1383165223654444e-06, + "loss": 0.9261, + "step": 8209 + }, + { + "epoch": 0.6329016342892384, + "grad_norm": 3.477597236633301, + "learning_rate": 3.137157899361799e-06, + "loss": 0.9783, + "step": 8210 + }, + { + "epoch": 0.6329787234042553, + "grad_norm": 3.3875021934509277, + "learning_rate": 3.1359993924967124e-06, + "loss": 0.8615, + "step": 8211 + }, + { + "epoch": 0.6330558125192722, + "grad_norm": 3.7047359943389893, + "learning_rate": 3.1348410018424115e-06, + "loss": 1.0487, + "step": 8212 + }, + { + "epoch": 0.6331329016342893, + "grad_norm": 3.5324010848999023, + "learning_rate": 3.1336827274711124e-06, + "loss": 0.8742, + "step": 8213 + }, + { + "epoch": 0.6332099907493062, + "grad_norm": 3.7448694705963135, + "learning_rate": 3.1325245694550292e-06, + "loss": 0.9452, + "step": 8214 + }, + { + "epoch": 0.6332870798643232, + "grad_norm": 4.035982608795166, + "learning_rate": 3.1313665278663686e-06, + "loss": 1.0678, + "step": 8215 + }, + { + "epoch": 0.6333641689793401, + "grad_norm": 3.4959230422973633, + "learning_rate": 3.130208602777326e-06, + "loss": 0.8715, + "step": 8216 + }, + { + "epoch": 0.633441258094357, + "grad_norm": 4.09905481338501, + "learning_rate": 3.1290507942600936e-06, + "loss": 0.9712, + "step": 8217 + }, + { + "epoch": 0.633518347209374, + "grad_norm": 4.000631332397461, + "learning_rate": 3.1278931023868543e-06, + "loss": 0.8744, + "step": 8218 + }, + { + "epoch": 0.633595436324391, + "grad_norm": 3.4521965980529785, + "learning_rate": 3.126735527229784e-06, + "loss": 0.8514, + "step": 8219 + }, + { + "epoch": 0.633672525439408, + "grad_norm": 3.4077889919281006, + "learning_rate": 3.125578068861051e-06, + "loss": 0.9584, + "step": 8220 + }, + { + "epoch": 0.6337496145544249, + "grad_norm": 3.76359224319458, + "learning_rate": 3.124420727352819e-06, + "loss": 0.8964, + "step": 8221 + }, + { + "epoch": 0.6338267036694418, + "grad_norm": 3.4744224548339844, + "learning_rate": 3.1232635027772397e-06, + "loss": 0.9265, + "step": 8222 + }, + { + "epoch": 0.6339037927844589, + "grad_norm": 3.7400758266448975, + "learning_rate": 3.122106395206462e-06, + "loss": 0.9967, + "step": 8223 + }, + { + "epoch": 0.6339808818994758, + "grad_norm": 3.5406832695007324, + "learning_rate": 3.1209494047126233e-06, + "loss": 0.9603, + "step": 8224 + }, + { + "epoch": 0.6340579710144928, + "grad_norm": 3.479562520980835, + "learning_rate": 3.119792531367858e-06, + "loss": 0.8891, + "step": 8225 + }, + { + "epoch": 0.6341350601295097, + "grad_norm": 3.7099175453186035, + "learning_rate": 3.1186357752442915e-06, + "loss": 0.9093, + "step": 8226 + }, + { + "epoch": 0.6342121492445266, + "grad_norm": 4.117136478424072, + "learning_rate": 3.1174791364140394e-06, + "loss": 0.8646, + "step": 8227 + }, + { + "epoch": 0.6342892383595437, + "grad_norm": 3.6180074214935303, + "learning_rate": 3.1163226149492133e-06, + "loss": 0.7977, + "step": 8228 + }, + { + "epoch": 0.6343663274745606, + "grad_norm": 3.70717191696167, + "learning_rate": 3.1151662109219173e-06, + "loss": 0.8866, + "step": 8229 + }, + { + "epoch": 0.6344434165895776, + "grad_norm": 3.557631492614746, + "learning_rate": 3.1140099244042454e-06, + "loss": 0.8787, + "step": 8230 + }, + { + "epoch": 0.6345205057045945, + "grad_norm": 3.4483120441436768, + "learning_rate": 3.1128537554682868e-06, + "loss": 0.8503, + "step": 8231 + }, + { + "epoch": 0.6345975948196114, + "grad_norm": 3.401564121246338, + "learning_rate": 3.111697704186124e-06, + "loss": 0.8023, + "step": 8232 + }, + { + "epoch": 0.6346746839346284, + "grad_norm": 3.990302562713623, + "learning_rate": 3.110541770629828e-06, + "loss": 0.9897, + "step": 8233 + }, + { + "epoch": 0.6347517730496454, + "grad_norm": 3.496260166168213, + "learning_rate": 3.109385954871469e-06, + "loss": 1.0002, + "step": 8234 + }, + { + "epoch": 0.6348288621646624, + "grad_norm": 3.497117757797241, + "learning_rate": 3.108230256983102e-06, + "loss": 0.9513, + "step": 8235 + }, + { + "epoch": 0.6349059512796793, + "grad_norm": 3.9657294750213623, + "learning_rate": 3.107074677036781e-06, + "loss": 0.96, + "step": 8236 + }, + { + "epoch": 0.6349830403946962, + "grad_norm": 3.5663013458251953, + "learning_rate": 3.1059192151045507e-06, + "loss": 0.8036, + "step": 8237 + }, + { + "epoch": 0.6350601295097132, + "grad_norm": 3.7324275970458984, + "learning_rate": 3.104763871258447e-06, + "loss": 0.971, + "step": 8238 + }, + { + "epoch": 0.6351372186247302, + "grad_norm": 4.242516994476318, + "learning_rate": 3.1036086455705006e-06, + "loss": 1.0418, + "step": 8239 + }, + { + "epoch": 0.6352143077397472, + "grad_norm": 3.5605618953704834, + "learning_rate": 3.102453538112734e-06, + "loss": 0.9253, + "step": 8240 + }, + { + "epoch": 0.6352913968547641, + "grad_norm": 3.748979330062866, + "learning_rate": 3.1012985489571613e-06, + "loss": 0.972, + "step": 8241 + }, + { + "epoch": 0.635368485969781, + "grad_norm": 3.898329734802246, + "learning_rate": 3.10014367817579e-06, + "loss": 1.0175, + "step": 8242 + }, + { + "epoch": 0.635445575084798, + "grad_norm": 3.484175682067871, + "learning_rate": 3.098988925840621e-06, + "loss": 0.8839, + "step": 8243 + }, + { + "epoch": 0.635522664199815, + "grad_norm": 3.634960412979126, + "learning_rate": 3.097834292023647e-06, + "loss": 0.8899, + "step": 8244 + }, + { + "epoch": 0.635599753314832, + "grad_norm": 3.7136380672454834, + "learning_rate": 3.096679776796854e-06, + "loss": 0.9849, + "step": 8245 + }, + { + "epoch": 0.6356768424298489, + "grad_norm": 3.5262043476104736, + "learning_rate": 3.0955253802322183e-06, + "loss": 0.8174, + "step": 8246 + }, + { + "epoch": 0.6357539315448658, + "grad_norm": 3.480539321899414, + "learning_rate": 3.094371102401712e-06, + "loss": 0.8963, + "step": 8247 + }, + { + "epoch": 0.6358310206598828, + "grad_norm": 3.8594253063201904, + "learning_rate": 3.0932169433772986e-06, + "loss": 0.9647, + "step": 8248 + }, + { + "epoch": 0.6359081097748998, + "grad_norm": 3.4248909950256348, + "learning_rate": 3.0920629032309323e-06, + "loss": 0.8798, + "step": 8249 + }, + { + "epoch": 0.6359851988899168, + "grad_norm": 3.841808319091797, + "learning_rate": 3.090908982034563e-06, + "loss": 0.9655, + "step": 8250 + }, + { + "epoch": 0.6360622880049337, + "grad_norm": 3.5504653453826904, + "learning_rate": 3.0897551798601312e-06, + "loss": 0.9077, + "step": 8251 + }, + { + "epoch": 0.6361393771199506, + "grad_norm": 3.5525784492492676, + "learning_rate": 3.0886014967795696e-06, + "loss": 0.8604, + "step": 8252 + }, + { + "epoch": 0.6362164662349676, + "grad_norm": 3.50411057472229, + "learning_rate": 3.087447932864807e-06, + "loss": 0.9307, + "step": 8253 + }, + { + "epoch": 0.6362935553499846, + "grad_norm": 3.3735544681549072, + "learning_rate": 3.086294488187758e-06, + "loss": 0.8722, + "step": 8254 + }, + { + "epoch": 0.6363706444650016, + "grad_norm": 3.8651130199432373, + "learning_rate": 3.0851411628203378e-06, + "loss": 0.9636, + "step": 8255 + }, + { + "epoch": 0.6364477335800185, + "grad_norm": 4.263221263885498, + "learning_rate": 3.083987956834449e-06, + "loss": 0.946, + "step": 8256 + }, + { + "epoch": 0.6365248226950354, + "grad_norm": 3.6470301151275635, + "learning_rate": 3.082834870301987e-06, + "loss": 0.9204, + "step": 8257 + }, + { + "epoch": 0.6366019118100524, + "grad_norm": 3.6749472618103027, + "learning_rate": 3.081681903294843e-06, + "loss": 1.0162, + "step": 8258 + }, + { + "epoch": 0.6366790009250693, + "grad_norm": 3.7581326961517334, + "learning_rate": 3.080529055884896e-06, + "loss": 0.8681, + "step": 8259 + }, + { + "epoch": 0.6367560900400864, + "grad_norm": 3.6538708209991455, + "learning_rate": 3.0793763281440225e-06, + "loss": 0.9529, + "step": 8260 + }, + { + "epoch": 0.6368331791551033, + "grad_norm": 3.6864404678344727, + "learning_rate": 3.0782237201440863e-06, + "loss": 1.0218, + "step": 8261 + }, + { + "epoch": 0.6369102682701202, + "grad_norm": 3.453056573867798, + "learning_rate": 3.077071231956948e-06, + "loss": 0.8975, + "step": 8262 + }, + { + "epoch": 0.6369873573851372, + "grad_norm": 3.943230390548706, + "learning_rate": 3.07591886365446e-06, + "loss": 0.9366, + "step": 8263 + }, + { + "epoch": 0.6370644465001541, + "grad_norm": 3.7501060962677, + "learning_rate": 3.0747666153084656e-06, + "loss": 0.9883, + "step": 8264 + }, + { + "epoch": 0.6371415356151712, + "grad_norm": 3.9778292179107666, + "learning_rate": 3.0736144869908015e-06, + "loss": 0.9615, + "step": 8265 + }, + { + "epoch": 0.6372186247301881, + "grad_norm": 3.7227420806884766, + "learning_rate": 3.072462478773298e-06, + "loss": 0.9409, + "step": 8266 + }, + { + "epoch": 0.637295713845205, + "grad_norm": 3.480952501296997, + "learning_rate": 3.071310590727775e-06, + "loss": 0.8985, + "step": 8267 + }, + { + "epoch": 0.637372802960222, + "grad_norm": 3.7210822105407715, + "learning_rate": 3.0701588229260478e-06, + "loss": 0.93, + "step": 8268 + }, + { + "epoch": 0.6374498920752389, + "grad_norm": 3.6173181533813477, + "learning_rate": 3.0690071754399236e-06, + "loss": 0.9362, + "step": 8269 + }, + { + "epoch": 0.637526981190256, + "grad_norm": 3.7527718544006348, + "learning_rate": 3.0678556483412005e-06, + "loss": 0.9783, + "step": 8270 + }, + { + "epoch": 0.6376040703052729, + "grad_norm": 3.4101197719573975, + "learning_rate": 3.066704241701672e-06, + "loss": 0.8431, + "step": 8271 + }, + { + "epoch": 0.6376811594202898, + "grad_norm": 3.5629935264587402, + "learning_rate": 3.06555295559312e-06, + "loss": 0.9556, + "step": 8272 + }, + { + "epoch": 0.6377582485353068, + "grad_norm": 3.466757297515869, + "learning_rate": 3.0644017900873225e-06, + "loss": 0.7801, + "step": 8273 + }, + { + "epoch": 0.6378353376503237, + "grad_norm": 3.4312379360198975, + "learning_rate": 3.06325074525605e-06, + "loss": 0.8572, + "step": 8274 + }, + { + "epoch": 0.6379124267653408, + "grad_norm": 3.8068385124206543, + "learning_rate": 3.0620998211710617e-06, + "loss": 0.9986, + "step": 8275 + }, + { + "epoch": 0.6379895158803577, + "grad_norm": 3.604086399078369, + "learning_rate": 3.0609490179041124e-06, + "loss": 0.9039, + "step": 8276 + }, + { + "epoch": 0.6380666049953746, + "grad_norm": 4.103906631469727, + "learning_rate": 3.059798335526951e-06, + "loss": 0.9925, + "step": 8277 + }, + { + "epoch": 0.6381436941103916, + "grad_norm": 3.9479622840881348, + "learning_rate": 3.0586477741113134e-06, + "loss": 0.931, + "step": 8278 + }, + { + "epoch": 0.6382207832254085, + "grad_norm": 3.976950168609619, + "learning_rate": 3.0574973337289327e-06, + "loss": 0.8651, + "step": 8279 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 3.6093039512634277, + "learning_rate": 3.0563470144515337e-06, + "loss": 0.9052, + "step": 8280 + }, + { + "epoch": 0.6383749614554425, + "grad_norm": 3.744586229324341, + "learning_rate": 3.0551968163508303e-06, + "loss": 0.9146, + "step": 8281 + }, + { + "epoch": 0.6384520505704595, + "grad_norm": 3.3914260864257812, + "learning_rate": 3.054046739498535e-06, + "loss": 0.8381, + "step": 8282 + }, + { + "epoch": 0.6385291396854764, + "grad_norm": 3.5384232997894287, + "learning_rate": 3.052896783966346e-06, + "loss": 0.8195, + "step": 8283 + }, + { + "epoch": 0.6386062288004933, + "grad_norm": 3.407332420349121, + "learning_rate": 3.051746949825958e-06, + "loss": 0.9074, + "step": 8284 + }, + { + "epoch": 0.6386833179155104, + "grad_norm": 4.286930084228516, + "learning_rate": 3.0505972371490584e-06, + "loss": 1.0354, + "step": 8285 + }, + { + "epoch": 0.6387604070305273, + "grad_norm": 3.8700149059295654, + "learning_rate": 3.0494476460073237e-06, + "loss": 0.9878, + "step": 8286 + }, + { + "epoch": 0.6388374961455443, + "grad_norm": 4.059021472930908, + "learning_rate": 3.048298176472426e-06, + "loss": 0.9244, + "step": 8287 + }, + { + "epoch": 0.6389145852605612, + "grad_norm": 3.820525646209717, + "learning_rate": 3.04714882861603e-06, + "loss": 1.104, + "step": 8288 + }, + { + "epoch": 0.6389916743755781, + "grad_norm": 3.6468265056610107, + "learning_rate": 3.04599960250979e-06, + "loss": 0.9207, + "step": 8289 + }, + { + "epoch": 0.6390687634905952, + "grad_norm": 3.7343029975891113, + "learning_rate": 3.044850498225354e-06, + "loss": 0.8921, + "step": 8290 + }, + { + "epoch": 0.6391458526056121, + "grad_norm": 3.6490302085876465, + "learning_rate": 3.0437015158343654e-06, + "loss": 0.9089, + "step": 8291 + }, + { + "epoch": 0.6392229417206291, + "grad_norm": 3.772552490234375, + "learning_rate": 3.0425526554084526e-06, + "loss": 0.9522, + "step": 8292 + }, + { + "epoch": 0.639300030835646, + "grad_norm": 3.4605162143707275, + "learning_rate": 3.0414039170192466e-06, + "loss": 0.8253, + "step": 8293 + }, + { + "epoch": 0.6393771199506629, + "grad_norm": 3.6937575340270996, + "learning_rate": 3.0402553007383607e-06, + "loss": 0.7973, + "step": 8294 + }, + { + "epoch": 0.63945420906568, + "grad_norm": 3.3988912105560303, + "learning_rate": 3.0391068066374073e-06, + "loss": 0.8901, + "step": 8295 + }, + { + "epoch": 0.6395312981806969, + "grad_norm": 3.475071668624878, + "learning_rate": 3.03795843478799e-06, + "loss": 0.8047, + "step": 8296 + }, + { + "epoch": 0.6396083872957139, + "grad_norm": 3.693103075027466, + "learning_rate": 3.0368101852617017e-06, + "loss": 0.926, + "step": 8297 + }, + { + "epoch": 0.6396854764107308, + "grad_norm": 3.585578203201294, + "learning_rate": 3.03566205813013e-06, + "loss": 0.9834, + "step": 8298 + }, + { + "epoch": 0.6397625655257477, + "grad_norm": 3.4358348846435547, + "learning_rate": 3.034514053464854e-06, + "loss": 0.8158, + "step": 8299 + }, + { + "epoch": 0.6398396546407648, + "grad_norm": 3.7729804515838623, + "learning_rate": 3.0333661713374506e-06, + "loss": 0.8983, + "step": 8300 + }, + { + "epoch": 0.6399167437557817, + "grad_norm": 3.628103256225586, + "learning_rate": 3.0322184118194797e-06, + "loss": 0.8904, + "step": 8301 + }, + { + "epoch": 0.6399938328707987, + "grad_norm": 3.9679834842681885, + "learning_rate": 3.0310707749824987e-06, + "loss": 0.894, + "step": 8302 + }, + { + "epoch": 0.6400709219858156, + "grad_norm": 3.610292911529541, + "learning_rate": 3.0299232608980587e-06, + "loss": 0.9125, + "step": 8303 + }, + { + "epoch": 0.6401480111008325, + "grad_norm": 3.601590394973755, + "learning_rate": 3.0287758696377e-06, + "loss": 0.8971, + "step": 8304 + }, + { + "epoch": 0.6402251002158496, + "grad_norm": 3.709563732147217, + "learning_rate": 3.0276286012729563e-06, + "loss": 0.9295, + "step": 8305 + }, + { + "epoch": 0.6403021893308665, + "grad_norm": 3.6046383380889893, + "learning_rate": 3.0264814558753543e-06, + "loss": 0.9875, + "step": 8306 + }, + { + "epoch": 0.6403792784458835, + "grad_norm": 3.460169792175293, + "learning_rate": 3.0253344335164114e-06, + "loss": 0.9901, + "step": 8307 + }, + { + "epoch": 0.6404563675609004, + "grad_norm": 3.704393148422241, + "learning_rate": 3.0241875342676413e-06, + "loss": 0.9067, + "step": 8308 + }, + { + "epoch": 0.6405334566759173, + "grad_norm": 3.8875725269317627, + "learning_rate": 3.023040758200544e-06, + "loss": 0.9543, + "step": 8309 + }, + { + "epoch": 0.6406105457909343, + "grad_norm": 3.9328582286834717, + "learning_rate": 3.0218941053866167e-06, + "loss": 1.0593, + "step": 8310 + }, + { + "epoch": 0.6406876349059513, + "grad_norm": 4.249375343322754, + "learning_rate": 3.020747575897348e-06, + "loss": 0.9223, + "step": 8311 + }, + { + "epoch": 0.6407647240209683, + "grad_norm": 3.789259672164917, + "learning_rate": 3.019601169804216e-06, + "loss": 0.9254, + "step": 8312 + }, + { + "epoch": 0.6408418131359852, + "grad_norm": 3.5472848415374756, + "learning_rate": 3.0184548871786946e-06, + "loss": 0.9175, + "step": 8313 + }, + { + "epoch": 0.6409189022510021, + "grad_norm": 3.7821688652038574, + "learning_rate": 3.0173087280922493e-06, + "loss": 0.8734, + "step": 8314 + }, + { + "epoch": 0.6409959913660191, + "grad_norm": 3.4091010093688965, + "learning_rate": 3.0161626926163344e-06, + "loss": 0.9446, + "step": 8315 + }, + { + "epoch": 0.6410730804810361, + "grad_norm": 3.822620391845703, + "learning_rate": 3.015016780822402e-06, + "loss": 0.9002, + "step": 8316 + }, + { + "epoch": 0.6411501695960531, + "grad_norm": 3.6239142417907715, + "learning_rate": 3.013870992781894e-06, + "loss": 0.8363, + "step": 8317 + }, + { + "epoch": 0.64122725871107, + "grad_norm": 3.812986373901367, + "learning_rate": 3.0127253285662415e-06, + "loss": 0.9568, + "step": 8318 + }, + { + "epoch": 0.6413043478260869, + "grad_norm": 3.996857166290283, + "learning_rate": 3.0115797882468733e-06, + "loss": 0.863, + "step": 8319 + }, + { + "epoch": 0.6413814369411039, + "grad_norm": 3.7889609336853027, + "learning_rate": 3.0104343718952065e-06, + "loss": 1.0847, + "step": 8320 + }, + { + "epoch": 0.6414585260561209, + "grad_norm": 3.7491159439086914, + "learning_rate": 3.0092890795826524e-06, + "loss": 0.8731, + "step": 8321 + }, + { + "epoch": 0.6415356151711379, + "grad_norm": 3.6667377948760986, + "learning_rate": 3.0081439113806145e-06, + "loss": 0.8165, + "step": 8322 + }, + { + "epoch": 0.6416127042861548, + "grad_norm": 4.124176502227783, + "learning_rate": 3.0069988673604866e-06, + "loss": 1.0136, + "step": 8323 + }, + { + "epoch": 0.6416897934011717, + "grad_norm": 4.588274955749512, + "learning_rate": 3.0058539475936577e-06, + "loss": 1.0123, + "step": 8324 + }, + { + "epoch": 0.6417668825161887, + "grad_norm": 3.610478401184082, + "learning_rate": 3.0047091521515083e-06, + "loss": 0.868, + "step": 8325 + }, + { + "epoch": 0.6418439716312057, + "grad_norm": 3.607025384902954, + "learning_rate": 3.0035644811054083e-06, + "loss": 0.8376, + "step": 8326 + }, + { + "epoch": 0.6419210607462227, + "grad_norm": 3.4604544639587402, + "learning_rate": 3.002419934526723e-06, + "loss": 0.898, + "step": 8327 + }, + { + "epoch": 0.6419981498612396, + "grad_norm": 3.819643497467041, + "learning_rate": 3.00127551248681e-06, + "loss": 0.9074, + "step": 8328 + }, + { + "epoch": 0.6420752389762565, + "grad_norm": 3.4127678871154785, + "learning_rate": 3.000131215057016e-06, + "loss": 0.818, + "step": 8329 + }, + { + "epoch": 0.6421523280912735, + "grad_norm": 3.94305157661438, + "learning_rate": 2.998987042308685e-06, + "loss": 1.0063, + "step": 8330 + }, + { + "epoch": 0.6422294172062905, + "grad_norm": 3.653914213180542, + "learning_rate": 2.9978429943131466e-06, + "loss": 0.9659, + "step": 8331 + }, + { + "epoch": 0.6423065063213075, + "grad_norm": 3.779825210571289, + "learning_rate": 2.9966990711417283e-06, + "loss": 0.875, + "step": 8332 + }, + { + "epoch": 0.6423835954363244, + "grad_norm": 3.4283056259155273, + "learning_rate": 2.9955552728657483e-06, + "loss": 0.8167, + "step": 8333 + }, + { + "epoch": 0.6424606845513413, + "grad_norm": 3.8111422061920166, + "learning_rate": 2.994411599556515e-06, + "loss": 0.8542, + "step": 8334 + }, + { + "epoch": 0.6425377736663583, + "grad_norm": 3.65975284576416, + "learning_rate": 2.993268051285331e-06, + "loss": 0.8386, + "step": 8335 + }, + { + "epoch": 0.6426148627813753, + "grad_norm": 3.7688958644866943, + "learning_rate": 2.9921246281234924e-06, + "loss": 0.9334, + "step": 8336 + }, + { + "epoch": 0.6426919518963923, + "grad_norm": 3.5940749645233154, + "learning_rate": 2.990981330142283e-06, + "loss": 0.9051, + "step": 8337 + }, + { + "epoch": 0.6427690410114092, + "grad_norm": 3.8755249977111816, + "learning_rate": 2.989838157412982e-06, + "loss": 0.9065, + "step": 8338 + }, + { + "epoch": 0.6428461301264261, + "grad_norm": 4.170990943908691, + "learning_rate": 2.9886951100068628e-06, + "loss": 1.0524, + "step": 8339 + }, + { + "epoch": 0.6429232192414431, + "grad_norm": 3.61470890045166, + "learning_rate": 2.9875521879951852e-06, + "loss": 0.951, + "step": 8340 + }, + { + "epoch": 0.64300030835646, + "grad_norm": 3.7371344566345215, + "learning_rate": 2.9864093914492076e-06, + "loss": 0.9648, + "step": 8341 + }, + { + "epoch": 0.6430773974714771, + "grad_norm": 3.93182635307312, + "learning_rate": 2.9852667204401742e-06, + "loss": 1.0087, + "step": 8342 + }, + { + "epoch": 0.643154486586494, + "grad_norm": 3.8426945209503174, + "learning_rate": 2.9841241750393257e-06, + "loss": 0.9239, + "step": 8343 + }, + { + "epoch": 0.6432315757015109, + "grad_norm": 3.605802297592163, + "learning_rate": 2.9829817553178945e-06, + "loss": 0.9312, + "step": 8344 + }, + { + "epoch": 0.6433086648165279, + "grad_norm": 3.347144603729248, + "learning_rate": 2.981839461347105e-06, + "loss": 0.8394, + "step": 8345 + }, + { + "epoch": 0.6433857539315448, + "grad_norm": 3.462236166000366, + "learning_rate": 2.980697293198174e-06, + "loss": 0.9387, + "step": 8346 + }, + { + "epoch": 0.6434628430465619, + "grad_norm": 3.7107419967651367, + "learning_rate": 2.9795552509423075e-06, + "loss": 0.7882, + "step": 8347 + }, + { + "epoch": 0.6435399321615788, + "grad_norm": 3.59287166595459, + "learning_rate": 2.9784133346507077e-06, + "loss": 0.9505, + "step": 8348 + }, + { + "epoch": 0.6436170212765957, + "grad_norm": 3.7674858570098877, + "learning_rate": 2.9772715443945653e-06, + "loss": 0.9572, + "step": 8349 + }, + { + "epoch": 0.6436941103916127, + "grad_norm": 4.125513076782227, + "learning_rate": 2.9761298802450667e-06, + "loss": 0.9329, + "step": 8350 + }, + { + "epoch": 0.6437711995066296, + "grad_norm": 3.9114468097686768, + "learning_rate": 2.974988342273388e-06, + "loss": 0.8998, + "step": 8351 + }, + { + "epoch": 0.6438482886216467, + "grad_norm": 3.4808576107025146, + "learning_rate": 2.9738469305506985e-06, + "loss": 0.865, + "step": 8352 + }, + { + "epoch": 0.6439253777366636, + "grad_norm": 3.3427348136901855, + "learning_rate": 2.972705645148159e-06, + "loss": 0.9336, + "step": 8353 + }, + { + "epoch": 0.6440024668516805, + "grad_norm": 3.8578758239746094, + "learning_rate": 2.9715644861369235e-06, + "loss": 0.9053, + "step": 8354 + }, + { + "epoch": 0.6440795559666975, + "grad_norm": 3.955080986022949, + "learning_rate": 2.970423453588136e-06, + "loss": 1.0019, + "step": 8355 + }, + { + "epoch": 0.6441566450817144, + "grad_norm": 3.818481922149658, + "learning_rate": 2.969282547572936e-06, + "loss": 0.9418, + "step": 8356 + }, + { + "epoch": 0.6442337341967315, + "grad_norm": 3.748859405517578, + "learning_rate": 2.9681417681624502e-06, + "loss": 1.0381, + "step": 8357 + }, + { + "epoch": 0.6443108233117484, + "grad_norm": 4.283618450164795, + "learning_rate": 2.967001115427802e-06, + "loss": 1.0513, + "step": 8358 + }, + { + "epoch": 0.6443879124267653, + "grad_norm": 3.595768690109253, + "learning_rate": 2.965860589440106e-06, + "loss": 0.8752, + "step": 8359 + }, + { + "epoch": 0.6444650015417823, + "grad_norm": 3.731633424758911, + "learning_rate": 2.964720190270467e-06, + "loss": 0.9551, + "step": 8360 + }, + { + "epoch": 0.6445420906567992, + "grad_norm": 3.895328998565674, + "learning_rate": 2.9635799179899822e-06, + "loss": 0.8959, + "step": 8361 + }, + { + "epoch": 0.6446191797718163, + "grad_norm": 3.839773654937744, + "learning_rate": 2.962439772669744e-06, + "loss": 0.9535, + "step": 8362 + }, + { + "epoch": 0.6446962688868332, + "grad_norm": 3.9109835624694824, + "learning_rate": 2.961299754380832e-06, + "loss": 0.8728, + "step": 8363 + }, + { + "epoch": 0.6447733580018501, + "grad_norm": 3.788618803024292, + "learning_rate": 2.960159863194322e-06, + "loss": 0.9667, + "step": 8364 + }, + { + "epoch": 0.6448504471168671, + "grad_norm": 4.191736698150635, + "learning_rate": 2.9590200991812804e-06, + "loss": 1.0588, + "step": 8365 + }, + { + "epoch": 0.644927536231884, + "grad_norm": 3.4580607414245605, + "learning_rate": 2.9578804624127644e-06, + "loss": 0.849, + "step": 8366 + }, + { + "epoch": 0.6450046253469011, + "grad_norm": 3.6437692642211914, + "learning_rate": 2.9567409529598267e-06, + "loss": 0.9661, + "step": 8367 + }, + { + "epoch": 0.645081714461918, + "grad_norm": 3.486619472503662, + "learning_rate": 2.9556015708935064e-06, + "loss": 0.855, + "step": 8368 + }, + { + "epoch": 0.6451588035769349, + "grad_norm": 3.5905685424804688, + "learning_rate": 2.9544623162848406e-06, + "loss": 0.9604, + "step": 8369 + }, + { + "epoch": 0.6452358926919519, + "grad_norm": 3.5381009578704834, + "learning_rate": 2.9533231892048557e-06, + "loss": 0.8643, + "step": 8370 + }, + { + "epoch": 0.6453129818069688, + "grad_norm": 3.4007678031921387, + "learning_rate": 2.9521841897245694e-06, + "loss": 0.8136, + "step": 8371 + }, + { + "epoch": 0.6453900709219859, + "grad_norm": 3.962644338607788, + "learning_rate": 2.9510453179149923e-06, + "loss": 0.9401, + "step": 8372 + }, + { + "epoch": 0.6454671600370028, + "grad_norm": 3.902945041656494, + "learning_rate": 2.94990657384713e-06, + "loss": 0.8761, + "step": 8373 + }, + { + "epoch": 0.6455442491520197, + "grad_norm": 3.3986709117889404, + "learning_rate": 2.9487679575919733e-06, + "loss": 0.9076, + "step": 8374 + }, + { + "epoch": 0.6456213382670367, + "grad_norm": 3.4570724964141846, + "learning_rate": 2.947629469220511e-06, + "loss": 0.8969, + "step": 8375 + }, + { + "epoch": 0.6456984273820536, + "grad_norm": 3.3817062377929688, + "learning_rate": 2.9464911088037225e-06, + "loss": 0.7891, + "step": 8376 + }, + { + "epoch": 0.6457755164970707, + "grad_norm": 4.3606696128845215, + "learning_rate": 2.945352876412577e-06, + "loss": 0.8679, + "step": 8377 + }, + { + "epoch": 0.6458526056120876, + "grad_norm": 3.8586175441741943, + "learning_rate": 2.94421477211804e-06, + "loss": 0.9431, + "step": 8378 + }, + { + "epoch": 0.6459296947271045, + "grad_norm": 3.5619122982025146, + "learning_rate": 2.9430767959910633e-06, + "loss": 0.8947, + "step": 8379 + }, + { + "epoch": 0.6460067838421215, + "grad_norm": 3.578460693359375, + "learning_rate": 2.941938948102595e-06, + "loss": 0.9447, + "step": 8380 + }, + { + "epoch": 0.6460838729571384, + "grad_norm": 3.719597101211548, + "learning_rate": 2.9408012285235753e-06, + "loss": 0.9453, + "step": 8381 + }, + { + "epoch": 0.6461609620721555, + "grad_norm": 3.9067883491516113, + "learning_rate": 2.939663637324934e-06, + "loss": 0.9575, + "step": 8382 + }, + { + "epoch": 0.6462380511871724, + "grad_norm": 3.6373391151428223, + "learning_rate": 2.9385261745775932e-06, + "loss": 0.8788, + "step": 8383 + }, + { + "epoch": 0.6463151403021893, + "grad_norm": 3.917569160461426, + "learning_rate": 2.93738884035247e-06, + "loss": 0.9704, + "step": 8384 + }, + { + "epoch": 0.6463922294172063, + "grad_norm": 3.8104026317596436, + "learning_rate": 2.9362516347204695e-06, + "loss": 0.9048, + "step": 8385 + }, + { + "epoch": 0.6464693185322232, + "grad_norm": 3.8119072914123535, + "learning_rate": 2.9351145577524908e-06, + "loss": 0.9322, + "step": 8386 + }, + { + "epoch": 0.6465464076472403, + "grad_norm": 3.7479817867279053, + "learning_rate": 2.9339776095194245e-06, + "loss": 0.9542, + "step": 8387 + }, + { + "epoch": 0.6466234967622572, + "grad_norm": 3.774230718612671, + "learning_rate": 2.9328407900921555e-06, + "loss": 0.9974, + "step": 8388 + }, + { + "epoch": 0.6467005858772741, + "grad_norm": 3.979708671569824, + "learning_rate": 2.931704099541557e-06, + "loss": 1.0159, + "step": 8389 + }, + { + "epoch": 0.6467776749922911, + "grad_norm": 3.6764190196990967, + "learning_rate": 2.9305675379384945e-06, + "loss": 0.9333, + "step": 8390 + }, + { + "epoch": 0.646854764107308, + "grad_norm": 3.608494997024536, + "learning_rate": 2.9294311053538306e-06, + "loss": 0.8975, + "step": 8391 + }, + { + "epoch": 0.646931853222325, + "grad_norm": 3.679772138595581, + "learning_rate": 2.9282948018584116e-06, + "loss": 0.8043, + "step": 8392 + }, + { + "epoch": 0.647008942337342, + "grad_norm": 3.3427774906158447, + "learning_rate": 2.9271586275230823e-06, + "loss": 0.7306, + "step": 8393 + }, + { + "epoch": 0.6470860314523589, + "grad_norm": 3.9158496856689453, + "learning_rate": 2.926022582418678e-06, + "loss": 0.7937, + "step": 8394 + }, + { + "epoch": 0.6471631205673759, + "grad_norm": 3.52773380279541, + "learning_rate": 2.9248866666160236e-06, + "loss": 0.9314, + "step": 8395 + }, + { + "epoch": 0.6472402096823928, + "grad_norm": 4.079388618469238, + "learning_rate": 2.9237508801859394e-06, + "loss": 1.0586, + "step": 8396 + }, + { + "epoch": 0.6473172987974098, + "grad_norm": 3.9858343601226807, + "learning_rate": 2.922615223199233e-06, + "loss": 0.9373, + "step": 8397 + }, + { + "epoch": 0.6473943879124268, + "grad_norm": 3.436227798461914, + "learning_rate": 2.921479695726709e-06, + "loss": 0.9468, + "step": 8398 + }, + { + "epoch": 0.6474714770274437, + "grad_norm": 3.483386278152466, + "learning_rate": 2.9203442978391618e-06, + "loss": 0.9141, + "step": 8399 + }, + { + "epoch": 0.6475485661424607, + "grad_norm": 3.5396289825439453, + "learning_rate": 2.9192090296073755e-06, + "loss": 0.93, + "step": 8400 + }, + { + "epoch": 0.6476256552574776, + "grad_norm": 3.383061170578003, + "learning_rate": 2.91807389110213e-06, + "loss": 0.8128, + "step": 8401 + }, + { + "epoch": 0.6477027443724946, + "grad_norm": 3.429288864135742, + "learning_rate": 2.916938882394194e-06, + "loss": 0.8793, + "step": 8402 + }, + { + "epoch": 0.6477798334875116, + "grad_norm": 3.536487340927124, + "learning_rate": 2.9158040035543334e-06, + "loss": 0.9317, + "step": 8403 + }, + { + "epoch": 0.6478569226025285, + "grad_norm": 3.7233903408050537, + "learning_rate": 2.914669254653296e-06, + "loss": 0.8154, + "step": 8404 + }, + { + "epoch": 0.6479340117175455, + "grad_norm": 3.469984292984009, + "learning_rate": 2.9135346357618304e-06, + "loss": 0.9265, + "step": 8405 + }, + { + "epoch": 0.6480111008325624, + "grad_norm": 3.6804935932159424, + "learning_rate": 2.912400146950675e-06, + "loss": 1.0209, + "step": 8406 + }, + { + "epoch": 0.6480881899475794, + "grad_norm": 3.8416061401367188, + "learning_rate": 2.9112657882905573e-06, + "loss": 0.9045, + "step": 8407 + }, + { + "epoch": 0.6481652790625964, + "grad_norm": 3.884570360183716, + "learning_rate": 2.910131559852201e-06, + "loss": 0.8565, + "step": 8408 + }, + { + "epoch": 0.6482423681776133, + "grad_norm": 3.519493341445923, + "learning_rate": 2.9089974617063188e-06, + "loss": 0.9198, + "step": 8409 + }, + { + "epoch": 0.6483194572926303, + "grad_norm": 4.712925434112549, + "learning_rate": 2.9078634939236146e-06, + "loss": 1.0443, + "step": 8410 + }, + { + "epoch": 0.6483965464076472, + "grad_norm": 6.0556559562683105, + "learning_rate": 2.9067296565747856e-06, + "loss": 0.8174, + "step": 8411 + }, + { + "epoch": 0.6484736355226642, + "grad_norm": 3.844569206237793, + "learning_rate": 2.905595949730521e-06, + "loss": 1.0147, + "step": 8412 + }, + { + "epoch": 0.6485507246376812, + "grad_norm": 3.9868366718292236, + "learning_rate": 2.9044623734615018e-06, + "loss": 0.9485, + "step": 8413 + }, + { + "epoch": 0.6486278137526981, + "grad_norm": 3.9000837802886963, + "learning_rate": 2.903328927838403e-06, + "loss": 1.0351, + "step": 8414 + }, + { + "epoch": 0.6487049028677151, + "grad_norm": 3.564354419708252, + "learning_rate": 2.902195612931884e-06, + "loss": 0.892, + "step": 8415 + }, + { + "epoch": 0.648781991982732, + "grad_norm": 3.669095277786255, + "learning_rate": 2.901062428812604e-06, + "loss": 0.9128, + "step": 8416 + }, + { + "epoch": 0.648859081097749, + "grad_norm": 3.598200798034668, + "learning_rate": 2.8999293755512113e-06, + "loss": 0.8725, + "step": 8417 + }, + { + "epoch": 0.648936170212766, + "grad_norm": 3.7269628047943115, + "learning_rate": 2.8987964532183454e-06, + "loss": 0.8105, + "step": 8418 + }, + { + "epoch": 0.6490132593277829, + "grad_norm": 3.622880458831787, + "learning_rate": 2.897663661884638e-06, + "loss": 0.8715, + "step": 8419 + }, + { + "epoch": 0.6490903484427999, + "grad_norm": 3.9800972938537598, + "learning_rate": 2.896531001620715e-06, + "loss": 0.9407, + "step": 8420 + }, + { + "epoch": 0.6491674375578168, + "grad_norm": 4.113578796386719, + "learning_rate": 2.8953984724971874e-06, + "loss": 0.9153, + "step": 8421 + }, + { + "epoch": 0.6492445266728338, + "grad_norm": 4.180689811706543, + "learning_rate": 2.8942660745846657e-06, + "loss": 0.8857, + "step": 8422 + }, + { + "epoch": 0.6493216157878507, + "grad_norm": 4.276498794555664, + "learning_rate": 2.8931338079537487e-06, + "loss": 0.9398, + "step": 8423 + }, + { + "epoch": 0.6493987049028677, + "grad_norm": 3.5343918800354004, + "learning_rate": 2.892001672675026e-06, + "loss": 0.9164, + "step": 8424 + }, + { + "epoch": 0.6494757940178847, + "grad_norm": 3.7870051860809326, + "learning_rate": 2.890869668819084e-06, + "loss": 1.0146, + "step": 8425 + }, + { + "epoch": 0.6495528831329016, + "grad_norm": 3.560620069503784, + "learning_rate": 2.8897377964564922e-06, + "loss": 0.8416, + "step": 8426 + }, + { + "epoch": 0.6496299722479186, + "grad_norm": 3.742684841156006, + "learning_rate": 2.88860605565782e-06, + "loss": 0.9449, + "step": 8427 + }, + { + "epoch": 0.6497070613629355, + "grad_norm": 3.494896411895752, + "learning_rate": 2.887474446493625e-06, + "loss": 0.8439, + "step": 8428 + }, + { + "epoch": 0.6497841504779525, + "grad_norm": 3.8634705543518066, + "learning_rate": 2.886342969034457e-06, + "loss": 0.9453, + "step": 8429 + }, + { + "epoch": 0.6498612395929695, + "grad_norm": 3.891278028488159, + "learning_rate": 2.8852116233508604e-06, + "loss": 1.0619, + "step": 8430 + }, + { + "epoch": 0.6499383287079864, + "grad_norm": 3.782010793685913, + "learning_rate": 2.8840804095133617e-06, + "loss": 1.0296, + "step": 8431 + }, + { + "epoch": 0.6500154178230034, + "grad_norm": 3.684227466583252, + "learning_rate": 2.8829493275924935e-06, + "loss": 0.9614, + "step": 8432 + }, + { + "epoch": 0.6500925069380203, + "grad_norm": 3.9234118461608887, + "learning_rate": 2.8818183776587694e-06, + "loss": 1.0703, + "step": 8433 + }, + { + "epoch": 0.6501695960530373, + "grad_norm": 3.863373279571533, + "learning_rate": 2.8806875597827e-06, + "loss": 0.9689, + "step": 8434 + }, + { + "epoch": 0.6502466851680543, + "grad_norm": 3.6144754886627197, + "learning_rate": 2.8795568740347868e-06, + "loss": 0.8714, + "step": 8435 + }, + { + "epoch": 0.6503237742830712, + "grad_norm": 4.281971454620361, + "learning_rate": 2.878426320485518e-06, + "loss": 0.9231, + "step": 8436 + }, + { + "epoch": 0.6504008633980882, + "grad_norm": 4.0023040771484375, + "learning_rate": 2.87729589920538e-06, + "loss": 0.9989, + "step": 8437 + }, + { + "epoch": 0.6504779525131051, + "grad_norm": 3.5423178672790527, + "learning_rate": 2.8761656102648495e-06, + "loss": 0.9044, + "step": 8438 + }, + { + "epoch": 0.650555041628122, + "grad_norm": 3.6947436332702637, + "learning_rate": 2.875035453734394e-06, + "loss": 0.9198, + "step": 8439 + }, + { + "epoch": 0.6506321307431391, + "grad_norm": 3.7778401374816895, + "learning_rate": 2.8739054296844736e-06, + "loss": 0.9065, + "step": 8440 + }, + { + "epoch": 0.650709219858156, + "grad_norm": 3.7728586196899414, + "learning_rate": 2.8727755381855373e-06, + "loss": 0.9014, + "step": 8441 + }, + { + "epoch": 0.650786308973173, + "grad_norm": 3.6431329250335693, + "learning_rate": 2.8716457793080277e-06, + "loss": 0.9221, + "step": 8442 + }, + { + "epoch": 0.6508633980881899, + "grad_norm": 3.698190927505493, + "learning_rate": 2.8705161531223814e-06, + "loss": 0.9884, + "step": 8443 + }, + { + "epoch": 0.6509404872032069, + "grad_norm": 4.110282897949219, + "learning_rate": 2.8693866596990237e-06, + "loss": 0.9542, + "step": 8444 + }, + { + "epoch": 0.6510175763182239, + "grad_norm": 4.268514156341553, + "learning_rate": 2.868257299108374e-06, + "loss": 1.0497, + "step": 8445 + }, + { + "epoch": 0.6510946654332408, + "grad_norm": 3.500208616256714, + "learning_rate": 2.8671280714208427e-06, + "loss": 0.8821, + "step": 8446 + }, + { + "epoch": 0.6511717545482578, + "grad_norm": 3.6203670501708984, + "learning_rate": 2.8659989767068277e-06, + "loss": 0.8401, + "step": 8447 + }, + { + "epoch": 0.6512488436632747, + "grad_norm": 3.4511590003967285, + "learning_rate": 2.8648700150367236e-06, + "loss": 0.8388, + "step": 8448 + }, + { + "epoch": 0.6513259327782918, + "grad_norm": 3.800976514816284, + "learning_rate": 2.863741186480917e-06, + "loss": 0.977, + "step": 8449 + }, + { + "epoch": 0.6514030218933087, + "grad_norm": 3.96077561378479, + "learning_rate": 2.862612491109783e-06, + "loss": 1.0604, + "step": 8450 + }, + { + "epoch": 0.6514801110083256, + "grad_norm": 3.6005845069885254, + "learning_rate": 2.861483928993692e-06, + "loss": 1.0056, + "step": 8451 + }, + { + "epoch": 0.6515572001233426, + "grad_norm": 3.862231492996216, + "learning_rate": 2.8603555002030005e-06, + "loss": 0.9447, + "step": 8452 + }, + { + "epoch": 0.6516342892383595, + "grad_norm": 3.786400318145752, + "learning_rate": 2.8592272048080623e-06, + "loss": 0.9996, + "step": 8453 + }, + { + "epoch": 0.6517113783533766, + "grad_norm": 4.21661901473999, + "learning_rate": 2.8580990428792205e-06, + "loss": 0.9962, + "step": 8454 + }, + { + "epoch": 0.6517884674683935, + "grad_norm": 3.853390693664551, + "learning_rate": 2.85697101448681e-06, + "loss": 0.9301, + "step": 8455 + }, + { + "epoch": 0.6518655565834104, + "grad_norm": 3.9790003299713135, + "learning_rate": 2.855843119701158e-06, + "loss": 0.8526, + "step": 8456 + }, + { + "epoch": 0.6519426456984274, + "grad_norm": 4.153457164764404, + "learning_rate": 2.8547153585925842e-06, + "loss": 0.9439, + "step": 8457 + }, + { + "epoch": 0.6520197348134443, + "grad_norm": 3.5101563930511475, + "learning_rate": 2.853587731231395e-06, + "loss": 0.8871, + "step": 8458 + }, + { + "epoch": 0.6520968239284614, + "grad_norm": 3.815025806427002, + "learning_rate": 2.8524602376878952e-06, + "loss": 0.9554, + "step": 8459 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 3.934506893157959, + "learning_rate": 2.8513328780323763e-06, + "loss": 0.8709, + "step": 8460 + }, + { + "epoch": 0.6522510021584952, + "grad_norm": 3.9895248413085938, + "learning_rate": 2.850205652335124e-06, + "loss": 0.976, + "step": 8461 + }, + { + "epoch": 0.6523280912735122, + "grad_norm": 3.5615084171295166, + "learning_rate": 2.8490785606664177e-06, + "loss": 0.8848, + "step": 8462 + }, + { + "epoch": 0.6524051803885291, + "grad_norm": 3.748917579650879, + "learning_rate": 2.8479516030965215e-06, + "loss": 0.8879, + "step": 8463 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 3.557553768157959, + "learning_rate": 2.8468247796956967e-06, + "loss": 0.8766, + "step": 8464 + }, + { + "epoch": 0.6525593586185631, + "grad_norm": 3.534771203994751, + "learning_rate": 2.845698090534196e-06, + "loss": 0.9342, + "step": 8465 + }, + { + "epoch": 0.65263644773358, + "grad_norm": 3.2332229614257812, + "learning_rate": 2.8445715356822605e-06, + "loss": 0.7536, + "step": 8466 + }, + { + "epoch": 0.652713536848597, + "grad_norm": 3.904291868209839, + "learning_rate": 2.8434451152101273e-06, + "loss": 0.9559, + "step": 8467 + }, + { + "epoch": 0.6527906259636139, + "grad_norm": 3.66575288772583, + "learning_rate": 2.8423188291880243e-06, + "loss": 1.0307, + "step": 8468 + }, + { + "epoch": 0.652867715078631, + "grad_norm": 3.674485206604004, + "learning_rate": 2.8411926776861653e-06, + "loss": 0.9073, + "step": 8469 + }, + { + "epoch": 0.6529448041936479, + "grad_norm": 3.7279632091522217, + "learning_rate": 2.840066660774762e-06, + "loss": 0.9714, + "step": 8470 + }, + { + "epoch": 0.6530218933086648, + "grad_norm": 3.9103338718414307, + "learning_rate": 2.838940778524015e-06, + "loss": 0.8716, + "step": 8471 + }, + { + "epoch": 0.6530989824236818, + "grad_norm": 3.741309404373169, + "learning_rate": 2.8378150310041197e-06, + "loss": 0.8169, + "step": 8472 + }, + { + "epoch": 0.6531760715386987, + "grad_norm": 3.7075369358062744, + "learning_rate": 2.83668941828526e-06, + "loss": 0.9703, + "step": 8473 + }, + { + "epoch": 0.6532531606537157, + "grad_norm": 4.015072822570801, + "learning_rate": 2.835563940437609e-06, + "loss": 0.975, + "step": 8474 + }, + { + "epoch": 0.6533302497687327, + "grad_norm": 4.320268630981445, + "learning_rate": 2.8344385975313366e-06, + "loss": 0.9171, + "step": 8475 + }, + { + "epoch": 0.6534073388837496, + "grad_norm": 3.988584518432617, + "learning_rate": 2.8333133896365993e-06, + "loss": 0.9586, + "step": 8476 + }, + { + "epoch": 0.6534844279987666, + "grad_norm": 3.7726614475250244, + "learning_rate": 2.8321883168235555e-06, + "loss": 0.8503, + "step": 8477 + }, + { + "epoch": 0.6535615171137835, + "grad_norm": 3.7435741424560547, + "learning_rate": 2.8310633791623403e-06, + "loss": 0.9676, + "step": 8478 + }, + { + "epoch": 0.6536386062288005, + "grad_norm": 3.893402099609375, + "learning_rate": 2.8299385767230904e-06, + "loss": 0.8198, + "step": 8479 + }, + { + "epoch": 0.6537156953438175, + "grad_norm": 3.577685594558716, + "learning_rate": 2.8288139095759315e-06, + "loss": 0.9281, + "step": 8480 + }, + { + "epoch": 0.6537927844588344, + "grad_norm": 3.464526891708374, + "learning_rate": 2.8276893777909796e-06, + "loss": 0.8223, + "step": 8481 + }, + { + "epoch": 0.6538698735738514, + "grad_norm": 3.8149683475494385, + "learning_rate": 2.826564981438345e-06, + "loss": 0.9708, + "step": 8482 + }, + { + "epoch": 0.6539469626888683, + "grad_norm": 3.6663248538970947, + "learning_rate": 2.825440720588129e-06, + "loss": 0.8898, + "step": 8483 + }, + { + "epoch": 0.6540240518038853, + "grad_norm": 3.7101478576660156, + "learning_rate": 2.82431659531042e-06, + "loss": 0.9334, + "step": 8484 + }, + { + "epoch": 0.6541011409189023, + "grad_norm": 3.7153666019439697, + "learning_rate": 2.823192605675303e-06, + "loss": 0.8837, + "step": 8485 + }, + { + "epoch": 0.6541782300339192, + "grad_norm": 3.713289260864258, + "learning_rate": 2.822068751752853e-06, + "loss": 0.8889, + "step": 8486 + }, + { + "epoch": 0.6542553191489362, + "grad_norm": 3.4118247032165527, + "learning_rate": 2.820945033613136e-06, + "loss": 0.9104, + "step": 8487 + }, + { + "epoch": 0.6543324082639531, + "grad_norm": 3.6996970176696777, + "learning_rate": 2.8198214513262132e-06, + "loss": 0.8539, + "step": 8488 + }, + { + "epoch": 0.6544094973789701, + "grad_norm": 3.8898186683654785, + "learning_rate": 2.8186980049621292e-06, + "loss": 0.9048, + "step": 8489 + }, + { + "epoch": 0.654486586493987, + "grad_norm": 3.663559675216675, + "learning_rate": 2.8175746945909277e-06, + "loss": 0.798, + "step": 8490 + }, + { + "epoch": 0.654563675609004, + "grad_norm": 4.062438488006592, + "learning_rate": 2.816451520282641e-06, + "loss": 0.8531, + "step": 8491 + }, + { + "epoch": 0.654640764724021, + "grad_norm": 3.281128168106079, + "learning_rate": 2.8153284821072925e-06, + "loss": 0.8421, + "step": 8492 + }, + { + "epoch": 0.6547178538390379, + "grad_norm": 3.5460658073425293, + "learning_rate": 2.814205580134899e-06, + "loss": 0.9501, + "step": 8493 + }, + { + "epoch": 0.6547949429540549, + "grad_norm": 3.5473179817199707, + "learning_rate": 2.813082814435469e-06, + "loss": 0.8635, + "step": 8494 + }, + { + "epoch": 0.6548720320690719, + "grad_norm": 3.5406644344329834, + "learning_rate": 2.8119601850789966e-06, + "loss": 0.8653, + "step": 8495 + }, + { + "epoch": 0.6549491211840888, + "grad_norm": 4.069020748138428, + "learning_rate": 2.810837692135475e-06, + "loss": 0.9973, + "step": 8496 + }, + { + "epoch": 0.6550262102991058, + "grad_norm": 3.7306931018829346, + "learning_rate": 2.809715335674885e-06, + "loss": 1.004, + "step": 8497 + }, + { + "epoch": 0.6551032994141227, + "grad_norm": 4.0841288566589355, + "learning_rate": 2.8085931157672007e-06, + "loss": 0.9853, + "step": 8498 + }, + { + "epoch": 0.6551803885291397, + "grad_norm": 3.389622211456299, + "learning_rate": 2.807471032482387e-06, + "loss": 0.9221, + "step": 8499 + }, + { + "epoch": 0.6552574776441566, + "grad_norm": 3.7449147701263428, + "learning_rate": 2.8063490858903987e-06, + "loss": 0.9093, + "step": 8500 + }, + { + "epoch": 0.6553345667591736, + "grad_norm": 3.7935571670532227, + "learning_rate": 2.805227276061182e-06, + "loss": 0.9671, + "step": 8501 + }, + { + "epoch": 0.6554116558741906, + "grad_norm": 3.392277717590332, + "learning_rate": 2.8041056030646787e-06, + "loss": 0.9458, + "step": 8502 + }, + { + "epoch": 0.6554887449892075, + "grad_norm": 4.1073150634765625, + "learning_rate": 2.8029840669708176e-06, + "loss": 0.9082, + "step": 8503 + }, + { + "epoch": 0.6555658341042245, + "grad_norm": 3.53625750541687, + "learning_rate": 2.8018626678495215e-06, + "loss": 0.9127, + "step": 8504 + }, + { + "epoch": 0.6556429232192414, + "grad_norm": 3.6561434268951416, + "learning_rate": 2.8007414057707054e-06, + "loss": 0.9273, + "step": 8505 + }, + { + "epoch": 0.6557200123342584, + "grad_norm": 3.346360683441162, + "learning_rate": 2.7996202808042706e-06, + "loss": 0.8784, + "step": 8506 + }, + { + "epoch": 0.6557971014492754, + "grad_norm": 3.969512462615967, + "learning_rate": 2.7984992930201156e-06, + "loss": 0.8833, + "step": 8507 + }, + { + "epoch": 0.6558741905642923, + "grad_norm": 3.466351270675659, + "learning_rate": 2.7973784424881273e-06, + "loss": 0.8353, + "step": 8508 + }, + { + "epoch": 0.6559512796793093, + "grad_norm": 3.9077634811401367, + "learning_rate": 2.7962577292781856e-06, + "loss": 0.9578, + "step": 8509 + }, + { + "epoch": 0.6560283687943262, + "grad_norm": 4.4450225830078125, + "learning_rate": 2.795137153460163e-06, + "loss": 1.0021, + "step": 8510 + }, + { + "epoch": 0.6561054579093432, + "grad_norm": 3.7983291149139404, + "learning_rate": 2.794016715103918e-06, + "loss": 0.9893, + "step": 8511 + }, + { + "epoch": 0.6561825470243602, + "grad_norm": 3.427762985229492, + "learning_rate": 2.7928964142793048e-06, + "loss": 0.8019, + "step": 8512 + }, + { + "epoch": 0.6562596361393771, + "grad_norm": 4.073552131652832, + "learning_rate": 2.7917762510561698e-06, + "loss": 0.9104, + "step": 8513 + }, + { + "epoch": 0.6563367252543941, + "grad_norm": 3.4051129817962646, + "learning_rate": 2.790656225504349e-06, + "loss": 0.8763, + "step": 8514 + }, + { + "epoch": 0.656413814369411, + "grad_norm": 3.3330910205841064, + "learning_rate": 2.789536337693672e-06, + "loss": 0.8136, + "step": 8515 + }, + { + "epoch": 0.656490903484428, + "grad_norm": 3.8815250396728516, + "learning_rate": 2.7884165876939537e-06, + "loss": 1.0265, + "step": 8516 + }, + { + "epoch": 0.656567992599445, + "grad_norm": 3.9009790420532227, + "learning_rate": 2.787296975575008e-06, + "loss": 0.9191, + "step": 8517 + }, + { + "epoch": 0.6566450817144619, + "grad_norm": 3.823681354522705, + "learning_rate": 2.786177501406635e-06, + "loss": 1.0026, + "step": 8518 + }, + { + "epoch": 0.6567221708294789, + "grad_norm": 3.889641761779785, + "learning_rate": 2.7850581652586296e-06, + "loss": 0.8824, + "step": 8519 + }, + { + "epoch": 0.6567992599444958, + "grad_norm": 3.67506742477417, + "learning_rate": 2.7839389672007753e-06, + "loss": 0.9501, + "step": 8520 + }, + { + "epoch": 0.6568763490595128, + "grad_norm": 4.033994197845459, + "learning_rate": 2.7828199073028504e-06, + "loss": 0.9688, + "step": 8521 + }, + { + "epoch": 0.6569534381745298, + "grad_norm": 3.685767889022827, + "learning_rate": 2.78170098563462e-06, + "loss": 1.0076, + "step": 8522 + }, + { + "epoch": 0.6570305272895467, + "grad_norm": 3.811617374420166, + "learning_rate": 2.7805822022658452e-06, + "loss": 0.8409, + "step": 8523 + }, + { + "epoch": 0.6571076164045637, + "grad_norm": 3.2844462394714355, + "learning_rate": 2.7794635572662755e-06, + "loss": 0.8843, + "step": 8524 + }, + { + "epoch": 0.6571847055195806, + "grad_norm": 3.5882797241210938, + "learning_rate": 2.778345050705654e-06, + "loss": 0.8763, + "step": 8525 + }, + { + "epoch": 0.6572617946345976, + "grad_norm": 4.07551383972168, + "learning_rate": 2.7772266826537103e-06, + "loss": 0.9604, + "step": 8526 + }, + { + "epoch": 0.6573388837496146, + "grad_norm": 3.5869812965393066, + "learning_rate": 2.7761084531801707e-06, + "loss": 1.0136, + "step": 8527 + }, + { + "epoch": 0.6574159728646315, + "grad_norm": 3.7623794078826904, + "learning_rate": 2.774990362354752e-06, + "loss": 0.9086, + "step": 8528 + }, + { + "epoch": 0.6574930619796485, + "grad_norm": 3.60953950881958, + "learning_rate": 2.7738724102471603e-06, + "loss": 0.9353, + "step": 8529 + }, + { + "epoch": 0.6575701510946654, + "grad_norm": 3.6601200103759766, + "learning_rate": 2.772754596927094e-06, + "loss": 0.9573, + "step": 8530 + }, + { + "epoch": 0.6576472402096823, + "grad_norm": 3.794666051864624, + "learning_rate": 2.771636922464246e-06, + "loss": 0.8832, + "step": 8531 + }, + { + "epoch": 0.6577243293246994, + "grad_norm": 3.9697749614715576, + "learning_rate": 2.7705193869282924e-06, + "loss": 1.0734, + "step": 8532 + }, + { + "epoch": 0.6578014184397163, + "grad_norm": 3.8079090118408203, + "learning_rate": 2.7694019903889087e-06, + "loss": 0.969, + "step": 8533 + }, + { + "epoch": 0.6578785075547333, + "grad_norm": 4.480514049530029, + "learning_rate": 2.768284732915758e-06, + "loss": 1.0543, + "step": 8534 + }, + { + "epoch": 0.6579555966697502, + "grad_norm": 3.4868109226226807, + "learning_rate": 2.767167614578496e-06, + "loss": 0.9419, + "step": 8535 + }, + { + "epoch": 0.6580326857847671, + "grad_norm": 3.8815977573394775, + "learning_rate": 2.7660506354467708e-06, + "loss": 0.9569, + "step": 8536 + }, + { + "epoch": 0.6581097748997842, + "grad_norm": 3.8802473545074463, + "learning_rate": 2.7649337955902167e-06, + "loss": 0.9196, + "step": 8537 + }, + { + "epoch": 0.6581868640148011, + "grad_norm": 3.837989330291748, + "learning_rate": 2.763817095078465e-06, + "loss": 0.9033, + "step": 8538 + }, + { + "epoch": 0.6582639531298181, + "grad_norm": 3.4889914989471436, + "learning_rate": 2.762700533981136e-06, + "loss": 0.8692, + "step": 8539 + }, + { + "epoch": 0.658341042244835, + "grad_norm": 3.921881914138794, + "learning_rate": 2.761584112367841e-06, + "loss": 0.9108, + "step": 8540 + }, + { + "epoch": 0.6584181313598519, + "grad_norm": 3.6214163303375244, + "learning_rate": 2.7604678303081833e-06, + "loss": 0.9357, + "step": 8541 + }, + { + "epoch": 0.658495220474869, + "grad_norm": 3.711642265319824, + "learning_rate": 2.7593516878717604e-06, + "loss": 0.8258, + "step": 8542 + }, + { + "epoch": 0.6585723095898859, + "grad_norm": 3.5625925064086914, + "learning_rate": 2.758235685128152e-06, + "loss": 0.9205, + "step": 8543 + }, + { + "epoch": 0.6586493987049029, + "grad_norm": 3.9220776557922363, + "learning_rate": 2.75711982214694e-06, + "loss": 0.9736, + "step": 8544 + }, + { + "epoch": 0.6587264878199198, + "grad_norm": 3.7191176414489746, + "learning_rate": 2.7560040989976894e-06, + "loss": 0.8053, + "step": 8545 + }, + { + "epoch": 0.6588035769349367, + "grad_norm": 3.6026618480682373, + "learning_rate": 2.7548885157499617e-06, + "loss": 0.9155, + "step": 8546 + }, + { + "epoch": 0.6588806660499538, + "grad_norm": 3.873753786087036, + "learning_rate": 2.7537730724733092e-06, + "loss": 0.9101, + "step": 8547 + }, + { + "epoch": 0.6589577551649707, + "grad_norm": 3.89041805267334, + "learning_rate": 2.7526577692372704e-06, + "loss": 0.783, + "step": 8548 + }, + { + "epoch": 0.6590348442799877, + "grad_norm": 3.7555131912231445, + "learning_rate": 2.75154260611138e-06, + "loss": 1.0063, + "step": 8549 + }, + { + "epoch": 0.6591119333950046, + "grad_norm": 3.4990248680114746, + "learning_rate": 2.750427583165164e-06, + "loss": 0.9204, + "step": 8550 + }, + { + "epoch": 0.6591890225100215, + "grad_norm": 3.4794487953186035, + "learning_rate": 2.7493127004681365e-06, + "loss": 0.9723, + "step": 8551 + }, + { + "epoch": 0.6592661116250386, + "grad_norm": 3.5538179874420166, + "learning_rate": 2.7481979580898056e-06, + "loss": 0.8747, + "step": 8552 + }, + { + "epoch": 0.6593432007400555, + "grad_norm": 3.8357620239257812, + "learning_rate": 2.7470833560996717e-06, + "loss": 1.0114, + "step": 8553 + }, + { + "epoch": 0.6594202898550725, + "grad_norm": 3.8175249099731445, + "learning_rate": 2.7459688945672204e-06, + "loss": 0.9458, + "step": 8554 + }, + { + "epoch": 0.6594973789700894, + "grad_norm": 3.8220913410186768, + "learning_rate": 2.744854573561935e-06, + "loss": 0.9557, + "step": 8555 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 4.2890944480896, + "learning_rate": 2.7437403931532867e-06, + "loss": 1.043, + "step": 8556 + }, + { + "epoch": 0.6596515572001234, + "grad_norm": 4.034401893615723, + "learning_rate": 2.7426263534107394e-06, + "loss": 0.9287, + "step": 8557 + }, + { + "epoch": 0.6597286463151403, + "grad_norm": 3.621591091156006, + "learning_rate": 2.7415124544037497e-06, + "loss": 0.968, + "step": 8558 + }, + { + "epoch": 0.6598057354301573, + "grad_norm": 3.7988715171813965, + "learning_rate": 2.74039869620176e-06, + "loss": 0.9679, + "step": 8559 + }, + { + "epoch": 0.6598828245451742, + "grad_norm": 3.6728246212005615, + "learning_rate": 2.7392850788742088e-06, + "loss": 0.8546, + "step": 8560 + }, + { + "epoch": 0.6599599136601911, + "grad_norm": 3.391083002090454, + "learning_rate": 2.7381716024905236e-06, + "loss": 0.8998, + "step": 8561 + }, + { + "epoch": 0.6600370027752082, + "grad_norm": 3.6413748264312744, + "learning_rate": 2.7370582671201253e-06, + "loss": 1.021, + "step": 8562 + }, + { + "epoch": 0.6601140918902251, + "grad_norm": 3.9035537242889404, + "learning_rate": 2.7359450728324254e-06, + "loss": 0.9325, + "step": 8563 + }, + { + "epoch": 0.6601911810052421, + "grad_norm": 3.603443145751953, + "learning_rate": 2.734832019696821e-06, + "loss": 0.943, + "step": 8564 + }, + { + "epoch": 0.660268270120259, + "grad_norm": 4.327446937561035, + "learning_rate": 2.7337191077827107e-06, + "loss": 0.94, + "step": 8565 + }, + { + "epoch": 0.6603453592352759, + "grad_norm": 3.7741518020629883, + "learning_rate": 2.732606337159476e-06, + "loss": 0.8859, + "step": 8566 + }, + { + "epoch": 0.660422448350293, + "grad_norm": 3.900160074234009, + "learning_rate": 2.731493707896493e-06, + "loss": 0.8494, + "step": 8567 + }, + { + "epoch": 0.6604995374653099, + "grad_norm": 4.106858253479004, + "learning_rate": 2.730381220063131e-06, + "loss": 0.9732, + "step": 8568 + }, + { + "epoch": 0.6605766265803269, + "grad_norm": 3.8731374740600586, + "learning_rate": 2.729268873728743e-06, + "loss": 0.9365, + "step": 8569 + }, + { + "epoch": 0.6606537156953438, + "grad_norm": 4.0173659324646, + "learning_rate": 2.7281566689626804e-06, + "loss": 0.9892, + "step": 8570 + }, + { + "epoch": 0.6607308048103607, + "grad_norm": 3.8889834880828857, + "learning_rate": 2.7270446058342836e-06, + "loss": 0.9164, + "step": 8571 + }, + { + "epoch": 0.6608078939253778, + "grad_norm": 4.212670803070068, + "learning_rate": 2.7259326844128832e-06, + "loss": 1.0294, + "step": 8572 + }, + { + "epoch": 0.6608849830403947, + "grad_norm": 4.06730318069458, + "learning_rate": 2.724820904767804e-06, + "loss": 0.9905, + "step": 8573 + }, + { + "epoch": 0.6609620721554117, + "grad_norm": 3.9518141746520996, + "learning_rate": 2.7237092669683562e-06, + "loss": 0.9677, + "step": 8574 + }, + { + "epoch": 0.6610391612704286, + "grad_norm": 3.7986531257629395, + "learning_rate": 2.722597771083847e-06, + "loss": 0.8483, + "step": 8575 + }, + { + "epoch": 0.6611162503854455, + "grad_norm": 4.189377784729004, + "learning_rate": 2.721486417183571e-06, + "loss": 1.0454, + "step": 8576 + }, + { + "epoch": 0.6611933395004626, + "grad_norm": 3.7273128032684326, + "learning_rate": 2.7203752053368167e-06, + "loss": 0.928, + "step": 8577 + }, + { + "epoch": 0.6612704286154795, + "grad_norm": 3.6438515186309814, + "learning_rate": 2.7192641356128613e-06, + "loss": 0.9048, + "step": 8578 + }, + { + "epoch": 0.6613475177304965, + "grad_norm": 3.6332359313964844, + "learning_rate": 2.718153208080977e-06, + "loss": 0.9138, + "step": 8579 + }, + { + "epoch": 0.6614246068455134, + "grad_norm": 3.474804162979126, + "learning_rate": 2.7170424228104207e-06, + "loss": 0.8285, + "step": 8580 + }, + { + "epoch": 0.6615016959605303, + "grad_norm": 3.8621301651000977, + "learning_rate": 2.7159317798704452e-06, + "loss": 0.9538, + "step": 8581 + }, + { + "epoch": 0.6615787850755473, + "grad_norm": 4.121014595031738, + "learning_rate": 2.7148212793302938e-06, + "loss": 0.9663, + "step": 8582 + }, + { + "epoch": 0.6616558741905643, + "grad_norm": 3.514923572540283, + "learning_rate": 2.7137109212592006e-06, + "loss": 0.8936, + "step": 8583 + }, + { + "epoch": 0.6617329633055813, + "grad_norm": 3.3887276649475098, + "learning_rate": 2.712600705726392e-06, + "loss": 0.9403, + "step": 8584 + }, + { + "epoch": 0.6618100524205982, + "grad_norm": 3.4407477378845215, + "learning_rate": 2.7114906328010803e-06, + "loss": 0.9276, + "step": 8585 + }, + { + "epoch": 0.6618871415356151, + "grad_norm": 3.4507927894592285, + "learning_rate": 2.7103807025524764e-06, + "loss": 0.7437, + "step": 8586 + }, + { + "epoch": 0.6619642306506321, + "grad_norm": 3.491410255432129, + "learning_rate": 2.7092709150497764e-06, + "loss": 0.8576, + "step": 8587 + }, + { + "epoch": 0.6620413197656491, + "grad_norm": 3.461198091506958, + "learning_rate": 2.708161270362171e-06, + "loss": 0.8797, + "step": 8588 + }, + { + "epoch": 0.6621184088806661, + "grad_norm": 3.8753464221954346, + "learning_rate": 2.7070517685588417e-06, + "loss": 0.8896, + "step": 8589 + }, + { + "epoch": 0.662195497995683, + "grad_norm": 3.9158358573913574, + "learning_rate": 2.70594240970896e-06, + "loss": 0.8949, + "step": 8590 + }, + { + "epoch": 0.6622725871106999, + "grad_norm": 12.948868751525879, + "learning_rate": 2.7048331938816863e-06, + "loss": 0.8589, + "step": 8591 + }, + { + "epoch": 0.6623496762257169, + "grad_norm": 3.7209019660949707, + "learning_rate": 2.7037241211461752e-06, + "loss": 0.9651, + "step": 8592 + }, + { + "epoch": 0.6624267653407339, + "grad_norm": 3.9368491172790527, + "learning_rate": 2.7026151915715735e-06, + "loss": 0.9401, + "step": 8593 + }, + { + "epoch": 0.6625038544557509, + "grad_norm": 3.6502487659454346, + "learning_rate": 2.7015064052270156e-06, + "loss": 0.9531, + "step": 8594 + }, + { + "epoch": 0.6625809435707678, + "grad_norm": 3.4706952571868896, + "learning_rate": 2.700397762181631e-06, + "loss": 0.8791, + "step": 8595 + }, + { + "epoch": 0.6626580326857847, + "grad_norm": 3.8239970207214355, + "learning_rate": 2.6992892625045343e-06, + "loss": 1.0303, + "step": 8596 + }, + { + "epoch": 0.6627351218008017, + "grad_norm": 3.561873435974121, + "learning_rate": 2.698180906264837e-06, + "loss": 0.87, + "step": 8597 + }, + { + "epoch": 0.6628122109158187, + "grad_norm": 3.5032331943511963, + "learning_rate": 2.697072693531637e-06, + "loss": 0.9174, + "step": 8598 + }, + { + "epoch": 0.6628893000308357, + "grad_norm": 3.578794479370117, + "learning_rate": 2.695964624374029e-06, + "loss": 0.926, + "step": 8599 + }, + { + "epoch": 0.6629663891458526, + "grad_norm": 3.583988904953003, + "learning_rate": 2.6948566988610938e-06, + "loss": 0.8603, + "step": 8600 + }, + { + "epoch": 0.6630434782608695, + "grad_norm": 3.555171489715576, + "learning_rate": 2.693748917061906e-06, + "loss": 0.8882, + "step": 8601 + }, + { + "epoch": 0.6631205673758865, + "grad_norm": 3.73492431640625, + "learning_rate": 2.692641279045527e-06, + "loss": 0.961, + "step": 8602 + }, + { + "epoch": 0.6631976564909035, + "grad_norm": 3.6243600845336914, + "learning_rate": 2.691533784881014e-06, + "loss": 0.8199, + "step": 8603 + }, + { + "epoch": 0.6632747456059205, + "grad_norm": 3.813415288925171, + "learning_rate": 2.6904264346374135e-06, + "loss": 0.8965, + "step": 8604 + }, + { + "epoch": 0.6633518347209374, + "grad_norm": 3.7679946422576904, + "learning_rate": 2.6893192283837636e-06, + "loss": 0.9742, + "step": 8605 + }, + { + "epoch": 0.6634289238359543, + "grad_norm": 3.7962825298309326, + "learning_rate": 2.688212166189095e-06, + "loss": 0.9346, + "step": 8606 + }, + { + "epoch": 0.6635060129509713, + "grad_norm": 3.616288900375366, + "learning_rate": 2.6871052481224217e-06, + "loss": 0.9487, + "step": 8607 + }, + { + "epoch": 0.6635831020659883, + "grad_norm": 3.7229197025299072, + "learning_rate": 2.685998474252758e-06, + "loss": 0.8765, + "step": 8608 + }, + { + "epoch": 0.6636601911810053, + "grad_norm": 3.9187211990356445, + "learning_rate": 2.684891844649103e-06, + "loss": 0.9206, + "step": 8609 + }, + { + "epoch": 0.6637372802960222, + "grad_norm": 3.438185453414917, + "learning_rate": 2.6837853593804554e-06, + "loss": 0.871, + "step": 8610 + }, + { + "epoch": 0.6638143694110391, + "grad_norm": 3.9521586894989014, + "learning_rate": 2.682679018515792e-06, + "loss": 0.9515, + "step": 8611 + }, + { + "epoch": 0.6638914585260561, + "grad_norm": 3.899041175842285, + "learning_rate": 2.68157282212409e-06, + "loss": 0.9129, + "step": 8612 + }, + { + "epoch": 0.663968547641073, + "grad_norm": 3.531867265701294, + "learning_rate": 2.680466770274316e-06, + "loss": 0.8427, + "step": 8613 + }, + { + "epoch": 0.6640456367560901, + "grad_norm": 3.4896748065948486, + "learning_rate": 2.679360863035425e-06, + "loss": 0.7873, + "step": 8614 + }, + { + "epoch": 0.664122725871107, + "grad_norm": 3.5182361602783203, + "learning_rate": 2.678255100476366e-06, + "loss": 0.8737, + "step": 8615 + }, + { + "epoch": 0.6641998149861239, + "grad_norm": 3.8621270656585693, + "learning_rate": 2.6771494826660782e-06, + "loss": 0.9002, + "step": 8616 + }, + { + "epoch": 0.6642769041011409, + "grad_norm": 3.783743143081665, + "learning_rate": 2.6760440096734875e-06, + "loss": 0.9574, + "step": 8617 + }, + { + "epoch": 0.6643539932161578, + "grad_norm": 3.6670119762420654, + "learning_rate": 2.674938681567517e-06, + "loss": 0.824, + "step": 8618 + }, + { + "epoch": 0.6644310823311749, + "grad_norm": 3.6787564754486084, + "learning_rate": 2.6738334984170785e-06, + "loss": 0.9541, + "step": 8619 + }, + { + "epoch": 0.6645081714461918, + "grad_norm": 3.8661928176879883, + "learning_rate": 2.672728460291073e-06, + "loss": 0.9713, + "step": 8620 + }, + { + "epoch": 0.6645852605612088, + "grad_norm": 5.067914009094238, + "learning_rate": 2.671623567258398e-06, + "loss": 1.0446, + "step": 8621 + }, + { + "epoch": 0.6646623496762257, + "grad_norm": 3.7516796588897705, + "learning_rate": 2.6705188193879316e-06, + "loss": 0.8746, + "step": 8622 + }, + { + "epoch": 0.6647394387912426, + "grad_norm": 3.7899248600006104, + "learning_rate": 2.669414216748552e-06, + "loss": 0.9415, + "step": 8623 + }, + { + "epoch": 0.6648165279062597, + "grad_norm": 3.813930034637451, + "learning_rate": 2.6683097594091257e-06, + "loss": 1.0627, + "step": 8624 + }, + { + "epoch": 0.6648936170212766, + "grad_norm": 4.484930992126465, + "learning_rate": 2.6672054474385102e-06, + "loss": 1.0192, + "step": 8625 + }, + { + "epoch": 0.6649707061362936, + "grad_norm": 3.5336852073669434, + "learning_rate": 2.666101280905553e-06, + "loss": 0.8844, + "step": 8626 + }, + { + "epoch": 0.6650477952513105, + "grad_norm": 3.831902503967285, + "learning_rate": 2.664997259879095e-06, + "loss": 0.8492, + "step": 8627 + }, + { + "epoch": 0.6651248843663274, + "grad_norm": 3.4217164516448975, + "learning_rate": 2.663893384427963e-06, + "loss": 0.9379, + "step": 8628 + }, + { + "epoch": 0.6652019734813445, + "grad_norm": 3.7807199954986572, + "learning_rate": 2.66278965462098e-06, + "loss": 0.9947, + "step": 8629 + }, + { + "epoch": 0.6652790625963614, + "grad_norm": 4.046280384063721, + "learning_rate": 2.661686070526956e-06, + "loss": 0.9283, + "step": 8630 + }, + { + "epoch": 0.6653561517113784, + "grad_norm": 3.804257392883301, + "learning_rate": 2.6605826322146954e-06, + "loss": 1.0612, + "step": 8631 + }, + { + "epoch": 0.6654332408263953, + "grad_norm": 3.5956528186798096, + "learning_rate": 2.659479339752994e-06, + "loss": 0.8869, + "step": 8632 + }, + { + "epoch": 0.6655103299414122, + "grad_norm": 3.916138172149658, + "learning_rate": 2.6583761932106323e-06, + "loss": 0.8914, + "step": 8633 + }, + { + "epoch": 0.6655874190564293, + "grad_norm": 4.105411052703857, + "learning_rate": 2.6572731926563867e-06, + "loss": 0.961, + "step": 8634 + }, + { + "epoch": 0.6656645081714462, + "grad_norm": 3.324625015258789, + "learning_rate": 2.6561703381590244e-06, + "loss": 0.8609, + "step": 8635 + }, + { + "epoch": 0.6657415972864632, + "grad_norm": 3.7240700721740723, + "learning_rate": 2.6550676297873023e-06, + "loss": 0.8107, + "step": 8636 + }, + { + "epoch": 0.6658186864014801, + "grad_norm": 3.6989731788635254, + "learning_rate": 2.6539650676099687e-06, + "loss": 0.8776, + "step": 8637 + }, + { + "epoch": 0.665895775516497, + "grad_norm": 3.833456039428711, + "learning_rate": 2.652862651695765e-06, + "loss": 0.9177, + "step": 8638 + }, + { + "epoch": 0.6659728646315141, + "grad_norm": 3.728043794631958, + "learning_rate": 2.651760382113417e-06, + "loss": 0.9688, + "step": 8639 + }, + { + "epoch": 0.666049953746531, + "grad_norm": 3.813891649246216, + "learning_rate": 2.6506582589316463e-06, + "loss": 0.9131, + "step": 8640 + }, + { + "epoch": 0.666127042861548, + "grad_norm": 3.5366103649139404, + "learning_rate": 2.649556282219167e-06, + "loss": 0.9076, + "step": 8641 + }, + { + "epoch": 0.6662041319765649, + "grad_norm": 3.920154333114624, + "learning_rate": 2.64845445204468e-06, + "loss": 0.9324, + "step": 8642 + }, + { + "epoch": 0.6662812210915818, + "grad_norm": 4.236634731292725, + "learning_rate": 2.64735276847688e-06, + "loss": 0.8984, + "step": 8643 + }, + { + "epoch": 0.6663583102065989, + "grad_norm": 3.766432762145996, + "learning_rate": 2.646251231584449e-06, + "loss": 0.9886, + "step": 8644 + }, + { + "epoch": 0.6664353993216158, + "grad_norm": 3.6179211139678955, + "learning_rate": 2.6451498414360633e-06, + "loss": 0.7535, + "step": 8645 + }, + { + "epoch": 0.6665124884366328, + "grad_norm": 3.5795180797576904, + "learning_rate": 2.644048598100388e-06, + "loss": 0.8096, + "step": 8646 + }, + { + "epoch": 0.6665895775516497, + "grad_norm": 3.80401873588562, + "learning_rate": 2.642947501646082e-06, + "loss": 0.9594, + "step": 8647 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 3.3994295597076416, + "learning_rate": 2.6418465521417914e-06, + "loss": 0.7756, + "step": 8648 + }, + { + "epoch": 0.6667437557816837, + "grad_norm": 3.6575098037719727, + "learning_rate": 2.6407457496561573e-06, + "loss": 0.9767, + "step": 8649 + }, + { + "epoch": 0.6668208448967006, + "grad_norm": 3.2171177864074707, + "learning_rate": 2.6396450942578046e-06, + "loss": 0.8447, + "step": 8650 + }, + { + "epoch": 0.6668979340117176, + "grad_norm": 3.508512496948242, + "learning_rate": 2.638544586015356e-06, + "loss": 0.9398, + "step": 8651 + }, + { + "epoch": 0.6669750231267345, + "grad_norm": 3.8532354831695557, + "learning_rate": 2.6374442249974214e-06, + "loss": 0.988, + "step": 8652 + }, + { + "epoch": 0.6670521122417514, + "grad_norm": 4.1069416999816895, + "learning_rate": 2.6363440112726037e-06, + "loss": 0.9103, + "step": 8653 + }, + { + "epoch": 0.6671292013567685, + "grad_norm": 3.5245847702026367, + "learning_rate": 2.6352439449094952e-06, + "loss": 0.9636, + "step": 8654 + }, + { + "epoch": 0.6672062904717854, + "grad_norm": 3.4307544231414795, + "learning_rate": 2.6341440259766792e-06, + "loss": 0.8636, + "step": 8655 + }, + { + "epoch": 0.6672833795868024, + "grad_norm": 3.6783254146575928, + "learning_rate": 2.6330442545427303e-06, + "loss": 0.8899, + "step": 8656 + }, + { + "epoch": 0.6673604687018193, + "grad_norm": 3.7773871421813965, + "learning_rate": 2.6319446306762136e-06, + "loss": 0.9547, + "step": 8657 + }, + { + "epoch": 0.6674375578168362, + "grad_norm": 3.804189920425415, + "learning_rate": 2.6308451544456863e-06, + "loss": 0.9195, + "step": 8658 + }, + { + "epoch": 0.6675146469318533, + "grad_norm": 3.798102855682373, + "learning_rate": 2.6297458259196915e-06, + "loss": 0.9327, + "step": 8659 + }, + { + "epoch": 0.6675917360468702, + "grad_norm": 4.101744174957275, + "learning_rate": 2.628646645166769e-06, + "loss": 1.0034, + "step": 8660 + }, + { + "epoch": 0.6676688251618872, + "grad_norm": 3.816171884536743, + "learning_rate": 2.6275476122554467e-06, + "loss": 1.0159, + "step": 8661 + }, + { + "epoch": 0.6677459142769041, + "grad_norm": 3.46830677986145, + "learning_rate": 2.6264487272542427e-06, + "loss": 0.9187, + "step": 8662 + }, + { + "epoch": 0.667823003391921, + "grad_norm": 3.302978515625, + "learning_rate": 2.625349990231668e-06, + "loss": 0.942, + "step": 8663 + }, + { + "epoch": 0.667900092506938, + "grad_norm": 3.929262399673462, + "learning_rate": 2.624251401256225e-06, + "loss": 0.986, + "step": 8664 + }, + { + "epoch": 0.667977181621955, + "grad_norm": 4.437450408935547, + "learning_rate": 2.623152960396401e-06, + "loss": 0.8384, + "step": 8665 + }, + { + "epoch": 0.668054270736972, + "grad_norm": 3.7824137210845947, + "learning_rate": 2.622054667720679e-06, + "loss": 0.9588, + "step": 8666 + }, + { + "epoch": 0.6681313598519889, + "grad_norm": 3.94724702835083, + "learning_rate": 2.6209565232975342e-06, + "loss": 0.875, + "step": 8667 + }, + { + "epoch": 0.6682084489670058, + "grad_norm": 3.6828274726867676, + "learning_rate": 2.619858527195428e-06, + "loss": 0.914, + "step": 8668 + }, + { + "epoch": 0.6682855380820228, + "grad_norm": 3.615562677383423, + "learning_rate": 2.618760679482817e-06, + "loss": 0.9406, + "step": 8669 + }, + { + "epoch": 0.6683626271970398, + "grad_norm": 3.8135929107666016, + "learning_rate": 2.617662980228144e-06, + "loss": 0.8714, + "step": 8670 + }, + { + "epoch": 0.6684397163120568, + "grad_norm": 3.3155691623687744, + "learning_rate": 2.6165654294998442e-06, + "loss": 0.7839, + "step": 8671 + }, + { + "epoch": 0.6685168054270737, + "grad_norm": 3.6126980781555176, + "learning_rate": 2.6154680273663468e-06, + "loss": 0.8381, + "step": 8672 + }, + { + "epoch": 0.6685938945420906, + "grad_norm": 4.058097839355469, + "learning_rate": 2.614370773896069e-06, + "loss": 1.006, + "step": 8673 + }, + { + "epoch": 0.6686709836571076, + "grad_norm": 3.903541088104248, + "learning_rate": 2.613273669157417e-06, + "loss": 0.8644, + "step": 8674 + }, + { + "epoch": 0.6687480727721246, + "grad_norm": 3.7401607036590576, + "learning_rate": 2.612176713218792e-06, + "loss": 0.9988, + "step": 8675 + }, + { + "epoch": 0.6688251618871416, + "grad_norm": 3.917459011077881, + "learning_rate": 2.61107990614858e-06, + "loss": 0.9585, + "step": 8676 + }, + { + "epoch": 0.6689022510021585, + "grad_norm": 4.191850185394287, + "learning_rate": 2.6099832480151642e-06, + "loss": 0.931, + "step": 8677 + }, + { + "epoch": 0.6689793401171754, + "grad_norm": 3.9755399227142334, + "learning_rate": 2.6088867388869143e-06, + "loss": 1.0736, + "step": 8678 + }, + { + "epoch": 0.6690564292321924, + "grad_norm": 3.440880298614502, + "learning_rate": 2.6077903788321924e-06, + "loss": 1.0059, + "step": 8679 + }, + { + "epoch": 0.6691335183472094, + "grad_norm": 3.5236377716064453, + "learning_rate": 2.606694167919353e-06, + "loss": 0.9046, + "step": 8680 + }, + { + "epoch": 0.6692106074622264, + "grad_norm": 3.7371933460235596, + "learning_rate": 2.6055981062167348e-06, + "loss": 0.9506, + "step": 8681 + }, + { + "epoch": 0.6692876965772433, + "grad_norm": 3.5741524696350098, + "learning_rate": 2.6045021937926738e-06, + "loss": 0.8337, + "step": 8682 + }, + { + "epoch": 0.6693647856922602, + "grad_norm": 3.534313678741455, + "learning_rate": 2.6034064307154944e-06, + "loss": 0.8608, + "step": 8683 + }, + { + "epoch": 0.6694418748072772, + "grad_norm": 4.026943206787109, + "learning_rate": 2.6023108170535115e-06, + "loss": 0.909, + "step": 8684 + }, + { + "epoch": 0.6695189639222942, + "grad_norm": 3.7044715881347656, + "learning_rate": 2.6012153528750316e-06, + "loss": 0.9678, + "step": 8685 + }, + { + "epoch": 0.6695960530373112, + "grad_norm": 3.3647007942199707, + "learning_rate": 2.6001200382483525e-06, + "loss": 0.7775, + "step": 8686 + }, + { + "epoch": 0.6696731421523281, + "grad_norm": 3.695740222930908, + "learning_rate": 2.599024873241758e-06, + "loss": 0.9464, + "step": 8687 + }, + { + "epoch": 0.669750231267345, + "grad_norm": 3.711057662963867, + "learning_rate": 2.5979298579235276e-06, + "loss": 0.9555, + "step": 8688 + }, + { + "epoch": 0.669827320382362, + "grad_norm": 3.6604957580566406, + "learning_rate": 2.5968349923619297e-06, + "loss": 0.9571, + "step": 8689 + }, + { + "epoch": 0.669904409497379, + "grad_norm": 3.690217971801758, + "learning_rate": 2.5957402766252238e-06, + "loss": 0.8819, + "step": 8690 + }, + { + "epoch": 0.669981498612396, + "grad_norm": 3.983525514602661, + "learning_rate": 2.594645710781661e-06, + "loss": 0.9634, + "step": 8691 + }, + { + "epoch": 0.6700585877274129, + "grad_norm": 3.4786341190338135, + "learning_rate": 2.59355129489948e-06, + "loss": 0.9043, + "step": 8692 + }, + { + "epoch": 0.6701356768424298, + "grad_norm": 3.7870969772338867, + "learning_rate": 2.592457029046912e-06, + "loss": 0.8991, + "step": 8693 + }, + { + "epoch": 0.6702127659574468, + "grad_norm": 3.683763027191162, + "learning_rate": 2.5913629132921784e-06, + "loss": 0.8989, + "step": 8694 + }, + { + "epoch": 0.6702898550724637, + "grad_norm": 3.569939136505127, + "learning_rate": 2.5902689477034936e-06, + "loss": 0.9178, + "step": 8695 + }, + { + "epoch": 0.6703669441874808, + "grad_norm": 3.9141666889190674, + "learning_rate": 2.589175132349059e-06, + "loss": 0.9937, + "step": 8696 + }, + { + "epoch": 0.6704440333024977, + "grad_norm": 3.4919772148132324, + "learning_rate": 2.588081467297069e-06, + "loss": 0.9049, + "step": 8697 + }, + { + "epoch": 0.6705211224175146, + "grad_norm": 3.8324713706970215, + "learning_rate": 2.5869879526157073e-06, + "loss": 1.0839, + "step": 8698 + }, + { + "epoch": 0.6705982115325316, + "grad_norm": 4.001489639282227, + "learning_rate": 2.58589458837315e-06, + "loss": 0.9274, + "step": 8699 + }, + { + "epoch": 0.6706753006475485, + "grad_norm": 4.185301780700684, + "learning_rate": 2.584801374637562e-06, + "loss": 0.9855, + "step": 8700 + }, + { + "epoch": 0.6707523897625656, + "grad_norm": 3.5428168773651123, + "learning_rate": 2.583708311477102e-06, + "loss": 0.9486, + "step": 8701 + }, + { + "epoch": 0.6708294788775825, + "grad_norm": 3.9238369464874268, + "learning_rate": 2.582615398959912e-06, + "loss": 0.9111, + "step": 8702 + }, + { + "epoch": 0.6709065679925994, + "grad_norm": 4.060305595397949, + "learning_rate": 2.5815226371541325e-06, + "loss": 0.9043, + "step": 8703 + }, + { + "epoch": 0.6709836571076164, + "grad_norm": 3.543778896331787, + "learning_rate": 2.5804300261278903e-06, + "loss": 0.8829, + "step": 8704 + }, + { + "epoch": 0.6710607462226333, + "grad_norm": 3.358534336090088, + "learning_rate": 2.579337565949305e-06, + "loss": 0.9352, + "step": 8705 + }, + { + "epoch": 0.6711378353376504, + "grad_norm": 4.072521686553955, + "learning_rate": 2.578245256686488e-06, + "loss": 0.9195, + "step": 8706 + }, + { + "epoch": 0.6712149244526673, + "grad_norm": 3.970937728881836, + "learning_rate": 2.577153098407534e-06, + "loss": 0.878, + "step": 8707 + }, + { + "epoch": 0.6712920135676842, + "grad_norm": 3.6512842178344727, + "learning_rate": 2.576061091180536e-06, + "loss": 0.8582, + "step": 8708 + }, + { + "epoch": 0.6713691026827012, + "grad_norm": 4.31977653503418, + "learning_rate": 2.5749692350735756e-06, + "loss": 0.8524, + "step": 8709 + }, + { + "epoch": 0.6714461917977181, + "grad_norm": 3.4686012268066406, + "learning_rate": 2.5738775301547238e-06, + "loss": 0.8528, + "step": 8710 + }, + { + "epoch": 0.6715232809127352, + "grad_norm": 3.668100595474243, + "learning_rate": 2.572785976492043e-06, + "loss": 0.8513, + "step": 8711 + }, + { + "epoch": 0.6716003700277521, + "grad_norm": 3.792353868484497, + "learning_rate": 2.5716945741535864e-06, + "loss": 0.7951, + "step": 8712 + }, + { + "epoch": 0.671677459142769, + "grad_norm": 3.5469858646392822, + "learning_rate": 2.570603323207396e-06, + "loss": 0.868, + "step": 8713 + }, + { + "epoch": 0.671754548257786, + "grad_norm": 3.5623743534088135, + "learning_rate": 2.5695122237215053e-06, + "loss": 0.7898, + "step": 8714 + }, + { + "epoch": 0.6718316373728029, + "grad_norm": 5.158936023712158, + "learning_rate": 2.56842127576394e-06, + "loss": 0.8926, + "step": 8715 + }, + { + "epoch": 0.67190872648782, + "grad_norm": 3.4081263542175293, + "learning_rate": 2.5673304794027143e-06, + "loss": 0.8877, + "step": 8716 + }, + { + "epoch": 0.6719858156028369, + "grad_norm": 3.6705238819122314, + "learning_rate": 2.566239834705837e-06, + "loss": 0.8883, + "step": 8717 + }, + { + "epoch": 0.6720629047178538, + "grad_norm": 3.546715497970581, + "learning_rate": 2.5651493417412986e-06, + "loss": 0.7861, + "step": 8718 + }, + { + "epoch": 0.6721399938328708, + "grad_norm": 3.700578212738037, + "learning_rate": 2.5640590005770883e-06, + "loss": 0.867, + "step": 8719 + }, + { + "epoch": 0.6722170829478877, + "grad_norm": 4.1538472175598145, + "learning_rate": 2.5629688112811835e-06, + "loss": 0.8547, + "step": 8720 + }, + { + "epoch": 0.6722941720629048, + "grad_norm": 3.4912803173065186, + "learning_rate": 2.561878773921551e-06, + "loss": 0.7669, + "step": 8721 + }, + { + "epoch": 0.6723712611779217, + "grad_norm": 3.5746119022369385, + "learning_rate": 2.56078888856615e-06, + "loss": 0.87, + "step": 8722 + }, + { + "epoch": 0.6724483502929386, + "grad_norm": 4.2662811279296875, + "learning_rate": 2.559699155282931e-06, + "loss": 0.9485, + "step": 8723 + }, + { + "epoch": 0.6725254394079556, + "grad_norm": 4.090740203857422, + "learning_rate": 2.558609574139829e-06, + "loss": 0.8954, + "step": 8724 + }, + { + "epoch": 0.6726025285229725, + "grad_norm": 3.631227970123291, + "learning_rate": 2.557520145204776e-06, + "loss": 0.9711, + "step": 8725 + }, + { + "epoch": 0.6726796176379896, + "grad_norm": 4.057615756988525, + "learning_rate": 2.5564308685456916e-06, + "loss": 1.0421, + "step": 8726 + }, + { + "epoch": 0.6727567067530065, + "grad_norm": 3.319061040878296, + "learning_rate": 2.555341744230487e-06, + "loss": 0.8149, + "step": 8727 + }, + { + "epoch": 0.6728337958680234, + "grad_norm": 3.737069606781006, + "learning_rate": 2.5542527723270653e-06, + "loss": 0.9622, + "step": 8728 + }, + { + "epoch": 0.6729108849830404, + "grad_norm": 4.169541358947754, + "learning_rate": 2.553163952903315e-06, + "loss": 0.9304, + "step": 8729 + }, + { + "epoch": 0.6729879740980573, + "grad_norm": 3.955720901489258, + "learning_rate": 2.552075286027119e-06, + "loss": 0.9517, + "step": 8730 + }, + { + "epoch": 0.6730650632130744, + "grad_norm": 3.720717191696167, + "learning_rate": 2.5509867717663516e-06, + "loss": 0.8208, + "step": 8731 + }, + { + "epoch": 0.6731421523280913, + "grad_norm": 4.123862266540527, + "learning_rate": 2.5498984101888747e-06, + "loss": 0.8568, + "step": 8732 + }, + { + "epoch": 0.6732192414431082, + "grad_norm": 4.052290439605713, + "learning_rate": 2.5488102013625425e-06, + "loss": 1.0482, + "step": 8733 + }, + { + "epoch": 0.6732963305581252, + "grad_norm": 3.5889370441436768, + "learning_rate": 2.5477221453552015e-06, + "loss": 0.9129, + "step": 8734 + }, + { + "epoch": 0.6733734196731421, + "grad_norm": 4.095925331115723, + "learning_rate": 2.5466342422346813e-06, + "loss": 0.9051, + "step": 8735 + }, + { + "epoch": 0.6734505087881592, + "grad_norm": 3.5070693492889404, + "learning_rate": 2.5455464920688105e-06, + "loss": 0.894, + "step": 8736 + }, + { + "epoch": 0.6735275979031761, + "grad_norm": 3.590304136276245, + "learning_rate": 2.5444588949254035e-06, + "loss": 0.9069, + "step": 8737 + }, + { + "epoch": 0.673604687018193, + "grad_norm": 3.7862966060638428, + "learning_rate": 2.5433714508722674e-06, + "loss": 0.9632, + "step": 8738 + }, + { + "epoch": 0.67368177613321, + "grad_norm": 3.6480305194854736, + "learning_rate": 2.5422841599771995e-06, + "loss": 0.9864, + "step": 8739 + }, + { + "epoch": 0.6737588652482269, + "grad_norm": 3.975153923034668, + "learning_rate": 2.541197022307984e-06, + "loss": 0.8769, + "step": 8740 + }, + { + "epoch": 0.673835954363244, + "grad_norm": 3.6856942176818848, + "learning_rate": 2.5401100379323994e-06, + "loss": 0.9895, + "step": 8741 + }, + { + "epoch": 0.6739130434782609, + "grad_norm": 3.8510794639587402, + "learning_rate": 2.539023206918212e-06, + "loss": 0.842, + "step": 8742 + }, + { + "epoch": 0.6739901325932778, + "grad_norm": 3.5220234394073486, + "learning_rate": 2.537936529333186e-06, + "loss": 0.8066, + "step": 8743 + }, + { + "epoch": 0.6740672217082948, + "grad_norm": 3.950100898742676, + "learning_rate": 2.536850005245064e-06, + "loss": 1.1013, + "step": 8744 + }, + { + "epoch": 0.6741443108233117, + "grad_norm": 3.5776264667510986, + "learning_rate": 2.535763634721587e-06, + "loss": 0.9417, + "step": 8745 + }, + { + "epoch": 0.6742213999383287, + "grad_norm": 3.58642578125, + "learning_rate": 2.5346774178304847e-06, + "loss": 0.8659, + "step": 8746 + }, + { + "epoch": 0.6742984890533457, + "grad_norm": 4.12030029296875, + "learning_rate": 2.5335913546394776e-06, + "loss": 0.9124, + "step": 8747 + }, + { + "epoch": 0.6743755781683626, + "grad_norm": 3.6494553089141846, + "learning_rate": 2.532505445216275e-06, + "loss": 0.9372, + "step": 8748 + }, + { + "epoch": 0.6744526672833796, + "grad_norm": 3.9090404510498047, + "learning_rate": 2.5314196896285804e-06, + "loss": 0.9739, + "step": 8749 + }, + { + "epoch": 0.6745297563983965, + "grad_norm": 3.881211280822754, + "learning_rate": 2.530334087944082e-06, + "loss": 0.935, + "step": 8750 + }, + { + "epoch": 0.6746068455134135, + "grad_norm": 3.7291910648345947, + "learning_rate": 2.5292486402304617e-06, + "loss": 0.83, + "step": 8751 + }, + { + "epoch": 0.6746839346284305, + "grad_norm": 3.3748013973236084, + "learning_rate": 2.5281633465553923e-06, + "loss": 0.8803, + "step": 8752 + }, + { + "epoch": 0.6747610237434474, + "grad_norm": 3.430265188217163, + "learning_rate": 2.527078206986537e-06, + "loss": 0.9526, + "step": 8753 + }, + { + "epoch": 0.6748381128584644, + "grad_norm": 3.7419652938842773, + "learning_rate": 2.525993221591549e-06, + "loss": 0.8539, + "step": 8754 + }, + { + "epoch": 0.6749152019734813, + "grad_norm": 3.45969557762146, + "learning_rate": 2.524908390438069e-06, + "loss": 0.9154, + "step": 8755 + }, + { + "epoch": 0.6749922910884983, + "grad_norm": 4.311490535736084, + "learning_rate": 2.5238237135937314e-06, + "loss": 0.9163, + "step": 8756 + }, + { + "epoch": 0.6750693802035153, + "grad_norm": 3.4476709365844727, + "learning_rate": 2.5227391911261617e-06, + "loss": 0.7989, + "step": 8757 + }, + { + "epoch": 0.6751464693185322, + "grad_norm": 3.477334976196289, + "learning_rate": 2.5216548231029726e-06, + "loss": 0.8753, + "step": 8758 + }, + { + "epoch": 0.6752235584335492, + "grad_norm": 3.5356128215789795, + "learning_rate": 2.5205706095917703e-06, + "loss": 0.9253, + "step": 8759 + }, + { + "epoch": 0.6753006475485661, + "grad_norm": 3.273407459259033, + "learning_rate": 2.5194865506601507e-06, + "loss": 0.9407, + "step": 8760 + }, + { + "epoch": 0.6753777366635831, + "grad_norm": 3.790802001953125, + "learning_rate": 2.518402646375696e-06, + "loss": 0.9698, + "step": 8761 + }, + { + "epoch": 0.6754548257786, + "grad_norm": 3.5114316940307617, + "learning_rate": 2.5173188968059836e-06, + "loss": 0.9552, + "step": 8762 + }, + { + "epoch": 0.675531914893617, + "grad_norm": 3.583873987197876, + "learning_rate": 2.5162353020185798e-06, + "loss": 0.9253, + "step": 8763 + }, + { + "epoch": 0.675609004008634, + "grad_norm": 3.8350162506103516, + "learning_rate": 2.5151518620810413e-06, + "loss": 0.8797, + "step": 8764 + }, + { + "epoch": 0.6756860931236509, + "grad_norm": 3.6162362098693848, + "learning_rate": 2.514068577060917e-06, + "loss": 0.9368, + "step": 8765 + }, + { + "epoch": 0.6757631822386679, + "grad_norm": 3.375894546508789, + "learning_rate": 2.51298544702574e-06, + "loss": 0.8251, + "step": 8766 + }, + { + "epoch": 0.6758402713536849, + "grad_norm": 3.4712395668029785, + "learning_rate": 2.5119024720430396e-06, + "loss": 0.8507, + "step": 8767 + }, + { + "epoch": 0.6759173604687018, + "grad_norm": 3.914985179901123, + "learning_rate": 2.5108196521803336e-06, + "loss": 0.9589, + "step": 8768 + }, + { + "epoch": 0.6759944495837188, + "grad_norm": 4.005273342132568, + "learning_rate": 2.50973698750513e-06, + "loss": 0.8717, + "step": 8769 + }, + { + "epoch": 0.6760715386987357, + "grad_norm": 3.7250683307647705, + "learning_rate": 2.508654478084929e-06, + "loss": 0.9127, + "step": 8770 + }, + { + "epoch": 0.6761486278137527, + "grad_norm": 3.753918170928955, + "learning_rate": 2.5075721239872196e-06, + "loss": 0.8688, + "step": 8771 + }, + { + "epoch": 0.6762257169287696, + "grad_norm": 3.8328804969787598, + "learning_rate": 2.5064899252794783e-06, + "loss": 0.9741, + "step": 8772 + }, + { + "epoch": 0.6763028060437866, + "grad_norm": 3.605774402618408, + "learning_rate": 2.5054078820291754e-06, + "loss": 0.9508, + "step": 8773 + }, + { + "epoch": 0.6763798951588036, + "grad_norm": 3.709643840789795, + "learning_rate": 2.5043259943037714e-06, + "loss": 0.9305, + "step": 8774 + }, + { + "epoch": 0.6764569842738205, + "grad_norm": 3.6249752044677734, + "learning_rate": 2.5032442621707164e-06, + "loss": 0.9045, + "step": 8775 + }, + { + "epoch": 0.6765340733888375, + "grad_norm": 3.642636299133301, + "learning_rate": 2.5021626856974524e-06, + "loss": 0.9249, + "step": 8776 + }, + { + "epoch": 0.6766111625038544, + "grad_norm": 3.822664976119995, + "learning_rate": 2.501081264951406e-06, + "loss": 1.0165, + "step": 8777 + }, + { + "epoch": 0.6766882516188714, + "grad_norm": 3.8768982887268066, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.9671, + "step": 8778 + }, + { + "epoch": 0.6767653407338884, + "grad_norm": 3.6057324409484863, + "learning_rate": 2.498918890910648e-06, + "loss": 0.8731, + "step": 8779 + }, + { + "epoch": 0.6768424298489053, + "grad_norm": 3.6541295051574707, + "learning_rate": 2.497837937750749e-06, + "loss": 0.9921, + "step": 8780 + }, + { + "epoch": 0.6769195189639223, + "grad_norm": 3.908982038497925, + "learning_rate": 2.496757140587695e-06, + "loss": 0.9808, + "step": 8781 + }, + { + "epoch": 0.6769966080789392, + "grad_norm": 3.8420374393463135, + "learning_rate": 2.495676499488871e-06, + "loss": 1.0192, + "step": 8782 + }, + { + "epoch": 0.6770736971939562, + "grad_norm": 3.714198350906372, + "learning_rate": 2.494596014521645e-06, + "loss": 0.9404, + "step": 8783 + }, + { + "epoch": 0.6771507863089732, + "grad_norm": 3.6355252265930176, + "learning_rate": 2.4935156857533816e-06, + "loss": 0.9077, + "step": 8784 + }, + { + "epoch": 0.6772278754239901, + "grad_norm": 3.312899112701416, + "learning_rate": 2.4924355132514337e-06, + "loss": 0.8434, + "step": 8785 + }, + { + "epoch": 0.6773049645390071, + "grad_norm": 3.788271188735962, + "learning_rate": 2.491355497083145e-06, + "loss": 0.909, + "step": 8786 + }, + { + "epoch": 0.677382053654024, + "grad_norm": 3.835744619369507, + "learning_rate": 2.490275637315848e-06, + "loss": 0.9578, + "step": 8787 + }, + { + "epoch": 0.677459142769041, + "grad_norm": 3.8753020763397217, + "learning_rate": 2.489195934016867e-06, + "loss": 1.0083, + "step": 8788 + }, + { + "epoch": 0.677536231884058, + "grad_norm": 3.9721779823303223, + "learning_rate": 2.488116387253516e-06, + "loss": 0.9055, + "step": 8789 + }, + { + "epoch": 0.6776133209990749, + "grad_norm": 4.252837181091309, + "learning_rate": 2.487036997093098e-06, + "loss": 0.9204, + "step": 8790 + }, + { + "epoch": 0.6776904101140919, + "grad_norm": 3.932285785675049, + "learning_rate": 2.485957763602911e-06, + "loss": 0.9707, + "step": 8791 + }, + { + "epoch": 0.6777674992291088, + "grad_norm": 3.8516898155212402, + "learning_rate": 2.4848786868502344e-06, + "loss": 0.9746, + "step": 8792 + }, + { + "epoch": 0.6778445883441259, + "grad_norm": 3.652773141860962, + "learning_rate": 2.483799766902346e-06, + "loss": 0.8415, + "step": 8793 + }, + { + "epoch": 0.6779216774591428, + "grad_norm": 3.671022891998291, + "learning_rate": 2.48272100382651e-06, + "loss": 0.9036, + "step": 8794 + }, + { + "epoch": 0.6779987665741597, + "grad_norm": 3.8974409103393555, + "learning_rate": 2.481642397689982e-06, + "loss": 1.0436, + "step": 8795 + }, + { + "epoch": 0.6780758556891767, + "grad_norm": 4.345669269561768, + "learning_rate": 2.4805639485600087e-06, + "loss": 1.0052, + "step": 8796 + }, + { + "epoch": 0.6781529448041936, + "grad_norm": 3.7597620487213135, + "learning_rate": 2.4794856565038256e-06, + "loss": 0.8635, + "step": 8797 + }, + { + "epoch": 0.6782300339192107, + "grad_norm": 3.701038360595703, + "learning_rate": 2.478407521588656e-06, + "loss": 0.8971, + "step": 8798 + }, + { + "epoch": 0.6783071230342276, + "grad_norm": 3.521311044692993, + "learning_rate": 2.4773295438817178e-06, + "loss": 0.9003, + "step": 8799 + }, + { + "epoch": 0.6783842121492445, + "grad_norm": 3.864536762237549, + "learning_rate": 2.476251723450217e-06, + "loss": 1.0305, + "step": 8800 + }, + { + "epoch": 0.6784613012642615, + "grad_norm": 3.7340474128723145, + "learning_rate": 2.475174060361351e-06, + "loss": 0.991, + "step": 8801 + }, + { + "epoch": 0.6785383903792784, + "grad_norm": 3.8703043460845947, + "learning_rate": 2.4740965546823076e-06, + "loss": 0.9927, + "step": 8802 + }, + { + "epoch": 0.6786154794942955, + "grad_norm": 3.6807234287261963, + "learning_rate": 2.47301920648026e-06, + "loss": 0.9202, + "step": 8803 + }, + { + "epoch": 0.6786925686093124, + "grad_norm": 3.687746047973633, + "learning_rate": 2.471942015822377e-06, + "loss": 0.9262, + "step": 8804 + }, + { + "epoch": 0.6787696577243293, + "grad_norm": 3.667069435119629, + "learning_rate": 2.4708649827758167e-06, + "loss": 0.9589, + "step": 8805 + }, + { + "epoch": 0.6788467468393463, + "grad_norm": 3.750629425048828, + "learning_rate": 2.4697881074077263e-06, + "loss": 0.8988, + "step": 8806 + }, + { + "epoch": 0.6789238359543632, + "grad_norm": 3.8485107421875, + "learning_rate": 2.468711389785242e-06, + "loss": 1.0432, + "step": 8807 + }, + { + "epoch": 0.6790009250693803, + "grad_norm": 3.7426390647888184, + "learning_rate": 2.4676348299754956e-06, + "loss": 0.9872, + "step": 8808 + }, + { + "epoch": 0.6790780141843972, + "grad_norm": 3.8060035705566406, + "learning_rate": 2.4665584280456002e-06, + "loss": 0.9285, + "step": 8809 + }, + { + "epoch": 0.6791551032994141, + "grad_norm": 3.9451937675476074, + "learning_rate": 2.4654821840626653e-06, + "loss": 1.0148, + "step": 8810 + }, + { + "epoch": 0.6792321924144311, + "grad_norm": 3.2711398601531982, + "learning_rate": 2.4644060980937894e-06, + "loss": 0.8548, + "step": 8811 + }, + { + "epoch": 0.679309281529448, + "grad_norm": 3.3944523334503174, + "learning_rate": 2.4633301702060613e-06, + "loss": 0.889, + "step": 8812 + }, + { + "epoch": 0.679386370644465, + "grad_norm": 3.9511239528656006, + "learning_rate": 2.4622544004665617e-06, + "loss": 0.8868, + "step": 8813 + }, + { + "epoch": 0.679463459759482, + "grad_norm": 3.569582462310791, + "learning_rate": 2.4611787889423546e-06, + "loss": 0.8662, + "step": 8814 + }, + { + "epoch": 0.6795405488744989, + "grad_norm": 3.395447254180908, + "learning_rate": 2.460103335700501e-06, + "loss": 0.8403, + "step": 8815 + }, + { + "epoch": 0.6796176379895159, + "grad_norm": 3.699881076812744, + "learning_rate": 2.45902804080805e-06, + "loss": 0.8871, + "step": 8816 + }, + { + "epoch": 0.6796947271045328, + "grad_norm": 3.846745491027832, + "learning_rate": 2.457952904332041e-06, + "loss": 0.8642, + "step": 8817 + }, + { + "epoch": 0.6797718162195499, + "grad_norm": 3.8669497966766357, + "learning_rate": 2.4568779263395026e-06, + "loss": 0.9709, + "step": 8818 + }, + { + "epoch": 0.6798489053345668, + "grad_norm": 3.6306495666503906, + "learning_rate": 2.455803106897456e-06, + "loss": 0.9367, + "step": 8819 + }, + { + "epoch": 0.6799259944495837, + "grad_norm": 3.4192559719085693, + "learning_rate": 2.454728446072907e-06, + "loss": 0.7778, + "step": 8820 + }, + { + "epoch": 0.6800030835646007, + "grad_norm": 3.654353141784668, + "learning_rate": 2.4536539439328575e-06, + "loss": 0.9357, + "step": 8821 + }, + { + "epoch": 0.6800801726796176, + "grad_norm": 3.7927420139312744, + "learning_rate": 2.4525796005442963e-06, + "loss": 0.9209, + "step": 8822 + }, + { + "epoch": 0.6801572617946346, + "grad_norm": 3.8942689895629883, + "learning_rate": 2.451505415974204e-06, + "loss": 0.824, + "step": 8823 + }, + { + "epoch": 0.6802343509096516, + "grad_norm": 4.133578777313232, + "learning_rate": 2.4504313902895523e-06, + "loss": 0.9132, + "step": 8824 + }, + { + "epoch": 0.6803114400246685, + "grad_norm": 4.291555404663086, + "learning_rate": 2.449357523557296e-06, + "loss": 0.9479, + "step": 8825 + }, + { + "epoch": 0.6803885291396855, + "grad_norm": 3.8000755310058594, + "learning_rate": 2.4482838158443883e-06, + "loss": 0.8301, + "step": 8826 + }, + { + "epoch": 0.6804656182547024, + "grad_norm": 3.7706243991851807, + "learning_rate": 2.4472102672177693e-06, + "loss": 1.0852, + "step": 8827 + }, + { + "epoch": 0.6805427073697194, + "grad_norm": 3.8600974082946777, + "learning_rate": 2.4461368777443683e-06, + "loss": 0.9142, + "step": 8828 + }, + { + "epoch": 0.6806197964847364, + "grad_norm": 3.7134435176849365, + "learning_rate": 2.4450636474911072e-06, + "loss": 0.9089, + "step": 8829 + }, + { + "epoch": 0.6806968855997533, + "grad_norm": 3.886906862258911, + "learning_rate": 2.4439905765248944e-06, + "loss": 0.9055, + "step": 8830 + }, + { + "epoch": 0.6807739747147703, + "grad_norm": 3.8458032608032227, + "learning_rate": 2.442917664912632e-06, + "loss": 0.9037, + "step": 8831 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 3.480123281478882, + "learning_rate": 2.441844912721209e-06, + "loss": 0.8902, + "step": 8832 + }, + { + "epoch": 0.6809281529448042, + "grad_norm": 3.9139695167541504, + "learning_rate": 2.4407723200175064e-06, + "loss": 0.9043, + "step": 8833 + }, + { + "epoch": 0.6810052420598212, + "grad_norm": 3.7381532192230225, + "learning_rate": 2.439699886868398e-06, + "loss": 1.0108, + "step": 8834 + }, + { + "epoch": 0.6810823311748381, + "grad_norm": 3.5520753860473633, + "learning_rate": 2.438627613340739e-06, + "loss": 0.9035, + "step": 8835 + }, + { + "epoch": 0.6811594202898551, + "grad_norm": 4.716245651245117, + "learning_rate": 2.4375554995013827e-06, + "loss": 0.9881, + "step": 8836 + }, + { + "epoch": 0.681236509404872, + "grad_norm": 3.765180826187134, + "learning_rate": 2.43648354541717e-06, + "loss": 0.9127, + "step": 8837 + }, + { + "epoch": 0.681313598519889, + "grad_norm": 3.5906150341033936, + "learning_rate": 2.435411751154931e-06, + "loss": 0.9026, + "step": 8838 + }, + { + "epoch": 0.681390687634906, + "grad_norm": 3.9749667644500732, + "learning_rate": 2.4343401167814895e-06, + "loss": 0.8457, + "step": 8839 + }, + { + "epoch": 0.6814677767499229, + "grad_norm": 4.670644760131836, + "learning_rate": 2.4332686423636516e-06, + "loss": 0.9388, + "step": 8840 + }, + { + "epoch": 0.6815448658649399, + "grad_norm": 3.8460373878479004, + "learning_rate": 2.43219732796822e-06, + "loss": 0.8584, + "step": 8841 + }, + { + "epoch": 0.6816219549799568, + "grad_norm": 3.915990114212036, + "learning_rate": 2.431126173661986e-06, + "loss": 0.9734, + "step": 8842 + }, + { + "epoch": 0.6816990440949738, + "grad_norm": 3.3147318363189697, + "learning_rate": 2.430055179511731e-06, + "loss": 0.8453, + "step": 8843 + }, + { + "epoch": 0.6817761332099908, + "grad_norm": 3.620931386947632, + "learning_rate": 2.4289843455842255e-06, + "loss": 0.8603, + "step": 8844 + }, + { + "epoch": 0.6818532223250077, + "grad_norm": 5.175318717956543, + "learning_rate": 2.4279136719462326e-06, + "loss": 0.8268, + "step": 8845 + }, + { + "epoch": 0.6819303114400247, + "grad_norm": 3.6000447273254395, + "learning_rate": 2.4268431586644985e-06, + "loss": 0.8702, + "step": 8846 + }, + { + "epoch": 0.6820074005550416, + "grad_norm": 3.783622980117798, + "learning_rate": 2.4257728058057673e-06, + "loss": 0.97, + "step": 8847 + }, + { + "epoch": 0.6820844896700586, + "grad_norm": 3.600386381149292, + "learning_rate": 2.424702613436769e-06, + "loss": 1.039, + "step": 8848 + }, + { + "epoch": 0.6821615787850756, + "grad_norm": 3.5093846321105957, + "learning_rate": 2.423632581624225e-06, + "loss": 0.9421, + "step": 8849 + }, + { + "epoch": 0.6822386679000925, + "grad_norm": 3.684954881668091, + "learning_rate": 2.422562710434848e-06, + "loss": 0.8655, + "step": 8850 + }, + { + "epoch": 0.6823157570151095, + "grad_norm": 3.958432912826538, + "learning_rate": 2.4214929999353354e-06, + "loss": 0.8486, + "step": 8851 + }, + { + "epoch": 0.6823928461301264, + "grad_norm": 4.018614768981934, + "learning_rate": 2.4204234501923797e-06, + "loss": 0.9712, + "step": 8852 + }, + { + "epoch": 0.6824699352451434, + "grad_norm": 3.787519931793213, + "learning_rate": 2.419354061272662e-06, + "loss": 0.9451, + "step": 8853 + }, + { + "epoch": 0.6825470243601603, + "grad_norm": 3.5393474102020264, + "learning_rate": 2.4182848332428534e-06, + "loss": 0.9476, + "step": 8854 + }, + { + "epoch": 0.6826241134751773, + "grad_norm": 3.7970054149627686, + "learning_rate": 2.417215766169614e-06, + "loss": 0.9967, + "step": 8855 + }, + { + "epoch": 0.6827012025901943, + "grad_norm": 3.7069692611694336, + "learning_rate": 2.4161468601195966e-06, + "loss": 0.9612, + "step": 8856 + }, + { + "epoch": 0.6827782917052112, + "grad_norm": 3.631960868835449, + "learning_rate": 2.4150781151594386e-06, + "loss": 0.9315, + "step": 8857 + }, + { + "epoch": 0.6828553808202282, + "grad_norm": 3.616215229034424, + "learning_rate": 2.4140095313557734e-06, + "loss": 0.9053, + "step": 8858 + }, + { + "epoch": 0.6829324699352451, + "grad_norm": 3.7601921558380127, + "learning_rate": 2.4129411087752196e-06, + "loss": 1.0008, + "step": 8859 + }, + { + "epoch": 0.6830095590502621, + "grad_norm": 3.73475980758667, + "learning_rate": 2.4118728474843894e-06, + "loss": 0.9158, + "step": 8860 + }, + { + "epoch": 0.6830866481652791, + "grad_norm": 3.5904901027679443, + "learning_rate": 2.4108047475498847e-06, + "loss": 0.9244, + "step": 8861 + }, + { + "epoch": 0.683163737280296, + "grad_norm": 3.4571268558502197, + "learning_rate": 2.4097368090382922e-06, + "loss": 0.8297, + "step": 8862 + }, + { + "epoch": 0.683240826395313, + "grad_norm": 3.620143413543701, + "learning_rate": 2.4086690320161947e-06, + "loss": 0.8506, + "step": 8863 + }, + { + "epoch": 0.6833179155103299, + "grad_norm": 3.556610107421875, + "learning_rate": 2.407601416550162e-06, + "loss": 0.9505, + "step": 8864 + }, + { + "epoch": 0.6833950046253469, + "grad_norm": 3.6465303897857666, + "learning_rate": 2.4065339627067545e-06, + "loss": 0.9715, + "step": 8865 + }, + { + "epoch": 0.6834720937403639, + "grad_norm": 3.4857358932495117, + "learning_rate": 2.4054666705525227e-06, + "loss": 0.7897, + "step": 8866 + }, + { + "epoch": 0.6835491828553808, + "grad_norm": 3.4617760181427, + "learning_rate": 2.404399540154008e-06, + "loss": 0.8728, + "step": 8867 + }, + { + "epoch": 0.6836262719703978, + "grad_norm": 3.6965367794036865, + "learning_rate": 2.403332571577738e-06, + "loss": 0.8499, + "step": 8868 + }, + { + "epoch": 0.6837033610854147, + "grad_norm": 4.105483055114746, + "learning_rate": 2.402265764890233e-06, + "loss": 0.9697, + "step": 8869 + }, + { + "epoch": 0.6837804502004317, + "grad_norm": 4.040141582489014, + "learning_rate": 2.4011991201580036e-06, + "loss": 0.8711, + "step": 8870 + }, + { + "epoch": 0.6838575393154487, + "grad_norm": 3.5217995643615723, + "learning_rate": 2.4001326374475496e-06, + "loss": 0.8271, + "step": 8871 + }, + { + "epoch": 0.6839346284304656, + "grad_norm": 3.501316785812378, + "learning_rate": 2.3990663168253627e-06, + "loss": 0.9758, + "step": 8872 + }, + { + "epoch": 0.6840117175454826, + "grad_norm": 3.6326067447662354, + "learning_rate": 2.398000158357918e-06, + "loss": 0.909, + "step": 8873 + }, + { + "epoch": 0.6840888066604995, + "grad_norm": 3.510653257369995, + "learning_rate": 2.396934162111686e-06, + "loss": 0.9052, + "step": 8874 + }, + { + "epoch": 0.6841658957755165, + "grad_norm": 3.3554039001464844, + "learning_rate": 2.395868328153129e-06, + "loss": 0.9057, + "step": 8875 + }, + { + "epoch": 0.6842429848905335, + "grad_norm": 3.7715797424316406, + "learning_rate": 2.3948026565486963e-06, + "loss": 0.9396, + "step": 8876 + }, + { + "epoch": 0.6843200740055504, + "grad_norm": 3.4584174156188965, + "learning_rate": 2.3937371473648237e-06, + "loss": 0.8345, + "step": 8877 + }, + { + "epoch": 0.6843971631205674, + "grad_norm": 3.387113571166992, + "learning_rate": 2.3926718006679416e-06, + "loss": 0.8837, + "step": 8878 + }, + { + "epoch": 0.6844742522355843, + "grad_norm": 3.546274423599243, + "learning_rate": 2.391606616524469e-06, + "loss": 0.8913, + "step": 8879 + }, + { + "epoch": 0.6845513413506013, + "grad_norm": 3.3328139781951904, + "learning_rate": 2.390541595000815e-06, + "loss": 0.8801, + "step": 8880 + }, + { + "epoch": 0.6846284304656183, + "grad_norm": 3.5848608016967773, + "learning_rate": 2.3894767361633775e-06, + "loss": 0.8967, + "step": 8881 + }, + { + "epoch": 0.6847055195806352, + "grad_norm": 3.5277299880981445, + "learning_rate": 2.388412040078547e-06, + "loss": 0.8776, + "step": 8882 + }, + { + "epoch": 0.6847826086956522, + "grad_norm": 3.7416014671325684, + "learning_rate": 2.387347506812698e-06, + "loss": 0.811, + "step": 8883 + }, + { + "epoch": 0.6848596978106691, + "grad_norm": 3.776992082595825, + "learning_rate": 2.3862831364322013e-06, + "loss": 0.966, + "step": 8884 + }, + { + "epoch": 0.684936786925686, + "grad_norm": 3.511543035507202, + "learning_rate": 2.385218929003414e-06, + "loss": 0.9115, + "step": 8885 + }, + { + "epoch": 0.6850138760407031, + "grad_norm": 3.6545498371124268, + "learning_rate": 2.3841548845926844e-06, + "loss": 0.9297, + "step": 8886 + }, + { + "epoch": 0.68509096515572, + "grad_norm": 4.169460773468018, + "learning_rate": 2.3830910032663514e-06, + "loss": 0.9614, + "step": 8887 + }, + { + "epoch": 0.685168054270737, + "grad_norm": 3.843756675720215, + "learning_rate": 2.3820272850907393e-06, + "loss": 0.8832, + "step": 8888 + }, + { + "epoch": 0.6852451433857539, + "grad_norm": 3.783128499984741, + "learning_rate": 2.380963730132167e-06, + "loss": 0.8474, + "step": 8889 + }, + { + "epoch": 0.6853222325007708, + "grad_norm": 3.7089338302612305, + "learning_rate": 2.3799003384569417e-06, + "loss": 0.941, + "step": 8890 + }, + { + "epoch": 0.6853993216157879, + "grad_norm": 3.5771591663360596, + "learning_rate": 2.3788371101313614e-06, + "loss": 0.9073, + "step": 8891 + }, + { + "epoch": 0.6854764107308048, + "grad_norm": 3.6812937259674072, + "learning_rate": 2.3777740452217106e-06, + "loss": 0.9433, + "step": 8892 + }, + { + "epoch": 0.6855534998458218, + "grad_norm": 3.610495090484619, + "learning_rate": 2.3767111437942703e-06, + "loss": 0.9288, + "step": 8893 + }, + { + "epoch": 0.6856305889608387, + "grad_norm": 3.8256382942199707, + "learning_rate": 2.375648405915301e-06, + "loss": 0.8738, + "step": 8894 + }, + { + "epoch": 0.6857076780758556, + "grad_norm": 3.9262545108795166, + "learning_rate": 2.374585831651062e-06, + "loss": 0.8454, + "step": 8895 + }, + { + "epoch": 0.6857847671908727, + "grad_norm": 3.498192071914673, + "learning_rate": 2.3735234210677994e-06, + "loss": 0.8577, + "step": 8896 + }, + { + "epoch": 0.6858618563058896, + "grad_norm": 3.9072468280792236, + "learning_rate": 2.3724611742317483e-06, + "loss": 0.906, + "step": 8897 + }, + { + "epoch": 0.6859389454209066, + "grad_norm": 3.8992998600006104, + "learning_rate": 2.3713990912091367e-06, + "loss": 0.8959, + "step": 8898 + }, + { + "epoch": 0.6860160345359235, + "grad_norm": 3.6476802825927734, + "learning_rate": 2.3703371720661754e-06, + "loss": 0.8936, + "step": 8899 + }, + { + "epoch": 0.6860931236509404, + "grad_norm": 3.575178384780884, + "learning_rate": 2.369275416869073e-06, + "loss": 0.9295, + "step": 8900 + }, + { + "epoch": 0.6861702127659575, + "grad_norm": 4.027018070220947, + "learning_rate": 2.3682138256840224e-06, + "loss": 0.9805, + "step": 8901 + }, + { + "epoch": 0.6862473018809744, + "grad_norm": 3.9200379848480225, + "learning_rate": 2.3671523985772105e-06, + "loss": 0.9032, + "step": 8902 + }, + { + "epoch": 0.6863243909959914, + "grad_norm": 3.6151862144470215, + "learning_rate": 2.36609113561481e-06, + "loss": 0.8806, + "step": 8903 + }, + { + "epoch": 0.6864014801110083, + "grad_norm": 4.015349388122559, + "learning_rate": 2.365030036862988e-06, + "loss": 0.9448, + "step": 8904 + }, + { + "epoch": 0.6864785692260252, + "grad_norm": 3.450439691543579, + "learning_rate": 2.363969102387894e-06, + "loss": 0.9339, + "step": 8905 + }, + { + "epoch": 0.6865556583410423, + "grad_norm": 3.489480972290039, + "learning_rate": 2.3629083322556744e-06, + "loss": 0.8266, + "step": 8906 + }, + { + "epoch": 0.6866327474560592, + "grad_norm": 3.841921091079712, + "learning_rate": 2.361847726532463e-06, + "loss": 0.9075, + "step": 8907 + }, + { + "epoch": 0.6867098365710762, + "grad_norm": 3.6356115341186523, + "learning_rate": 2.3607872852843814e-06, + "loss": 0.9278, + "step": 8908 + }, + { + "epoch": 0.6867869256860931, + "grad_norm": 3.600667715072632, + "learning_rate": 2.3597270085775466e-06, + "loss": 0.9311, + "step": 8909 + }, + { + "epoch": 0.68686401480111, + "grad_norm": 3.5631704330444336, + "learning_rate": 2.3586668964780562e-06, + "loss": 0.8496, + "step": 8910 + }, + { + "epoch": 0.6869411039161271, + "grad_norm": 3.3869268894195557, + "learning_rate": 2.3576069490520046e-06, + "loss": 0.94, + "step": 8911 + }, + { + "epoch": 0.687018193031144, + "grad_norm": 4.510827541351318, + "learning_rate": 2.3565471663654752e-06, + "loss": 0.8451, + "step": 8912 + }, + { + "epoch": 0.687095282146161, + "grad_norm": 3.5590405464172363, + "learning_rate": 2.355487548484539e-06, + "loss": 0.9092, + "step": 8913 + }, + { + "epoch": 0.6871723712611779, + "grad_norm": 3.764672040939331, + "learning_rate": 2.3544280954752573e-06, + "loss": 0.9336, + "step": 8914 + }, + { + "epoch": 0.6872494603761948, + "grad_norm": 3.8435845375061035, + "learning_rate": 2.3533688074036847e-06, + "loss": 0.9875, + "step": 8915 + }, + { + "epoch": 0.6873265494912119, + "grad_norm": 3.645751714706421, + "learning_rate": 2.3523096843358573e-06, + "loss": 0.8304, + "step": 8916 + }, + { + "epoch": 0.6874036386062288, + "grad_norm": 3.7767014503479004, + "learning_rate": 2.3512507263378086e-06, + "loss": 0.8513, + "step": 8917 + }, + { + "epoch": 0.6874807277212458, + "grad_norm": 3.934950590133667, + "learning_rate": 2.3501919334755584e-06, + "loss": 0.8667, + "step": 8918 + }, + { + "epoch": 0.6875578168362627, + "grad_norm": 3.6691348552703857, + "learning_rate": 2.3491333058151185e-06, + "loss": 1.0102, + "step": 8919 + }, + { + "epoch": 0.6876349059512796, + "grad_norm": 3.4543542861938477, + "learning_rate": 2.3480748434224872e-06, + "loss": 0.8902, + "step": 8920 + }, + { + "epoch": 0.6877119950662967, + "grad_norm": 4.049129486083984, + "learning_rate": 2.347016546363654e-06, + "loss": 0.821, + "step": 8921 + }, + { + "epoch": 0.6877890841813136, + "grad_norm": 3.604243755340576, + "learning_rate": 2.3459584147046e-06, + "loss": 0.8626, + "step": 8922 + }, + { + "epoch": 0.6878661732963306, + "grad_norm": 3.946105480194092, + "learning_rate": 2.344900448511293e-06, + "loss": 0.9558, + "step": 8923 + }, + { + "epoch": 0.6879432624113475, + "grad_norm": 3.4957942962646484, + "learning_rate": 2.3438426478496936e-06, + "loss": 0.8554, + "step": 8924 + }, + { + "epoch": 0.6880203515263644, + "grad_norm": 3.9918699264526367, + "learning_rate": 2.3427850127857464e-06, + "loss": 1.0795, + "step": 8925 + }, + { + "epoch": 0.6880974406413815, + "grad_norm": 3.549349069595337, + "learning_rate": 2.341727543385392e-06, + "loss": 0.8196, + "step": 8926 + }, + { + "epoch": 0.6881745297563984, + "grad_norm": 3.535104751586914, + "learning_rate": 2.3406702397145574e-06, + "loss": 0.8999, + "step": 8927 + }, + { + "epoch": 0.6882516188714154, + "grad_norm": 3.535996913909912, + "learning_rate": 2.3396131018391606e-06, + "loss": 0.8936, + "step": 8928 + }, + { + "epoch": 0.6883287079864323, + "grad_norm": 4.0836262702941895, + "learning_rate": 2.3385561298251087e-06, + "loss": 0.9796, + "step": 8929 + }, + { + "epoch": 0.6884057971014492, + "grad_norm": 3.8373196125030518, + "learning_rate": 2.337499323738299e-06, + "loss": 0.9182, + "step": 8930 + }, + { + "epoch": 0.6884828862164662, + "grad_norm": 3.3470261096954346, + "learning_rate": 2.3364426836446163e-06, + "loss": 0.8113, + "step": 8931 + }, + { + "epoch": 0.6885599753314832, + "grad_norm": 3.9608960151672363, + "learning_rate": 2.3353862096099365e-06, + "loss": 0.9545, + "step": 8932 + }, + { + "epoch": 0.6886370644465002, + "grad_norm": 3.7849230766296387, + "learning_rate": 2.3343299017001263e-06, + "loss": 0.8956, + "step": 8933 + }, + { + "epoch": 0.6887141535615171, + "grad_norm": 4.301094055175781, + "learning_rate": 2.333273759981041e-06, + "loss": 0.9049, + "step": 8934 + }, + { + "epoch": 0.688791242676534, + "grad_norm": 3.382276773452759, + "learning_rate": 2.3322177845185278e-06, + "loss": 0.8616, + "step": 8935 + }, + { + "epoch": 0.688868331791551, + "grad_norm": 3.7838985919952393, + "learning_rate": 2.3311619753784167e-06, + "loss": 0.9545, + "step": 8936 + }, + { + "epoch": 0.688945420906568, + "grad_norm": 3.419304609298706, + "learning_rate": 2.3301063326265338e-06, + "loss": 0.9138, + "step": 8937 + }, + { + "epoch": 0.689022510021585, + "grad_norm": 3.7308456897735596, + "learning_rate": 2.329050856328694e-06, + "loss": 0.8699, + "step": 8938 + }, + { + "epoch": 0.6890995991366019, + "grad_norm": 3.8383610248565674, + "learning_rate": 2.3279955465506995e-06, + "loss": 0.9504, + "step": 8939 + }, + { + "epoch": 0.6891766882516188, + "grad_norm": 3.7427611351013184, + "learning_rate": 2.3269404033583443e-06, + "loss": 0.9492, + "step": 8940 + }, + { + "epoch": 0.6892537773666358, + "grad_norm": 3.5722649097442627, + "learning_rate": 2.3258854268174125e-06, + "loss": 0.9321, + "step": 8941 + }, + { + "epoch": 0.6893308664816528, + "grad_norm": 4.43306827545166, + "learning_rate": 2.324830616993673e-06, + "loss": 1.0846, + "step": 8942 + }, + { + "epoch": 0.6894079555966698, + "grad_norm": 3.7848153114318848, + "learning_rate": 2.3237759739528893e-06, + "loss": 0.8663, + "step": 8943 + }, + { + "epoch": 0.6894850447116867, + "grad_norm": 4.198482513427734, + "learning_rate": 2.3227214977608136e-06, + "loss": 0.9388, + "step": 8944 + }, + { + "epoch": 0.6895621338267036, + "grad_norm": 3.4994449615478516, + "learning_rate": 2.321667188483186e-06, + "loss": 0.8521, + "step": 8945 + }, + { + "epoch": 0.6896392229417206, + "grad_norm": 3.8813703060150146, + "learning_rate": 2.3206130461857403e-06, + "loss": 1.072, + "step": 8946 + }, + { + "epoch": 0.6897163120567376, + "grad_norm": 3.928469181060791, + "learning_rate": 2.3195590709341915e-06, + "loss": 0.8955, + "step": 8947 + }, + { + "epoch": 0.6897934011717546, + "grad_norm": 3.67775821685791, + "learning_rate": 2.3185052627942528e-06, + "loss": 0.9373, + "step": 8948 + }, + { + "epoch": 0.6898704902867715, + "grad_norm": 3.703199863433838, + "learning_rate": 2.317451621831623e-06, + "loss": 0.9561, + "step": 8949 + }, + { + "epoch": 0.6899475794017884, + "grad_norm": 3.4708447456359863, + "learning_rate": 2.3163981481119913e-06, + "loss": 0.8132, + "step": 8950 + }, + { + "epoch": 0.6900246685168054, + "grad_norm": 3.7089951038360596, + "learning_rate": 2.3153448417010367e-06, + "loss": 0.9367, + "step": 8951 + }, + { + "epoch": 0.6901017576318224, + "grad_norm": 3.567253351211548, + "learning_rate": 2.314291702664428e-06, + "loss": 0.9247, + "step": 8952 + }, + { + "epoch": 0.6901788467468394, + "grad_norm": 4.459877967834473, + "learning_rate": 2.3132387310678212e-06, + "loss": 0.9534, + "step": 8953 + }, + { + "epoch": 0.6902559358618563, + "grad_norm": 3.3123741149902344, + "learning_rate": 2.3121859269768637e-06, + "loss": 0.8037, + "step": 8954 + }, + { + "epoch": 0.6903330249768732, + "grad_norm": 3.7758708000183105, + "learning_rate": 2.3111332904571933e-06, + "loss": 0.8445, + "step": 8955 + }, + { + "epoch": 0.6904101140918902, + "grad_norm": 3.962944507598877, + "learning_rate": 2.3100808215744365e-06, + "loss": 0.9568, + "step": 8956 + }, + { + "epoch": 0.6904872032069072, + "grad_norm": 3.808398723602295, + "learning_rate": 2.3090285203942104e-06, + "loss": 0.9115, + "step": 8957 + }, + { + "epoch": 0.6905642923219242, + "grad_norm": 3.7572476863861084, + "learning_rate": 2.3079763869821176e-06, + "loss": 0.9322, + "step": 8958 + }, + { + "epoch": 0.6906413814369411, + "grad_norm": 3.5704262256622314, + "learning_rate": 2.3069244214037546e-06, + "loss": 0.8228, + "step": 8959 + }, + { + "epoch": 0.690718470551958, + "grad_norm": 3.9301810264587402, + "learning_rate": 2.3058726237247063e-06, + "loss": 0.903, + "step": 8960 + }, + { + "epoch": 0.690795559666975, + "grad_norm": 3.7818596363067627, + "learning_rate": 2.3048209940105465e-06, + "loss": 0.8574, + "step": 8961 + }, + { + "epoch": 0.690872648781992, + "grad_norm": 3.3645167350769043, + "learning_rate": 2.3037695323268394e-06, + "loss": 0.9248, + "step": 8962 + }, + { + "epoch": 0.690949737897009, + "grad_norm": 3.7038087844848633, + "learning_rate": 2.3027182387391373e-06, + "loss": 0.8641, + "step": 8963 + }, + { + "epoch": 0.6910268270120259, + "grad_norm": 3.863682985305786, + "learning_rate": 2.3016671133129837e-06, + "loss": 0.9759, + "step": 8964 + }, + { + "epoch": 0.6911039161270429, + "grad_norm": 4.024723529815674, + "learning_rate": 2.3006161561139105e-06, + "loss": 0.8978, + "step": 8965 + }, + { + "epoch": 0.6911810052420598, + "grad_norm": 3.979031562805176, + "learning_rate": 2.29956536720744e-06, + "loss": 0.95, + "step": 8966 + }, + { + "epoch": 0.6912580943570767, + "grad_norm": 4.345090389251709, + "learning_rate": 2.298514746659084e-06, + "loss": 1.0485, + "step": 8967 + }, + { + "epoch": 0.6913351834720938, + "grad_norm": 3.6718671321868896, + "learning_rate": 2.2974642945343405e-06, + "loss": 0.9435, + "step": 8968 + }, + { + "epoch": 0.6914122725871107, + "grad_norm": 4.286196231842041, + "learning_rate": 2.2964140108987015e-06, + "loss": 0.9947, + "step": 8969 + }, + { + "epoch": 0.6914893617021277, + "grad_norm": 3.392043352127075, + "learning_rate": 2.2953638958176465e-06, + "loss": 0.7898, + "step": 8970 + }, + { + "epoch": 0.6915664508171446, + "grad_norm": 3.7056403160095215, + "learning_rate": 2.2943139493566456e-06, + "loss": 1.0118, + "step": 8971 + }, + { + "epoch": 0.6916435399321615, + "grad_norm": 3.6872429847717285, + "learning_rate": 2.293264171581158e-06, + "loss": 0.999, + "step": 8972 + }, + { + "epoch": 0.6917206290471786, + "grad_norm": 3.7803478240966797, + "learning_rate": 2.2922145625566293e-06, + "loss": 0.9195, + "step": 8973 + }, + { + "epoch": 0.6917977181621955, + "grad_norm": 4.061209201812744, + "learning_rate": 2.2911651223484987e-06, + "loss": 0.9921, + "step": 8974 + }, + { + "epoch": 0.6918748072772125, + "grad_norm": 3.8408148288726807, + "learning_rate": 2.2901158510221934e-06, + "loss": 0.9547, + "step": 8975 + }, + { + "epoch": 0.6919518963922294, + "grad_norm": 4.113273620605469, + "learning_rate": 2.2890667486431296e-06, + "loss": 0.9756, + "step": 8976 + }, + { + "epoch": 0.6920289855072463, + "grad_norm": 3.689815044403076, + "learning_rate": 2.288017815276714e-06, + "loss": 0.8866, + "step": 8977 + }, + { + "epoch": 0.6921060746222634, + "grad_norm": 3.9249446392059326, + "learning_rate": 2.2869690509883435e-06, + "loss": 0.9308, + "step": 8978 + }, + { + "epoch": 0.6921831637372803, + "grad_norm": 3.9732306003570557, + "learning_rate": 2.2859204558433994e-06, + "loss": 0.8548, + "step": 8979 + }, + { + "epoch": 0.6922602528522973, + "grad_norm": 3.4670419692993164, + "learning_rate": 2.284872029907259e-06, + "loss": 0.9897, + "step": 8980 + }, + { + "epoch": 0.6923373419673142, + "grad_norm": 3.4217910766601562, + "learning_rate": 2.283823773245285e-06, + "loss": 0.8297, + "step": 8981 + }, + { + "epoch": 0.6924144310823311, + "grad_norm": 3.647320032119751, + "learning_rate": 2.2827756859228316e-06, + "loss": 0.8505, + "step": 8982 + }, + { + "epoch": 0.6924915201973482, + "grad_norm": 3.502896785736084, + "learning_rate": 2.281727768005243e-06, + "loss": 0.8633, + "step": 8983 + }, + { + "epoch": 0.6925686093123651, + "grad_norm": 3.6835694313049316, + "learning_rate": 2.2806800195578484e-06, + "loss": 0.9993, + "step": 8984 + }, + { + "epoch": 0.6926456984273821, + "grad_norm": 3.4930477142333984, + "learning_rate": 2.2796324406459703e-06, + "loss": 0.7549, + "step": 8985 + }, + { + "epoch": 0.692722787542399, + "grad_norm": 3.864713668823242, + "learning_rate": 2.2785850313349207e-06, + "loss": 0.9116, + "step": 8986 + }, + { + "epoch": 0.6927998766574159, + "grad_norm": 3.770353317260742, + "learning_rate": 2.277537791689999e-06, + "loss": 0.9246, + "step": 8987 + }, + { + "epoch": 0.692876965772433, + "grad_norm": 3.649230718612671, + "learning_rate": 2.276490721776497e-06, + "loss": 0.9455, + "step": 8988 + }, + { + "epoch": 0.6929540548874499, + "grad_norm": 3.8266777992248535, + "learning_rate": 2.2754438216596946e-06, + "loss": 0.9737, + "step": 8989 + }, + { + "epoch": 0.6930311440024669, + "grad_norm": 3.5182337760925293, + "learning_rate": 2.2743970914048575e-06, + "loss": 0.9442, + "step": 8990 + }, + { + "epoch": 0.6931082331174838, + "grad_norm": 4.085440158843994, + "learning_rate": 2.273350531077245e-06, + "loss": 1.0016, + "step": 8991 + }, + { + "epoch": 0.6931853222325007, + "grad_norm": 3.814785957336426, + "learning_rate": 2.2723041407421055e-06, + "loss": 0.9506, + "step": 8992 + }, + { + "epoch": 0.6932624113475178, + "grad_norm": 3.8445160388946533, + "learning_rate": 2.2712579204646755e-06, + "loss": 0.9151, + "step": 8993 + }, + { + "epoch": 0.6933395004625347, + "grad_norm": 3.886320114135742, + "learning_rate": 2.270211870310184e-06, + "loss": 0.9457, + "step": 8994 + }, + { + "epoch": 0.6934165895775517, + "grad_norm": 4.0595011711120605, + "learning_rate": 2.269165990343842e-06, + "loss": 0.9247, + "step": 8995 + }, + { + "epoch": 0.6934936786925686, + "grad_norm": 3.6124463081359863, + "learning_rate": 2.2681202806308572e-06, + "loss": 0.8495, + "step": 8996 + }, + { + "epoch": 0.6935707678075855, + "grad_norm": 3.549788236618042, + "learning_rate": 2.2670747412364243e-06, + "loss": 0.8262, + "step": 8997 + }, + { + "epoch": 0.6936478569226026, + "grad_norm": 3.6086161136627197, + "learning_rate": 2.266029372225727e-06, + "loss": 0.9215, + "step": 8998 + }, + { + "epoch": 0.6937249460376195, + "grad_norm": 3.8933184146881104, + "learning_rate": 2.2649841736639387e-06, + "loss": 0.8623, + "step": 8999 + }, + { + "epoch": 0.6938020351526365, + "grad_norm": 3.9083821773529053, + "learning_rate": 2.2639391456162246e-06, + "loss": 1.0115, + "step": 9000 + }, + { + "epoch": 0.6938791242676534, + "grad_norm": 3.4845428466796875, + "learning_rate": 2.262894288147732e-06, + "loss": 0.8953, + "step": 9001 + }, + { + "epoch": 0.6939562133826703, + "grad_norm": 3.57886004447937, + "learning_rate": 2.2618496013236046e-06, + "loss": 0.8971, + "step": 9002 + }, + { + "epoch": 0.6940333024976874, + "grad_norm": 3.5042362213134766, + "learning_rate": 2.2608050852089738e-06, + "loss": 0.8637, + "step": 9003 + }, + { + "epoch": 0.6941103916127043, + "grad_norm": 3.460904598236084, + "learning_rate": 2.2597607398689588e-06, + "loss": 0.8448, + "step": 9004 + }, + { + "epoch": 0.6941874807277213, + "grad_norm": 3.7049005031585693, + "learning_rate": 2.2587165653686714e-06, + "loss": 0.9066, + "step": 9005 + }, + { + "epoch": 0.6942645698427382, + "grad_norm": 3.4977948665618896, + "learning_rate": 2.257672561773207e-06, + "loss": 0.8732, + "step": 9006 + }, + { + "epoch": 0.6943416589577551, + "grad_norm": 4.0038676261901855, + "learning_rate": 2.256628729147654e-06, + "loss": 0.9104, + "step": 9007 + }, + { + "epoch": 0.6944187480727722, + "grad_norm": 3.465859889984131, + "learning_rate": 2.255585067557093e-06, + "loss": 0.8565, + "step": 9008 + }, + { + "epoch": 0.6944958371877891, + "grad_norm": 3.844653606414795, + "learning_rate": 2.254541577066589e-06, + "loss": 0.949, + "step": 9009 + }, + { + "epoch": 0.6945729263028061, + "grad_norm": 3.570842742919922, + "learning_rate": 2.2534982577412013e-06, + "loss": 0.9655, + "step": 9010 + }, + { + "epoch": 0.694650015417823, + "grad_norm": 3.9422519207000732, + "learning_rate": 2.2524551096459703e-06, + "loss": 0.9421, + "step": 9011 + }, + { + "epoch": 0.6947271045328399, + "grad_norm": 3.6970112323760986, + "learning_rate": 2.251412132845933e-06, + "loss": 0.9201, + "step": 9012 + }, + { + "epoch": 0.694804193647857, + "grad_norm": 3.5936856269836426, + "learning_rate": 2.2503693274061145e-06, + "loss": 0.8951, + "step": 9013 + }, + { + "epoch": 0.6948812827628739, + "grad_norm": 3.511066436767578, + "learning_rate": 2.2493266933915274e-06, + "loss": 0.845, + "step": 9014 + }, + { + "epoch": 0.6949583718778909, + "grad_norm": 3.7108609676361084, + "learning_rate": 2.2482842308671766e-06, + "loss": 0.97, + "step": 9015 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 3.6902565956115723, + "learning_rate": 2.247241939898051e-06, + "loss": 0.959, + "step": 9016 + }, + { + "epoch": 0.6951125501079247, + "grad_norm": 3.712344169616699, + "learning_rate": 2.2461998205491335e-06, + "loss": 0.9824, + "step": 9017 + }, + { + "epoch": 0.6951896392229417, + "grad_norm": 3.7560031414031982, + "learning_rate": 2.2451578728853944e-06, + "loss": 0.8713, + "step": 9018 + }, + { + "epoch": 0.6952667283379587, + "grad_norm": 3.6209821701049805, + "learning_rate": 2.244116096971795e-06, + "loss": 0.8807, + "step": 9019 + }, + { + "epoch": 0.6953438174529757, + "grad_norm": 3.638458490371704, + "learning_rate": 2.2430744928732854e-06, + "loss": 0.8903, + "step": 9020 + }, + { + "epoch": 0.6954209065679926, + "grad_norm": 3.5852041244506836, + "learning_rate": 2.242033060654801e-06, + "loss": 0.9105, + "step": 9021 + }, + { + "epoch": 0.6954979956830095, + "grad_norm": 3.6906344890594482, + "learning_rate": 2.240991800381271e-06, + "loss": 0.9511, + "step": 9022 + }, + { + "epoch": 0.6955750847980265, + "grad_norm": 3.609318971633911, + "learning_rate": 2.2399507121176127e-06, + "loss": 0.9687, + "step": 9023 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 3.8320932388305664, + "learning_rate": 2.238909795928734e-06, + "loss": 0.9885, + "step": 9024 + }, + { + "epoch": 0.6957292630280605, + "grad_norm": 3.6839687824249268, + "learning_rate": 2.2378690518795284e-06, + "loss": 0.8995, + "step": 9025 + }, + { + "epoch": 0.6958063521430774, + "grad_norm": 4.18240213394165, + "learning_rate": 2.2368284800348845e-06, + "loss": 0.9525, + "step": 9026 + }, + { + "epoch": 0.6958834412580943, + "grad_norm": 3.770507574081421, + "learning_rate": 2.2357880804596714e-06, + "loss": 0.9616, + "step": 9027 + }, + { + "epoch": 0.6959605303731113, + "grad_norm": 3.8755156993865967, + "learning_rate": 2.2347478532187562e-06, + "loss": 0.8957, + "step": 9028 + }, + { + "epoch": 0.6960376194881283, + "grad_norm": 3.3811306953430176, + "learning_rate": 2.2337077983769902e-06, + "loss": 0.8254, + "step": 9029 + }, + { + "epoch": 0.6961147086031453, + "grad_norm": 3.608715295791626, + "learning_rate": 2.232667915999216e-06, + "loss": 0.8726, + "step": 9030 + }, + { + "epoch": 0.6961917977181622, + "grad_norm": 3.5603222846984863, + "learning_rate": 2.2316282061502666e-06, + "loss": 0.9169, + "step": 9031 + }, + { + "epoch": 0.6962688868331791, + "grad_norm": 3.8038291931152344, + "learning_rate": 2.230588668894959e-06, + "loss": 0.874, + "step": 9032 + }, + { + "epoch": 0.6963459759481961, + "grad_norm": 3.717751979827881, + "learning_rate": 2.229549304298105e-06, + "loss": 0.9489, + "step": 9033 + }, + { + "epoch": 0.696423065063213, + "grad_norm": 4.030141830444336, + "learning_rate": 2.228510112424503e-06, + "loss": 0.9211, + "step": 9034 + }, + { + "epoch": 0.6965001541782301, + "grad_norm": 4.214062213897705, + "learning_rate": 2.2274710933389423e-06, + "loss": 0.9463, + "step": 9035 + }, + { + "epoch": 0.696577243293247, + "grad_norm": 3.714287519454956, + "learning_rate": 2.226432247106199e-06, + "loss": 0.9016, + "step": 9036 + }, + { + "epoch": 0.6966543324082639, + "grad_norm": 3.579857110977173, + "learning_rate": 2.225393573791042e-06, + "loss": 0.8215, + "step": 9037 + }, + { + "epoch": 0.6967314215232809, + "grad_norm": 3.9160430431365967, + "learning_rate": 2.2243550734582243e-06, + "loss": 0.8289, + "step": 9038 + }, + { + "epoch": 0.6968085106382979, + "grad_norm": 3.4613399505615234, + "learning_rate": 2.223316746172492e-06, + "loss": 0.8294, + "step": 9039 + }, + { + "epoch": 0.6968855997533149, + "grad_norm": 3.514765977859497, + "learning_rate": 2.22227859199858e-06, + "loss": 0.9109, + "step": 9040 + }, + { + "epoch": 0.6969626888683318, + "grad_norm": 3.923628807067871, + "learning_rate": 2.2212406110012113e-06, + "loss": 0.9976, + "step": 9041 + }, + { + "epoch": 0.6970397779833487, + "grad_norm": 3.940729856491089, + "learning_rate": 2.2202028032451013e-06, + "loss": 0.9827, + "step": 9042 + }, + { + "epoch": 0.6971168670983657, + "grad_norm": 3.6197614669799805, + "learning_rate": 2.2191651687949473e-06, + "loss": 0.991, + "step": 9043 + }, + { + "epoch": 0.6971939562133826, + "grad_norm": 3.794126510620117, + "learning_rate": 2.218127707715443e-06, + "loss": 0.9555, + "step": 9044 + }, + { + "epoch": 0.6972710453283997, + "grad_norm": 3.6639597415924072, + "learning_rate": 2.2170904200712684e-06, + "loss": 0.968, + "step": 9045 + }, + { + "epoch": 0.6973481344434166, + "grad_norm": 3.9340016841888428, + "learning_rate": 2.216053305927093e-06, + "loss": 0.9202, + "step": 9046 + }, + { + "epoch": 0.6974252235584335, + "grad_norm": 3.675588846206665, + "learning_rate": 2.2150163653475752e-06, + "loss": 0.7814, + "step": 9047 + }, + { + "epoch": 0.6975023126734505, + "grad_norm": 3.601801633834839, + "learning_rate": 2.2139795983973654e-06, + "loss": 0.8856, + "step": 9048 + }, + { + "epoch": 0.6975794017884674, + "grad_norm": 3.440999746322632, + "learning_rate": 2.2129430051410967e-06, + "loss": 0.9017, + "step": 9049 + }, + { + "epoch": 0.6976564909034845, + "grad_norm": 3.5719244480133057, + "learning_rate": 2.211906585643397e-06, + "loss": 0.8295, + "step": 9050 + }, + { + "epoch": 0.6977335800185014, + "grad_norm": 4.056939601898193, + "learning_rate": 2.210870339968881e-06, + "loss": 0.9759, + "step": 9051 + }, + { + "epoch": 0.6978106691335183, + "grad_norm": 3.9863460063934326, + "learning_rate": 2.2098342681821555e-06, + "loss": 0.9698, + "step": 9052 + }, + { + "epoch": 0.6978877582485353, + "grad_norm": 3.962463140487671, + "learning_rate": 2.2087983703478118e-06, + "loss": 0.9996, + "step": 9053 + }, + { + "epoch": 0.6979648473635522, + "grad_norm": 3.7303974628448486, + "learning_rate": 2.207762646530434e-06, + "loss": 0.9972, + "step": 9054 + }, + { + "epoch": 0.6980419364785693, + "grad_norm": 3.459636688232422, + "learning_rate": 2.2067270967945936e-06, + "loss": 0.8566, + "step": 9055 + }, + { + "epoch": 0.6981190255935862, + "grad_norm": 3.582228183746338, + "learning_rate": 2.2056917212048522e-06, + "loss": 0.9435, + "step": 9056 + }, + { + "epoch": 0.6981961147086031, + "grad_norm": 3.553546905517578, + "learning_rate": 2.2046565198257595e-06, + "loss": 0.9016, + "step": 9057 + }, + { + "epoch": 0.6982732038236201, + "grad_norm": 3.933868646621704, + "learning_rate": 2.203621492721858e-06, + "loss": 0.9479, + "step": 9058 + }, + { + "epoch": 0.698350292938637, + "grad_norm": 3.995857000350952, + "learning_rate": 2.2025866399576713e-06, + "loss": 0.98, + "step": 9059 + }, + { + "epoch": 0.6984273820536541, + "grad_norm": 3.842726469039917, + "learning_rate": 2.2015519615977193e-06, + "loss": 0.9618, + "step": 9060 + }, + { + "epoch": 0.698504471168671, + "grad_norm": 3.7340540885925293, + "learning_rate": 2.2005174577065085e-06, + "loss": 0.9015, + "step": 9061 + }, + { + "epoch": 0.6985815602836879, + "grad_norm": 3.7171812057495117, + "learning_rate": 2.1994831283485363e-06, + "loss": 0.9996, + "step": 9062 + }, + { + "epoch": 0.6986586493987049, + "grad_norm": 3.576258420944214, + "learning_rate": 2.198448973588288e-06, + "loss": 0.8816, + "step": 9063 + }, + { + "epoch": 0.6987357385137218, + "grad_norm": 3.6319639682769775, + "learning_rate": 2.197414993490235e-06, + "loss": 0.9423, + "step": 9064 + }, + { + "epoch": 0.6988128276287389, + "grad_norm": 3.287196159362793, + "learning_rate": 2.1963811881188423e-06, + "loss": 0.8847, + "step": 9065 + }, + { + "epoch": 0.6988899167437558, + "grad_norm": 3.3368561267852783, + "learning_rate": 2.1953475575385618e-06, + "loss": 0.8119, + "step": 9066 + }, + { + "epoch": 0.6989670058587727, + "grad_norm": 3.6596055030822754, + "learning_rate": 2.1943141018138357e-06, + "loss": 1.0, + "step": 9067 + }, + { + "epoch": 0.6990440949737897, + "grad_norm": 3.594658851623535, + "learning_rate": 2.1932808210090963e-06, + "loss": 0.867, + "step": 9068 + }, + { + "epoch": 0.6991211840888066, + "grad_norm": 4.087553977966309, + "learning_rate": 2.1922477151887595e-06, + "loss": 0.915, + "step": 9069 + }, + { + "epoch": 0.6991982732038237, + "grad_norm": 3.692249059677124, + "learning_rate": 2.1912147844172354e-06, + "loss": 0.8633, + "step": 9070 + }, + { + "epoch": 0.6992753623188406, + "grad_norm": 3.7603821754455566, + "learning_rate": 2.1901820287589227e-06, + "loss": 0.8733, + "step": 9071 + }, + { + "epoch": 0.6993524514338575, + "grad_norm": 3.858159303665161, + "learning_rate": 2.189149448278208e-06, + "loss": 0.7904, + "step": 9072 + }, + { + "epoch": 0.6994295405488745, + "grad_norm": 3.873694896697998, + "learning_rate": 2.188117043039468e-06, + "loss": 0.9063, + "step": 9073 + }, + { + "epoch": 0.6995066296638914, + "grad_norm": 3.4330713748931885, + "learning_rate": 2.187084813107069e-06, + "loss": 0.9582, + "step": 9074 + }, + { + "epoch": 0.6995837187789085, + "grad_norm": 3.4164810180664062, + "learning_rate": 2.186052758545361e-06, + "loss": 0.8979, + "step": 9075 + }, + { + "epoch": 0.6996608078939254, + "grad_norm": 3.9505198001861572, + "learning_rate": 2.18502087941869e-06, + "loss": 0.9866, + "step": 9076 + }, + { + "epoch": 0.6997378970089423, + "grad_norm": 3.860844373703003, + "learning_rate": 2.183989175791388e-06, + "loss": 0.9133, + "step": 9077 + }, + { + "epoch": 0.6998149861239593, + "grad_norm": 3.6044540405273438, + "learning_rate": 2.1829576477277765e-06, + "loss": 0.9124, + "step": 9078 + }, + { + "epoch": 0.6998920752389762, + "grad_norm": 3.7315444946289062, + "learning_rate": 2.181926295292167e-06, + "loss": 0.9567, + "step": 9079 + }, + { + "epoch": 0.6999691643539933, + "grad_norm": 3.6717987060546875, + "learning_rate": 2.180895118548857e-06, + "loss": 0.9217, + "step": 9080 + }, + { + "epoch": 0.7000462534690102, + "grad_norm": 4.249053478240967, + "learning_rate": 2.1798641175621354e-06, + "loss": 1.0054, + "step": 9081 + }, + { + "epoch": 0.7001233425840271, + "grad_norm": 3.520657539367676, + "learning_rate": 2.17883329239628e-06, + "loss": 0.8778, + "step": 9082 + }, + { + "epoch": 0.7002004316990441, + "grad_norm": 3.7700884342193604, + "learning_rate": 2.177802643115558e-06, + "loss": 0.9893, + "step": 9083 + }, + { + "epoch": 0.700277520814061, + "grad_norm": 3.6206729412078857, + "learning_rate": 2.1767721697842244e-06, + "loss": 0.8638, + "step": 9084 + }, + { + "epoch": 0.700354609929078, + "grad_norm": 3.6218421459198, + "learning_rate": 2.1757418724665263e-06, + "loss": 0.9983, + "step": 9085 + }, + { + "epoch": 0.700431699044095, + "grad_norm": 3.8477892875671387, + "learning_rate": 2.1747117512266928e-06, + "loss": 1.0336, + "step": 9086 + }, + { + "epoch": 0.7005087881591119, + "grad_norm": 3.7004141807556152, + "learning_rate": 2.1736818061289492e-06, + "loss": 0.8383, + "step": 9087 + }, + { + "epoch": 0.7005858772741289, + "grad_norm": 3.667736291885376, + "learning_rate": 2.1726520372375076e-06, + "loss": 0.9925, + "step": 9088 + }, + { + "epoch": 0.7006629663891458, + "grad_norm": 3.574789047241211, + "learning_rate": 2.1716224446165678e-06, + "loss": 0.7904, + "step": 9089 + }, + { + "epoch": 0.7007400555041629, + "grad_norm": 3.753038167953491, + "learning_rate": 2.1705930283303222e-06, + "loss": 0.8674, + "step": 9090 + }, + { + "epoch": 0.7008171446191798, + "grad_norm": 3.9094438552856445, + "learning_rate": 2.1695637884429456e-06, + "loss": 0.9338, + "step": 9091 + }, + { + "epoch": 0.7008942337341967, + "grad_norm": 3.6531260013580322, + "learning_rate": 2.1685347250186073e-06, + "loss": 0.9804, + "step": 9092 + }, + { + "epoch": 0.7009713228492137, + "grad_norm": 3.6628949642181396, + "learning_rate": 2.1675058381214647e-06, + "loss": 0.9991, + "step": 9093 + }, + { + "epoch": 0.7010484119642306, + "grad_norm": 3.8137435913085938, + "learning_rate": 2.166477127815663e-06, + "loss": 0.8553, + "step": 9094 + }, + { + "epoch": 0.7011255010792476, + "grad_norm": 3.7563507556915283, + "learning_rate": 2.1654485941653374e-06, + "loss": 0.8153, + "step": 9095 + }, + { + "epoch": 0.7012025901942646, + "grad_norm": 3.9121322631835938, + "learning_rate": 2.164420237234611e-06, + "loss": 0.9299, + "step": 9096 + }, + { + "epoch": 0.7012796793092815, + "grad_norm": 3.449474811553955, + "learning_rate": 2.163392057087597e-06, + "loss": 0.9556, + "step": 9097 + }, + { + "epoch": 0.7013567684242985, + "grad_norm": 3.6520328521728516, + "learning_rate": 2.1623640537883977e-06, + "loss": 0.8858, + "step": 9098 + }, + { + "epoch": 0.7014338575393154, + "grad_norm": 3.4059646129608154, + "learning_rate": 2.1613362274011025e-06, + "loss": 0.8044, + "step": 9099 + }, + { + "epoch": 0.7015109466543324, + "grad_norm": 3.759131669998169, + "learning_rate": 2.1603085779897935e-06, + "loss": 0.8184, + "step": 9100 + }, + { + "epoch": 0.7015880357693494, + "grad_norm": 3.839765787124634, + "learning_rate": 2.1592811056185363e-06, + "loss": 0.9364, + "step": 9101 + }, + { + "epoch": 0.7016651248843663, + "grad_norm": 3.4155290126800537, + "learning_rate": 2.1582538103513896e-06, + "loss": 0.835, + "step": 9102 + }, + { + "epoch": 0.7017422139993833, + "grad_norm": 3.8770790100097656, + "learning_rate": 2.1572266922524e-06, + "loss": 1.0204, + "step": 9103 + }, + { + "epoch": 0.7018193031144002, + "grad_norm": 3.4858505725860596, + "learning_rate": 2.1561997513856027e-06, + "loss": 0.9334, + "step": 9104 + }, + { + "epoch": 0.7018963922294172, + "grad_norm": 3.834233283996582, + "learning_rate": 2.155172987815024e-06, + "loss": 0.9136, + "step": 9105 + }, + { + "epoch": 0.7019734813444342, + "grad_norm": 4.0034050941467285, + "learning_rate": 2.1541464016046747e-06, + "loss": 1.0584, + "step": 9106 + }, + { + "epoch": 0.7020505704594511, + "grad_norm": 3.6104769706726074, + "learning_rate": 2.153119992818558e-06, + "loss": 0.9131, + "step": 9107 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 3.7609310150146484, + "learning_rate": 2.152093761520665e-06, + "loss": 0.9231, + "step": 9108 + }, + { + "epoch": 0.702204748689485, + "grad_norm": 3.517526388168335, + "learning_rate": 2.151067707774977e-06, + "loss": 0.9278, + "step": 9109 + }, + { + "epoch": 0.702281837804502, + "grad_norm": 3.942485809326172, + "learning_rate": 2.150041831645462e-06, + "loss": 1.0325, + "step": 9110 + }, + { + "epoch": 0.702358926919519, + "grad_norm": 3.776834487915039, + "learning_rate": 2.14901613319608e-06, + "loss": 0.9948, + "step": 9111 + }, + { + "epoch": 0.7024360160345359, + "grad_norm": 3.855884552001953, + "learning_rate": 2.147990612490775e-06, + "loss": 1.0399, + "step": 9112 + }, + { + "epoch": 0.7025131051495529, + "grad_norm": 4.162666320800781, + "learning_rate": 2.1469652695934847e-06, + "loss": 0.8753, + "step": 9113 + }, + { + "epoch": 0.7025901942645698, + "grad_norm": 3.925414800643921, + "learning_rate": 2.145940104568133e-06, + "loss": 0.9324, + "step": 9114 + }, + { + "epoch": 0.7026672833795868, + "grad_norm": 3.718440294265747, + "learning_rate": 2.1449151174786343e-06, + "loss": 0.8593, + "step": 9115 + }, + { + "epoch": 0.7027443724946038, + "grad_norm": 3.670015335083008, + "learning_rate": 2.143890308388893e-06, + "loss": 0.7962, + "step": 9116 + }, + { + "epoch": 0.7028214616096207, + "grad_norm": 3.8344802856445312, + "learning_rate": 2.142865677362797e-06, + "loss": 0.9722, + "step": 9117 + }, + { + "epoch": 0.7028985507246377, + "grad_norm": 3.786147117614746, + "learning_rate": 2.141841224464229e-06, + "loss": 0.8701, + "step": 9118 + }, + { + "epoch": 0.7029756398396546, + "grad_norm": 3.831787109375, + "learning_rate": 2.1408169497570576e-06, + "loss": 0.8794, + "step": 9119 + }, + { + "epoch": 0.7030527289546716, + "grad_norm": 3.9900169372558594, + "learning_rate": 2.139792853305141e-06, + "loss": 1.055, + "step": 9120 + }, + { + "epoch": 0.7031298180696886, + "grad_norm": 3.954350233078003, + "learning_rate": 2.1387689351723266e-06, + "loss": 0.952, + "step": 9121 + }, + { + "epoch": 0.7032069071847055, + "grad_norm": 3.8101086616516113, + "learning_rate": 2.1377451954224526e-06, + "loss": 0.94, + "step": 9122 + }, + { + "epoch": 0.7032839962997225, + "grad_norm": 4.078210353851318, + "learning_rate": 2.13672163411934e-06, + "loss": 0.9399, + "step": 9123 + }, + { + "epoch": 0.7033610854147394, + "grad_norm": 3.5562384128570557, + "learning_rate": 2.1356982513268034e-06, + "loss": 0.806, + "step": 9124 + }, + { + "epoch": 0.7034381745297564, + "grad_norm": 3.3538687229156494, + "learning_rate": 2.134675047108647e-06, + "loss": 0.8706, + "step": 9125 + }, + { + "epoch": 0.7035152636447733, + "grad_norm": 4.081210136413574, + "learning_rate": 2.133652021528661e-06, + "loss": 0.8758, + "step": 9126 + }, + { + "epoch": 0.7035923527597903, + "grad_norm": 4.063713550567627, + "learning_rate": 2.1326291746506283e-06, + "loss": 1.0671, + "step": 9127 + }, + { + "epoch": 0.7036694418748073, + "grad_norm": 3.4613239765167236, + "learning_rate": 2.131606506538314e-06, + "loss": 0.8831, + "step": 9128 + }, + { + "epoch": 0.7037465309898242, + "grad_norm": 3.84372878074646, + "learning_rate": 2.1305840172554786e-06, + "loss": 1.0199, + "step": 9129 + }, + { + "epoch": 0.7038236201048412, + "grad_norm": 3.5245513916015625, + "learning_rate": 2.1295617068658685e-06, + "loss": 0.8048, + "step": 9130 + }, + { + "epoch": 0.7039007092198581, + "grad_norm": 3.294621706008911, + "learning_rate": 2.1285395754332198e-06, + "loss": 0.8333, + "step": 9131 + }, + { + "epoch": 0.7039777983348752, + "grad_norm": 3.9054811000823975, + "learning_rate": 2.1275176230212568e-06, + "loss": 0.8569, + "step": 9132 + }, + { + "epoch": 0.7040548874498921, + "grad_norm": 3.4768083095550537, + "learning_rate": 2.1264958496936945e-06, + "loss": 0.8138, + "step": 9133 + }, + { + "epoch": 0.704131976564909, + "grad_norm": 3.709878921508789, + "learning_rate": 2.125474255514232e-06, + "loss": 0.925, + "step": 9134 + }, + { + "epoch": 0.704209065679926, + "grad_norm": 3.6700382232666016, + "learning_rate": 2.1244528405465626e-06, + "loss": 0.9451, + "step": 9135 + }, + { + "epoch": 0.7042861547949429, + "grad_norm": 3.370394468307495, + "learning_rate": 2.123431604854365e-06, + "loss": 0.8803, + "step": 9136 + }, + { + "epoch": 0.70436324390996, + "grad_norm": 3.7478532791137695, + "learning_rate": 2.1224105485013096e-06, + "loss": 0.8755, + "step": 9137 + }, + { + "epoch": 0.7044403330249769, + "grad_norm": 3.732043981552124, + "learning_rate": 2.121389671551054e-06, + "loss": 0.9241, + "step": 9138 + }, + { + "epoch": 0.7045174221399938, + "grad_norm": 3.701848268508911, + "learning_rate": 2.120368974067242e-06, + "loss": 0.9204, + "step": 9139 + }, + { + "epoch": 0.7045945112550108, + "grad_norm": 3.8429481983184814, + "learning_rate": 2.11934845611351e-06, + "loss": 1.0072, + "step": 9140 + }, + { + "epoch": 0.7046716003700277, + "grad_norm": 3.9448812007904053, + "learning_rate": 2.1183281177534828e-06, + "loss": 0.945, + "step": 9141 + }, + { + "epoch": 0.7047486894850448, + "grad_norm": 3.5992908477783203, + "learning_rate": 2.1173079590507735e-06, + "loss": 0.9431, + "step": 9142 + }, + { + "epoch": 0.7048257786000617, + "grad_norm": 3.7120702266693115, + "learning_rate": 2.1162879800689845e-06, + "loss": 0.7797, + "step": 9143 + }, + { + "epoch": 0.7049028677150786, + "grad_norm": 3.8240060806274414, + "learning_rate": 2.1152681808717033e-06, + "loss": 0.9258, + "step": 9144 + }, + { + "epoch": 0.7049799568300956, + "grad_norm": 3.821179151535034, + "learning_rate": 2.1142485615225104e-06, + "loss": 0.869, + "step": 9145 + }, + { + "epoch": 0.7050570459451125, + "grad_norm": 4.0263142585754395, + "learning_rate": 2.1132291220849743e-06, + "loss": 0.9052, + "step": 9146 + }, + { + "epoch": 0.7051341350601296, + "grad_norm": 3.7679524421691895, + "learning_rate": 2.1122098626226512e-06, + "loss": 1.0384, + "step": 9147 + }, + { + "epoch": 0.7052112241751465, + "grad_norm": 3.6580440998077393, + "learning_rate": 2.111190783199089e-06, + "loss": 0.9052, + "step": 9148 + }, + { + "epoch": 0.7052883132901634, + "grad_norm": 3.7239935398101807, + "learning_rate": 2.110171883877818e-06, + "loss": 0.9718, + "step": 9149 + }, + { + "epoch": 0.7053654024051804, + "grad_norm": 4.641191482543945, + "learning_rate": 2.1091531647223632e-06, + "loss": 1.015, + "step": 9150 + }, + { + "epoch": 0.7054424915201973, + "grad_norm": 3.7391200065612793, + "learning_rate": 2.1081346257962367e-06, + "loss": 0.9439, + "step": 9151 + }, + { + "epoch": 0.7055195806352144, + "grad_norm": 3.7103328704833984, + "learning_rate": 2.1071162671629386e-06, + "loss": 0.9708, + "step": 9152 + }, + { + "epoch": 0.7055966697502313, + "grad_norm": 3.7474148273468018, + "learning_rate": 2.1060980888859608e-06, + "loss": 0.9439, + "step": 9153 + }, + { + "epoch": 0.7056737588652482, + "grad_norm": 3.799514055252075, + "learning_rate": 2.1050800910287774e-06, + "loss": 0.961, + "step": 9154 + }, + { + "epoch": 0.7057508479802652, + "grad_norm": 3.7274961471557617, + "learning_rate": 2.1040622736548566e-06, + "loss": 0.871, + "step": 9155 + }, + { + "epoch": 0.7058279370952821, + "grad_norm": 4.0381669998168945, + "learning_rate": 2.1030446368276547e-06, + "loss": 0.8929, + "step": 9156 + }, + { + "epoch": 0.7059050262102992, + "grad_norm": 3.871047019958496, + "learning_rate": 2.102027180610616e-06, + "loss": 0.9771, + "step": 9157 + }, + { + "epoch": 0.7059821153253161, + "grad_norm": 3.7191429138183594, + "learning_rate": 2.1010099050671745e-06, + "loss": 0.9002, + "step": 9158 + }, + { + "epoch": 0.706059204440333, + "grad_norm": 3.723458766937256, + "learning_rate": 2.099992810260752e-06, + "loss": 0.9915, + "step": 9159 + }, + { + "epoch": 0.70613629355535, + "grad_norm": 4.22058629989624, + "learning_rate": 2.098975896254757e-06, + "loss": 1.0003, + "step": 9160 + }, + { + "epoch": 0.7062133826703669, + "grad_norm": 3.7381343841552734, + "learning_rate": 2.0979591631125896e-06, + "loss": 0.9054, + "step": 9161 + }, + { + "epoch": 0.706290471785384, + "grad_norm": 3.518643379211426, + "learning_rate": 2.096942610897639e-06, + "loss": 0.9081, + "step": 9162 + }, + { + "epoch": 0.7063675609004009, + "grad_norm": 3.9908714294433594, + "learning_rate": 2.0959262396732804e-06, + "loss": 0.9465, + "step": 9163 + }, + { + "epoch": 0.7064446500154178, + "grad_norm": 3.8830153942108154, + "learning_rate": 2.094910049502882e-06, + "loss": 0.8796, + "step": 9164 + }, + { + "epoch": 0.7065217391304348, + "grad_norm": 3.6630706787109375, + "learning_rate": 2.093894040449795e-06, + "loss": 0.8835, + "step": 9165 + }, + { + "epoch": 0.7065988282454517, + "grad_norm": 3.9855260848999023, + "learning_rate": 2.0928782125773636e-06, + "loss": 1.0637, + "step": 9166 + }, + { + "epoch": 0.7066759173604688, + "grad_norm": 3.887150764465332, + "learning_rate": 2.091862565948919e-06, + "loss": 0.9717, + "step": 9167 + }, + { + "epoch": 0.7067530064754857, + "grad_norm": 3.4647603034973145, + "learning_rate": 2.0908471006277816e-06, + "loss": 0.92, + "step": 9168 + }, + { + "epoch": 0.7068300955905026, + "grad_norm": 3.9277379512786865, + "learning_rate": 2.089831816677261e-06, + "loss": 0.9405, + "step": 9169 + }, + { + "epoch": 0.7069071847055196, + "grad_norm": 3.964707136154175, + "learning_rate": 2.088816714160656e-06, + "loss": 1.0286, + "step": 9170 + }, + { + "epoch": 0.7069842738205365, + "grad_norm": 3.6983249187469482, + "learning_rate": 2.0878017931412493e-06, + "loss": 1.0521, + "step": 9171 + }, + { + "epoch": 0.7070613629355536, + "grad_norm": 3.797630548477173, + "learning_rate": 2.0867870536823185e-06, + "loss": 0.9313, + "step": 9172 + }, + { + "epoch": 0.7071384520505705, + "grad_norm": 3.392491340637207, + "learning_rate": 2.0857724958471273e-06, + "loss": 0.742, + "step": 9173 + }, + { + "epoch": 0.7072155411655874, + "grad_norm": 3.4460275173187256, + "learning_rate": 2.0847581196989277e-06, + "loss": 0.8896, + "step": 9174 + }, + { + "epoch": 0.7072926302806044, + "grad_norm": 4.643573760986328, + "learning_rate": 2.0837439253009623e-06, + "loss": 0.9268, + "step": 9175 + }, + { + "epoch": 0.7073697193956213, + "grad_norm": 3.4909539222717285, + "learning_rate": 2.0827299127164574e-06, + "loss": 0.9444, + "step": 9176 + }, + { + "epoch": 0.7074468085106383, + "grad_norm": 3.7072372436523438, + "learning_rate": 2.0817160820086342e-06, + "loss": 0.9454, + "step": 9177 + }, + { + "epoch": 0.7075238976256553, + "grad_norm": 3.618695020675659, + "learning_rate": 2.080702433240699e-06, + "loss": 0.9146, + "step": 9178 + }, + { + "epoch": 0.7076009867406722, + "grad_norm": 3.594953775405884, + "learning_rate": 2.079688966475847e-06, + "loss": 0.9275, + "step": 9179 + }, + { + "epoch": 0.7076780758556892, + "grad_norm": 3.649266004562378, + "learning_rate": 2.078675681777264e-06, + "loss": 0.8821, + "step": 9180 + }, + { + "epoch": 0.7077551649707061, + "grad_norm": 3.623678207397461, + "learning_rate": 2.077662579208124e-06, + "loss": 0.9072, + "step": 9181 + }, + { + "epoch": 0.7078322540857231, + "grad_norm": 3.6807639598846436, + "learning_rate": 2.0766496588315853e-06, + "loss": 0.9662, + "step": 9182 + }, + { + "epoch": 0.7079093432007401, + "grad_norm": 3.5517539978027344, + "learning_rate": 2.0756369207107997e-06, + "loss": 0.8944, + "step": 9183 + }, + { + "epoch": 0.707986432315757, + "grad_norm": 3.9906022548675537, + "learning_rate": 2.0746243649089065e-06, + "loss": 0.8717, + "step": 9184 + }, + { + "epoch": 0.708063521430774, + "grad_norm": 3.7278897762298584, + "learning_rate": 2.0736119914890335e-06, + "loss": 0.9515, + "step": 9185 + }, + { + "epoch": 0.7081406105457909, + "grad_norm": 4.302172660827637, + "learning_rate": 2.072599800514296e-06, + "loss": 0.9035, + "step": 9186 + }, + { + "epoch": 0.7082176996608079, + "grad_norm": 4.022290229797363, + "learning_rate": 2.0715877920478e-06, + "loss": 0.8397, + "step": 9187 + }, + { + "epoch": 0.7082947887758249, + "grad_norm": 3.2679741382598877, + "learning_rate": 2.0705759661526387e-06, + "loss": 0.7864, + "step": 9188 + }, + { + "epoch": 0.7083718778908418, + "grad_norm": 3.836318254470825, + "learning_rate": 2.069564322891894e-06, + "loss": 0.9187, + "step": 9189 + }, + { + "epoch": 0.7084489670058588, + "grad_norm": 4.069839000701904, + "learning_rate": 2.068552862328637e-06, + "loss": 1.0155, + "step": 9190 + }, + { + "epoch": 0.7085260561208757, + "grad_norm": 3.7692060470581055, + "learning_rate": 2.067541584525927e-06, + "loss": 0.8143, + "step": 9191 + }, + { + "epoch": 0.7086031452358927, + "grad_norm": 3.5922646522521973, + "learning_rate": 2.0665304895468114e-06, + "loss": 0.8584, + "step": 9192 + }, + { + "epoch": 0.7086802343509097, + "grad_norm": 3.649376630783081, + "learning_rate": 2.065519577454326e-06, + "loss": 0.9411, + "step": 9193 + }, + { + "epoch": 0.7087573234659266, + "grad_norm": 3.5520145893096924, + "learning_rate": 2.0645088483114974e-06, + "loss": 0.8421, + "step": 9194 + }, + { + "epoch": 0.7088344125809436, + "grad_norm": 4.203880310058594, + "learning_rate": 2.0634983021813385e-06, + "loss": 0.9849, + "step": 9195 + }, + { + "epoch": 0.7089115016959605, + "grad_norm": 3.6634862422943115, + "learning_rate": 2.062487939126854e-06, + "loss": 0.9275, + "step": 9196 + }, + { + "epoch": 0.7089885908109775, + "grad_norm": 3.71562123298645, + "learning_rate": 2.0614777592110306e-06, + "loss": 0.8916, + "step": 9197 + }, + { + "epoch": 0.7090656799259945, + "grad_norm": 3.9342494010925293, + "learning_rate": 2.06046776249685e-06, + "loss": 0.8933, + "step": 9198 + }, + { + "epoch": 0.7091427690410114, + "grad_norm": 3.345992088317871, + "learning_rate": 2.0594579490472803e-06, + "loss": 0.8077, + "step": 9199 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 3.8320610523223877, + "learning_rate": 2.058448318925278e-06, + "loss": 0.9362, + "step": 9200 + }, + { + "epoch": 0.7092969472710453, + "grad_norm": 3.6400604248046875, + "learning_rate": 2.0574388721937905e-06, + "loss": 0.8823, + "step": 9201 + }, + { + "epoch": 0.7093740363860623, + "grad_norm": 3.9985427856445312, + "learning_rate": 2.056429608915747e-06, + "loss": 1.0019, + "step": 9202 + }, + { + "epoch": 0.7094511255010792, + "grad_norm": 3.3883984088897705, + "learning_rate": 2.0554205291540724e-06, + "loss": 0.8021, + "step": 9203 + }, + { + "epoch": 0.7095282146160962, + "grad_norm": 3.7378809452056885, + "learning_rate": 2.0544116329716773e-06, + "loss": 0.8293, + "step": 9204 + }, + { + "epoch": 0.7096053037311132, + "grad_norm": 3.601414918899536, + "learning_rate": 2.0534029204314613e-06, + "loss": 0.8926, + "step": 9205 + }, + { + "epoch": 0.7096823928461301, + "grad_norm": 3.2823150157928467, + "learning_rate": 2.052394391596313e-06, + "loss": 0.7918, + "step": 9206 + }, + { + "epoch": 0.7097594819611471, + "grad_norm": 4.111335277557373, + "learning_rate": 2.0513860465291097e-06, + "loss": 0.8922, + "step": 9207 + }, + { + "epoch": 0.709836571076164, + "grad_norm": 3.9760074615478516, + "learning_rate": 2.0503778852927134e-06, + "loss": 0.9042, + "step": 9208 + }, + { + "epoch": 0.709913660191181, + "grad_norm": 3.659054756164551, + "learning_rate": 2.0493699079499797e-06, + "loss": 0.8971, + "step": 9209 + }, + { + "epoch": 0.709990749306198, + "grad_norm": 3.8829121589660645, + "learning_rate": 2.04836211456375e-06, + "loss": 0.8574, + "step": 9210 + }, + { + "epoch": 0.7100678384212149, + "grad_norm": 3.769808292388916, + "learning_rate": 2.0473545051968557e-06, + "loss": 0.777, + "step": 9211 + }, + { + "epoch": 0.7101449275362319, + "grad_norm": 3.7595462799072266, + "learning_rate": 2.0463470799121177e-06, + "loss": 0.8837, + "step": 9212 + }, + { + "epoch": 0.7102220166512488, + "grad_norm": 3.825510025024414, + "learning_rate": 2.04533983877234e-06, + "loss": 0.9327, + "step": 9213 + }, + { + "epoch": 0.7102991057662658, + "grad_norm": 3.5481746196746826, + "learning_rate": 2.0443327818403213e-06, + "loss": 0.9789, + "step": 9214 + }, + { + "epoch": 0.7103761948812828, + "grad_norm": 3.591843366622925, + "learning_rate": 2.0433259091788453e-06, + "loss": 0.9067, + "step": 9215 + }, + { + "epoch": 0.7104532839962997, + "grad_norm": 4.088891983032227, + "learning_rate": 2.042319220850686e-06, + "loss": 0.9353, + "step": 9216 + }, + { + "epoch": 0.7105303731113167, + "grad_norm": 3.6502561569213867, + "learning_rate": 2.0413127169186053e-06, + "loss": 0.9367, + "step": 9217 + }, + { + "epoch": 0.7106074622263336, + "grad_norm": 3.281320810317993, + "learning_rate": 2.0403063974453547e-06, + "loss": 0.8194, + "step": 9218 + }, + { + "epoch": 0.7106845513413506, + "grad_norm": 3.4343175888061523, + "learning_rate": 2.03930026249367e-06, + "loss": 0.7826, + "step": 9219 + }, + { + "epoch": 0.7107616404563676, + "grad_norm": 3.537015438079834, + "learning_rate": 2.03829431212628e-06, + "loss": 0.9891, + "step": 9220 + }, + { + "epoch": 0.7108387295713845, + "grad_norm": 3.6004459857940674, + "learning_rate": 2.0372885464059004e-06, + "loss": 0.9879, + "step": 9221 + }, + { + "epoch": 0.7109158186864015, + "grad_norm": 3.736265182495117, + "learning_rate": 2.036282965395236e-06, + "loss": 0.8828, + "step": 9222 + }, + { + "epoch": 0.7109929078014184, + "grad_norm": 3.9425337314605713, + "learning_rate": 2.035277569156981e-06, + "loss": 1.03, + "step": 9223 + }, + { + "epoch": 0.7110699969164354, + "grad_norm": 3.6420586109161377, + "learning_rate": 2.0342723577538125e-06, + "loss": 1.0069, + "step": 9224 + }, + { + "epoch": 0.7111470860314524, + "grad_norm": 3.814761161804199, + "learning_rate": 2.0332673312484037e-06, + "loss": 1.0226, + "step": 9225 + }, + { + "epoch": 0.7112241751464693, + "grad_norm": 3.7190582752227783, + "learning_rate": 2.0322624897034102e-06, + "loss": 0.8863, + "step": 9226 + }, + { + "epoch": 0.7113012642614863, + "grad_norm": 3.9217827320098877, + "learning_rate": 2.0312578331814812e-06, + "loss": 0.9611, + "step": 9227 + }, + { + "epoch": 0.7113783533765032, + "grad_norm": 3.8105664253234863, + "learning_rate": 2.030253361745251e-06, + "loss": 0.9657, + "step": 9228 + }, + { + "epoch": 0.7114554424915202, + "grad_norm": 3.696140766143799, + "learning_rate": 2.0292490754573425e-06, + "loss": 0.9549, + "step": 9229 + }, + { + "epoch": 0.7115325316065372, + "grad_norm": 3.7637147903442383, + "learning_rate": 2.0282449743803684e-06, + "loss": 0.9717, + "step": 9230 + }, + { + "epoch": 0.7116096207215541, + "grad_norm": 3.541837692260742, + "learning_rate": 2.0272410585769284e-06, + "loss": 0.8398, + "step": 9231 + }, + { + "epoch": 0.7116867098365711, + "grad_norm": 3.7558746337890625, + "learning_rate": 2.0262373281096133e-06, + "loss": 0.9397, + "step": 9232 + }, + { + "epoch": 0.711763798951588, + "grad_norm": 3.951739549636841, + "learning_rate": 2.025233783041e-06, + "loss": 1.0248, + "step": 9233 + }, + { + "epoch": 0.711840888066605, + "grad_norm": 3.583740472793579, + "learning_rate": 2.0242304234336525e-06, + "loss": 0.9207, + "step": 9234 + }, + { + "epoch": 0.711917977181622, + "grad_norm": 3.8319308757781982, + "learning_rate": 2.0232272493501253e-06, + "loss": 0.9674, + "step": 9235 + }, + { + "epoch": 0.7119950662966389, + "grad_norm": 3.4977939128875732, + "learning_rate": 2.022224260852963e-06, + "loss": 0.8959, + "step": 9236 + }, + { + "epoch": 0.7120721554116559, + "grad_norm": 3.4424352645874023, + "learning_rate": 2.021221458004695e-06, + "loss": 0.8286, + "step": 9237 + }, + { + "epoch": 0.7121492445266728, + "grad_norm": 3.8004579544067383, + "learning_rate": 2.020218840867842e-06, + "loss": 1.0063, + "step": 9238 + }, + { + "epoch": 0.7122263336416897, + "grad_norm": 3.6114730834960938, + "learning_rate": 2.019216409504913e-06, + "loss": 0.8109, + "step": 9239 + }, + { + "epoch": 0.7123034227567068, + "grad_norm": 3.892796516418457, + "learning_rate": 2.018214163978402e-06, + "loss": 1.0058, + "step": 9240 + }, + { + "epoch": 0.7123805118717237, + "grad_norm": 3.875223159790039, + "learning_rate": 2.0172121043507943e-06, + "loss": 0.9591, + "step": 9241 + }, + { + "epoch": 0.7124576009867407, + "grad_norm": 4.0388665199279785, + "learning_rate": 2.016210230684564e-06, + "loss": 0.9484, + "step": 9242 + }, + { + "epoch": 0.7125346901017576, + "grad_norm": 3.8614680767059326, + "learning_rate": 2.015208543042172e-06, + "loss": 1.0, + "step": 9243 + }, + { + "epoch": 0.7126117792167745, + "grad_norm": 3.793128252029419, + "learning_rate": 2.0142070414860704e-06, + "loss": 0.9818, + "step": 9244 + }, + { + "epoch": 0.7126888683317916, + "grad_norm": 3.489015817642212, + "learning_rate": 2.0132057260786943e-06, + "loss": 0.9183, + "step": 9245 + }, + { + "epoch": 0.7127659574468085, + "grad_norm": 3.5036089420318604, + "learning_rate": 2.012204596882472e-06, + "loss": 0.9126, + "step": 9246 + }, + { + "epoch": 0.7128430465618255, + "grad_norm": 3.7784531116485596, + "learning_rate": 2.011203653959819e-06, + "loss": 0.902, + "step": 9247 + }, + { + "epoch": 0.7129201356768424, + "grad_norm": 3.950711488723755, + "learning_rate": 2.0102028973731393e-06, + "loss": 0.9668, + "step": 9248 + }, + { + "epoch": 0.7129972247918593, + "grad_norm": 3.538501501083374, + "learning_rate": 2.0092023271848254e-06, + "loss": 0.9317, + "step": 9249 + }, + { + "epoch": 0.7130743139068764, + "grad_norm": 3.5862019062042236, + "learning_rate": 2.008201943457255e-06, + "loss": 0.8727, + "step": 9250 + }, + { + "epoch": 0.7131514030218933, + "grad_norm": 3.7320566177368164, + "learning_rate": 2.007201746252799e-06, + "loss": 1.0035, + "step": 9251 + }, + { + "epoch": 0.7132284921369103, + "grad_norm": 3.5628163814544678, + "learning_rate": 2.0062017356338136e-06, + "loss": 0.8501, + "step": 9252 + }, + { + "epoch": 0.7133055812519272, + "grad_norm": 3.4357900619506836, + "learning_rate": 2.0052019116626446e-06, + "loss": 0.867, + "step": 9253 + }, + { + "epoch": 0.7133826703669441, + "grad_norm": 3.7662131786346436, + "learning_rate": 2.0042022744016264e-06, + "loss": 1.0013, + "step": 9254 + }, + { + "epoch": 0.7134597594819612, + "grad_norm": 3.847144842147827, + "learning_rate": 2.0032028239130824e-06, + "loss": 1.0047, + "step": 9255 + }, + { + "epoch": 0.7135368485969781, + "grad_norm": 3.619068145751953, + "learning_rate": 2.00220356025932e-06, + "loss": 0.8234, + "step": 9256 + }, + { + "epoch": 0.7136139377119951, + "grad_norm": 3.6342787742614746, + "learning_rate": 2.001204483502639e-06, + "loss": 0.8697, + "step": 9257 + }, + { + "epoch": 0.713691026827012, + "grad_norm": 3.920424222946167, + "learning_rate": 2.000205593705328e-06, + "loss": 1.0292, + "step": 9258 + }, + { + "epoch": 0.7137681159420289, + "grad_norm": 3.7946994304656982, + "learning_rate": 1.9992068909296607e-06, + "loss": 0.8509, + "step": 9259 + }, + { + "epoch": 0.713845205057046, + "grad_norm": 4.217837810516357, + "learning_rate": 1.998208375237905e-06, + "loss": 0.9054, + "step": 9260 + }, + { + "epoch": 0.7139222941720629, + "grad_norm": 4.02250862121582, + "learning_rate": 1.9972100466923083e-06, + "loss": 0.8969, + "step": 9261 + }, + { + "epoch": 0.7139993832870799, + "grad_norm": 3.8427062034606934, + "learning_rate": 1.996211905355114e-06, + "loss": 0.8616, + "step": 9262 + }, + { + "epoch": 0.7140764724020968, + "grad_norm": 3.860753059387207, + "learning_rate": 1.9952139512885497e-06, + "loss": 1.0435, + "step": 9263 + }, + { + "epoch": 0.7141535615171137, + "grad_norm": 3.5810599327087402, + "learning_rate": 1.9942161845548334e-06, + "loss": 0.8754, + "step": 9264 + }, + { + "epoch": 0.7142306506321308, + "grad_norm": 3.545610189437866, + "learning_rate": 1.993218605216171e-06, + "loss": 0.8673, + "step": 9265 + }, + { + "epoch": 0.7143077397471477, + "grad_norm": 3.7623043060302734, + "learning_rate": 1.9922212133347575e-06, + "loss": 1.0008, + "step": 9266 + }, + { + "epoch": 0.7143848288621647, + "grad_norm": 3.5907156467437744, + "learning_rate": 1.991224008972772e-06, + "loss": 0.9327, + "step": 9267 + }, + { + "epoch": 0.7144619179771816, + "grad_norm": 3.638535261154175, + "learning_rate": 1.9902269921923867e-06, + "loss": 0.924, + "step": 9268 + }, + { + "epoch": 0.7145390070921985, + "grad_norm": 3.881423234939575, + "learning_rate": 1.9892301630557604e-06, + "loss": 0.9654, + "step": 9269 + }, + { + "epoch": 0.7146160962072156, + "grad_norm": 3.544396162033081, + "learning_rate": 1.9882335216250402e-06, + "loss": 0.8691, + "step": 9270 + }, + { + "epoch": 0.7146931853222325, + "grad_norm": 3.863416910171509, + "learning_rate": 1.987237067962363e-06, + "loss": 0.8315, + "step": 9271 + }, + { + "epoch": 0.7147702744372495, + "grad_norm": 3.7197678089141846, + "learning_rate": 1.9862408021298503e-06, + "loss": 0.9305, + "step": 9272 + }, + { + "epoch": 0.7148473635522664, + "grad_norm": 3.4291746616363525, + "learning_rate": 1.9852447241896122e-06, + "loss": 0.8485, + "step": 9273 + }, + { + "epoch": 0.7149244526672833, + "grad_norm": 3.7952208518981934, + "learning_rate": 1.984248834203754e-06, + "loss": 0.8727, + "step": 9274 + }, + { + "epoch": 0.7150015417823004, + "grad_norm": 3.9852194786071777, + "learning_rate": 1.9832531322343617e-06, + "loss": 0.9672, + "step": 9275 + }, + { + "epoch": 0.7150786308973173, + "grad_norm": 3.7083189487457275, + "learning_rate": 1.982257618343515e-06, + "loss": 0.8642, + "step": 9276 + }, + { + "epoch": 0.7151557200123343, + "grad_norm": 3.6518466472625732, + "learning_rate": 1.981262292593274e-06, + "loss": 0.8827, + "step": 9277 + }, + { + "epoch": 0.7152328091273512, + "grad_norm": 3.622223138809204, + "learning_rate": 1.9802671550456948e-06, + "loss": 0.8975, + "step": 9278 + }, + { + "epoch": 0.7153098982423681, + "grad_norm": 3.709808349609375, + "learning_rate": 1.979272205762819e-06, + "loss": 0.9319, + "step": 9279 + }, + { + "epoch": 0.7153869873573852, + "grad_norm": 3.7233026027679443, + "learning_rate": 1.978277444806676e-06, + "loss": 0.855, + "step": 9280 + }, + { + "epoch": 0.7154640764724021, + "grad_norm": 3.8098905086517334, + "learning_rate": 1.9772828722392866e-06, + "loss": 0.895, + "step": 9281 + }, + { + "epoch": 0.7155411655874191, + "grad_norm": 3.943906545639038, + "learning_rate": 1.9762884881226535e-06, + "loss": 0.8825, + "step": 9282 + }, + { + "epoch": 0.715618254702436, + "grad_norm": 3.8374009132385254, + "learning_rate": 1.9752942925187725e-06, + "loss": 0.7061, + "step": 9283 + }, + { + "epoch": 0.7156953438174529, + "grad_norm": 3.700810194015503, + "learning_rate": 1.974300285489627e-06, + "loss": 0.9142, + "step": 9284 + }, + { + "epoch": 0.71577243293247, + "grad_norm": 3.885406494140625, + "learning_rate": 1.9733064670971886e-06, + "loss": 0.9404, + "step": 9285 + }, + { + "epoch": 0.7158495220474869, + "grad_norm": 3.8215103149414062, + "learning_rate": 1.972312837403416e-06, + "loss": 0.9591, + "step": 9286 + }, + { + "epoch": 0.7159266111625039, + "grad_norm": 3.734654426574707, + "learning_rate": 1.971319396470259e-06, + "loss": 0.9052, + "step": 9287 + }, + { + "epoch": 0.7160037002775208, + "grad_norm": 3.6638259887695312, + "learning_rate": 1.97032614435965e-06, + "loss": 1.0389, + "step": 9288 + }, + { + "epoch": 0.7160807893925377, + "grad_norm": 3.528590440750122, + "learning_rate": 1.969333081133515e-06, + "loss": 0.8955, + "step": 9289 + }, + { + "epoch": 0.7161578785075547, + "grad_norm": 3.670738458633423, + "learning_rate": 1.9683402068537654e-06, + "loss": 1.0466, + "step": 9290 + }, + { + "epoch": 0.7162349676225717, + "grad_norm": 3.703134775161743, + "learning_rate": 1.9673475215823035e-06, + "loss": 0.9594, + "step": 9291 + }, + { + "epoch": 0.7163120567375887, + "grad_norm": 4.054429054260254, + "learning_rate": 1.966355025381018e-06, + "loss": 0.9606, + "step": 9292 + }, + { + "epoch": 0.7163891458526056, + "grad_norm": 3.620304584503174, + "learning_rate": 1.965362718311784e-06, + "loss": 0.8857, + "step": 9293 + }, + { + "epoch": 0.7164662349676225, + "grad_norm": 3.4398069381713867, + "learning_rate": 1.9643706004364675e-06, + "loss": 0.7762, + "step": 9294 + }, + { + "epoch": 0.7165433240826395, + "grad_norm": 3.881047487258911, + "learning_rate": 1.9633786718169217e-06, + "loss": 0.9335, + "step": 9295 + }, + { + "epoch": 0.7166204131976565, + "grad_norm": 3.718986749649048, + "learning_rate": 1.9623869325149893e-06, + "loss": 0.8663, + "step": 9296 + }, + { + "epoch": 0.7166975023126735, + "grad_norm": 3.492274522781372, + "learning_rate": 1.9613953825925008e-06, + "loss": 0.9198, + "step": 9297 + }, + { + "epoch": 0.7167745914276904, + "grad_norm": 3.6691811084747314, + "learning_rate": 1.960404022111271e-06, + "loss": 0.8376, + "step": 9298 + }, + { + "epoch": 0.7168516805427073, + "grad_norm": 3.32060170173645, + "learning_rate": 1.9594128511331082e-06, + "loss": 0.7495, + "step": 9299 + }, + { + "epoch": 0.7169287696577243, + "grad_norm": 3.6289753913879395, + "learning_rate": 1.958421869719807e-06, + "loss": 0.9179, + "step": 9300 + }, + { + "epoch": 0.7170058587727413, + "grad_norm": 3.9919261932373047, + "learning_rate": 1.957431077933149e-06, + "loss": 0.8687, + "step": 9301 + }, + { + "epoch": 0.7170829478877583, + "grad_norm": 4.032829761505127, + "learning_rate": 1.9564404758349055e-06, + "loss": 0.9729, + "step": 9302 + }, + { + "epoch": 0.7171600370027752, + "grad_norm": 3.717048406600952, + "learning_rate": 1.955450063486837e-06, + "loss": 0.9274, + "step": 9303 + }, + { + "epoch": 0.7172371261177922, + "grad_norm": 3.426722288131714, + "learning_rate": 1.954459840950687e-06, + "loss": 0.8553, + "step": 9304 + }, + { + "epoch": 0.7173142152328091, + "grad_norm": 3.5215723514556885, + "learning_rate": 1.9534698082881926e-06, + "loss": 0.9, + "step": 9305 + }, + { + "epoch": 0.717391304347826, + "grad_norm": 3.6140124797821045, + "learning_rate": 1.9524799655610776e-06, + "loss": 0.8674, + "step": 9306 + }, + { + "epoch": 0.7174683934628431, + "grad_norm": 3.1654090881347656, + "learning_rate": 1.951490312831053e-06, + "loss": 0.7923, + "step": 9307 + }, + { + "epoch": 0.71754548257786, + "grad_norm": 3.4995226860046387, + "learning_rate": 1.9505008501598204e-06, + "loss": 0.9468, + "step": 9308 + }, + { + "epoch": 0.717622571692877, + "grad_norm": 3.9290666580200195, + "learning_rate": 1.949511577609065e-06, + "loss": 0.8838, + "step": 9309 + }, + { + "epoch": 0.7176996608078939, + "grad_norm": 4.191568851470947, + "learning_rate": 1.948522495240463e-06, + "loss": 0.8792, + "step": 9310 + }, + { + "epoch": 0.7177767499229109, + "grad_norm": 4.151352882385254, + "learning_rate": 1.94753360311568e-06, + "loss": 0.9023, + "step": 9311 + }, + { + "epoch": 0.7178538390379279, + "grad_norm": 3.6860830783843994, + "learning_rate": 1.946544901296367e-06, + "loss": 0.9006, + "step": 9312 + }, + { + "epoch": 0.7179309281529448, + "grad_norm": 3.961071491241455, + "learning_rate": 1.945556389844166e-06, + "loss": 0.8171, + "step": 9313 + }, + { + "epoch": 0.7180080172679618, + "grad_norm": 3.5512242317199707, + "learning_rate": 1.9445680688207065e-06, + "loss": 1.0231, + "step": 9314 + }, + { + "epoch": 0.7180851063829787, + "grad_norm": 4.143185615539551, + "learning_rate": 1.943579938287601e-06, + "loss": 0.9645, + "step": 9315 + }, + { + "epoch": 0.7181621954979956, + "grad_norm": 3.999950885772705, + "learning_rate": 1.942591998306457e-06, + "loss": 0.9124, + "step": 9316 + }, + { + "epoch": 0.7182392846130127, + "grad_norm": 3.6654207706451416, + "learning_rate": 1.941604248938867e-06, + "loss": 0.8595, + "step": 9317 + }, + { + "epoch": 0.7183163737280296, + "grad_norm": 4.063483238220215, + "learning_rate": 1.9406166902464128e-06, + "loss": 0.9909, + "step": 9318 + }, + { + "epoch": 0.7183934628430466, + "grad_norm": 3.7724883556365967, + "learning_rate": 1.9396293222906626e-06, + "loss": 0.9458, + "step": 9319 + }, + { + "epoch": 0.7184705519580635, + "grad_norm": 3.940067768096924, + "learning_rate": 1.9386421451331737e-06, + "loss": 0.9825, + "step": 9320 + }, + { + "epoch": 0.7185476410730804, + "grad_norm": 3.5647549629211426, + "learning_rate": 1.9376551588354924e-06, + "loss": 0.9541, + "step": 9321 + }, + { + "epoch": 0.7186247301880975, + "grad_norm": 3.541923761367798, + "learning_rate": 1.936668363459152e-06, + "loss": 0.9573, + "step": 9322 + }, + { + "epoch": 0.7187018193031144, + "grad_norm": 3.8034353256225586, + "learning_rate": 1.9356817590656734e-06, + "loss": 0.7969, + "step": 9323 + }, + { + "epoch": 0.7187789084181314, + "grad_norm": 3.5387001037597656, + "learning_rate": 1.934695345716568e-06, + "loss": 0.8029, + "step": 9324 + }, + { + "epoch": 0.7188559975331483, + "grad_norm": 3.763958215713501, + "learning_rate": 1.933709123473331e-06, + "loss": 0.8748, + "step": 9325 + }, + { + "epoch": 0.7189330866481652, + "grad_norm": 3.617594003677368, + "learning_rate": 1.9327230923974487e-06, + "loss": 0.8477, + "step": 9326 + }, + { + "epoch": 0.7190101757631823, + "grad_norm": 3.5570931434631348, + "learning_rate": 1.931737252550396e-06, + "loss": 0.8856, + "step": 9327 + }, + { + "epoch": 0.7190872648781992, + "grad_norm": 3.4663596153259277, + "learning_rate": 1.9307516039936354e-06, + "loss": 0.874, + "step": 9328 + }, + { + "epoch": 0.7191643539932162, + "grad_norm": 3.960660696029663, + "learning_rate": 1.9297661467886177e-06, + "loss": 0.9692, + "step": 9329 + }, + { + "epoch": 0.7192414431082331, + "grad_norm": 3.4713823795318604, + "learning_rate": 1.928780880996777e-06, + "loss": 0.8727, + "step": 9330 + }, + { + "epoch": 0.71931853222325, + "grad_norm": 3.68634295463562, + "learning_rate": 1.9277958066795426e-06, + "loss": 0.8905, + "step": 9331 + }, + { + "epoch": 0.7193956213382671, + "grad_norm": 3.8657889366149902, + "learning_rate": 1.9268109238983287e-06, + "loss": 0.9138, + "step": 9332 + }, + { + "epoch": 0.719472710453284, + "grad_norm": 3.750303268432617, + "learning_rate": 1.925826232714537e-06, + "loss": 0.9105, + "step": 9333 + }, + { + "epoch": 0.719549799568301, + "grad_norm": 3.9123423099517822, + "learning_rate": 1.924841733189558e-06, + "loss": 0.8826, + "step": 9334 + }, + { + "epoch": 0.7196268886833179, + "grad_norm": 3.7364614009857178, + "learning_rate": 1.923857425384772e-06, + "loss": 0.9367, + "step": 9335 + }, + { + "epoch": 0.7197039777983348, + "grad_norm": 4.046642303466797, + "learning_rate": 1.922873309361542e-06, + "loss": 1.0428, + "step": 9336 + }, + { + "epoch": 0.7197810669133519, + "grad_norm": 3.7967915534973145, + "learning_rate": 1.921889385181225e-06, + "loss": 0.9271, + "step": 9337 + }, + { + "epoch": 0.7198581560283688, + "grad_norm": 4.126679420471191, + "learning_rate": 1.9209056529051617e-06, + "loss": 0.9439, + "step": 9338 + }, + { + "epoch": 0.7199352451433858, + "grad_norm": 3.514425039291382, + "learning_rate": 1.9199221125946847e-06, + "loss": 0.8919, + "step": 9339 + }, + { + "epoch": 0.7200123342584027, + "grad_norm": 3.4573843479156494, + "learning_rate": 1.9189387643111135e-06, + "loss": 0.8665, + "step": 9340 + }, + { + "epoch": 0.7200894233734196, + "grad_norm": 4.3528242111206055, + "learning_rate": 1.9179556081157513e-06, + "loss": 0.9908, + "step": 9341 + }, + { + "epoch": 0.7201665124884367, + "grad_norm": 3.473971366882324, + "learning_rate": 1.9169726440698945e-06, + "loss": 0.8575, + "step": 9342 + }, + { + "epoch": 0.7202436016034536, + "grad_norm": 3.958357334136963, + "learning_rate": 1.9159898722348264e-06, + "loss": 0.9238, + "step": 9343 + }, + { + "epoch": 0.7203206907184706, + "grad_norm": 3.5286219120025635, + "learning_rate": 1.9150072926718166e-06, + "loss": 0.9294, + "step": 9344 + }, + { + "epoch": 0.7203977798334875, + "grad_norm": 3.8079938888549805, + "learning_rate": 1.914024905442127e-06, + "loss": 0.9186, + "step": 9345 + }, + { + "epoch": 0.7204748689485044, + "grad_norm": 3.7631888389587402, + "learning_rate": 1.9130427106069993e-06, + "loss": 0.9976, + "step": 9346 + }, + { + "epoch": 0.7205519580635215, + "grad_norm": 3.6607329845428467, + "learning_rate": 1.912060708227671e-06, + "loss": 0.9171, + "step": 9347 + }, + { + "epoch": 0.7206290471785384, + "grad_norm": 3.550433874130249, + "learning_rate": 1.911078898365365e-06, + "loss": 0.8487, + "step": 9348 + }, + { + "epoch": 0.7207061362935554, + "grad_norm": 3.631561040878296, + "learning_rate": 1.9100972810812918e-06, + "loss": 0.9102, + "step": 9349 + }, + { + "epoch": 0.7207832254085723, + "grad_norm": 4.037540435791016, + "learning_rate": 1.90911585643665e-06, + "loss": 0.9158, + "step": 9350 + }, + { + "epoch": 0.7208603145235892, + "grad_norm": 4.127115249633789, + "learning_rate": 1.908134624492628e-06, + "loss": 0.9296, + "step": 9351 + }, + { + "epoch": 0.7209374036386063, + "grad_norm": 4.0273613929748535, + "learning_rate": 1.9071535853103978e-06, + "loss": 0.9744, + "step": 9352 + }, + { + "epoch": 0.7210144927536232, + "grad_norm": 4.10391092300415, + "learning_rate": 1.9061727389511226e-06, + "loss": 0.8842, + "step": 9353 + }, + { + "epoch": 0.7210915818686402, + "grad_norm": 3.6492302417755127, + "learning_rate": 1.9051920854759543e-06, + "loss": 0.9429, + "step": 9354 + }, + { + "epoch": 0.7211686709836571, + "grad_norm": 3.974447011947632, + "learning_rate": 1.9042116249460307e-06, + "loss": 0.9477, + "step": 9355 + }, + { + "epoch": 0.721245760098674, + "grad_norm": 3.8912482261657715, + "learning_rate": 1.903231357422481e-06, + "loss": 0.8826, + "step": 9356 + }, + { + "epoch": 0.721322849213691, + "grad_norm": 3.689361810684204, + "learning_rate": 1.9022512829664153e-06, + "loss": 0.9352, + "step": 9357 + }, + { + "epoch": 0.721399938328708, + "grad_norm": 3.682281732559204, + "learning_rate": 1.9012714016389388e-06, + "loss": 0.8895, + "step": 9358 + }, + { + "epoch": 0.721477027443725, + "grad_norm": 3.9238123893737793, + "learning_rate": 1.9002917135011413e-06, + "loss": 0.9899, + "step": 9359 + }, + { + "epoch": 0.7215541165587419, + "grad_norm": 3.783236503601074, + "learning_rate": 1.899312218614102e-06, + "loss": 0.9622, + "step": 9360 + }, + { + "epoch": 0.7216312056737588, + "grad_norm": 3.7680461406707764, + "learning_rate": 1.898332917038887e-06, + "loss": 0.9219, + "step": 9361 + }, + { + "epoch": 0.7217082947887759, + "grad_norm": 3.3629398345947266, + "learning_rate": 1.8973538088365507e-06, + "loss": 0.8667, + "step": 9362 + }, + { + "epoch": 0.7217853839037928, + "grad_norm": 3.6810696125030518, + "learning_rate": 1.8963748940681349e-06, + "loss": 1.0191, + "step": 9363 + }, + { + "epoch": 0.7218624730188098, + "grad_norm": 4.012876987457275, + "learning_rate": 1.8953961727946706e-06, + "loss": 0.8726, + "step": 9364 + }, + { + "epoch": 0.7219395621338267, + "grad_norm": 3.692685842514038, + "learning_rate": 1.8944176450771761e-06, + "loss": 0.8478, + "step": 9365 + }, + { + "epoch": 0.7220166512488436, + "grad_norm": 3.4944231510162354, + "learning_rate": 1.893439310976659e-06, + "loss": 0.8727, + "step": 9366 + }, + { + "epoch": 0.7220937403638606, + "grad_norm": 3.8247053623199463, + "learning_rate": 1.8924611705541095e-06, + "loss": 0.9374, + "step": 9367 + }, + { + "epoch": 0.7221708294788776, + "grad_norm": 4.079497814178467, + "learning_rate": 1.8914832238705117e-06, + "loss": 0.9631, + "step": 9368 + }, + { + "epoch": 0.7222479185938946, + "grad_norm": 4.07694149017334, + "learning_rate": 1.8905054709868354e-06, + "loss": 1.0642, + "step": 9369 + }, + { + "epoch": 0.7223250077089115, + "grad_norm": 3.5892646312713623, + "learning_rate": 1.8895279119640387e-06, + "loss": 0.9717, + "step": 9370 + }, + { + "epoch": 0.7224020968239284, + "grad_norm": 3.906541347503662, + "learning_rate": 1.8885505468630673e-06, + "loss": 0.9333, + "step": 9371 + }, + { + "epoch": 0.7224791859389454, + "grad_norm": 4.1830830574035645, + "learning_rate": 1.887573375744856e-06, + "loss": 0.993, + "step": 9372 + }, + { + "epoch": 0.7225562750539624, + "grad_norm": 3.726923942565918, + "learning_rate": 1.8865963986703234e-06, + "loss": 0.9537, + "step": 9373 + }, + { + "epoch": 0.7226333641689794, + "grad_norm": 3.74369215965271, + "learning_rate": 1.885619615700381e-06, + "loss": 0.9731, + "step": 9374 + }, + { + "epoch": 0.7227104532839963, + "grad_norm": 4.148539066314697, + "learning_rate": 1.8846430268959253e-06, + "loss": 1.0761, + "step": 9375 + }, + { + "epoch": 0.7227875423990132, + "grad_norm": 3.4325528144836426, + "learning_rate": 1.883666632317842e-06, + "loss": 0.846, + "step": 9376 + }, + { + "epoch": 0.7228646315140302, + "grad_norm": 3.434077024459839, + "learning_rate": 1.8826904320270068e-06, + "loss": 0.7253, + "step": 9377 + }, + { + "epoch": 0.7229417206290472, + "grad_norm": 3.7879035472869873, + "learning_rate": 1.8817144260842757e-06, + "loss": 0.8692, + "step": 9378 + }, + { + "epoch": 0.7230188097440642, + "grad_norm": 3.6962223052978516, + "learning_rate": 1.8807386145505002e-06, + "loss": 0.8621, + "step": 9379 + }, + { + "epoch": 0.7230958988590811, + "grad_norm": 3.504754066467285, + "learning_rate": 1.8797629974865172e-06, + "loss": 0.8153, + "step": 9380 + }, + { + "epoch": 0.723172987974098, + "grad_norm": 3.4584434032440186, + "learning_rate": 1.8787875749531509e-06, + "loss": 0.9573, + "step": 9381 + }, + { + "epoch": 0.723250077089115, + "grad_norm": 3.89050555229187, + "learning_rate": 1.8778123470112141e-06, + "loss": 0.8954, + "step": 9382 + }, + { + "epoch": 0.723327166204132, + "grad_norm": 3.5211403369903564, + "learning_rate": 1.8768373137215096e-06, + "loss": 0.8079, + "step": 9383 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 3.81539249420166, + "learning_rate": 1.8758624751448213e-06, + "loss": 0.9524, + "step": 9384 + }, + { + "epoch": 0.7234813444341659, + "grad_norm": 3.4943387508392334, + "learning_rate": 1.8748878313419271e-06, + "loss": 0.8294, + "step": 9385 + }, + { + "epoch": 0.7235584335491828, + "grad_norm": 3.9146976470947266, + "learning_rate": 1.873913382373591e-06, + "loss": 0.9008, + "step": 9386 + }, + { + "epoch": 0.7236355226641998, + "grad_norm": 3.7792716026306152, + "learning_rate": 1.872939128300566e-06, + "loss": 1.0432, + "step": 9387 + }, + { + "epoch": 0.7237126117792168, + "grad_norm": 4.022826671600342, + "learning_rate": 1.8719650691835917e-06, + "loss": 0.9682, + "step": 9388 + }, + { + "epoch": 0.7237897008942338, + "grad_norm": 3.6135926246643066, + "learning_rate": 1.8709912050833933e-06, + "loss": 0.8809, + "step": 9389 + }, + { + "epoch": 0.7238667900092507, + "grad_norm": 3.3770127296447754, + "learning_rate": 1.8700175360606882e-06, + "loss": 0.8576, + "step": 9390 + }, + { + "epoch": 0.7239438791242676, + "grad_norm": 3.8401501178741455, + "learning_rate": 1.869044062176179e-06, + "loss": 0.975, + "step": 9391 + }, + { + "epoch": 0.7240209682392846, + "grad_norm": 3.574070453643799, + "learning_rate": 1.8680707834905565e-06, + "loss": 0.8732, + "step": 9392 + }, + { + "epoch": 0.7240980573543015, + "grad_norm": 3.733696699142456, + "learning_rate": 1.8670977000645018e-06, + "loss": 1.0079, + "step": 9393 + }, + { + "epoch": 0.7241751464693186, + "grad_norm": 3.893634557723999, + "learning_rate": 1.8661248119586784e-06, + "loss": 0.9188, + "step": 9394 + }, + { + "epoch": 0.7242522355843355, + "grad_norm": 3.5450663566589355, + "learning_rate": 1.865152119233742e-06, + "loss": 0.9227, + "step": 9395 + }, + { + "epoch": 0.7243293246993524, + "grad_norm": 4.02310848236084, + "learning_rate": 1.864179621950335e-06, + "loss": 0.8968, + "step": 9396 + }, + { + "epoch": 0.7244064138143694, + "grad_norm": 3.913698434829712, + "learning_rate": 1.8632073201690882e-06, + "loss": 0.8523, + "step": 9397 + }, + { + "epoch": 0.7244835029293863, + "grad_norm": 3.8382716178894043, + "learning_rate": 1.8622352139506184e-06, + "loss": 0.8597, + "step": 9398 + }, + { + "epoch": 0.7245605920444034, + "grad_norm": 4.016849040985107, + "learning_rate": 1.8612633033555345e-06, + "loss": 0.7716, + "step": 9399 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 3.819669485092163, + "learning_rate": 1.8602915884444257e-06, + "loss": 0.9032, + "step": 9400 + }, + { + "epoch": 0.7247147702744372, + "grad_norm": 4.067957878112793, + "learning_rate": 1.859320069277875e-06, + "loss": 0.9601, + "step": 9401 + }, + { + "epoch": 0.7247918593894542, + "grad_norm": 3.471204996109009, + "learning_rate": 1.8583487459164528e-06, + "loss": 0.9013, + "step": 9402 + }, + { + "epoch": 0.7248689485044711, + "grad_norm": 3.366598606109619, + "learning_rate": 1.8573776184207148e-06, + "loss": 0.9391, + "step": 9403 + }, + { + "epoch": 0.7249460376194882, + "grad_norm": 3.657421827316284, + "learning_rate": 1.8564066868512082e-06, + "loss": 0.8154, + "step": 9404 + }, + { + "epoch": 0.7250231267345051, + "grad_norm": 3.919306755065918, + "learning_rate": 1.8554359512684617e-06, + "loss": 0.9312, + "step": 9405 + }, + { + "epoch": 0.725100215849522, + "grad_norm": 3.4179515838623047, + "learning_rate": 1.8544654117329958e-06, + "loss": 0.8741, + "step": 9406 + }, + { + "epoch": 0.725177304964539, + "grad_norm": 3.9625890254974365, + "learning_rate": 1.8534950683053215e-06, + "loss": 0.8815, + "step": 9407 + }, + { + "epoch": 0.7252543940795559, + "grad_norm": 3.6159121990203857, + "learning_rate": 1.8525249210459345e-06, + "loss": 0.8737, + "step": 9408 + }, + { + "epoch": 0.725331483194573, + "grad_norm": 3.8844573497772217, + "learning_rate": 1.8515549700153185e-06, + "loss": 0.8763, + "step": 9409 + }, + { + "epoch": 0.7254085723095899, + "grad_norm": 3.9960150718688965, + "learning_rate": 1.8505852152739423e-06, + "loss": 1.0845, + "step": 9410 + }, + { + "epoch": 0.7254856614246068, + "grad_norm": 3.7577638626098633, + "learning_rate": 1.849615656882267e-06, + "loss": 1.0138, + "step": 9411 + }, + { + "epoch": 0.7255627505396238, + "grad_norm": 3.612480401992798, + "learning_rate": 1.8486462949007388e-06, + "loss": 0.9291, + "step": 9412 + }, + { + "epoch": 0.7256398396546407, + "grad_norm": 3.4609127044677734, + "learning_rate": 1.8476771293897932e-06, + "loss": 0.9554, + "step": 9413 + }, + { + "epoch": 0.7257169287696578, + "grad_norm": 3.4932973384857178, + "learning_rate": 1.846708160409854e-06, + "loss": 0.96, + "step": 9414 + }, + { + "epoch": 0.7257940178846747, + "grad_norm": 3.711120367050171, + "learning_rate": 1.8457393880213282e-06, + "loss": 0.9477, + "step": 9415 + }, + { + "epoch": 0.7258711069996916, + "grad_norm": 3.978654384613037, + "learning_rate": 1.8447708122846148e-06, + "loss": 0.9769, + "step": 9416 + }, + { + "epoch": 0.7259481961147086, + "grad_norm": 3.9434118270874023, + "learning_rate": 1.8438024332601002e-06, + "loss": 0.9147, + "step": 9417 + }, + { + "epoch": 0.7260252852297255, + "grad_norm": 3.865705966949463, + "learning_rate": 1.8428342510081571e-06, + "loss": 0.9856, + "step": 9418 + }, + { + "epoch": 0.7261023743447426, + "grad_norm": 3.8399107456207275, + "learning_rate": 1.8418662655891472e-06, + "loss": 0.9694, + "step": 9419 + }, + { + "epoch": 0.7261794634597595, + "grad_norm": 3.621166467666626, + "learning_rate": 1.8408984770634209e-06, + "loss": 0.9042, + "step": 9420 + }, + { + "epoch": 0.7262565525747764, + "grad_norm": 3.5802974700927734, + "learning_rate": 1.8399308854913118e-06, + "loss": 0.8997, + "step": 9421 + }, + { + "epoch": 0.7263336416897934, + "grad_norm": 3.3973002433776855, + "learning_rate": 1.8389634909331449e-06, + "loss": 0.8711, + "step": 9422 + }, + { + "epoch": 0.7264107308048103, + "grad_norm": 3.4591875076293945, + "learning_rate": 1.8379962934492335e-06, + "loss": 0.8588, + "step": 9423 + }, + { + "epoch": 0.7264878199198274, + "grad_norm": 3.682715654373169, + "learning_rate": 1.8370292930998768e-06, + "loss": 0.859, + "step": 9424 + }, + { + "epoch": 0.7265649090348443, + "grad_norm": 3.5783348083496094, + "learning_rate": 1.8360624899453638e-06, + "loss": 0.7877, + "step": 9425 + }, + { + "epoch": 0.7266419981498612, + "grad_norm": 3.5001380443573, + "learning_rate": 1.8350958840459665e-06, + "loss": 0.8595, + "step": 9426 + }, + { + "epoch": 0.7267190872648782, + "grad_norm": 3.755657434463501, + "learning_rate": 1.8341294754619487e-06, + "loss": 0.9371, + "step": 9427 + }, + { + "epoch": 0.7267961763798951, + "grad_norm": 4.00222635269165, + "learning_rate": 1.8331632642535623e-06, + "loss": 0.9562, + "step": 9428 + }, + { + "epoch": 0.7268732654949122, + "grad_norm": 3.519622802734375, + "learning_rate": 1.8321972504810448e-06, + "loss": 0.9217, + "step": 9429 + }, + { + "epoch": 0.7269503546099291, + "grad_norm": 3.830078363418579, + "learning_rate": 1.8312314342046222e-06, + "loss": 0.9358, + "step": 9430 + }, + { + "epoch": 0.727027443724946, + "grad_norm": 3.737502336502075, + "learning_rate": 1.8302658154845099e-06, + "loss": 0.9166, + "step": 9431 + }, + { + "epoch": 0.727104532839963, + "grad_norm": 3.694373369216919, + "learning_rate": 1.8293003943809062e-06, + "loss": 0.8676, + "step": 9432 + }, + { + "epoch": 0.7271816219549799, + "grad_norm": 3.707840919494629, + "learning_rate": 1.828335170954001e-06, + "loss": 0.8953, + "step": 9433 + }, + { + "epoch": 0.727258711069997, + "grad_norm": 3.8381030559539795, + "learning_rate": 1.8273701452639713e-06, + "loss": 0.9618, + "step": 9434 + }, + { + "epoch": 0.7273358001850139, + "grad_norm": 3.7728703022003174, + "learning_rate": 1.8264053173709817e-06, + "loss": 0.8564, + "step": 9435 + }, + { + "epoch": 0.7274128893000308, + "grad_norm": 3.9917378425598145, + "learning_rate": 1.825440687335186e-06, + "loss": 1.0076, + "step": 9436 + }, + { + "epoch": 0.7274899784150478, + "grad_norm": 3.8551135063171387, + "learning_rate": 1.82447625521672e-06, + "loss": 0.9929, + "step": 9437 + }, + { + "epoch": 0.7275670675300647, + "grad_norm": 3.5121772289276123, + "learning_rate": 1.8235120210757134e-06, + "loss": 0.8955, + "step": 9438 + }, + { + "epoch": 0.7276441566450818, + "grad_norm": 4.1116509437561035, + "learning_rate": 1.8225479849722804e-06, + "loss": 0.9207, + "step": 9439 + }, + { + "epoch": 0.7277212457600987, + "grad_norm": 3.6008291244506836, + "learning_rate": 1.8215841469665247e-06, + "loss": 0.9125, + "step": 9440 + }, + { + "epoch": 0.7277983348751156, + "grad_norm": 3.791553020477295, + "learning_rate": 1.8206205071185373e-06, + "loss": 0.8539, + "step": 9441 + }, + { + "epoch": 0.7278754239901326, + "grad_norm": 3.53488826751709, + "learning_rate": 1.8196570654883932e-06, + "loss": 0.8545, + "step": 9442 + }, + { + "epoch": 0.7279525131051495, + "grad_norm": 3.721651554107666, + "learning_rate": 1.8186938221361594e-06, + "loss": 1.0581, + "step": 9443 + }, + { + "epoch": 0.7280296022201665, + "grad_norm": 3.7198684215545654, + "learning_rate": 1.8177307771218894e-06, + "loss": 0.8781, + "step": 9444 + }, + { + "epoch": 0.7281066913351835, + "grad_norm": 3.6171069145202637, + "learning_rate": 1.8167679305056247e-06, + "loss": 0.9443, + "step": 9445 + }, + { + "epoch": 0.7281837804502004, + "grad_norm": 3.718351125717163, + "learning_rate": 1.8158052823473927e-06, + "loss": 0.9222, + "step": 9446 + }, + { + "epoch": 0.7282608695652174, + "grad_norm": 3.887049913406372, + "learning_rate": 1.8148428327072114e-06, + "loss": 0.9062, + "step": 9447 + }, + { + "epoch": 0.7283379586802343, + "grad_norm": 3.688441514968872, + "learning_rate": 1.8138805816450815e-06, + "loss": 0.9262, + "step": 9448 + }, + { + "epoch": 0.7284150477952513, + "grad_norm": 3.61232852935791, + "learning_rate": 1.812918529220996e-06, + "loss": 0.8991, + "step": 9449 + }, + { + "epoch": 0.7284921369102683, + "grad_norm": 3.5513792037963867, + "learning_rate": 1.8119566754949324e-06, + "loss": 0.8965, + "step": 9450 + }, + { + "epoch": 0.7285692260252852, + "grad_norm": 3.7241334915161133, + "learning_rate": 1.8109950205268624e-06, + "loss": 0.8641, + "step": 9451 + }, + { + "epoch": 0.7286463151403022, + "grad_norm": 3.3493247032165527, + "learning_rate": 1.8100335643767347e-06, + "loss": 0.9665, + "step": 9452 + }, + { + "epoch": 0.7287234042553191, + "grad_norm": 3.7719924449920654, + "learning_rate": 1.809072307104493e-06, + "loss": 0.8762, + "step": 9453 + }, + { + "epoch": 0.7288004933703361, + "grad_norm": 3.6483349800109863, + "learning_rate": 1.8081112487700665e-06, + "loss": 0.8928, + "step": 9454 + }, + { + "epoch": 0.7288775824853531, + "grad_norm": 3.8290278911590576, + "learning_rate": 1.8071503894333725e-06, + "loss": 0.9746, + "step": 9455 + }, + { + "epoch": 0.72895467160037, + "grad_norm": 3.715212345123291, + "learning_rate": 1.8061897291543157e-06, + "loss": 0.8306, + "step": 9456 + }, + { + "epoch": 0.729031760715387, + "grad_norm": 3.781547784805298, + "learning_rate": 1.8052292679927896e-06, + "loss": 0.8701, + "step": 9457 + }, + { + "epoch": 0.7291088498304039, + "grad_norm": 3.933978796005249, + "learning_rate": 1.804269006008671e-06, + "loss": 0.9048, + "step": 9458 + }, + { + "epoch": 0.7291859389454209, + "grad_norm": 3.5607521533966064, + "learning_rate": 1.803308943261829e-06, + "loss": 0.861, + "step": 9459 + }, + { + "epoch": 0.7292630280604379, + "grad_norm": 3.4771389961242676, + "learning_rate": 1.802349079812118e-06, + "loss": 0.9354, + "step": 9460 + }, + { + "epoch": 0.7293401171754548, + "grad_norm": 3.6316308975219727, + "learning_rate": 1.8013894157193807e-06, + "loss": 1.0234, + "step": 9461 + }, + { + "epoch": 0.7294172062904718, + "grad_norm": 3.486682176589966, + "learning_rate": 1.8004299510434493e-06, + "loss": 0.8481, + "step": 9462 + }, + { + "epoch": 0.7294942954054887, + "grad_norm": 3.479135036468506, + "learning_rate": 1.7994706858441375e-06, + "loss": 0.9336, + "step": 9463 + }, + { + "epoch": 0.7295713845205057, + "grad_norm": 3.551004409790039, + "learning_rate": 1.7985116201812524e-06, + "loss": 0.937, + "step": 9464 + }, + { + "epoch": 0.7296484736355227, + "grad_norm": 3.698634386062622, + "learning_rate": 1.7975527541145865e-06, + "loss": 0.9793, + "step": 9465 + }, + { + "epoch": 0.7297255627505396, + "grad_norm": 3.525210380554199, + "learning_rate": 1.7965940877039211e-06, + "loss": 1.0082, + "step": 9466 + }, + { + "epoch": 0.7298026518655566, + "grad_norm": 3.7876412868499756, + "learning_rate": 1.7956356210090236e-06, + "loss": 0.8984, + "step": 9467 + }, + { + "epoch": 0.7298797409805735, + "grad_norm": 4.059507369995117, + "learning_rate": 1.7946773540896506e-06, + "loss": 0.9558, + "step": 9468 + }, + { + "epoch": 0.7299568300955905, + "grad_norm": 3.954590320587158, + "learning_rate": 1.793719287005542e-06, + "loss": 1.0237, + "step": 9469 + }, + { + "epoch": 0.7300339192106075, + "grad_norm": 4.025545597076416, + "learning_rate": 1.7927614198164306e-06, + "loss": 1.0038, + "step": 9470 + }, + { + "epoch": 0.7301110083256244, + "grad_norm": 3.4767184257507324, + "learning_rate": 1.7918037525820336e-06, + "loss": 0.832, + "step": 9471 + }, + { + "epoch": 0.7301880974406414, + "grad_norm": 3.732330322265625, + "learning_rate": 1.7908462853620568e-06, + "loss": 0.9538, + "step": 9472 + }, + { + "epoch": 0.7302651865556583, + "grad_norm": 3.6224257946014404, + "learning_rate": 1.7898890182161954e-06, + "loss": 0.9717, + "step": 9473 + }, + { + "epoch": 0.7303422756706753, + "grad_norm": 3.483696460723877, + "learning_rate": 1.7889319512041264e-06, + "loss": 0.8152, + "step": 9474 + }, + { + "epoch": 0.7304193647856922, + "grad_norm": 3.9409186840057373, + "learning_rate": 1.7879750843855197e-06, + "loss": 0.9242, + "step": 9475 + }, + { + "epoch": 0.7304964539007093, + "grad_norm": 3.357489585876465, + "learning_rate": 1.7870184178200312e-06, + "loss": 0.9181, + "step": 9476 + }, + { + "epoch": 0.7305735430157262, + "grad_norm": 3.646585702896118, + "learning_rate": 1.7860619515673034e-06, + "loss": 0.9368, + "step": 9477 + }, + { + "epoch": 0.7306506321307431, + "grad_norm": 3.7453439235687256, + "learning_rate": 1.7851056856869681e-06, + "loss": 0.9319, + "step": 9478 + }, + { + "epoch": 0.7307277212457601, + "grad_norm": 3.725774049758911, + "learning_rate": 1.7841496202386437e-06, + "loss": 0.8849, + "step": 9479 + }, + { + "epoch": 0.730804810360777, + "grad_norm": 3.687467098236084, + "learning_rate": 1.7831937552819345e-06, + "loss": 0.9166, + "step": 9480 + }, + { + "epoch": 0.7308818994757941, + "grad_norm": 3.666307210922241, + "learning_rate": 1.7822380908764336e-06, + "loss": 0.768, + "step": 9481 + }, + { + "epoch": 0.730958988590811, + "grad_norm": 3.638786554336548, + "learning_rate": 1.781282627081723e-06, + "loss": 0.8445, + "step": 9482 + }, + { + "epoch": 0.7310360777058279, + "grad_norm": 3.7186498641967773, + "learning_rate": 1.7803273639573704e-06, + "loss": 0.9877, + "step": 9483 + }, + { + "epoch": 0.7311131668208449, + "grad_norm": 4.096242904663086, + "learning_rate": 1.7793723015629333e-06, + "loss": 0.9338, + "step": 9484 + }, + { + "epoch": 0.7311902559358618, + "grad_norm": 3.3239645957946777, + "learning_rate": 1.7784174399579513e-06, + "loss": 0.7036, + "step": 9485 + }, + { + "epoch": 0.7312673450508789, + "grad_norm": 3.5666675567626953, + "learning_rate": 1.7774627792019567e-06, + "loss": 0.8775, + "step": 9486 + }, + { + "epoch": 0.7313444341658958, + "grad_norm": 3.7199814319610596, + "learning_rate": 1.7765083193544679e-06, + "loss": 0.974, + "step": 9487 + }, + { + "epoch": 0.7314215232809127, + "grad_norm": 3.733372688293457, + "learning_rate": 1.775554060474991e-06, + "loss": 0.9032, + "step": 9488 + }, + { + "epoch": 0.7314986123959297, + "grad_norm": 4.118587493896484, + "learning_rate": 1.7746000026230198e-06, + "loss": 0.8837, + "step": 9489 + }, + { + "epoch": 0.7315757015109466, + "grad_norm": 3.630469560623169, + "learning_rate": 1.7736461458580324e-06, + "loss": 0.8297, + "step": 9490 + }, + { + "epoch": 0.7316527906259637, + "grad_norm": 3.6379334926605225, + "learning_rate": 1.7726924902394976e-06, + "loss": 0.7553, + "step": 9491 + }, + { + "epoch": 0.7317298797409806, + "grad_norm": 3.688769817352295, + "learning_rate": 1.7717390358268716e-06, + "loss": 0.9635, + "step": 9492 + }, + { + "epoch": 0.7318069688559975, + "grad_norm": 3.543370485305786, + "learning_rate": 1.7707857826795971e-06, + "loss": 0.8589, + "step": 9493 + }, + { + "epoch": 0.7318840579710145, + "grad_norm": 3.616456985473633, + "learning_rate": 1.7698327308571045e-06, + "loss": 0.8856, + "step": 9494 + }, + { + "epoch": 0.7319611470860314, + "grad_norm": 3.9555208683013916, + "learning_rate": 1.7688798804188118e-06, + "loss": 0.9327, + "step": 9495 + }, + { + "epoch": 0.7320382362010485, + "grad_norm": 3.7948670387268066, + "learning_rate": 1.767927231424124e-06, + "loss": 0.9202, + "step": 9496 + }, + { + "epoch": 0.7321153253160654, + "grad_norm": 3.7444069385528564, + "learning_rate": 1.766974783932434e-06, + "loss": 0.8917, + "step": 9497 + }, + { + "epoch": 0.7321924144310823, + "grad_norm": 3.439579486846924, + "learning_rate": 1.766022538003122e-06, + "loss": 0.8429, + "step": 9498 + }, + { + "epoch": 0.7322695035460993, + "grad_norm": 3.7228710651397705, + "learning_rate": 1.7650704936955577e-06, + "loss": 0.8607, + "step": 9499 + }, + { + "epoch": 0.7323465926611162, + "grad_norm": 3.5966405868530273, + "learning_rate": 1.7641186510690916e-06, + "loss": 0.8669, + "step": 9500 + }, + { + "epoch": 0.7324236817761333, + "grad_norm": 3.7146801948547363, + "learning_rate": 1.763167010183069e-06, + "loss": 0.8685, + "step": 9501 + }, + { + "epoch": 0.7325007708911502, + "grad_norm": 3.474900484085083, + "learning_rate": 1.7622155710968187e-06, + "loss": 0.8428, + "step": 9502 + }, + { + "epoch": 0.7325778600061671, + "grad_norm": 4.253158092498779, + "learning_rate": 1.761264333869659e-06, + "loss": 0.925, + "step": 9503 + }, + { + "epoch": 0.7326549491211841, + "grad_norm": 4.116722106933594, + "learning_rate": 1.7603132985608945e-06, + "loss": 0.8248, + "step": 9504 + }, + { + "epoch": 0.732732038236201, + "grad_norm": 4.452220916748047, + "learning_rate": 1.7593624652298186e-06, + "loss": 1.0207, + "step": 9505 + }, + { + "epoch": 0.7328091273512181, + "grad_norm": 3.5835022926330566, + "learning_rate": 1.7584118339357076e-06, + "loss": 0.8986, + "step": 9506 + }, + { + "epoch": 0.732886216466235, + "grad_norm": 3.83974289894104, + "learning_rate": 1.7574614047378297e-06, + "loss": 0.9099, + "step": 9507 + }, + { + "epoch": 0.7329633055812519, + "grad_norm": 3.6427395343780518, + "learning_rate": 1.7565111776954401e-06, + "loss": 0.9213, + "step": 9508 + }, + { + "epoch": 0.7330403946962689, + "grad_norm": 3.9767303466796875, + "learning_rate": 1.7555611528677803e-06, + "loss": 0.9611, + "step": 9509 + }, + { + "epoch": 0.7331174838112858, + "grad_norm": 3.73007869720459, + "learning_rate": 1.7546113303140806e-06, + "loss": 0.9217, + "step": 9510 + }, + { + "epoch": 0.7331945729263029, + "grad_norm": 3.5436551570892334, + "learning_rate": 1.7536617100935544e-06, + "loss": 0.8044, + "step": 9511 + }, + { + "epoch": 0.7332716620413198, + "grad_norm": 3.655442237854004, + "learning_rate": 1.7527122922654077e-06, + "loss": 0.8905, + "step": 9512 + }, + { + "epoch": 0.7333487511563367, + "grad_norm": 3.6758716106414795, + "learning_rate": 1.751763076888831e-06, + "loss": 0.9612, + "step": 9513 + }, + { + "epoch": 0.7334258402713537, + "grad_norm": 3.906611204147339, + "learning_rate": 1.7508140640230037e-06, + "loss": 0.9355, + "step": 9514 + }, + { + "epoch": 0.7335029293863706, + "grad_norm": 3.8882009983062744, + "learning_rate": 1.7498652537270916e-06, + "loss": 0.8655, + "step": 9515 + }, + { + "epoch": 0.7335800185013877, + "grad_norm": 3.8022232055664062, + "learning_rate": 1.7489166460602496e-06, + "loss": 0.9754, + "step": 9516 + }, + { + "epoch": 0.7336571076164046, + "grad_norm": 3.5740768909454346, + "learning_rate": 1.7479682410816156e-06, + "loss": 0.9293, + "step": 9517 + }, + { + "epoch": 0.7337341967314215, + "grad_norm": 3.5046918392181396, + "learning_rate": 1.7470200388503184e-06, + "loss": 0.9062, + "step": 9518 + }, + { + "epoch": 0.7338112858464385, + "grad_norm": 3.434080123901367, + "learning_rate": 1.7460720394254748e-06, + "loss": 0.8451, + "step": 9519 + }, + { + "epoch": 0.7338883749614554, + "grad_norm": 3.992218255996704, + "learning_rate": 1.7451242428661868e-06, + "loss": 0.9591, + "step": 9520 + }, + { + "epoch": 0.7339654640764725, + "grad_norm": 3.7113566398620605, + "learning_rate": 1.7441766492315465e-06, + "loss": 0.9268, + "step": 9521 + }, + { + "epoch": 0.7340425531914894, + "grad_norm": 3.7934823036193848, + "learning_rate": 1.7432292585806277e-06, + "loss": 0.8757, + "step": 9522 + }, + { + "epoch": 0.7341196423065063, + "grad_norm": 3.7156882286071777, + "learning_rate": 1.742282070972498e-06, + "loss": 0.9287, + "step": 9523 + }, + { + "epoch": 0.7341967314215233, + "grad_norm": 3.4963877201080322, + "learning_rate": 1.7413350864662088e-06, + "loss": 0.8622, + "step": 9524 + }, + { + "epoch": 0.7342738205365402, + "grad_norm": 3.512899160385132, + "learning_rate": 1.7403883051207997e-06, + "loss": 0.8646, + "step": 9525 + }, + { + "epoch": 0.7343509096515572, + "grad_norm": 3.7643868923187256, + "learning_rate": 1.739441726995298e-06, + "loss": 0.9348, + "step": 9526 + }, + { + "epoch": 0.7344279987665742, + "grad_norm": 3.798492431640625, + "learning_rate": 1.7384953521487191e-06, + "loss": 0.9234, + "step": 9527 + }, + { + "epoch": 0.7345050878815911, + "grad_norm": 3.6906044483184814, + "learning_rate": 1.737549180640062e-06, + "loss": 0.983, + "step": 9528 + }, + { + "epoch": 0.7345821769966081, + "grad_norm": 3.6873204708099365, + "learning_rate": 1.7366032125283167e-06, + "loss": 0.9359, + "step": 9529 + }, + { + "epoch": 0.734659266111625, + "grad_norm": 3.9488134384155273, + "learning_rate": 1.7356574478724593e-06, + "loss": 0.9906, + "step": 9530 + }, + { + "epoch": 0.734736355226642, + "grad_norm": 3.8538029193878174, + "learning_rate": 1.7347118867314538e-06, + "loss": 0.9494, + "step": 9531 + }, + { + "epoch": 0.734813444341659, + "grad_norm": 3.5995075702667236, + "learning_rate": 1.7337665291642524e-06, + "loss": 0.8652, + "step": 9532 + }, + { + "epoch": 0.7348905334566759, + "grad_norm": 3.970221519470215, + "learning_rate": 1.7328213752297902e-06, + "loss": 0.9299, + "step": 9533 + }, + { + "epoch": 0.7349676225716929, + "grad_norm": 3.456852674484253, + "learning_rate": 1.7318764249869934e-06, + "loss": 0.9381, + "step": 9534 + }, + { + "epoch": 0.7350447116867098, + "grad_norm": 3.492973566055298, + "learning_rate": 1.730931678494776e-06, + "loss": 0.9255, + "step": 9535 + }, + { + "epoch": 0.7351218008017268, + "grad_norm": 3.6652021408081055, + "learning_rate": 1.7299871358120373e-06, + "loss": 0.9341, + "step": 9536 + }, + { + "epoch": 0.7351988899167438, + "grad_norm": 3.6004602909088135, + "learning_rate": 1.729042796997667e-06, + "loss": 0.9393, + "step": 9537 + }, + { + "epoch": 0.7352759790317607, + "grad_norm": 3.624760150909424, + "learning_rate": 1.7280986621105355e-06, + "loss": 0.9113, + "step": 9538 + }, + { + "epoch": 0.7353530681467777, + "grad_norm": 3.834989309310913, + "learning_rate": 1.7271547312095055e-06, + "loss": 0.9726, + "step": 9539 + }, + { + "epoch": 0.7354301572617946, + "grad_norm": 3.8131542205810547, + "learning_rate": 1.7262110043534285e-06, + "loss": 0.9446, + "step": 9540 + }, + { + "epoch": 0.7355072463768116, + "grad_norm": 3.5170748233795166, + "learning_rate": 1.7252674816011405e-06, + "loss": 0.8592, + "step": 9541 + }, + { + "epoch": 0.7355843354918286, + "grad_norm": 3.7116787433624268, + "learning_rate": 1.7243241630114665e-06, + "loss": 0.9016, + "step": 9542 + }, + { + "epoch": 0.7356614246068455, + "grad_norm": 3.888503313064575, + "learning_rate": 1.723381048643214e-06, + "loss": 0.938, + "step": 9543 + }, + { + "epoch": 0.7357385137218625, + "grad_norm": 3.5734593868255615, + "learning_rate": 1.722438138555183e-06, + "loss": 0.8522, + "step": 9544 + }, + { + "epoch": 0.7358156028368794, + "grad_norm": 3.421564817428589, + "learning_rate": 1.7214954328061588e-06, + "loss": 0.9318, + "step": 9545 + }, + { + "epoch": 0.7358926919518964, + "grad_norm": 3.7123959064483643, + "learning_rate": 1.7205529314549153e-06, + "loss": 0.9123, + "step": 9546 + }, + { + "epoch": 0.7359697810669134, + "grad_norm": 3.348281145095825, + "learning_rate": 1.7196106345602126e-06, + "loss": 0.8201, + "step": 9547 + }, + { + "epoch": 0.7360468701819303, + "grad_norm": 4.12083625793457, + "learning_rate": 1.7186685421807964e-06, + "loss": 0.9747, + "step": 9548 + }, + { + "epoch": 0.7361239592969473, + "grad_norm": 3.7178947925567627, + "learning_rate": 1.7177266543754018e-06, + "loss": 0.9056, + "step": 9549 + }, + { + "epoch": 0.7362010484119642, + "grad_norm": 4.042630672454834, + "learning_rate": 1.7167849712027507e-06, + "loss": 0.9108, + "step": 9550 + }, + { + "epoch": 0.7362781375269812, + "grad_norm": 3.405878782272339, + "learning_rate": 1.7158434927215528e-06, + "loss": 0.8586, + "step": 9551 + }, + { + "epoch": 0.7363552266419982, + "grad_norm": 3.563534736633301, + "learning_rate": 1.7149022189905041e-06, + "loss": 0.8012, + "step": 9552 + }, + { + "epoch": 0.7364323157570151, + "grad_norm": 3.4462010860443115, + "learning_rate": 1.7139611500682896e-06, + "loss": 0.874, + "step": 9553 + }, + { + "epoch": 0.7365094048720321, + "grad_norm": 3.5920019149780273, + "learning_rate": 1.713020286013577e-06, + "loss": 0.8469, + "step": 9554 + }, + { + "epoch": 0.736586493987049, + "grad_norm": 3.872889995574951, + "learning_rate": 1.7120796268850254e-06, + "loss": 0.8842, + "step": 9555 + }, + { + "epoch": 0.736663583102066, + "grad_norm": 4.169905185699463, + "learning_rate": 1.7111391727412807e-06, + "loss": 1.0117, + "step": 9556 + }, + { + "epoch": 0.736740672217083, + "grad_norm": 4.358822345733643, + "learning_rate": 1.7101989236409754e-06, + "loss": 0.9126, + "step": 9557 + }, + { + "epoch": 0.7368177613320999, + "grad_norm": 3.7961339950561523, + "learning_rate": 1.7092588796427306e-06, + "loss": 1.0046, + "step": 9558 + }, + { + "epoch": 0.7368948504471169, + "grad_norm": 3.6711761951446533, + "learning_rate": 1.7083190408051498e-06, + "loss": 0.8484, + "step": 9559 + }, + { + "epoch": 0.7369719395621338, + "grad_norm": 3.702301502227783, + "learning_rate": 1.7073794071868283e-06, + "loss": 0.8271, + "step": 9560 + }, + { + "epoch": 0.7370490286771508, + "grad_norm": 3.758284330368042, + "learning_rate": 1.7064399788463482e-06, + "loss": 1.0023, + "step": 9561 + }, + { + "epoch": 0.7371261177921677, + "grad_norm": 3.5002236366271973, + "learning_rate": 1.7055007558422776e-06, + "loss": 0.8328, + "step": 9562 + }, + { + "epoch": 0.7372032069071847, + "grad_norm": 3.4732120037078857, + "learning_rate": 1.7045617382331726e-06, + "loss": 0.794, + "step": 9563 + }, + { + "epoch": 0.7372802960222017, + "grad_norm": 4.162208080291748, + "learning_rate": 1.7036229260775766e-06, + "loss": 1.033, + "step": 9564 + }, + { + "epoch": 0.7373573851372186, + "grad_norm": 3.8932626247406006, + "learning_rate": 1.7026843194340176e-06, + "loss": 0.9436, + "step": 9565 + }, + { + "epoch": 0.7374344742522356, + "grad_norm": 3.9444522857666016, + "learning_rate": 1.7017459183610142e-06, + "loss": 0.9912, + "step": 9566 + }, + { + "epoch": 0.7375115633672525, + "grad_norm": 3.844452381134033, + "learning_rate": 1.70080772291707e-06, + "loss": 0.8048, + "step": 9567 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 3.734955072402954, + "learning_rate": 1.6998697331606777e-06, + "loss": 0.9294, + "step": 9568 + }, + { + "epoch": 0.7376657415972865, + "grad_norm": 3.940300226211548, + "learning_rate": 1.6989319491503175e-06, + "loss": 0.9772, + "step": 9569 + }, + { + "epoch": 0.7377428307123034, + "grad_norm": 3.8375890254974365, + "learning_rate": 1.697994370944452e-06, + "loss": 0.9545, + "step": 9570 + }, + { + "epoch": 0.7378199198273204, + "grad_norm": 3.75097393989563, + "learning_rate": 1.6970569986015356e-06, + "loss": 0.9672, + "step": 9571 + }, + { + "epoch": 0.7378970089423373, + "grad_norm": 3.56791353225708, + "learning_rate": 1.6961198321800092e-06, + "loss": 0.9334, + "step": 9572 + }, + { + "epoch": 0.7379740980573543, + "grad_norm": 3.3650386333465576, + "learning_rate": 1.6951828717383e-06, + "loss": 0.8871, + "step": 9573 + }, + { + "epoch": 0.7380511871723713, + "grad_norm": 3.7570347785949707, + "learning_rate": 1.694246117334823e-06, + "loss": 0.9818, + "step": 9574 + }, + { + "epoch": 0.7381282762873882, + "grad_norm": 3.496140241622925, + "learning_rate": 1.6933095690279805e-06, + "loss": 0.907, + "step": 9575 + }, + { + "epoch": 0.7382053654024052, + "grad_norm": 3.67641019821167, + "learning_rate": 1.6923732268761594e-06, + "loss": 0.8191, + "step": 9576 + }, + { + "epoch": 0.7382824545174221, + "grad_norm": 3.6642329692840576, + "learning_rate": 1.691437090937737e-06, + "loss": 0.8536, + "step": 9577 + }, + { + "epoch": 0.738359543632439, + "grad_norm": 3.9450578689575195, + "learning_rate": 1.6905011612710764e-06, + "loss": 0.8556, + "step": 9578 + }, + { + "epoch": 0.7384366327474561, + "grad_norm": 4.003316402435303, + "learning_rate": 1.6895654379345282e-06, + "loss": 0.9955, + "step": 9579 + }, + { + "epoch": 0.738513721862473, + "grad_norm": 3.8521080017089844, + "learning_rate": 1.6886299209864316e-06, + "loss": 0.888, + "step": 9580 + }, + { + "epoch": 0.73859081097749, + "grad_norm": 3.9132065773010254, + "learning_rate": 1.687694610485107e-06, + "loss": 0.9501, + "step": 9581 + }, + { + "epoch": 0.7386679000925069, + "grad_norm": 3.6591548919677734, + "learning_rate": 1.6867595064888693e-06, + "loss": 0.8745, + "step": 9582 + }, + { + "epoch": 0.7387449892075239, + "grad_norm": 4.419125556945801, + "learning_rate": 1.685824609056015e-06, + "loss": 0.9704, + "step": 9583 + }, + { + "epoch": 0.7388220783225409, + "grad_norm": 3.558927297592163, + "learning_rate": 1.6848899182448347e-06, + "loss": 0.9493, + "step": 9584 + }, + { + "epoch": 0.7388991674375578, + "grad_norm": 3.7360217571258545, + "learning_rate": 1.6839554341135973e-06, + "loss": 0.9127, + "step": 9585 + }, + { + "epoch": 0.7389762565525748, + "grad_norm": 4.130924701690674, + "learning_rate": 1.683021156720564e-06, + "loss": 0.8786, + "step": 9586 + }, + { + "epoch": 0.7390533456675917, + "grad_norm": 3.717590808868408, + "learning_rate": 1.6820870861239824e-06, + "loss": 0.8723, + "step": 9587 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 3.745699405670166, + "learning_rate": 1.6811532223820875e-06, + "loss": 0.8268, + "step": 9588 + }, + { + "epoch": 0.7392075238976257, + "grad_norm": 3.7815754413604736, + "learning_rate": 1.6802195655531e-06, + "loss": 0.9505, + "step": 9589 + }, + { + "epoch": 0.7392846130126426, + "grad_norm": 4.17587947845459, + "learning_rate": 1.6792861156952312e-06, + "loss": 0.9202, + "step": 9590 + }, + { + "epoch": 0.7393617021276596, + "grad_norm": 3.6216659545898438, + "learning_rate": 1.6783528728666725e-06, + "loss": 0.8253, + "step": 9591 + }, + { + "epoch": 0.7394387912426765, + "grad_norm": 3.885458469390869, + "learning_rate": 1.677419837125609e-06, + "loss": 1.0538, + "step": 9592 + }, + { + "epoch": 0.7395158803576934, + "grad_norm": 3.8355486392974854, + "learning_rate": 1.676487008530211e-06, + "loss": 0.9182, + "step": 9593 + }, + { + "epoch": 0.7395929694727105, + "grad_norm": 4.003592014312744, + "learning_rate": 1.6755543871386342e-06, + "loss": 0.9035, + "step": 9594 + }, + { + "epoch": 0.7396700585877274, + "grad_norm": 3.761218309402466, + "learning_rate": 1.6746219730090262e-06, + "loss": 0.8466, + "step": 9595 + }, + { + "epoch": 0.7397471477027444, + "grad_norm": 3.5276873111724854, + "learning_rate": 1.6736897661995132e-06, + "loss": 0.8705, + "step": 9596 + }, + { + "epoch": 0.7398242368177613, + "grad_norm": 3.678708791732788, + "learning_rate": 1.6727577667682165e-06, + "loss": 0.8528, + "step": 9597 + }, + { + "epoch": 0.7399013259327782, + "grad_norm": 3.7199313640594482, + "learning_rate": 1.6718259747732407e-06, + "loss": 0.9636, + "step": 9598 + }, + { + "epoch": 0.7399784150477953, + "grad_norm": 3.733581304550171, + "learning_rate": 1.6708943902726783e-06, + "loss": 0.9271, + "step": 9599 + }, + { + "epoch": 0.7400555041628122, + "grad_norm": 3.6299502849578857, + "learning_rate": 1.6699630133246087e-06, + "loss": 0.9428, + "step": 9600 + }, + { + "epoch": 0.7401325932778292, + "grad_norm": 3.8488755226135254, + "learning_rate": 1.6690318439871e-06, + "loss": 0.7749, + "step": 9601 + }, + { + "epoch": 0.7402096823928461, + "grad_norm": 3.60003399848938, + "learning_rate": 1.6681008823182027e-06, + "loss": 0.9667, + "step": 9602 + }, + { + "epoch": 0.740286771507863, + "grad_norm": 3.882335662841797, + "learning_rate": 1.6671701283759596e-06, + "loss": 0.9634, + "step": 9603 + }, + { + "epoch": 0.7403638606228801, + "grad_norm": 3.8450510501861572, + "learning_rate": 1.6662395822183975e-06, + "loss": 0.7925, + "step": 9604 + }, + { + "epoch": 0.740440949737897, + "grad_norm": 3.650480031967163, + "learning_rate": 1.6653092439035312e-06, + "loss": 0.8782, + "step": 9605 + }, + { + "epoch": 0.740518038852914, + "grad_norm": 3.5009539127349854, + "learning_rate": 1.6643791134893644e-06, + "loss": 0.952, + "step": 9606 + }, + { + "epoch": 0.7405951279679309, + "grad_norm": 3.5242550373077393, + "learning_rate": 1.6634491910338829e-06, + "loss": 0.9471, + "step": 9607 + }, + { + "epoch": 0.7406722170829478, + "grad_norm": 3.7609641551971436, + "learning_rate": 1.6625194765950636e-06, + "loss": 0.8637, + "step": 9608 + }, + { + "epoch": 0.7407493061979649, + "grad_norm": 3.836843729019165, + "learning_rate": 1.6615899702308696e-06, + "loss": 0.9317, + "step": 9609 + }, + { + "epoch": 0.7408263953129818, + "grad_norm": 4.039276599884033, + "learning_rate": 1.6606606719992513e-06, + "loss": 0.9741, + "step": 9610 + }, + { + "epoch": 0.7409034844279988, + "grad_norm": 3.873875379562378, + "learning_rate": 1.6597315819581449e-06, + "loss": 0.8736, + "step": 9611 + }, + { + "epoch": 0.7409805735430157, + "grad_norm": 3.878174066543579, + "learning_rate": 1.6588027001654765e-06, + "loss": 0.9261, + "step": 9612 + }, + { + "epoch": 0.7410576626580326, + "grad_norm": 3.606624126434326, + "learning_rate": 1.6578740266791532e-06, + "loss": 0.8731, + "step": 9613 + }, + { + "epoch": 0.7411347517730497, + "grad_norm": 3.7116219997406006, + "learning_rate": 1.6569455615570757e-06, + "loss": 0.8865, + "step": 9614 + }, + { + "epoch": 0.7412118408880666, + "grad_norm": 3.811680316925049, + "learning_rate": 1.6560173048571277e-06, + "loss": 0.938, + "step": 9615 + }, + { + "epoch": 0.7412889300030836, + "grad_norm": 3.9601168632507324, + "learning_rate": 1.6550892566371823e-06, + "loss": 0.9875, + "step": 9616 + }, + { + "epoch": 0.7413660191181005, + "grad_norm": 3.370882034301758, + "learning_rate": 1.654161416955099e-06, + "loss": 0.878, + "step": 9617 + }, + { + "epoch": 0.7414431082331174, + "grad_norm": 4.08239221572876, + "learning_rate": 1.653233785868722e-06, + "loss": 1.0293, + "step": 9618 + }, + { + "epoch": 0.7415201973481345, + "grad_norm": 3.562781572341919, + "learning_rate": 1.6523063634358844e-06, + "loss": 0.8308, + "step": 9619 + }, + { + "epoch": 0.7415972864631514, + "grad_norm": 3.801961898803711, + "learning_rate": 1.6513791497144071e-06, + "loss": 0.9176, + "step": 9620 + }, + { + "epoch": 0.7416743755781684, + "grad_norm": 3.9425201416015625, + "learning_rate": 1.6504521447620969e-06, + "loss": 0.9052, + "step": 9621 + }, + { + "epoch": 0.7417514646931853, + "grad_norm": 3.5563175678253174, + "learning_rate": 1.649525348636748e-06, + "loss": 1.0543, + "step": 9622 + }, + { + "epoch": 0.7418285538082022, + "grad_norm": 3.3809213638305664, + "learning_rate": 1.6485987613961423e-06, + "loss": 0.8341, + "step": 9623 + }, + { + "epoch": 0.7419056429232193, + "grad_norm": 3.588660717010498, + "learning_rate": 1.6476723830980451e-06, + "loss": 0.7934, + "step": 9624 + }, + { + "epoch": 0.7419827320382362, + "grad_norm": 3.640718460083008, + "learning_rate": 1.6467462138002126e-06, + "loss": 0.9354, + "step": 9625 + }, + { + "epoch": 0.7420598211532532, + "grad_norm": 3.681979179382324, + "learning_rate": 1.6458202535603867e-06, + "loss": 0.9655, + "step": 9626 + }, + { + "epoch": 0.7421369102682701, + "grad_norm": 4.223269939422607, + "learning_rate": 1.6448945024362962e-06, + "loss": 0.965, + "step": 9627 + }, + { + "epoch": 0.742213999383287, + "grad_norm": 3.7559590339660645, + "learning_rate": 1.6439689604856568e-06, + "loss": 0.9417, + "step": 9628 + }, + { + "epoch": 0.742291088498304, + "grad_norm": 3.631321668624878, + "learning_rate": 1.6430436277661715e-06, + "loss": 0.8356, + "step": 9629 + }, + { + "epoch": 0.742368177613321, + "grad_norm": 3.696200370788574, + "learning_rate": 1.6421185043355304e-06, + "loss": 0.9243, + "step": 9630 + }, + { + "epoch": 0.742445266728338, + "grad_norm": 3.6229546070098877, + "learning_rate": 1.6411935902514086e-06, + "loss": 0.8577, + "step": 9631 + }, + { + "epoch": 0.7425223558433549, + "grad_norm": 3.96211314201355, + "learning_rate": 1.6402688855714733e-06, + "loss": 0.9646, + "step": 9632 + }, + { + "epoch": 0.7425994449583718, + "grad_norm": 3.6700477600097656, + "learning_rate": 1.6393443903533707e-06, + "loss": 0.9983, + "step": 9633 + }, + { + "epoch": 0.7426765340733888, + "grad_norm": 4.209187030792236, + "learning_rate": 1.6384201046547399e-06, + "loss": 1.0566, + "step": 9634 + }, + { + "epoch": 0.7427536231884058, + "grad_norm": 3.9726881980895996, + "learning_rate": 1.6374960285332053e-06, + "loss": 1.0066, + "step": 9635 + }, + { + "epoch": 0.7428307123034228, + "grad_norm": 3.917154550552368, + "learning_rate": 1.6365721620463786e-06, + "loss": 0.9755, + "step": 9636 + }, + { + "epoch": 0.7429078014184397, + "grad_norm": 4.186873435974121, + "learning_rate": 1.6356485052518578e-06, + "loss": 0.9928, + "step": 9637 + }, + { + "epoch": 0.7429848905334566, + "grad_norm": 3.5882840156555176, + "learning_rate": 1.6347250582072305e-06, + "loss": 0.8609, + "step": 9638 + }, + { + "epoch": 0.7430619796484736, + "grad_norm": 3.6521730422973633, + "learning_rate": 1.6338018209700647e-06, + "loss": 0.9443, + "step": 9639 + }, + { + "epoch": 0.7431390687634906, + "grad_norm": 3.2607851028442383, + "learning_rate": 1.6328787935979207e-06, + "loss": 0.8569, + "step": 9640 + }, + { + "epoch": 0.7432161578785076, + "grad_norm": 3.7790210247039795, + "learning_rate": 1.6319559761483461e-06, + "loss": 0.8764, + "step": 9641 + }, + { + "epoch": 0.7432932469935245, + "grad_norm": 3.771569013595581, + "learning_rate": 1.631033368678872e-06, + "loss": 0.9934, + "step": 9642 + }, + { + "epoch": 0.7433703361085414, + "grad_norm": 3.3746719360351562, + "learning_rate": 1.6301109712470214e-06, + "loss": 0.7936, + "step": 9643 + }, + { + "epoch": 0.7434474252235584, + "grad_norm": 3.9737672805786133, + "learning_rate": 1.6291887839102966e-06, + "loss": 0.9538, + "step": 9644 + }, + { + "epoch": 0.7435245143385754, + "grad_norm": 4.225729465484619, + "learning_rate": 1.6282668067261935e-06, + "loss": 0.8667, + "step": 9645 + }, + { + "epoch": 0.7436016034535924, + "grad_norm": 3.5801405906677246, + "learning_rate": 1.6273450397521922e-06, + "loss": 0.8745, + "step": 9646 + }, + { + "epoch": 0.7436786925686093, + "grad_norm": 3.57384991645813, + "learning_rate": 1.6264234830457603e-06, + "loss": 0.8146, + "step": 9647 + }, + { + "epoch": 0.7437557816836263, + "grad_norm": 3.640131711959839, + "learning_rate": 1.625502136664352e-06, + "loss": 0.9404, + "step": 9648 + }, + { + "epoch": 0.7438328707986432, + "grad_norm": 3.9445488452911377, + "learning_rate": 1.62458100066541e-06, + "loss": 1.0291, + "step": 9649 + }, + { + "epoch": 0.7439099599136602, + "grad_norm": 3.508561134338379, + "learning_rate": 1.6236600751063597e-06, + "loss": 0.8162, + "step": 9650 + }, + { + "epoch": 0.7439870490286772, + "grad_norm": 3.63757586479187, + "learning_rate": 1.6227393600446168e-06, + "loss": 0.8668, + "step": 9651 + }, + { + "epoch": 0.7440641381436941, + "grad_norm": 3.6220133304595947, + "learning_rate": 1.6218188555375836e-06, + "loss": 0.8303, + "step": 9652 + }, + { + "epoch": 0.7441412272587111, + "grad_norm": 3.732778310775757, + "learning_rate": 1.6208985616426488e-06, + "loss": 0.9131, + "step": 9653 + }, + { + "epoch": 0.744218316373728, + "grad_norm": 3.7289443016052246, + "learning_rate": 1.619978478417189e-06, + "loss": 1.0164, + "step": 9654 + }, + { + "epoch": 0.744295405488745, + "grad_norm": 3.5976722240448, + "learning_rate": 1.6190586059185642e-06, + "loss": 0.8991, + "step": 9655 + }, + { + "epoch": 0.744372494603762, + "grad_norm": 3.97445011138916, + "learning_rate": 1.618138944204125e-06, + "loss": 0.9376, + "step": 9656 + }, + { + "epoch": 0.7444495837187789, + "grad_norm": 3.541771173477173, + "learning_rate": 1.617219493331208e-06, + "loss": 0.929, + "step": 9657 + }, + { + "epoch": 0.7445266728337959, + "grad_norm": 3.4124934673309326, + "learning_rate": 1.6163002533571348e-06, + "loss": 0.8519, + "step": 9658 + }, + { + "epoch": 0.7446037619488128, + "grad_norm": 3.9390342235565186, + "learning_rate": 1.6153812243392169e-06, + "loss": 0.8352, + "step": 9659 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 3.7517385482788086, + "learning_rate": 1.6144624063347514e-06, + "loss": 1.0166, + "step": 9660 + }, + { + "epoch": 0.7447579401788468, + "grad_norm": 3.8233649730682373, + "learning_rate": 1.6135437994010195e-06, + "loss": 0.8967, + "step": 9661 + }, + { + "epoch": 0.7448350292938637, + "grad_norm": 4.343902111053467, + "learning_rate": 1.6126254035952926e-06, + "loss": 0.9621, + "step": 9662 + }, + { + "epoch": 0.7449121184088807, + "grad_norm": 3.828294038772583, + "learning_rate": 1.6117072189748285e-06, + "loss": 0.9517, + "step": 9663 + }, + { + "epoch": 0.7449892075238976, + "grad_norm": 4.02465295791626, + "learning_rate": 1.6107892455968704e-06, + "loss": 0.9339, + "step": 9664 + }, + { + "epoch": 0.7450662966389145, + "grad_norm": 3.764190673828125, + "learning_rate": 1.6098714835186512e-06, + "loss": 0.9529, + "step": 9665 + }, + { + "epoch": 0.7451433857539316, + "grad_norm": 3.6371004581451416, + "learning_rate": 1.6089539327973857e-06, + "loss": 0.8936, + "step": 9666 + }, + { + "epoch": 0.7452204748689485, + "grad_norm": 3.513137102127075, + "learning_rate": 1.6080365934902798e-06, + "loss": 0.9337, + "step": 9667 + }, + { + "epoch": 0.7452975639839655, + "grad_norm": 4.115825653076172, + "learning_rate": 1.6071194656545246e-06, + "loss": 0.9739, + "step": 9668 + }, + { + "epoch": 0.7453746530989824, + "grad_norm": 4.290143966674805, + "learning_rate": 1.6062025493472988e-06, + "loss": 1.0652, + "step": 9669 + }, + { + "epoch": 0.7454517422139993, + "grad_norm": 3.6070821285247803, + "learning_rate": 1.6052858446257674e-06, + "loss": 0.866, + "step": 9670 + }, + { + "epoch": 0.7455288313290164, + "grad_norm": 3.7635834217071533, + "learning_rate": 1.6043693515470833e-06, + "loss": 0.8835, + "step": 9671 + }, + { + "epoch": 0.7456059204440333, + "grad_norm": 3.815763473510742, + "learning_rate": 1.6034530701683804e-06, + "loss": 0.8308, + "step": 9672 + }, + { + "epoch": 0.7456830095590503, + "grad_norm": 3.4556825160980225, + "learning_rate": 1.6025370005467889e-06, + "loss": 0.8331, + "step": 9673 + }, + { + "epoch": 0.7457600986740672, + "grad_norm": 3.467095375061035, + "learning_rate": 1.6016211427394196e-06, + "loss": 0.7999, + "step": 9674 + }, + { + "epoch": 0.7458371877890841, + "grad_norm": 4.090667724609375, + "learning_rate": 1.600705496803373e-06, + "loss": 0.8447, + "step": 9675 + }, + { + "epoch": 0.7459142769041012, + "grad_norm": 3.751192092895508, + "learning_rate": 1.5997900627957318e-06, + "loss": 0.9856, + "step": 9676 + }, + { + "epoch": 0.7459913660191181, + "grad_norm": 3.7970805168151855, + "learning_rate": 1.5988748407735698e-06, + "loss": 0.8911, + "step": 9677 + }, + { + "epoch": 0.7460684551341351, + "grad_norm": 3.705199956893921, + "learning_rate": 1.597959830793947e-06, + "loss": 0.8308, + "step": 9678 + }, + { + "epoch": 0.746145544249152, + "grad_norm": 3.4573512077331543, + "learning_rate": 1.5970450329139087e-06, + "loss": 0.9452, + "step": 9679 + }, + { + "epoch": 0.7462226333641689, + "grad_norm": 3.6191141605377197, + "learning_rate": 1.5961304471904897e-06, + "loss": 0.953, + "step": 9680 + }, + { + "epoch": 0.746299722479186, + "grad_norm": 4.017669677734375, + "learning_rate": 1.595216073680707e-06, + "loss": 0.9276, + "step": 9681 + }, + { + "epoch": 0.7463768115942029, + "grad_norm": 3.8973758220672607, + "learning_rate": 1.5943019124415687e-06, + "loss": 0.8892, + "step": 9682 + }, + { + "epoch": 0.7464539007092199, + "grad_norm": 3.952500104904175, + "learning_rate": 1.593387963530067e-06, + "loss": 0.9615, + "step": 9683 + }, + { + "epoch": 0.7465309898242368, + "grad_norm": 3.6438992023468018, + "learning_rate": 1.5924742270031823e-06, + "loss": 0.8843, + "step": 9684 + }, + { + "epoch": 0.7466080789392537, + "grad_norm": 3.6509311199188232, + "learning_rate": 1.591560702917882e-06, + "loss": 0.9018, + "step": 9685 + }, + { + "epoch": 0.7466851680542708, + "grad_norm": 4.112860202789307, + "learning_rate": 1.5906473913311204e-06, + "loss": 0.9559, + "step": 9686 + }, + { + "epoch": 0.7467622571692877, + "grad_norm": 3.7320613861083984, + "learning_rate": 1.589734292299835e-06, + "loss": 0.9533, + "step": 9687 + }, + { + "epoch": 0.7468393462843047, + "grad_norm": 3.6934304237365723, + "learning_rate": 1.5888214058809543e-06, + "loss": 0.9533, + "step": 9688 + }, + { + "epoch": 0.7469164353993216, + "grad_norm": 3.7583377361297607, + "learning_rate": 1.587908732131392e-06, + "loss": 0.8712, + "step": 9689 + }, + { + "epoch": 0.7469935245143385, + "grad_norm": 3.579685926437378, + "learning_rate": 1.5869962711080483e-06, + "loss": 0.9753, + "step": 9690 + }, + { + "epoch": 0.7470706136293556, + "grad_norm": 3.7619788646698, + "learning_rate": 1.5860840228678126e-06, + "loss": 0.9095, + "step": 9691 + }, + { + "epoch": 0.7471477027443725, + "grad_norm": 3.7893290519714355, + "learning_rate": 1.5851719874675552e-06, + "loss": 0.9442, + "step": 9692 + }, + { + "epoch": 0.7472247918593895, + "grad_norm": 3.8225247859954834, + "learning_rate": 1.5842601649641388e-06, + "loss": 0.8569, + "step": 9693 + }, + { + "epoch": 0.7473018809744064, + "grad_norm": 3.59032940864563, + "learning_rate": 1.58334855541441e-06, + "loss": 0.9241, + "step": 9694 + }, + { + "epoch": 0.7473789700894233, + "grad_norm": 3.8772151470184326, + "learning_rate": 1.5824371588752042e-06, + "loss": 0.8766, + "step": 9695 + }, + { + "epoch": 0.7474560592044404, + "grad_norm": 3.8452553749084473, + "learning_rate": 1.5815259754033407e-06, + "loss": 0.9639, + "step": 9696 + }, + { + "epoch": 0.7475331483194573, + "grad_norm": 4.228387355804443, + "learning_rate": 1.5806150050556302e-06, + "loss": 1.0822, + "step": 9697 + }, + { + "epoch": 0.7476102374344743, + "grad_norm": 3.8701577186584473, + "learning_rate": 1.579704247888863e-06, + "loss": 0.9458, + "step": 9698 + }, + { + "epoch": 0.7476873265494912, + "grad_norm": 4.092501640319824, + "learning_rate": 1.5787937039598217e-06, + "loss": 0.9963, + "step": 9699 + }, + { + "epoch": 0.7477644156645081, + "grad_norm": 3.4330146312713623, + "learning_rate": 1.5778833733252735e-06, + "loss": 0.889, + "step": 9700 + }, + { + "epoch": 0.7478415047795252, + "grad_norm": 3.8023056983947754, + "learning_rate": 1.5769732560419742e-06, + "loss": 0.8818, + "step": 9701 + }, + { + "epoch": 0.7479185938945421, + "grad_norm": 3.714808464050293, + "learning_rate": 1.5760633521666652e-06, + "loss": 1.0705, + "step": 9702 + }, + { + "epoch": 0.7479956830095591, + "grad_norm": 4.097212314605713, + "learning_rate": 1.5751536617560715e-06, + "loss": 0.8879, + "step": 9703 + }, + { + "epoch": 0.748072772124576, + "grad_norm": 3.7691214084625244, + "learning_rate": 1.57424418486691e-06, + "loss": 1.0334, + "step": 9704 + }, + { + "epoch": 0.7481498612395929, + "grad_norm": 3.9305317401885986, + "learning_rate": 1.5733349215558802e-06, + "loss": 1.0065, + "step": 9705 + }, + { + "epoch": 0.74822695035461, + "grad_norm": 4.348329067230225, + "learning_rate": 1.5724258718796714e-06, + "loss": 1.1138, + "step": 9706 + }, + { + "epoch": 0.7483040394696269, + "grad_norm": 4.216433048248291, + "learning_rate": 1.5715170358949572e-06, + "loss": 1.008, + "step": 9707 + }, + { + "epoch": 0.7483811285846439, + "grad_norm": 3.6003470420837402, + "learning_rate": 1.5706084136584016e-06, + "loss": 0.8693, + "step": 9708 + }, + { + "epoch": 0.7484582176996608, + "grad_norm": 4.085456848144531, + "learning_rate": 1.5697000052266475e-06, + "loss": 0.8981, + "step": 9709 + }, + { + "epoch": 0.7485353068146777, + "grad_norm": 3.8469676971435547, + "learning_rate": 1.5687918106563326e-06, + "loss": 0.838, + "step": 9710 + }, + { + "epoch": 0.7486123959296948, + "grad_norm": 3.8500454425811768, + "learning_rate": 1.5678838300040783e-06, + "loss": 0.9659, + "step": 9711 + }, + { + "epoch": 0.7486894850447117, + "grad_norm": 4.028532028198242, + "learning_rate": 1.5669760633264908e-06, + "loss": 0.9956, + "step": 9712 + }, + { + "epoch": 0.7487665741597287, + "grad_norm": 3.699516534805298, + "learning_rate": 1.5660685106801677e-06, + "loss": 0.9479, + "step": 9713 + }, + { + "epoch": 0.7488436632747456, + "grad_norm": 3.5519814491271973, + "learning_rate": 1.5651611721216865e-06, + "loss": 0.8591, + "step": 9714 + }, + { + "epoch": 0.7489207523897625, + "grad_norm": 3.9628407955169678, + "learning_rate": 1.5642540477076169e-06, + "loss": 1.0351, + "step": 9715 + }, + { + "epoch": 0.7489978415047795, + "grad_norm": 3.8148396015167236, + "learning_rate": 1.5633471374945113e-06, + "loss": 1.0393, + "step": 9716 + }, + { + "epoch": 0.7490749306197965, + "grad_norm": 4.2555999755859375, + "learning_rate": 1.5624404415389166e-06, + "loss": 0.8896, + "step": 9717 + }, + { + "epoch": 0.7491520197348135, + "grad_norm": 3.7964389324188232, + "learning_rate": 1.5615339598973544e-06, + "loss": 0.9671, + "step": 9718 + }, + { + "epoch": 0.7492291088498304, + "grad_norm": 3.563861608505249, + "learning_rate": 1.560627692626342e-06, + "loss": 0.9568, + "step": 9719 + }, + { + "epoch": 0.7493061979648473, + "grad_norm": 4.131255626678467, + "learning_rate": 1.55972163978238e-06, + "loss": 0.8972, + "step": 9720 + }, + { + "epoch": 0.7493832870798643, + "grad_norm": 3.94917893409729, + "learning_rate": 1.5588158014219563e-06, + "loss": 0.9817, + "step": 9721 + }, + { + "epoch": 0.7494603761948813, + "grad_norm": 3.44014835357666, + "learning_rate": 1.5579101776015443e-06, + "loss": 0.8567, + "step": 9722 + }, + { + "epoch": 0.7495374653098983, + "grad_norm": 3.9482109546661377, + "learning_rate": 1.557004768377608e-06, + "loss": 0.9041, + "step": 9723 + }, + { + "epoch": 0.7496145544249152, + "grad_norm": 3.8712923526763916, + "learning_rate": 1.556099573806591e-06, + "loss": 0.9819, + "step": 9724 + }, + { + "epoch": 0.7496916435399321, + "grad_norm": 3.6934661865234375, + "learning_rate": 1.5551945939449287e-06, + "loss": 0.9745, + "step": 9725 + }, + { + "epoch": 0.7497687326549491, + "grad_norm": 3.556561231613159, + "learning_rate": 1.5542898288490426e-06, + "loss": 0.9324, + "step": 9726 + }, + { + "epoch": 0.7498458217699661, + "grad_norm": 3.59233021736145, + "learning_rate": 1.5533852785753401e-06, + "loss": 0.8831, + "step": 9727 + }, + { + "epoch": 0.7499229108849831, + "grad_norm": 3.663569450378418, + "learning_rate": 1.5524809431802162e-06, + "loss": 1.0045, + "step": 9728 + }, + { + "epoch": 0.75, + "grad_norm": 3.9396040439605713, + "learning_rate": 1.551576822720049e-06, + "loss": 0.9329, + "step": 9729 + }, + { + "epoch": 0.7500770891150169, + "grad_norm": 3.6608591079711914, + "learning_rate": 1.550672917251207e-06, + "loss": 0.8829, + "step": 9730 + }, + { + "epoch": 0.7501541782300339, + "grad_norm": 3.617097854614258, + "learning_rate": 1.5497692268300442e-06, + "loss": 0.9165, + "step": 9731 + }, + { + "epoch": 0.7502312673450509, + "grad_norm": 3.6227474212646484, + "learning_rate": 1.5488657515129001e-06, + "loss": 0.8429, + "step": 9732 + }, + { + "epoch": 0.7503083564600679, + "grad_norm": 3.4037904739379883, + "learning_rate": 1.5479624913561037e-06, + "loss": 0.8714, + "step": 9733 + }, + { + "epoch": 0.7503854455750848, + "grad_norm": 4.304060459136963, + "learning_rate": 1.5470594464159682e-06, + "loss": 0.9096, + "step": 9734 + }, + { + "epoch": 0.7504625346901017, + "grad_norm": 3.9946959018707275, + "learning_rate": 1.5461566167487918e-06, + "loss": 0.9868, + "step": 9735 + }, + { + "epoch": 0.7505396238051187, + "grad_norm": 3.8413825035095215, + "learning_rate": 1.5452540024108625e-06, + "loss": 0.9049, + "step": 9736 + }, + { + "epoch": 0.7506167129201357, + "grad_norm": 3.6537039279937744, + "learning_rate": 1.5443516034584533e-06, + "loss": 0.9031, + "step": 9737 + }, + { + "epoch": 0.7506938020351527, + "grad_norm": 3.7991750240325928, + "learning_rate": 1.5434494199478245e-06, + "loss": 0.8957, + "step": 9738 + }, + { + "epoch": 0.7507708911501696, + "grad_norm": 3.8140668869018555, + "learning_rate": 1.5425474519352234e-06, + "loss": 0.9202, + "step": 9739 + }, + { + "epoch": 0.7508479802651865, + "grad_norm": 3.7266762256622314, + "learning_rate": 1.5416456994768813e-06, + "loss": 0.919, + "step": 9740 + }, + { + "epoch": 0.7509250693802035, + "grad_norm": 3.538825511932373, + "learning_rate": 1.5407441626290181e-06, + "loss": 0.872, + "step": 9741 + }, + { + "epoch": 0.7510021584952205, + "grad_norm": 3.42117977142334, + "learning_rate": 1.5398428414478407e-06, + "loss": 0.7402, + "step": 9742 + }, + { + "epoch": 0.7510792476102375, + "grad_norm": 3.397956132888794, + "learning_rate": 1.5389417359895415e-06, + "loss": 0.8549, + "step": 9743 + }, + { + "epoch": 0.7511563367252544, + "grad_norm": 3.6643259525299072, + "learning_rate": 1.5380408463102998e-06, + "loss": 0.8128, + "step": 9744 + }, + { + "epoch": 0.7512334258402713, + "grad_norm": 3.688371419906616, + "learning_rate": 1.5371401724662826e-06, + "loss": 0.8901, + "step": 9745 + }, + { + "epoch": 0.7513105149552883, + "grad_norm": 3.9233813285827637, + "learning_rate": 1.5362397145136398e-06, + "loss": 0.9399, + "step": 9746 + }, + { + "epoch": 0.7513876040703052, + "grad_norm": 3.741755247116089, + "learning_rate": 1.5353394725085113e-06, + "loss": 0.8973, + "step": 9747 + }, + { + "epoch": 0.7514646931853223, + "grad_norm": 3.6673974990844727, + "learning_rate": 1.5344394465070234e-06, + "loss": 0.8966, + "step": 9748 + }, + { + "epoch": 0.7515417823003392, + "grad_norm": 3.5368854999542236, + "learning_rate": 1.5335396365652865e-06, + "loss": 0.9082, + "step": 9749 + }, + { + "epoch": 0.7516188714153561, + "grad_norm": 3.6051862239837646, + "learning_rate": 1.5326400427394023e-06, + "loss": 0.9469, + "step": 9750 + }, + { + "epoch": 0.7516959605303731, + "grad_norm": 3.7850236892700195, + "learning_rate": 1.5317406650854515e-06, + "loss": 0.8775, + "step": 9751 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 3.5623207092285156, + "learning_rate": 1.5308415036595076e-06, + "loss": 0.799, + "step": 9752 + }, + { + "epoch": 0.7518501387604071, + "grad_norm": 4.15452241897583, + "learning_rate": 1.5299425585176292e-06, + "loss": 0.9851, + "step": 9753 + }, + { + "epoch": 0.751927227875424, + "grad_norm": 3.781522274017334, + "learning_rate": 1.52904382971586e-06, + "loss": 1.033, + "step": 9754 + }, + { + "epoch": 0.7520043169904409, + "grad_norm": 3.5940423011779785, + "learning_rate": 1.528145317310231e-06, + "loss": 0.9551, + "step": 9755 + }, + { + "epoch": 0.7520814061054579, + "grad_norm": 3.518636465072632, + "learning_rate": 1.527247021356763e-06, + "loss": 0.8302, + "step": 9756 + }, + { + "epoch": 0.7521584952204748, + "grad_norm": 3.90459942817688, + "learning_rate": 1.5263489419114552e-06, + "loss": 0.9807, + "step": 9757 + }, + { + "epoch": 0.7522355843354919, + "grad_norm": 3.519850492477417, + "learning_rate": 1.5254510790303e-06, + "loss": 0.9089, + "step": 9758 + }, + { + "epoch": 0.7523126734505088, + "grad_norm": 3.474034547805786, + "learning_rate": 1.5245534327692751e-06, + "loss": 0.8635, + "step": 9759 + }, + { + "epoch": 0.7523897625655257, + "grad_norm": 3.7472171783447266, + "learning_rate": 1.5236560031843445e-06, + "loss": 0.8809, + "step": 9760 + }, + { + "epoch": 0.7524668516805427, + "grad_norm": 3.900770902633667, + "learning_rate": 1.5227587903314568e-06, + "loss": 0.9003, + "step": 9761 + }, + { + "epoch": 0.7525439407955596, + "grad_norm": 3.990259885787964, + "learning_rate": 1.5218617942665497e-06, + "loss": 0.8674, + "step": 9762 + }, + { + "epoch": 0.7526210299105767, + "grad_norm": 3.8502142429351807, + "learning_rate": 1.5209650150455462e-06, + "loss": 0.8873, + "step": 9763 + }, + { + "epoch": 0.7526981190255936, + "grad_norm": 3.540637969970703, + "learning_rate": 1.5200684527243552e-06, + "loss": 0.8397, + "step": 9764 + }, + { + "epoch": 0.7527752081406105, + "grad_norm": 3.576173782348633, + "learning_rate": 1.5191721073588755e-06, + "loss": 0.9391, + "step": 9765 + }, + { + "epoch": 0.7528522972556275, + "grad_norm": 3.8144445419311523, + "learning_rate": 1.518275979004985e-06, + "loss": 0.9871, + "step": 9766 + }, + { + "epoch": 0.7529293863706444, + "grad_norm": 4.009185314178467, + "learning_rate": 1.517380067718555e-06, + "loss": 0.9879, + "step": 9767 + }, + { + "epoch": 0.7530064754856615, + "grad_norm": 3.6188137531280518, + "learning_rate": 1.5164843735554408e-06, + "loss": 0.8921, + "step": 9768 + }, + { + "epoch": 0.7530835646006784, + "grad_norm": 3.5967025756835938, + "learning_rate": 1.5155888965714843e-06, + "loss": 0.8747, + "step": 9769 + }, + { + "epoch": 0.7531606537156953, + "grad_norm": 3.705009937286377, + "learning_rate": 1.514693636822514e-06, + "loss": 0.8078, + "step": 9770 + }, + { + "epoch": 0.7532377428307123, + "grad_norm": 3.469721794128418, + "learning_rate": 1.5137985943643463e-06, + "loss": 0.8209, + "step": 9771 + }, + { + "epoch": 0.7533148319457292, + "grad_norm": 3.523388385772705, + "learning_rate": 1.5129037692527794e-06, + "loss": 0.8536, + "step": 9772 + }, + { + "epoch": 0.7533919210607463, + "grad_norm": 4.049753665924072, + "learning_rate": 1.5120091615436016e-06, + "loss": 0.8941, + "step": 9773 + }, + { + "epoch": 0.7534690101757632, + "grad_norm": 3.7952821254730225, + "learning_rate": 1.5111147712925884e-06, + "loss": 0.884, + "step": 9774 + }, + { + "epoch": 0.7535460992907801, + "grad_norm": 3.82209849357605, + "learning_rate": 1.5102205985554992e-06, + "loss": 0.7733, + "step": 9775 + }, + { + "epoch": 0.7536231884057971, + "grad_norm": 3.5352354049682617, + "learning_rate": 1.5093266433880837e-06, + "loss": 0.8167, + "step": 9776 + }, + { + "epoch": 0.753700277520814, + "grad_norm": 3.5197014808654785, + "learning_rate": 1.5084329058460716e-06, + "loss": 0.9545, + "step": 9777 + }, + { + "epoch": 0.7537773666358311, + "grad_norm": 3.3797638416290283, + "learning_rate": 1.5075393859851844e-06, + "loss": 0.7952, + "step": 9778 + }, + { + "epoch": 0.753854455750848, + "grad_norm": 3.590395927429199, + "learning_rate": 1.5066460838611292e-06, + "loss": 0.8798, + "step": 9779 + }, + { + "epoch": 0.7539315448658649, + "grad_norm": 3.962991952896118, + "learning_rate": 1.505752999529597e-06, + "loss": 0.8392, + "step": 9780 + }, + { + "epoch": 0.7540086339808819, + "grad_norm": 3.7387423515319824, + "learning_rate": 1.5048601330462693e-06, + "loss": 0.9752, + "step": 9781 + }, + { + "epoch": 0.7540857230958988, + "grad_norm": 3.6760013103485107, + "learning_rate": 1.5039674844668112e-06, + "loss": 0.8867, + "step": 9782 + }, + { + "epoch": 0.7541628122109159, + "grad_norm": 3.807915449142456, + "learning_rate": 1.503075053846873e-06, + "loss": 0.9201, + "step": 9783 + }, + { + "epoch": 0.7542399013259328, + "grad_norm": 3.836941957473755, + "learning_rate": 1.5021828412420942e-06, + "loss": 0.9706, + "step": 9784 + }, + { + "epoch": 0.7543169904409497, + "grad_norm": 3.9428610801696777, + "learning_rate": 1.501290846708099e-06, + "loss": 0.893, + "step": 9785 + }, + { + "epoch": 0.7543940795559667, + "grad_norm": 3.6269469261169434, + "learning_rate": 1.5003990703004994e-06, + "loss": 0.8229, + "step": 9786 + }, + { + "epoch": 0.7544711686709836, + "grad_norm": 3.6933364868164062, + "learning_rate": 1.4995075120748948e-06, + "loss": 0.945, + "step": 9787 + }, + { + "epoch": 0.7545482577860007, + "grad_norm": 3.697035312652588, + "learning_rate": 1.498616172086866e-06, + "loss": 0.9209, + "step": 9788 + }, + { + "epoch": 0.7546253469010176, + "grad_norm": 3.5978927612304688, + "learning_rate": 1.4977250503919839e-06, + "loss": 0.961, + "step": 9789 + }, + { + "epoch": 0.7547024360160345, + "grad_norm": 3.69553804397583, + "learning_rate": 1.4968341470458064e-06, + "loss": 0.7702, + "step": 9790 + }, + { + "epoch": 0.7547795251310515, + "grad_norm": 3.7523858547210693, + "learning_rate": 1.495943462103877e-06, + "loss": 1.0264, + "step": 9791 + }, + { + "epoch": 0.7548566142460684, + "grad_norm": 3.5115909576416016, + "learning_rate": 1.495052995621724e-06, + "loss": 0.8715, + "step": 9792 + }, + { + "epoch": 0.7549337033610855, + "grad_norm": 4.038724422454834, + "learning_rate": 1.4941627476548665e-06, + "loss": 0.9478, + "step": 9793 + }, + { + "epoch": 0.7550107924761024, + "grad_norm": 3.764338970184326, + "learning_rate": 1.4932727182588025e-06, + "loss": 0.9636, + "step": 9794 + }, + { + "epoch": 0.7550878815911193, + "grad_norm": 3.6738407611846924, + "learning_rate": 1.4923829074890222e-06, + "loss": 0.9713, + "step": 9795 + }, + { + "epoch": 0.7551649707061363, + "grad_norm": 3.553886890411377, + "learning_rate": 1.491493315401002e-06, + "loss": 0.8905, + "step": 9796 + }, + { + "epoch": 0.7552420598211532, + "grad_norm": 3.702172040939331, + "learning_rate": 1.4906039420502022e-06, + "loss": 0.906, + "step": 9797 + }, + { + "epoch": 0.7553191489361702, + "grad_norm": 3.901486396789551, + "learning_rate": 1.4897147874920726e-06, + "loss": 0.8935, + "step": 9798 + }, + { + "epoch": 0.7553962380511872, + "grad_norm": 3.898955821990967, + "learning_rate": 1.4888258517820442e-06, + "loss": 0.83, + "step": 9799 + }, + { + "epoch": 0.7554733271662041, + "grad_norm": 3.40311598777771, + "learning_rate": 1.4879371349755395e-06, + "loss": 0.79, + "step": 9800 + }, + { + "epoch": 0.7555504162812211, + "grad_norm": 3.6599347591400146, + "learning_rate": 1.4870486371279647e-06, + "loss": 0.9212, + "step": 9801 + }, + { + "epoch": 0.755627505396238, + "grad_norm": 3.6006016731262207, + "learning_rate": 1.486160358294713e-06, + "loss": 0.914, + "step": 9802 + }, + { + "epoch": 0.755704594511255, + "grad_norm": 3.814122438430786, + "learning_rate": 1.4852722985311647e-06, + "loss": 0.965, + "step": 9803 + }, + { + "epoch": 0.755781683626272, + "grad_norm": 3.5150370597839355, + "learning_rate": 1.4843844578926863e-06, + "loss": 0.8573, + "step": 9804 + }, + { + "epoch": 0.7558587727412889, + "grad_norm": 3.7043817043304443, + "learning_rate": 1.483496836434627e-06, + "loss": 0.9301, + "step": 9805 + }, + { + "epoch": 0.7559358618563059, + "grad_norm": 3.6749911308288574, + "learning_rate": 1.4826094342123282e-06, + "loss": 0.8818, + "step": 9806 + }, + { + "epoch": 0.7560129509713228, + "grad_norm": 4.301516532897949, + "learning_rate": 1.4817222512811146e-06, + "loss": 0.9437, + "step": 9807 + }, + { + "epoch": 0.7560900400863398, + "grad_norm": 4.242541313171387, + "learning_rate": 1.4808352876962984e-06, + "loss": 0.9117, + "step": 9808 + }, + { + "epoch": 0.7561671292013568, + "grad_norm": 3.7353222370147705, + "learning_rate": 1.4799485435131745e-06, + "loss": 0.8453, + "step": 9809 + }, + { + "epoch": 0.7562442183163737, + "grad_norm": 3.8031511306762695, + "learning_rate": 1.4790620187870275e-06, + "loss": 0.9015, + "step": 9810 + }, + { + "epoch": 0.7563213074313907, + "grad_norm": 4.096917629241943, + "learning_rate": 1.478175713573129e-06, + "loss": 0.951, + "step": 9811 + }, + { + "epoch": 0.7563983965464076, + "grad_norm": 3.349189043045044, + "learning_rate": 1.477289627926734e-06, + "loss": 0.8813, + "step": 9812 + }, + { + "epoch": 0.7564754856614246, + "grad_norm": 3.9046249389648438, + "learning_rate": 1.476403761903088e-06, + "loss": 0.8709, + "step": 9813 + }, + { + "epoch": 0.7565525747764416, + "grad_norm": 4.114388465881348, + "learning_rate": 1.4755181155574166e-06, + "loss": 0.9669, + "step": 9814 + }, + { + "epoch": 0.7566296638914586, + "grad_norm": 3.496314287185669, + "learning_rate": 1.4746326889449375e-06, + "loss": 0.9647, + "step": 9815 + }, + { + "epoch": 0.7567067530064755, + "grad_norm": 3.8805534839630127, + "learning_rate": 1.4737474821208513e-06, + "loss": 0.8806, + "step": 9816 + }, + { + "epoch": 0.7567838421214924, + "grad_norm": 3.560910224914551, + "learning_rate": 1.472862495140347e-06, + "loss": 0.7968, + "step": 9817 + }, + { + "epoch": 0.7568609312365094, + "grad_norm": 3.6736807823181152, + "learning_rate": 1.4719777280585983e-06, + "loss": 0.8541, + "step": 9818 + }, + { + "epoch": 0.7569380203515264, + "grad_norm": 4.000894069671631, + "learning_rate": 1.4710931809307677e-06, + "loss": 0.9703, + "step": 9819 + }, + { + "epoch": 0.7570151094665434, + "grad_norm": 4.015716075897217, + "learning_rate": 1.4702088538119996e-06, + "loss": 1.0352, + "step": 9820 + }, + { + "epoch": 0.7570921985815603, + "grad_norm": 3.6512205600738525, + "learning_rate": 1.4693247467574273e-06, + "loss": 0.9386, + "step": 9821 + }, + { + "epoch": 0.7571692876965772, + "grad_norm": 3.792320728302002, + "learning_rate": 1.4684408598221722e-06, + "loss": 0.9188, + "step": 9822 + }, + { + "epoch": 0.7572463768115942, + "grad_norm": 3.727431297302246, + "learning_rate": 1.4675571930613385e-06, + "loss": 0.8135, + "step": 9823 + }, + { + "epoch": 0.7573234659266112, + "grad_norm": 3.9126195907592773, + "learning_rate": 1.4666737465300202e-06, + "loss": 0.958, + "step": 9824 + }, + { + "epoch": 0.7574005550416282, + "grad_norm": 3.749931573867798, + "learning_rate": 1.4657905202832928e-06, + "loss": 0.8214, + "step": 9825 + }, + { + "epoch": 0.7574776441566451, + "grad_norm": 3.8023717403411865, + "learning_rate": 1.4649075143762225e-06, + "loss": 0.9376, + "step": 9826 + }, + { + "epoch": 0.757554733271662, + "grad_norm": 3.8662075996398926, + "learning_rate": 1.4640247288638603e-06, + "loss": 0.9735, + "step": 9827 + }, + { + "epoch": 0.757631822386679, + "grad_norm": 3.9173572063446045, + "learning_rate": 1.463142163801242e-06, + "loss": 0.9421, + "step": 9828 + }, + { + "epoch": 0.757708911501696, + "grad_norm": 3.737765312194824, + "learning_rate": 1.4622598192433928e-06, + "loss": 0.9159, + "step": 9829 + }, + { + "epoch": 0.757786000616713, + "grad_norm": 3.5803210735321045, + "learning_rate": 1.461377695245323e-06, + "loss": 0.8864, + "step": 9830 + }, + { + "epoch": 0.7578630897317299, + "grad_norm": 3.9155960083007812, + "learning_rate": 1.460495791862025e-06, + "loss": 0.9516, + "step": 9831 + }, + { + "epoch": 0.7579401788467468, + "grad_norm": 3.639129161834717, + "learning_rate": 1.4596141091484828e-06, + "loss": 0.8667, + "step": 9832 + }, + { + "epoch": 0.7580172679617638, + "grad_norm": 3.9074134826660156, + "learning_rate": 1.4587326471596647e-06, + "loss": 0.9008, + "step": 9833 + }, + { + "epoch": 0.7580943570767807, + "grad_norm": 3.6987130641937256, + "learning_rate": 1.4578514059505256e-06, + "loss": 0.9457, + "step": 9834 + }, + { + "epoch": 0.7581714461917978, + "grad_norm": 3.8588478565216064, + "learning_rate": 1.4569703855760076e-06, + "loss": 0.9084, + "step": 9835 + }, + { + "epoch": 0.7582485353068147, + "grad_norm": 3.9179983139038086, + "learning_rate": 1.4560895860910345e-06, + "loss": 0.9484, + "step": 9836 + }, + { + "epoch": 0.7583256244218316, + "grad_norm": 3.7824649810791016, + "learning_rate": 1.4552090075505215e-06, + "loss": 0.945, + "step": 9837 + }, + { + "epoch": 0.7584027135368486, + "grad_norm": 3.568527936935425, + "learning_rate": 1.454328650009368e-06, + "loss": 0.9535, + "step": 9838 + }, + { + "epoch": 0.7584798026518655, + "grad_norm": 3.9821395874023438, + "learning_rate": 1.4534485135224597e-06, + "loss": 0.8473, + "step": 9839 + }, + { + "epoch": 0.7585568917668826, + "grad_norm": 3.9586429595947266, + "learning_rate": 1.452568598144668e-06, + "loss": 0.9471, + "step": 9840 + }, + { + "epoch": 0.7586339808818995, + "grad_norm": 4.391334533691406, + "learning_rate": 1.4516889039308535e-06, + "loss": 1.0125, + "step": 9841 + }, + { + "epoch": 0.7587110699969164, + "grad_norm": 3.699589252471924, + "learning_rate": 1.4508094309358573e-06, + "loss": 0.8814, + "step": 9842 + }, + { + "epoch": 0.7587881591119334, + "grad_norm": 4.033127784729004, + "learning_rate": 1.4499301792145109e-06, + "loss": 0.9453, + "step": 9843 + }, + { + "epoch": 0.7588652482269503, + "grad_norm": 3.849324941635132, + "learning_rate": 1.449051148821632e-06, + "loss": 0.8697, + "step": 9844 + }, + { + "epoch": 0.7589423373419674, + "grad_norm": 3.5151302814483643, + "learning_rate": 1.4481723398120228e-06, + "loss": 0.9318, + "step": 9845 + }, + { + "epoch": 0.7590194264569843, + "grad_norm": 3.9107677936553955, + "learning_rate": 1.4472937522404744e-06, + "loss": 0.9946, + "step": 9846 + }, + { + "epoch": 0.7590965155720012, + "grad_norm": 3.9752719402313232, + "learning_rate": 1.4464153861617597e-06, + "loss": 0.8786, + "step": 9847 + }, + { + "epoch": 0.7591736046870182, + "grad_norm": 3.470489978790283, + "learning_rate": 1.4455372416306407e-06, + "loss": 0.8099, + "step": 9848 + }, + { + "epoch": 0.7592506938020351, + "grad_norm": 4.009558200836182, + "learning_rate": 1.4446593187018637e-06, + "loss": 0.9382, + "step": 9849 + }, + { + "epoch": 0.7593277829170522, + "grad_norm": 4.106441020965576, + "learning_rate": 1.4437816174301684e-06, + "loss": 0.9847, + "step": 9850 + }, + { + "epoch": 0.7594048720320691, + "grad_norm": 3.834660768508911, + "learning_rate": 1.442904137870269e-06, + "loss": 0.8793, + "step": 9851 + }, + { + "epoch": 0.759481961147086, + "grad_norm": 3.583991050720215, + "learning_rate": 1.4420268800768744e-06, + "loss": 0.8814, + "step": 9852 + }, + { + "epoch": 0.759559050262103, + "grad_norm": 3.8111629486083984, + "learning_rate": 1.4411498441046761e-06, + "loss": 0.9838, + "step": 9853 + }, + { + "epoch": 0.7596361393771199, + "grad_norm": 3.4147655963897705, + "learning_rate": 1.4402730300083534e-06, + "loss": 0.8173, + "step": 9854 + }, + { + "epoch": 0.759713228492137, + "grad_norm": 3.8176896572113037, + "learning_rate": 1.4393964378425712e-06, + "loss": 0.8702, + "step": 9855 + }, + { + "epoch": 0.7597903176071539, + "grad_norm": 3.6424262523651123, + "learning_rate": 1.438520067661982e-06, + "loss": 0.7725, + "step": 9856 + }, + { + "epoch": 0.7598674067221708, + "grad_norm": 3.9817745685577393, + "learning_rate": 1.4376439195212194e-06, + "loss": 0.9865, + "step": 9857 + }, + { + "epoch": 0.7599444958371878, + "grad_norm": 3.7620887756347656, + "learning_rate": 1.4367679934749085e-06, + "loss": 1.0232, + "step": 9858 + }, + { + "epoch": 0.7600215849522047, + "grad_norm": 3.7138636112213135, + "learning_rate": 1.4358922895776584e-06, + "loss": 0.9584, + "step": 9859 + }, + { + "epoch": 0.7600986740672218, + "grad_norm": 3.80833101272583, + "learning_rate": 1.4350168078840653e-06, + "loss": 0.9049, + "step": 9860 + }, + { + "epoch": 0.7601757631822387, + "grad_norm": 3.6317269802093506, + "learning_rate": 1.4341415484487126e-06, + "loss": 0.8555, + "step": 9861 + }, + { + "epoch": 0.7602528522972556, + "grad_norm": 3.839651107788086, + "learning_rate": 1.4332665113261645e-06, + "loss": 0.8839, + "step": 9862 + }, + { + "epoch": 0.7603299414122726, + "grad_norm": 4.206575870513916, + "learning_rate": 1.4323916965709766e-06, + "loss": 0.9231, + "step": 9863 + }, + { + "epoch": 0.7604070305272895, + "grad_norm": 3.6243884563446045, + "learning_rate": 1.4315171042376897e-06, + "loss": 0.953, + "step": 9864 + }, + { + "epoch": 0.7604841196423066, + "grad_norm": 3.7419850826263428, + "learning_rate": 1.43064273438083e-06, + "loss": 1.0172, + "step": 9865 + }, + { + "epoch": 0.7605612087573235, + "grad_norm": 3.832564353942871, + "learning_rate": 1.4297685870549088e-06, + "loss": 0.8665, + "step": 9866 + }, + { + "epoch": 0.7606382978723404, + "grad_norm": 3.582911491394043, + "learning_rate": 1.4288946623144272e-06, + "loss": 0.8848, + "step": 9867 + }, + { + "epoch": 0.7607153869873574, + "grad_norm": 3.7686104774475098, + "learning_rate": 1.4280209602138673e-06, + "loss": 0.932, + "step": 9868 + }, + { + "epoch": 0.7607924761023743, + "grad_norm": 3.6485867500305176, + "learning_rate": 1.4271474808077e-06, + "loss": 0.9416, + "step": 9869 + }, + { + "epoch": 0.7608695652173914, + "grad_norm": 3.6767213344573975, + "learning_rate": 1.4262742241503836e-06, + "loss": 0.9576, + "step": 9870 + }, + { + "epoch": 0.7609466543324083, + "grad_norm": 3.4594292640686035, + "learning_rate": 1.42540119029636e-06, + "loss": 0.8465, + "step": 9871 + }, + { + "epoch": 0.7610237434474252, + "grad_norm": 3.935392141342163, + "learning_rate": 1.4245283793000608e-06, + "loss": 0.9516, + "step": 9872 + }, + { + "epoch": 0.7611008325624422, + "grad_norm": 3.8447792530059814, + "learning_rate": 1.4236557912158977e-06, + "loss": 0.8472, + "step": 9873 + }, + { + "epoch": 0.7611779216774591, + "grad_norm": 3.4628243446350098, + "learning_rate": 1.4227834260982732e-06, + "loss": 0.861, + "step": 9874 + }, + { + "epoch": 0.7612550107924761, + "grad_norm": 3.9336140155792236, + "learning_rate": 1.4219112840015759e-06, + "loss": 0.9094, + "step": 9875 + }, + { + "epoch": 0.7613320999074931, + "grad_norm": 4.2745041847229, + "learning_rate": 1.421039364980178e-06, + "loss": 0.9124, + "step": 9876 + }, + { + "epoch": 0.76140918902251, + "grad_norm": 4.0710625648498535, + "learning_rate": 1.4201676690884403e-06, + "loss": 0.7956, + "step": 9877 + }, + { + "epoch": 0.761486278137527, + "grad_norm": 3.9371089935302734, + "learning_rate": 1.4192961963807094e-06, + "loss": 0.9185, + "step": 9878 + }, + { + "epoch": 0.7615633672525439, + "grad_norm": 3.549879789352417, + "learning_rate": 1.4184249469113138e-06, + "loss": 0.8869, + "step": 9879 + }, + { + "epoch": 0.761640456367561, + "grad_norm": 3.896291494369507, + "learning_rate": 1.417553920734574e-06, + "loss": 0.8783, + "step": 9880 + }, + { + "epoch": 0.7617175454825779, + "grad_norm": 3.623753070831299, + "learning_rate": 1.4166831179047923e-06, + "loss": 0.8513, + "step": 9881 + }, + { + "epoch": 0.7617946345975948, + "grad_norm": 3.9215145111083984, + "learning_rate": 1.4158125384762606e-06, + "loss": 0.8208, + "step": 9882 + }, + { + "epoch": 0.7618717237126118, + "grad_norm": 3.590507745742798, + "learning_rate": 1.4149421825032556e-06, + "loss": 0.918, + "step": 9883 + }, + { + "epoch": 0.7619488128276287, + "grad_norm": 4.319372177124023, + "learning_rate": 1.4140720500400363e-06, + "loss": 0.9689, + "step": 9884 + }, + { + "epoch": 0.7620259019426457, + "grad_norm": 3.8288538455963135, + "learning_rate": 1.4132021411408526e-06, + "loss": 0.9748, + "step": 9885 + }, + { + "epoch": 0.7621029910576627, + "grad_norm": 4.058238506317139, + "learning_rate": 1.4123324558599389e-06, + "loss": 1.0233, + "step": 9886 + }, + { + "epoch": 0.7621800801726796, + "grad_norm": 3.779918670654297, + "learning_rate": 1.4114629942515156e-06, + "loss": 0.8905, + "step": 9887 + }, + { + "epoch": 0.7622571692876966, + "grad_norm": 3.943232774734497, + "learning_rate": 1.410593756369789e-06, + "loss": 0.8821, + "step": 9888 + }, + { + "epoch": 0.7623342584027135, + "grad_norm": 3.8813998699188232, + "learning_rate": 1.4097247422689537e-06, + "loss": 0.9586, + "step": 9889 + }, + { + "epoch": 0.7624113475177305, + "grad_norm": 4.069021224975586, + "learning_rate": 1.408855952003184e-06, + "loss": 0.9016, + "step": 9890 + }, + { + "epoch": 0.7624884366327475, + "grad_norm": 3.904691457748413, + "learning_rate": 1.4079873856266468e-06, + "loss": 0.8317, + "step": 9891 + }, + { + "epoch": 0.7625655257477644, + "grad_norm": 3.729013681411743, + "learning_rate": 1.4071190431934934e-06, + "loss": 1.0032, + "step": 9892 + }, + { + "epoch": 0.7626426148627814, + "grad_norm": 3.8653934001922607, + "learning_rate": 1.4062509247578586e-06, + "loss": 0.9653, + "step": 9893 + }, + { + "epoch": 0.7627197039777983, + "grad_norm": 3.8940982818603516, + "learning_rate": 1.405383030373867e-06, + "loss": 1.0486, + "step": 9894 + }, + { + "epoch": 0.7627967930928153, + "grad_norm": 3.855316162109375, + "learning_rate": 1.4045153600956257e-06, + "loss": 1.0069, + "step": 9895 + }, + { + "epoch": 0.7628738822078323, + "grad_norm": 4.0627827644348145, + "learning_rate": 1.4036479139772309e-06, + "loss": 0.9452, + "step": 9896 + }, + { + "epoch": 0.7629509713228492, + "grad_norm": 3.806206464767456, + "learning_rate": 1.402780692072762e-06, + "loss": 0.8991, + "step": 9897 + }, + { + "epoch": 0.7630280604378662, + "grad_norm": 4.070652484893799, + "learning_rate": 1.4019136944362882e-06, + "loss": 0.9424, + "step": 9898 + }, + { + "epoch": 0.7631051495528831, + "grad_norm": 3.583237409591675, + "learning_rate": 1.401046921121859e-06, + "loss": 0.9428, + "step": 9899 + }, + { + "epoch": 0.7631822386679001, + "grad_norm": 3.9483799934387207, + "learning_rate": 1.400180372183515e-06, + "loss": 1.017, + "step": 9900 + }, + { + "epoch": 0.763259327782917, + "grad_norm": 3.897564172744751, + "learning_rate": 1.3993140476752808e-06, + "loss": 0.8972, + "step": 9901 + }, + { + "epoch": 0.763336416897934, + "grad_norm": 3.622753620147705, + "learning_rate": 1.3984479476511676e-06, + "loss": 0.925, + "step": 9902 + }, + { + "epoch": 0.763413506012951, + "grad_norm": 3.872459888458252, + "learning_rate": 1.3975820721651718e-06, + "loss": 0.9007, + "step": 9903 + }, + { + "epoch": 0.7634905951279679, + "grad_norm": 3.8231959342956543, + "learning_rate": 1.3967164212712774e-06, + "loss": 0.8777, + "step": 9904 + }, + { + "epoch": 0.7635676842429849, + "grad_norm": 3.6052405834198, + "learning_rate": 1.3958509950234516e-06, + "loss": 0.926, + "step": 9905 + }, + { + "epoch": 0.7636447733580018, + "grad_norm": 4.091972351074219, + "learning_rate": 1.3949857934756495e-06, + "loss": 0.9525, + "step": 9906 + }, + { + "epoch": 0.7637218624730188, + "grad_norm": 3.3216168880462646, + "learning_rate": 1.394120816681812e-06, + "loss": 0.8179, + "step": 9907 + }, + { + "epoch": 0.7637989515880358, + "grad_norm": 3.8997809886932373, + "learning_rate": 1.3932560646958665e-06, + "loss": 0.9877, + "step": 9908 + }, + { + "epoch": 0.7638760407030527, + "grad_norm": 3.5005760192871094, + "learning_rate": 1.3923915375717272e-06, + "loss": 0.9585, + "step": 9909 + }, + { + "epoch": 0.7639531298180697, + "grad_norm": 3.4733173847198486, + "learning_rate": 1.3915272353632896e-06, + "loss": 0.873, + "step": 9910 + }, + { + "epoch": 0.7640302189330866, + "grad_norm": 3.862825393676758, + "learning_rate": 1.39066315812444e-06, + "loss": 0.8944, + "step": 9911 + }, + { + "epoch": 0.7641073080481036, + "grad_norm": 3.6997947692871094, + "learning_rate": 1.3897993059090492e-06, + "loss": 0.9553, + "step": 9912 + }, + { + "epoch": 0.7641843971631206, + "grad_norm": 4.415987491607666, + "learning_rate": 1.388935678770974e-06, + "loss": 0.8898, + "step": 9913 + }, + { + "epoch": 0.7642614862781375, + "grad_norm": 3.8351826667785645, + "learning_rate": 1.3880722767640575e-06, + "loss": 1.0199, + "step": 9914 + }, + { + "epoch": 0.7643385753931545, + "grad_norm": 3.8005855083465576, + "learning_rate": 1.387209099942129e-06, + "loss": 0.8509, + "step": 9915 + }, + { + "epoch": 0.7644156645081714, + "grad_norm": 3.49656343460083, + "learning_rate": 1.3863461483590008e-06, + "loss": 0.8576, + "step": 9916 + }, + { + "epoch": 0.7644927536231884, + "grad_norm": 3.7017486095428467, + "learning_rate": 1.3854834220684743e-06, + "loss": 0.976, + "step": 9917 + }, + { + "epoch": 0.7645698427382054, + "grad_norm": 3.82146954536438, + "learning_rate": 1.3846209211243366e-06, + "loss": 0.8766, + "step": 9918 + }, + { + "epoch": 0.7646469318532223, + "grad_norm": 3.6849117279052734, + "learning_rate": 1.3837586455803599e-06, + "loss": 0.9259, + "step": 9919 + }, + { + "epoch": 0.7647240209682393, + "grad_norm": 3.7132599353790283, + "learning_rate": 1.382896595490304e-06, + "loss": 0.853, + "step": 9920 + }, + { + "epoch": 0.7648011100832562, + "grad_norm": 4.089300632476807, + "learning_rate": 1.3820347709079103e-06, + "loss": 0.9161, + "step": 9921 + }, + { + "epoch": 0.7648781991982732, + "grad_norm": 3.6763312816619873, + "learning_rate": 1.3811731718869108e-06, + "loss": 0.9985, + "step": 9922 + }, + { + "epoch": 0.7649552883132902, + "grad_norm": 3.901153326034546, + "learning_rate": 1.3803117984810221e-06, + "loss": 0.9706, + "step": 9923 + }, + { + "epoch": 0.7650323774283071, + "grad_norm": 3.6680846214294434, + "learning_rate": 1.3794506507439454e-06, + "loss": 0.9672, + "step": 9924 + }, + { + "epoch": 0.7651094665433241, + "grad_norm": 3.704328775405884, + "learning_rate": 1.3785897287293693e-06, + "loss": 0.9395, + "step": 9925 + }, + { + "epoch": 0.765186555658341, + "grad_norm": 3.6774888038635254, + "learning_rate": 1.3777290324909698e-06, + "loss": 1.0224, + "step": 9926 + }, + { + "epoch": 0.765263644773358, + "grad_norm": 3.7571513652801514, + "learning_rate": 1.376868562082403e-06, + "loss": 1.0518, + "step": 9927 + }, + { + "epoch": 0.765340733888375, + "grad_norm": 3.6580474376678467, + "learning_rate": 1.3760083175573168e-06, + "loss": 0.8153, + "step": 9928 + }, + { + "epoch": 0.7654178230033919, + "grad_norm": 4.089507579803467, + "learning_rate": 1.3751482989693433e-06, + "loss": 0.9439, + "step": 9929 + }, + { + "epoch": 0.7654949121184089, + "grad_norm": 3.566601037979126, + "learning_rate": 1.374288506372099e-06, + "loss": 0.8285, + "step": 9930 + }, + { + "epoch": 0.7655720012334258, + "grad_norm": 3.4016520977020264, + "learning_rate": 1.3734289398191902e-06, + "loss": 0.8906, + "step": 9931 + }, + { + "epoch": 0.7656490903484428, + "grad_norm": 3.8981709480285645, + "learning_rate": 1.372569599364203e-06, + "loss": 1.0273, + "step": 9932 + }, + { + "epoch": 0.7657261794634598, + "grad_norm": 4.178238868713379, + "learning_rate": 1.3717104850607144e-06, + "loss": 1.0948, + "step": 9933 + }, + { + "epoch": 0.7658032685784767, + "grad_norm": 3.904507875442505, + "learning_rate": 1.3708515969622854e-06, + "loss": 1.0076, + "step": 9934 + }, + { + "epoch": 0.7658803576934937, + "grad_norm": 3.62880802154541, + "learning_rate": 1.369992935122464e-06, + "loss": 0.9251, + "step": 9935 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 3.961493968963623, + "learning_rate": 1.369134499594782e-06, + "loss": 0.9443, + "step": 9936 + }, + { + "epoch": 0.7660345359235275, + "grad_norm": 3.5568950176239014, + "learning_rate": 1.3682762904327613e-06, + "loss": 0.9349, + "step": 9937 + }, + { + "epoch": 0.7661116250385446, + "grad_norm": 4.020711898803711, + "learning_rate": 1.3674183076899016e-06, + "loss": 0.8839, + "step": 9938 + }, + { + "epoch": 0.7661887141535615, + "grad_norm": 4.319362163543701, + "learning_rate": 1.3665605514196984e-06, + "loss": 1.0175, + "step": 9939 + }, + { + "epoch": 0.7662658032685785, + "grad_norm": 3.667689085006714, + "learning_rate": 1.3657030216756263e-06, + "loss": 0.8792, + "step": 9940 + }, + { + "epoch": 0.7663428923835954, + "grad_norm": 3.632988214492798, + "learning_rate": 1.3648457185111502e-06, + "loss": 0.8438, + "step": 9941 + }, + { + "epoch": 0.7664199814986123, + "grad_norm": 3.730689525604248, + "learning_rate": 1.363988641979715e-06, + "loss": 0.9797, + "step": 9942 + }, + { + "epoch": 0.7664970706136294, + "grad_norm": 3.5583460330963135, + "learning_rate": 1.3631317921347564e-06, + "loss": 0.7986, + "step": 9943 + }, + { + "epoch": 0.7665741597286463, + "grad_norm": 3.7527616024017334, + "learning_rate": 1.3622751690296947e-06, + "loss": 0.9162, + "step": 9944 + }, + { + "epoch": 0.7666512488436633, + "grad_norm": 3.690645694732666, + "learning_rate": 1.3614187727179368e-06, + "loss": 0.9612, + "step": 9945 + }, + { + "epoch": 0.7667283379586802, + "grad_norm": 3.8504841327667236, + "learning_rate": 1.3605626032528746e-06, + "loss": 0.8196, + "step": 9946 + }, + { + "epoch": 0.7668054270736971, + "grad_norm": 3.3687868118286133, + "learning_rate": 1.3597066606878834e-06, + "loss": 0.8102, + "step": 9947 + }, + { + "epoch": 0.7668825161887142, + "grad_norm": 3.635887622833252, + "learning_rate": 1.3588509450763281e-06, + "loss": 0.8814, + "step": 9948 + }, + { + "epoch": 0.7669596053037311, + "grad_norm": 3.819847822189331, + "learning_rate": 1.3579954564715587e-06, + "loss": 0.8737, + "step": 9949 + }, + { + "epoch": 0.7670366944187481, + "grad_norm": 3.8392279148101807, + "learning_rate": 1.3571401949269103e-06, + "loss": 0.9532, + "step": 9950 + }, + { + "epoch": 0.767113783533765, + "grad_norm": 3.4588663578033447, + "learning_rate": 1.356285160495704e-06, + "loss": 0.7837, + "step": 9951 + }, + { + "epoch": 0.7671908726487819, + "grad_norm": 3.9829211235046387, + "learning_rate": 1.3554303532312475e-06, + "loss": 0.9798, + "step": 9952 + }, + { + "epoch": 0.767267961763799, + "grad_norm": 3.6021299362182617, + "learning_rate": 1.354575773186832e-06, + "loss": 0.8786, + "step": 9953 + }, + { + "epoch": 0.7673450508788159, + "grad_norm": 3.810025453567505, + "learning_rate": 1.353721420415736e-06, + "loss": 0.9356, + "step": 9954 + }, + { + "epoch": 0.7674221399938329, + "grad_norm": 3.6363253593444824, + "learning_rate": 1.3528672949712257e-06, + "loss": 0.9352, + "step": 9955 + }, + { + "epoch": 0.7674992291088498, + "grad_norm": 3.648545980453491, + "learning_rate": 1.3520133969065502e-06, + "loss": 0.8348, + "step": 9956 + }, + { + "epoch": 0.7675763182238667, + "grad_norm": 3.741356611251831, + "learning_rate": 1.3511597262749476e-06, + "loss": 0.8785, + "step": 9957 + }, + { + "epoch": 0.7676534073388838, + "grad_norm": 3.7994656562805176, + "learning_rate": 1.3503062831296372e-06, + "loss": 0.9241, + "step": 9958 + }, + { + "epoch": 0.7677304964539007, + "grad_norm": 3.677051544189453, + "learning_rate": 1.349453067523827e-06, + "loss": 0.9047, + "step": 9959 + }, + { + "epoch": 0.7678075855689177, + "grad_norm": 3.7097387313842773, + "learning_rate": 1.3486000795107118e-06, + "loss": 0.9048, + "step": 9960 + }, + { + "epoch": 0.7678846746839346, + "grad_norm": 3.4744412899017334, + "learning_rate": 1.3477473191434703e-06, + "loss": 0.939, + "step": 9961 + }, + { + "epoch": 0.7679617637989515, + "grad_norm": 4.095146179199219, + "learning_rate": 1.346894786475268e-06, + "loss": 1.0014, + "step": 9962 + }, + { + "epoch": 0.7680388529139686, + "grad_norm": 3.7162740230560303, + "learning_rate": 1.3460424815592577e-06, + "loss": 0.8576, + "step": 9963 + }, + { + "epoch": 0.7681159420289855, + "grad_norm": 3.45137095451355, + "learning_rate": 1.3451904044485725e-06, + "loss": 0.8469, + "step": 9964 + }, + { + "epoch": 0.7681930311440025, + "grad_norm": 3.4185662269592285, + "learning_rate": 1.3443385551963373e-06, + "loss": 0.741, + "step": 9965 + }, + { + "epoch": 0.7682701202590194, + "grad_norm": 3.571277618408203, + "learning_rate": 1.3434869338556594e-06, + "loss": 0.9274, + "step": 9966 + }, + { + "epoch": 0.7683472093740363, + "grad_norm": 3.5546271800994873, + "learning_rate": 1.3426355404796337e-06, + "loss": 0.8564, + "step": 9967 + }, + { + "epoch": 0.7684242984890534, + "grad_norm": 3.800027370452881, + "learning_rate": 1.341784375121342e-06, + "loss": 0.9594, + "step": 9968 + }, + { + "epoch": 0.7685013876040703, + "grad_norm": 3.928795337677002, + "learning_rate": 1.3409334378338461e-06, + "loss": 0.9045, + "step": 9969 + }, + { + "epoch": 0.7685784767190873, + "grad_norm": 3.6231064796447754, + "learning_rate": 1.3400827286702001e-06, + "loss": 0.9071, + "step": 9970 + }, + { + "epoch": 0.7686555658341042, + "grad_norm": 3.7267343997955322, + "learning_rate": 1.3392322476834402e-06, + "loss": 0.9177, + "step": 9971 + }, + { + "epoch": 0.7687326549491211, + "grad_norm": 3.9702486991882324, + "learning_rate": 1.3383819949265908e-06, + "loss": 0.9202, + "step": 9972 + }, + { + "epoch": 0.7688097440641382, + "grad_norm": 3.364682912826538, + "learning_rate": 1.3375319704526595e-06, + "loss": 0.8446, + "step": 9973 + }, + { + "epoch": 0.7688868331791551, + "grad_norm": 4.229642868041992, + "learning_rate": 1.336682174314643e-06, + "loss": 1.0238, + "step": 9974 + }, + { + "epoch": 0.7689639222941721, + "grad_norm": 3.9624364376068115, + "learning_rate": 1.3358326065655187e-06, + "loss": 1.0482, + "step": 9975 + }, + { + "epoch": 0.769041011409189, + "grad_norm": 3.632380485534668, + "learning_rate": 1.334983267258254e-06, + "loss": 0.8454, + "step": 9976 + }, + { + "epoch": 0.7691181005242059, + "grad_norm": 3.768838882446289, + "learning_rate": 1.334134156445801e-06, + "loss": 0.8613, + "step": 9977 + }, + { + "epoch": 0.769195189639223, + "grad_norm": 3.4363794326782227, + "learning_rate": 1.3332852741810975e-06, + "loss": 0.8998, + "step": 9978 + }, + { + "epoch": 0.7692722787542399, + "grad_norm": 3.6031389236450195, + "learning_rate": 1.332436620517068e-06, + "loss": 0.8144, + "step": 9979 + }, + { + "epoch": 0.7693493678692569, + "grad_norm": 3.714672327041626, + "learning_rate": 1.331588195506619e-06, + "loss": 1.0322, + "step": 9980 + }, + { + "epoch": 0.7694264569842738, + "grad_norm": 3.482741117477417, + "learning_rate": 1.330739999202647e-06, + "loss": 0.8334, + "step": 9981 + }, + { + "epoch": 0.7695035460992907, + "grad_norm": 4.036420822143555, + "learning_rate": 1.3298920316580304e-06, + "loss": 0.9349, + "step": 9982 + }, + { + "epoch": 0.7695806352143078, + "grad_norm": 4.182904243469238, + "learning_rate": 1.3290442929256415e-06, + "loss": 0.8901, + "step": 9983 + }, + { + "epoch": 0.7696577243293247, + "grad_norm": 3.8596582412719727, + "learning_rate": 1.3281967830583264e-06, + "loss": 0.9771, + "step": 9984 + }, + { + "epoch": 0.7697348134443417, + "grad_norm": 3.6666831970214844, + "learning_rate": 1.3273495021089255e-06, + "loss": 0.8917, + "step": 9985 + }, + { + "epoch": 0.7698119025593586, + "grad_norm": 3.531444787979126, + "learning_rate": 1.326502450130262e-06, + "loss": 0.8601, + "step": 9986 + }, + { + "epoch": 0.7698889916743756, + "grad_norm": 4.0722551345825195, + "learning_rate": 1.3256556271751454e-06, + "loss": 1.0251, + "step": 9987 + }, + { + "epoch": 0.7699660807893925, + "grad_norm": 3.8301305770874023, + "learning_rate": 1.3248090332963697e-06, + "loss": 0.9047, + "step": 9988 + }, + { + "epoch": 0.7700431699044095, + "grad_norm": 3.6400532722473145, + "learning_rate": 1.323962668546719e-06, + "loss": 0.995, + "step": 9989 + }, + { + "epoch": 0.7701202590194265, + "grad_norm": 4.178936958312988, + "learning_rate": 1.3231165329789546e-06, + "loss": 0.9048, + "step": 9990 + }, + { + "epoch": 0.7701973481344434, + "grad_norm": 4.014671325683594, + "learning_rate": 1.3222706266458323e-06, + "loss": 0.9091, + "step": 9991 + }, + { + "epoch": 0.7702744372494604, + "grad_norm": 4.306642055511475, + "learning_rate": 1.3214249496000887e-06, + "loss": 1.0158, + "step": 9992 + }, + { + "epoch": 0.7703515263644773, + "grad_norm": 3.796880006790161, + "learning_rate": 1.3205795018944473e-06, + "loss": 0.9725, + "step": 9993 + }, + { + "epoch": 0.7704286154794943, + "grad_norm": 3.244234800338745, + "learning_rate": 1.3197342835816196e-06, + "loss": 0.7616, + "step": 9994 + }, + { + "epoch": 0.7705057045945113, + "grad_norm": 3.749753713607788, + "learning_rate": 1.3188892947142973e-06, + "loss": 0.825, + "step": 9995 + }, + { + "epoch": 0.7705827937095282, + "grad_norm": 3.7566635608673096, + "learning_rate": 1.3180445353451621e-06, + "loss": 0.91, + "step": 9996 + }, + { + "epoch": 0.7706598828245452, + "grad_norm": 3.844532012939453, + "learning_rate": 1.3172000055268814e-06, + "loss": 0.96, + "step": 9997 + }, + { + "epoch": 0.7707369719395621, + "grad_norm": 3.6308090686798096, + "learning_rate": 1.3163557053121062e-06, + "loss": 0.9925, + "step": 9998 + }, + { + "epoch": 0.7708140610545791, + "grad_norm": 3.9852662086486816, + "learning_rate": 1.3155116347534746e-06, + "loss": 1.0156, + "step": 9999 + }, + { + "epoch": 0.7708911501695961, + "grad_norm": 3.719912052154541, + "learning_rate": 1.3146677939036118e-06, + "loss": 0.9678, + "step": 10000 + }, + { + "epoch": 0.770968239284613, + "grad_norm": 3.3894362449645996, + "learning_rate": 1.3138241828151238e-06, + "loss": 0.8484, + "step": 10001 + }, + { + "epoch": 0.77104532839963, + "grad_norm": 3.582899808883667, + "learning_rate": 1.3129808015406064e-06, + "loss": 0.7916, + "step": 10002 + }, + { + "epoch": 0.7711224175146469, + "grad_norm": 3.8180840015411377, + "learning_rate": 1.312137650132641e-06, + "loss": 0.9036, + "step": 10003 + }, + { + "epoch": 0.7711995066296639, + "grad_norm": 3.5141336917877197, + "learning_rate": 1.3112947286437927e-06, + "loss": 0.8419, + "step": 10004 + }, + { + "epoch": 0.7712765957446809, + "grad_norm": 3.5062694549560547, + "learning_rate": 1.3104520371266155e-06, + "loss": 0.8971, + "step": 10005 + }, + { + "epoch": 0.7713536848596978, + "grad_norm": 3.4877374172210693, + "learning_rate": 1.3096095756336442e-06, + "loss": 0.9091, + "step": 10006 + }, + { + "epoch": 0.7714307739747148, + "grad_norm": 4.043210029602051, + "learning_rate": 1.308767344217402e-06, + "loss": 1.0359, + "step": 10007 + }, + { + "epoch": 0.7715078630897317, + "grad_norm": 4.2239990234375, + "learning_rate": 1.307925342930399e-06, + "loss": 0.9755, + "step": 10008 + }, + { + "epoch": 0.7715849522047487, + "grad_norm": 3.9452037811279297, + "learning_rate": 1.3070835718251284e-06, + "loss": 0.963, + "step": 10009 + }, + { + "epoch": 0.7716620413197657, + "grad_norm": 3.688035249710083, + "learning_rate": 1.306242030954072e-06, + "loss": 0.9624, + "step": 10010 + }, + { + "epoch": 0.7717391304347826, + "grad_norm": 3.4075801372528076, + "learning_rate": 1.3054007203696955e-06, + "loss": 0.8649, + "step": 10011 + }, + { + "epoch": 0.7718162195497996, + "grad_norm": 3.897404909133911, + "learning_rate": 1.3045596401244477e-06, + "loss": 0.8981, + "step": 10012 + }, + { + "epoch": 0.7718933086648165, + "grad_norm": 3.891972303390503, + "learning_rate": 1.303718790270767e-06, + "loss": 0.9878, + "step": 10013 + }, + { + "epoch": 0.7719703977798335, + "grad_norm": 3.599642515182495, + "learning_rate": 1.3028781708610766e-06, + "loss": 0.8243, + "step": 10014 + }, + { + "epoch": 0.7720474868948505, + "grad_norm": 3.8693175315856934, + "learning_rate": 1.3020377819477843e-06, + "loss": 0.9088, + "step": 10015 + }, + { + "epoch": 0.7721245760098674, + "grad_norm": 3.6031887531280518, + "learning_rate": 1.3011976235832852e-06, + "loss": 0.8812, + "step": 10016 + }, + { + "epoch": 0.7722016651248844, + "grad_norm": 3.770428419113159, + "learning_rate": 1.3003576958199565e-06, + "loss": 1.026, + "step": 10017 + }, + { + "epoch": 0.7722787542399013, + "grad_norm": 3.2302563190460205, + "learning_rate": 1.2995179987101648e-06, + "loss": 0.7229, + "step": 10018 + }, + { + "epoch": 0.7723558433549182, + "grad_norm": 3.593198537826538, + "learning_rate": 1.29867853230626e-06, + "loss": 0.7879, + "step": 10019 + }, + { + "epoch": 0.7724329324699353, + "grad_norm": 4.028049945831299, + "learning_rate": 1.297839296660579e-06, + "loss": 0.8327, + "step": 10020 + }, + { + "epoch": 0.7725100215849522, + "grad_norm": 3.5654919147491455, + "learning_rate": 1.2970002918254443e-06, + "loss": 0.9508, + "step": 10021 + }, + { + "epoch": 0.7725871106999692, + "grad_norm": 3.5768983364105225, + "learning_rate": 1.2961615178531644e-06, + "loss": 0.8049, + "step": 10022 + }, + { + "epoch": 0.7726641998149861, + "grad_norm": 3.564749240875244, + "learning_rate": 1.29532297479603e-06, + "loss": 0.8298, + "step": 10023 + }, + { + "epoch": 0.772741288930003, + "grad_norm": 3.7964117527008057, + "learning_rate": 1.2944846627063208e-06, + "loss": 0.8518, + "step": 10024 + }, + { + "epoch": 0.7728183780450201, + "grad_norm": 3.8929343223571777, + "learning_rate": 1.2936465816363014e-06, + "loss": 0.8663, + "step": 10025 + }, + { + "epoch": 0.772895467160037, + "grad_norm": 4.378154277801514, + "learning_rate": 1.2928087316382225e-06, + "loss": 0.9825, + "step": 10026 + }, + { + "epoch": 0.772972556275054, + "grad_norm": 3.699422836303711, + "learning_rate": 1.2919711127643186e-06, + "loss": 0.939, + "step": 10027 + }, + { + "epoch": 0.7730496453900709, + "grad_norm": 3.876960277557373, + "learning_rate": 1.2911337250668115e-06, + "loss": 0.9534, + "step": 10028 + }, + { + "epoch": 0.7731267345050878, + "grad_norm": 4.108030319213867, + "learning_rate": 1.290296568597908e-06, + "loss": 0.9403, + "step": 10029 + }, + { + "epoch": 0.7732038236201049, + "grad_norm": 3.5450022220611572, + "learning_rate": 1.2894596434098006e-06, + "loss": 0.9481, + "step": 10030 + }, + { + "epoch": 0.7732809127351218, + "grad_norm": 3.4901676177978516, + "learning_rate": 1.2886229495546687e-06, + "loss": 0.8106, + "step": 10031 + }, + { + "epoch": 0.7733580018501388, + "grad_norm": 4.202146053314209, + "learning_rate": 1.2877864870846724e-06, + "loss": 0.8711, + "step": 10032 + }, + { + "epoch": 0.7734350909651557, + "grad_norm": 3.778503656387329, + "learning_rate": 1.2869502560519626e-06, + "loss": 1.053, + "step": 10033 + }, + { + "epoch": 0.7735121800801726, + "grad_norm": 3.5146751403808594, + "learning_rate": 1.2861142565086737e-06, + "loss": 0.7681, + "step": 10034 + }, + { + "epoch": 0.7735892691951897, + "grad_norm": 3.5477395057678223, + "learning_rate": 1.2852784885069265e-06, + "loss": 0.9164, + "step": 10035 + }, + { + "epoch": 0.7736663583102066, + "grad_norm": 3.6575543880462646, + "learning_rate": 1.284442952098826e-06, + "loss": 0.8941, + "step": 10036 + }, + { + "epoch": 0.7737434474252236, + "grad_norm": 3.6682982444763184, + "learning_rate": 1.2836076473364662e-06, + "loss": 0.824, + "step": 10037 + }, + { + "epoch": 0.7738205365402405, + "grad_norm": 3.7592432498931885, + "learning_rate": 1.2827725742719205e-06, + "loss": 0.9658, + "step": 10038 + }, + { + "epoch": 0.7738976256552574, + "grad_norm": 3.7127466201782227, + "learning_rate": 1.2819377329572525e-06, + "loss": 0.8969, + "step": 10039 + }, + { + "epoch": 0.7739747147702745, + "grad_norm": 3.9338138103485107, + "learning_rate": 1.2811031234445103e-06, + "loss": 1.0264, + "step": 10040 + }, + { + "epoch": 0.7740518038852914, + "grad_norm": 3.961344003677368, + "learning_rate": 1.2802687457857277e-06, + "loss": 0.9453, + "step": 10041 + }, + { + "epoch": 0.7741288930003084, + "grad_norm": 3.9601597785949707, + "learning_rate": 1.2794346000329256e-06, + "loss": 0.9532, + "step": 10042 + }, + { + "epoch": 0.7742059821153253, + "grad_norm": 3.8072171211242676, + "learning_rate": 1.278600686238105e-06, + "loss": 1.0072, + "step": 10043 + }, + { + "epoch": 0.7742830712303422, + "grad_norm": 3.795827865600586, + "learning_rate": 1.2777670044532586e-06, + "loss": 0.875, + "step": 10044 + }, + { + "epoch": 0.7743601603453593, + "grad_norm": 3.6865553855895996, + "learning_rate": 1.2769335547303613e-06, + "loss": 0.9439, + "step": 10045 + }, + { + "epoch": 0.7744372494603762, + "grad_norm": 3.5264663696289062, + "learning_rate": 1.2761003371213743e-06, + "loss": 0.942, + "step": 10046 + }, + { + "epoch": 0.7745143385753932, + "grad_norm": 3.729469060897827, + "learning_rate": 1.2752673516782448e-06, + "loss": 0.9807, + "step": 10047 + }, + { + "epoch": 0.7745914276904101, + "grad_norm": 3.5642600059509277, + "learning_rate": 1.2744345984529066e-06, + "loss": 0.9168, + "step": 10048 + }, + { + "epoch": 0.774668516805427, + "grad_norm": 3.684962749481201, + "learning_rate": 1.2736020774972746e-06, + "loss": 0.9065, + "step": 10049 + }, + { + "epoch": 0.7747456059204441, + "grad_norm": 3.697866916656494, + "learning_rate": 1.2727697888632534e-06, + "loss": 0.8338, + "step": 10050 + }, + { + "epoch": 0.774822695035461, + "grad_norm": 4.074203968048096, + "learning_rate": 1.271937732602732e-06, + "loss": 0.868, + "step": 10051 + }, + { + "epoch": 0.774899784150478, + "grad_norm": 3.6111466884613037, + "learning_rate": 1.2711059087675853e-06, + "loss": 0.9527, + "step": 10052 + }, + { + "epoch": 0.7749768732654949, + "grad_norm": 3.5815255641937256, + "learning_rate": 1.2702743174096737e-06, + "loss": 0.9145, + "step": 10053 + }, + { + "epoch": 0.7750539623805118, + "grad_norm": 3.476964235305786, + "learning_rate": 1.2694429585808404e-06, + "loss": 0.8759, + "step": 10054 + }, + { + "epoch": 0.7751310514955289, + "grad_norm": 3.8654966354370117, + "learning_rate": 1.2686118323329178e-06, + "loss": 0.9899, + "step": 10055 + }, + { + "epoch": 0.7752081406105458, + "grad_norm": 3.7267918586730957, + "learning_rate": 1.267780938717722e-06, + "loss": 0.9701, + "step": 10056 + }, + { + "epoch": 0.7752852297255628, + "grad_norm": 4.117705821990967, + "learning_rate": 1.2669502777870546e-06, + "loss": 0.8462, + "step": 10057 + }, + { + "epoch": 0.7753623188405797, + "grad_norm": 3.576138496398926, + "learning_rate": 1.266119849592704e-06, + "loss": 0.9018, + "step": 10058 + }, + { + "epoch": 0.7754394079555966, + "grad_norm": 5.330030918121338, + "learning_rate": 1.2652896541864435e-06, + "loss": 0.8455, + "step": 10059 + }, + { + "epoch": 0.7755164970706137, + "grad_norm": 3.8612890243530273, + "learning_rate": 1.264459691620029e-06, + "loss": 0.9188, + "step": 10060 + }, + { + "epoch": 0.7755935861856306, + "grad_norm": 4.6686787605285645, + "learning_rate": 1.2636299619452064e-06, + "loss": 0.9712, + "step": 10061 + }, + { + "epoch": 0.7756706753006476, + "grad_norm": 4.158293724060059, + "learning_rate": 1.2628004652137044e-06, + "loss": 0.8994, + "step": 10062 + }, + { + "epoch": 0.7757477644156645, + "grad_norm": 3.727849245071411, + "learning_rate": 1.2619712014772378e-06, + "loss": 0.8811, + "step": 10063 + }, + { + "epoch": 0.7758248535306814, + "grad_norm": 3.6993374824523926, + "learning_rate": 1.2611421707875083e-06, + "loss": 0.848, + "step": 10064 + }, + { + "epoch": 0.7759019426456985, + "grad_norm": 3.298645496368408, + "learning_rate": 1.2603133731961992e-06, + "loss": 0.8555, + "step": 10065 + }, + { + "epoch": 0.7759790317607154, + "grad_norm": 3.6150310039520264, + "learning_rate": 1.2594848087549826e-06, + "loss": 0.9416, + "step": 10066 + }, + { + "epoch": 0.7760561208757324, + "grad_norm": 3.694746732711792, + "learning_rate": 1.2586564775155158e-06, + "loss": 0.8568, + "step": 10067 + }, + { + "epoch": 0.7761332099907493, + "grad_norm": 4.369163990020752, + "learning_rate": 1.257828379529441e-06, + "loss": 1.0183, + "step": 10068 + }, + { + "epoch": 0.7762102991057662, + "grad_norm": 3.4145572185516357, + "learning_rate": 1.257000514848385e-06, + "loss": 0.874, + "step": 10069 + }, + { + "epoch": 0.7762873882207832, + "grad_norm": 4.185209274291992, + "learning_rate": 1.2561728835239633e-06, + "loss": 0.8839, + "step": 10070 + }, + { + "epoch": 0.7763644773358002, + "grad_norm": 3.814661979675293, + "learning_rate": 1.255345485607769e-06, + "loss": 0.9982, + "step": 10071 + }, + { + "epoch": 0.7764415664508172, + "grad_norm": 3.9218826293945312, + "learning_rate": 1.2545183211513918e-06, + "loss": 0.933, + "step": 10072 + }, + { + "epoch": 0.7765186555658341, + "grad_norm": 4.283771514892578, + "learning_rate": 1.2536913902063985e-06, + "loss": 0.9881, + "step": 10073 + }, + { + "epoch": 0.776595744680851, + "grad_norm": 3.5074286460876465, + "learning_rate": 1.252864692824346e-06, + "loss": 0.9492, + "step": 10074 + }, + { + "epoch": 0.776672833795868, + "grad_norm": 3.372987747192383, + "learning_rate": 1.2520382290567717e-06, + "loss": 0.8843, + "step": 10075 + }, + { + "epoch": 0.776749922910885, + "grad_norm": 3.437117099761963, + "learning_rate": 1.2512119989552023e-06, + "loss": 0.7762, + "step": 10076 + }, + { + "epoch": 0.776827012025902, + "grad_norm": 3.731052875518799, + "learning_rate": 1.2503860025711494e-06, + "loss": 1.0258, + "step": 10077 + }, + { + "epoch": 0.7769041011409189, + "grad_norm": 3.422943592071533, + "learning_rate": 1.2495602399561096e-06, + "loss": 0.7939, + "step": 10078 + }, + { + "epoch": 0.7769811902559358, + "grad_norm": 3.4586517810821533, + "learning_rate": 1.248734711161566e-06, + "loss": 0.7924, + "step": 10079 + }, + { + "epoch": 0.7770582793709528, + "grad_norm": 3.3821561336517334, + "learning_rate": 1.2479094162389838e-06, + "loss": 0.8364, + "step": 10080 + }, + { + "epoch": 0.7771353684859698, + "grad_norm": 4.26823091506958, + "learning_rate": 1.2470843552398166e-06, + "loss": 0.9628, + "step": 10081 + }, + { + "epoch": 0.7772124576009868, + "grad_norm": 3.7963852882385254, + "learning_rate": 1.2462595282155032e-06, + "loss": 0.9598, + "step": 10082 + }, + { + "epoch": 0.7772895467160037, + "grad_norm": 3.440056800842285, + "learning_rate": 1.2454349352174666e-06, + "loss": 0.887, + "step": 10083 + }, + { + "epoch": 0.7773666358310206, + "grad_norm": 3.769310235977173, + "learning_rate": 1.2446105762971167e-06, + "loss": 0.9397, + "step": 10084 + }, + { + "epoch": 0.7774437249460376, + "grad_norm": 3.7799243927001953, + "learning_rate": 1.2437864515058495e-06, + "loss": 0.9923, + "step": 10085 + }, + { + "epoch": 0.7775208140610546, + "grad_norm": 3.681706428527832, + "learning_rate": 1.2429625608950412e-06, + "loss": 0.9673, + "step": 10086 + }, + { + "epoch": 0.7775979031760716, + "grad_norm": 3.8595070838928223, + "learning_rate": 1.2421389045160593e-06, + "loss": 0.8813, + "step": 10087 + }, + { + "epoch": 0.7776749922910885, + "grad_norm": 4.8416361808776855, + "learning_rate": 1.2413154824202545e-06, + "loss": 0.9058, + "step": 10088 + }, + { + "epoch": 0.7777520814061054, + "grad_norm": 3.877955198287964, + "learning_rate": 1.240492294658962e-06, + "loss": 0.9176, + "step": 10089 + }, + { + "epoch": 0.7778291705211224, + "grad_norm": 3.9161739349365234, + "learning_rate": 1.239669341283506e-06, + "loss": 0.9904, + "step": 10090 + }, + { + "epoch": 0.7779062596361394, + "grad_norm": 3.6513936519622803, + "learning_rate": 1.2388466223451895e-06, + "loss": 0.8185, + "step": 10091 + }, + { + "epoch": 0.7779833487511564, + "grad_norm": 3.919912099838257, + "learning_rate": 1.2380241378953067e-06, + "loss": 0.8561, + "step": 10092 + }, + { + "epoch": 0.7780604378661733, + "grad_norm": 3.2892303466796875, + "learning_rate": 1.237201887985135e-06, + "loss": 0.902, + "step": 10093 + }, + { + "epoch": 0.7781375269811902, + "grad_norm": 3.583991527557373, + "learning_rate": 1.2363798726659377e-06, + "loss": 0.8996, + "step": 10094 + }, + { + "epoch": 0.7782146160962072, + "grad_norm": 3.9452333450317383, + "learning_rate": 1.235558091988963e-06, + "loss": 1.0222, + "step": 10095 + }, + { + "epoch": 0.7782917052112241, + "grad_norm": 3.6943013668060303, + "learning_rate": 1.234736546005446e-06, + "loss": 0.9208, + "step": 10096 + }, + { + "epoch": 0.7783687943262412, + "grad_norm": 3.9956469535827637, + "learning_rate": 1.2339152347666033e-06, + "loss": 1.0566, + "step": 10097 + }, + { + "epoch": 0.7784458834412581, + "grad_norm": 3.802483558654785, + "learning_rate": 1.2330941583236406e-06, + "loss": 0.9223, + "step": 10098 + }, + { + "epoch": 0.778522972556275, + "grad_norm": 3.4962236881256104, + "learning_rate": 1.2322733167277479e-06, + "loss": 0.8389, + "step": 10099 + }, + { + "epoch": 0.778600061671292, + "grad_norm": 3.899533271789551, + "learning_rate": 1.2314527100301005e-06, + "loss": 1.0017, + "step": 10100 + }, + { + "epoch": 0.778677150786309, + "grad_norm": 3.7576115131378174, + "learning_rate": 1.2306323382818596e-06, + "loss": 0.9071, + "step": 10101 + }, + { + "epoch": 0.778754239901326, + "grad_norm": 3.894066333770752, + "learning_rate": 1.2298122015341696e-06, + "loss": 1.0241, + "step": 10102 + }, + { + "epoch": 0.7788313290163429, + "grad_norm": 3.69673490524292, + "learning_rate": 1.2289922998381625e-06, + "loss": 0.9332, + "step": 10103 + }, + { + "epoch": 0.7789084181313598, + "grad_norm": 4.03447151184082, + "learning_rate": 1.2281726332449544e-06, + "loss": 1.017, + "step": 10104 + }, + { + "epoch": 0.7789855072463768, + "grad_norm": 3.7014174461364746, + "learning_rate": 1.2273532018056482e-06, + "loss": 0.9177, + "step": 10105 + }, + { + "epoch": 0.7790625963613937, + "grad_norm": 4.122037410736084, + "learning_rate": 1.226534005571331e-06, + "loss": 0.918, + "step": 10106 + }, + { + "epoch": 0.7791396854764108, + "grad_norm": 3.817849636077881, + "learning_rate": 1.2257150445930765e-06, + "loss": 0.8933, + "step": 10107 + }, + { + "epoch": 0.7792167745914277, + "grad_norm": 3.191657304763794, + "learning_rate": 1.2248963189219398e-06, + "loss": 0.7782, + "step": 10108 + }, + { + "epoch": 0.7792938637064446, + "grad_norm": 3.7507643699645996, + "learning_rate": 1.224077828608966e-06, + "loss": 0.9169, + "step": 10109 + }, + { + "epoch": 0.7793709528214616, + "grad_norm": 3.5642547607421875, + "learning_rate": 1.223259573705184e-06, + "loss": 0.9119, + "step": 10110 + }, + { + "epoch": 0.7794480419364785, + "grad_norm": 3.764009952545166, + "learning_rate": 1.2224415542616069e-06, + "loss": 0.8835, + "step": 10111 + }, + { + "epoch": 0.7795251310514956, + "grad_norm": 3.5799670219421387, + "learning_rate": 1.2216237703292361e-06, + "loss": 0.9183, + "step": 10112 + }, + { + "epoch": 0.7796022201665125, + "grad_norm": 3.8919787406921387, + "learning_rate": 1.2208062219590527e-06, + "loss": 0.9795, + "step": 10113 + }, + { + "epoch": 0.7796793092815294, + "grad_norm": 3.9707539081573486, + "learning_rate": 1.2199889092020289e-06, + "loss": 1.004, + "step": 10114 + }, + { + "epoch": 0.7797563983965464, + "grad_norm": 3.72212815284729, + "learning_rate": 1.2191718321091178e-06, + "loss": 0.9145, + "step": 10115 + }, + { + "epoch": 0.7798334875115633, + "grad_norm": 3.555389642715454, + "learning_rate": 1.2183549907312624e-06, + "loss": 0.8632, + "step": 10116 + }, + { + "epoch": 0.7799105766265804, + "grad_norm": 3.579590320587158, + "learning_rate": 1.2175383851193901e-06, + "loss": 0.8866, + "step": 10117 + }, + { + "epoch": 0.7799876657415973, + "grad_norm": 4.378693103790283, + "learning_rate": 1.2167220153244075e-06, + "loss": 0.8158, + "step": 10118 + }, + { + "epoch": 0.7800647548566142, + "grad_norm": 3.398848056793213, + "learning_rate": 1.2159058813972135e-06, + "loss": 0.9214, + "step": 10119 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 3.9123952388763428, + "learning_rate": 1.2150899833886892e-06, + "loss": 0.8256, + "step": 10120 + }, + { + "epoch": 0.7802189330866481, + "grad_norm": 4.100687026977539, + "learning_rate": 1.214274321349702e-06, + "loss": 0.9506, + "step": 10121 + }, + { + "epoch": 0.7802960222016652, + "grad_norm": 3.491703748703003, + "learning_rate": 1.2134588953311056e-06, + "loss": 0.8823, + "step": 10122 + }, + { + "epoch": 0.7803731113166821, + "grad_norm": 3.760547637939453, + "learning_rate": 1.2126437053837348e-06, + "loss": 0.9935, + "step": 10123 + }, + { + "epoch": 0.780450200431699, + "grad_norm": 3.883286237716675, + "learning_rate": 1.2118287515584132e-06, + "loss": 0.8716, + "step": 10124 + }, + { + "epoch": 0.780527289546716, + "grad_norm": 3.697366714477539, + "learning_rate": 1.2110140339059501e-06, + "loss": 0.8956, + "step": 10125 + }, + { + "epoch": 0.7806043786617329, + "grad_norm": 3.8138182163238525, + "learning_rate": 1.2101995524771376e-06, + "loss": 0.8893, + "step": 10126 + }, + { + "epoch": 0.78068146777675, + "grad_norm": 3.8224334716796875, + "learning_rate": 1.2093853073227574e-06, + "loss": 0.9656, + "step": 10127 + }, + { + "epoch": 0.7807585568917669, + "grad_norm": 3.8924286365509033, + "learning_rate": 1.2085712984935693e-06, + "loss": 0.8809, + "step": 10128 + }, + { + "epoch": 0.7808356460067838, + "grad_norm": 3.684929847717285, + "learning_rate": 1.2077575260403247e-06, + "loss": 1.0676, + "step": 10129 + }, + { + "epoch": 0.7809127351218008, + "grad_norm": 3.5521864891052246, + "learning_rate": 1.2069439900137575e-06, + "loss": 0.839, + "step": 10130 + }, + { + "epoch": 0.7809898242368177, + "grad_norm": 3.7578213214874268, + "learning_rate": 1.2061306904645875e-06, + "loss": 0.9186, + "step": 10131 + }, + { + "epoch": 0.7810669133518348, + "grad_norm": 3.4750614166259766, + "learning_rate": 1.2053176274435209e-06, + "loss": 0.86, + "step": 10132 + }, + { + "epoch": 0.7811440024668517, + "grad_norm": 3.816967487335205, + "learning_rate": 1.2045048010012477e-06, + "loss": 0.9405, + "step": 10133 + }, + { + "epoch": 0.7812210915818686, + "grad_norm": 4.11097526550293, + "learning_rate": 1.2036922111884414e-06, + "loss": 1.0416, + "step": 10134 + }, + { + "epoch": 0.7812981806968856, + "grad_norm": 3.8455522060394287, + "learning_rate": 1.2028798580557644e-06, + "loss": 0.9472, + "step": 10135 + }, + { + "epoch": 0.7813752698119025, + "grad_norm": 3.7609121799468994, + "learning_rate": 1.2020677416538623e-06, + "loss": 0.9292, + "step": 10136 + }, + { + "epoch": 0.7814523589269196, + "grad_norm": 3.555689573287964, + "learning_rate": 1.201255862033367e-06, + "loss": 0.9046, + "step": 10137 + }, + { + "epoch": 0.7815294480419365, + "grad_norm": 3.5667762756347656, + "learning_rate": 1.2004442192448956e-06, + "loss": 0.8375, + "step": 10138 + }, + { + "epoch": 0.7816065371569534, + "grad_norm": 3.609121322631836, + "learning_rate": 1.1996328133390472e-06, + "loss": 0.9084, + "step": 10139 + }, + { + "epoch": 0.7816836262719704, + "grad_norm": 3.747819423675537, + "learning_rate": 1.1988216443664102e-06, + "loss": 0.8834, + "step": 10140 + }, + { + "epoch": 0.7817607153869873, + "grad_norm": 3.649038314819336, + "learning_rate": 1.198010712377557e-06, + "loss": 0.9357, + "step": 10141 + }, + { + "epoch": 0.7818378045020044, + "grad_norm": 3.519754409790039, + "learning_rate": 1.1972000174230452e-06, + "loss": 0.8994, + "step": 10142 + }, + { + "epoch": 0.7819148936170213, + "grad_norm": 3.7660558223724365, + "learning_rate": 1.1963895595534164e-06, + "loss": 0.8964, + "step": 10143 + }, + { + "epoch": 0.7819919827320382, + "grad_norm": 3.712606430053711, + "learning_rate": 1.195579338819201e-06, + "loss": 0.8096, + "step": 10144 + }, + { + "epoch": 0.7820690718470552, + "grad_norm": 3.9107108116149902, + "learning_rate": 1.194769355270909e-06, + "loss": 0.998, + "step": 10145 + }, + { + "epoch": 0.7821461609620721, + "grad_norm": 3.800839424133301, + "learning_rate": 1.1939596089590394e-06, + "loss": 0.8565, + "step": 10146 + }, + { + "epoch": 0.7822232500770891, + "grad_norm": 3.84592342376709, + "learning_rate": 1.1931500999340768e-06, + "loss": 0.9408, + "step": 10147 + }, + { + "epoch": 0.7823003391921061, + "grad_norm": 3.9664783477783203, + "learning_rate": 1.1923408282464888e-06, + "loss": 1.0395, + "step": 10148 + }, + { + "epoch": 0.782377428307123, + "grad_norm": 3.7878577709198, + "learning_rate": 1.1915317939467314e-06, + "loss": 0.964, + "step": 10149 + }, + { + "epoch": 0.78245451742214, + "grad_norm": 3.499857187271118, + "learning_rate": 1.1907229970852407e-06, + "loss": 0.8254, + "step": 10150 + }, + { + "epoch": 0.7825316065371569, + "grad_norm": 3.702730894088745, + "learning_rate": 1.1899144377124422e-06, + "loss": 0.9191, + "step": 10151 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 3.7440195083618164, + "learning_rate": 1.1891061158787459e-06, + "loss": 0.8737, + "step": 10152 + }, + { + "epoch": 0.7826857847671909, + "grad_norm": 3.973545551300049, + "learning_rate": 1.1882980316345461e-06, + "loss": 1.0105, + "step": 10153 + }, + { + "epoch": 0.7827628738822078, + "grad_norm": 3.627835273742676, + "learning_rate": 1.1874901850302223e-06, + "loss": 0.8726, + "step": 10154 + }, + { + "epoch": 0.7828399629972248, + "grad_norm": 4.042776107788086, + "learning_rate": 1.1866825761161417e-06, + "loss": 0.9184, + "step": 10155 + }, + { + "epoch": 0.7829170521122417, + "grad_norm": 3.9499711990356445, + "learning_rate": 1.1858752049426513e-06, + "loss": 1.001, + "step": 10156 + }, + { + "epoch": 0.7829941412272587, + "grad_norm": 3.8737709522247314, + "learning_rate": 1.1850680715600883e-06, + "loss": 0.9999, + "step": 10157 + }, + { + "epoch": 0.7830712303422757, + "grad_norm": 3.580709934234619, + "learning_rate": 1.1842611760187722e-06, + "loss": 0.8882, + "step": 10158 + }, + { + "epoch": 0.7831483194572927, + "grad_norm": 3.639634847640991, + "learning_rate": 1.1834545183690105e-06, + "loss": 0.8136, + "step": 10159 + }, + { + "epoch": 0.7832254085723096, + "grad_norm": 3.59987735748291, + "learning_rate": 1.1826480986610928e-06, + "loss": 0.8607, + "step": 10160 + }, + { + "epoch": 0.7833024976873265, + "grad_norm": 3.7638700008392334, + "learning_rate": 1.1818419169452954e-06, + "loss": 0.836, + "step": 10161 + }, + { + "epoch": 0.7833795868023435, + "grad_norm": 3.5380563735961914, + "learning_rate": 1.1810359732718795e-06, + "loss": 0.847, + "step": 10162 + }, + { + "epoch": 0.7834566759173605, + "grad_norm": 4.049480438232422, + "learning_rate": 1.1802302676910926e-06, + "loss": 1.0036, + "step": 10163 + }, + { + "epoch": 0.7835337650323775, + "grad_norm": 3.6710615158081055, + "learning_rate": 1.1794248002531644e-06, + "loss": 0.8508, + "step": 10164 + }, + { + "epoch": 0.7836108541473944, + "grad_norm": 3.6866486072540283, + "learning_rate": 1.1786195710083149e-06, + "loss": 0.8394, + "step": 10165 + }, + { + "epoch": 0.7836879432624113, + "grad_norm": 4.017769813537598, + "learning_rate": 1.1778145800067419e-06, + "loss": 0.9146, + "step": 10166 + }, + { + "epoch": 0.7837650323774283, + "grad_norm": 3.859618902206421, + "learning_rate": 1.177009827298634e-06, + "loss": 0.9376, + "step": 10167 + }, + { + "epoch": 0.7838421214924453, + "grad_norm": 3.7508604526519775, + "learning_rate": 1.1762053129341643e-06, + "loss": 0.8773, + "step": 10168 + }, + { + "epoch": 0.7839192106074623, + "grad_norm": 4.094524383544922, + "learning_rate": 1.175401036963489e-06, + "loss": 0.9331, + "step": 10169 + }, + { + "epoch": 0.7839962997224792, + "grad_norm": 3.7034811973571777, + "learning_rate": 1.1745969994367524e-06, + "loss": 0.7125, + "step": 10170 + }, + { + "epoch": 0.7840733888374961, + "grad_norm": 3.425140857696533, + "learning_rate": 1.1737932004040792e-06, + "loss": 0.8507, + "step": 10171 + }, + { + "epoch": 0.7841504779525131, + "grad_norm": 3.6695098876953125, + "learning_rate": 1.1729896399155831e-06, + "loss": 0.96, + "step": 10172 + }, + { + "epoch": 0.78422756706753, + "grad_norm": 3.393888235092163, + "learning_rate": 1.1721863180213627e-06, + "loss": 0.7526, + "step": 10173 + }, + { + "epoch": 0.7843046561825471, + "grad_norm": 3.838538408279419, + "learning_rate": 1.171383234771501e-06, + "loss": 0.9423, + "step": 10174 + }, + { + "epoch": 0.784381745297564, + "grad_norm": 3.5390851497650146, + "learning_rate": 1.1705803902160668e-06, + "loss": 0.9341, + "step": 10175 + }, + { + "epoch": 0.7844588344125809, + "grad_norm": 3.473173141479492, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.903, + "step": 10176 + }, + { + "epoch": 0.7845359235275979, + "grad_norm": 4.052275657653809, + "learning_rate": 1.1689754173886725e-06, + "loss": 0.9877, + "step": 10177 + }, + { + "epoch": 0.7846130126426148, + "grad_norm": 3.684760808944702, + "learning_rate": 1.1681732892167757e-06, + "loss": 0.9197, + "step": 10178 + }, + { + "epoch": 0.7846901017576319, + "grad_norm": 3.7572269439697266, + "learning_rate": 1.1673713999394287e-06, + "loss": 0.9295, + "step": 10179 + }, + { + "epoch": 0.7847671908726488, + "grad_norm": 3.714064359664917, + "learning_rate": 1.1665697496066253e-06, + "loss": 0.8446, + "step": 10180 + }, + { + "epoch": 0.7848442799876657, + "grad_norm": 3.450218439102173, + "learning_rate": 1.1657683382683454e-06, + "loss": 0.9163, + "step": 10181 + }, + { + "epoch": 0.7849213691026827, + "grad_norm": 3.6925554275512695, + "learning_rate": 1.1649671659745504e-06, + "loss": 0.9408, + "step": 10182 + }, + { + "epoch": 0.7849984582176996, + "grad_norm": 4.052911758422852, + "learning_rate": 1.1641662327751907e-06, + "loss": 0.9494, + "step": 10183 + }, + { + "epoch": 0.7850755473327167, + "grad_norm": 4.215100288391113, + "learning_rate": 1.1633655387201998e-06, + "loss": 0.8627, + "step": 10184 + }, + { + "epoch": 0.7851526364477336, + "grad_norm": 3.8449103832244873, + "learning_rate": 1.162565083859497e-06, + "loss": 0.9384, + "step": 10185 + }, + { + "epoch": 0.7852297255627505, + "grad_norm": 3.831585645675659, + "learning_rate": 1.1617648682429882e-06, + "loss": 0.8777, + "step": 10186 + }, + { + "epoch": 0.7853068146777675, + "grad_norm": 3.922975540161133, + "learning_rate": 1.1609648919205596e-06, + "loss": 0.9134, + "step": 10187 + }, + { + "epoch": 0.7853839037927844, + "grad_norm": 3.6187238693237305, + "learning_rate": 1.1601651549420873e-06, + "loss": 0.9355, + "step": 10188 + }, + { + "epoch": 0.7854609929078015, + "grad_norm": 3.704779624938965, + "learning_rate": 1.1593656573574302e-06, + "loss": 0.9219, + "step": 10189 + }, + { + "epoch": 0.7855380820228184, + "grad_norm": 4.0944719314575195, + "learning_rate": 1.1585663992164336e-06, + "loss": 0.9159, + "step": 10190 + }, + { + "epoch": 0.7856151711378353, + "grad_norm": 3.6618051528930664, + "learning_rate": 1.1577673805689266e-06, + "loss": 0.8657, + "step": 10191 + }, + { + "epoch": 0.7856922602528523, + "grad_norm": 3.9331610202789307, + "learning_rate": 1.1569686014647253e-06, + "loss": 0.9718, + "step": 10192 + }, + { + "epoch": 0.7857693493678692, + "grad_norm": 3.8769805431365967, + "learning_rate": 1.156170061953627e-06, + "loss": 0.846, + "step": 10193 + }, + { + "epoch": 0.7858464384828863, + "grad_norm": 3.4525961875915527, + "learning_rate": 1.1553717620854176e-06, + "loss": 1.0036, + "step": 10194 + }, + { + "epoch": 0.7859235275979032, + "grad_norm": 3.376194715499878, + "learning_rate": 1.1545737019098668e-06, + "loss": 0.8905, + "step": 10195 + }, + { + "epoch": 0.7860006167129201, + "grad_norm": 3.572995185852051, + "learning_rate": 1.1537758814767298e-06, + "loss": 0.8374, + "step": 10196 + }, + { + "epoch": 0.7860777058279371, + "grad_norm": 3.6022350788116455, + "learning_rate": 1.1529783008357476e-06, + "loss": 0.9317, + "step": 10197 + }, + { + "epoch": 0.786154794942954, + "grad_norm": 3.5814099311828613, + "learning_rate": 1.152180960036643e-06, + "loss": 0.8554, + "step": 10198 + }, + { + "epoch": 0.7862318840579711, + "grad_norm": 3.6974618434906006, + "learning_rate": 1.151383859129127e-06, + "loss": 0.7929, + "step": 10199 + }, + { + "epoch": 0.786308973172988, + "grad_norm": 3.654067039489746, + "learning_rate": 1.1505869981628953e-06, + "loss": 0.8319, + "step": 10200 + }, + { + "epoch": 0.7863860622880049, + "grad_norm": 3.9007177352905273, + "learning_rate": 1.1497903771876272e-06, + "loss": 1.0742, + "step": 10201 + }, + { + "epoch": 0.7864631514030219, + "grad_norm": 4.104714870452881, + "learning_rate": 1.1489939962529884e-06, + "loss": 0.9199, + "step": 10202 + }, + { + "epoch": 0.7865402405180388, + "grad_norm": 3.8968212604522705, + "learning_rate": 1.148197855408631e-06, + "loss": 0.9078, + "step": 10203 + }, + { + "epoch": 0.7866173296330559, + "grad_norm": 4.120811939239502, + "learning_rate": 1.1474019547041848e-06, + "loss": 0.9516, + "step": 10204 + }, + { + "epoch": 0.7866944187480728, + "grad_norm": 3.669375419616699, + "learning_rate": 1.1466062941892754e-06, + "loss": 0.9964, + "step": 10205 + }, + { + "epoch": 0.7867715078630897, + "grad_norm": 3.683593988418579, + "learning_rate": 1.145810873913506e-06, + "loss": 0.8877, + "step": 10206 + }, + { + "epoch": 0.7868485969781067, + "grad_norm": 3.3355050086975098, + "learning_rate": 1.1450156939264689e-06, + "loss": 0.8196, + "step": 10207 + }, + { + "epoch": 0.7869256860931236, + "grad_norm": 3.5134360790252686, + "learning_rate": 1.144220754277736e-06, + "loss": 0.9914, + "step": 10208 + }, + { + "epoch": 0.7870027752081407, + "grad_norm": 4.333958625793457, + "learning_rate": 1.1434260550168702e-06, + "loss": 1.0483, + "step": 10209 + }, + { + "epoch": 0.7870798643231576, + "grad_norm": 3.63576340675354, + "learning_rate": 1.1426315961934159e-06, + "loss": 0.848, + "step": 10210 + }, + { + "epoch": 0.7871569534381745, + "grad_norm": 3.60412859916687, + "learning_rate": 1.1418373778569036e-06, + "loss": 0.8695, + "step": 10211 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 3.5103275775909424, + "learning_rate": 1.1410434000568487e-06, + "loss": 0.932, + "step": 10212 + }, + { + "epoch": 0.7873111316682084, + "grad_norm": 4.256019115447998, + "learning_rate": 1.1402496628427534e-06, + "loss": 0.9959, + "step": 10213 + }, + { + "epoch": 0.7873882207832255, + "grad_norm": 3.469325304031372, + "learning_rate": 1.1394561662641e-06, + "loss": 0.8449, + "step": 10214 + }, + { + "epoch": 0.7874653098982424, + "grad_norm": 3.975343942642212, + "learning_rate": 1.1386629103703606e-06, + "loss": 0.9372, + "step": 10215 + }, + { + "epoch": 0.7875423990132593, + "grad_norm": 3.5165517330169678, + "learning_rate": 1.13786989521099e-06, + "loss": 0.9793, + "step": 10216 + }, + { + "epoch": 0.7876194881282763, + "grad_norm": 3.727750062942505, + "learning_rate": 1.1370771208354291e-06, + "loss": 0.9962, + "step": 10217 + }, + { + "epoch": 0.7876965772432932, + "grad_norm": 3.6379904747009277, + "learning_rate": 1.1362845872931044e-06, + "loss": 0.8594, + "step": 10218 + }, + { + "epoch": 0.7877736663583103, + "grad_norm": 3.5502896308898926, + "learning_rate": 1.1354922946334241e-06, + "loss": 0.8649, + "step": 10219 + }, + { + "epoch": 0.7878507554733272, + "grad_norm": 3.6985366344451904, + "learning_rate": 1.1347002429057835e-06, + "loss": 0.9357, + "step": 10220 + }, + { + "epoch": 0.7879278445883441, + "grad_norm": 3.5729541778564453, + "learning_rate": 1.1339084321595644e-06, + "loss": 0.892, + "step": 10221 + }, + { + "epoch": 0.7880049337033611, + "grad_norm": 3.6966254711151123, + "learning_rate": 1.133116862444132e-06, + "loss": 0.9705, + "step": 10222 + }, + { + "epoch": 0.788082022818378, + "grad_norm": 3.8931562900543213, + "learning_rate": 1.1323255338088368e-06, + "loss": 0.8877, + "step": 10223 + }, + { + "epoch": 0.788159111933395, + "grad_norm": 3.7230522632598877, + "learning_rate": 1.131534446303012e-06, + "loss": 0.9154, + "step": 10224 + }, + { + "epoch": 0.788236201048412, + "grad_norm": 3.777541399002075, + "learning_rate": 1.1307435999759796e-06, + "loss": 0.8524, + "step": 10225 + }, + { + "epoch": 0.7883132901634289, + "grad_norm": 3.533332586288452, + "learning_rate": 1.1299529948770443e-06, + "loss": 0.9151, + "step": 10226 + }, + { + "epoch": 0.7883903792784459, + "grad_norm": 3.7804453372955322, + "learning_rate": 1.129162631055496e-06, + "loss": 0.8325, + "step": 10227 + }, + { + "epoch": 0.7884674683934628, + "grad_norm": 3.4261248111724854, + "learning_rate": 1.1283725085606101e-06, + "loss": 0.8707, + "step": 10228 + }, + { + "epoch": 0.7885445575084798, + "grad_norm": 3.9265434741973877, + "learning_rate": 1.1275826274416485e-06, + "loss": 0.9651, + "step": 10229 + }, + { + "epoch": 0.7886216466234968, + "grad_norm": 3.6662166118621826, + "learning_rate": 1.1267929877478522e-06, + "loss": 0.9031, + "step": 10230 + }, + { + "epoch": 0.7886987357385137, + "grad_norm": 3.6205952167510986, + "learning_rate": 1.1260035895284538e-06, + "loss": 0.935, + "step": 10231 + }, + { + "epoch": 0.7887758248535307, + "grad_norm": 3.919410467147827, + "learning_rate": 1.1252144328326676e-06, + "loss": 0.8459, + "step": 10232 + }, + { + "epoch": 0.7888529139685476, + "grad_norm": 3.570185422897339, + "learning_rate": 1.124425517709693e-06, + "loss": 0.846, + "step": 10233 + }, + { + "epoch": 0.7889300030835646, + "grad_norm": 3.781740427017212, + "learning_rate": 1.123636844208717e-06, + "loss": 0.8971, + "step": 10234 + }, + { + "epoch": 0.7890070921985816, + "grad_norm": 3.7370405197143555, + "learning_rate": 1.1228484123789064e-06, + "loss": 0.958, + "step": 10235 + }, + { + "epoch": 0.7890841813135985, + "grad_norm": 3.855154037475586, + "learning_rate": 1.1220602222694166e-06, + "loss": 0.9874, + "step": 10236 + }, + { + "epoch": 0.7891612704286155, + "grad_norm": 3.9142956733703613, + "learning_rate": 1.1212722739293875e-06, + "loss": 0.903, + "step": 10237 + }, + { + "epoch": 0.7892383595436324, + "grad_norm": 4.107082366943359, + "learning_rate": 1.120484567407944e-06, + "loss": 1.0201, + "step": 10238 + }, + { + "epoch": 0.7893154486586494, + "grad_norm": 3.537972927093506, + "learning_rate": 1.1196971027541953e-06, + "loss": 0.8097, + "step": 10239 + }, + { + "epoch": 0.7893925377736664, + "grad_norm": 3.690303087234497, + "learning_rate": 1.1189098800172365e-06, + "loss": 0.9472, + "step": 10240 + }, + { + "epoch": 0.7894696268886833, + "grad_norm": 4.062992572784424, + "learning_rate": 1.1181228992461451e-06, + "loss": 0.8588, + "step": 10241 + }, + { + "epoch": 0.7895467160037003, + "grad_norm": 3.6081042289733887, + "learning_rate": 1.1173361604899857e-06, + "loss": 0.8476, + "step": 10242 + }, + { + "epoch": 0.7896238051187172, + "grad_norm": 3.469123601913452, + "learning_rate": 1.1165496637978086e-06, + "loss": 0.8059, + "step": 10243 + }, + { + "epoch": 0.7897008942337342, + "grad_norm": 3.6479201316833496, + "learning_rate": 1.1157634092186464e-06, + "loss": 0.8624, + "step": 10244 + }, + { + "epoch": 0.7897779833487512, + "grad_norm": 3.684077024459839, + "learning_rate": 1.1149773968015205e-06, + "loss": 0.9347, + "step": 10245 + }, + { + "epoch": 0.7898550724637681, + "grad_norm": 4.0022382736206055, + "learning_rate": 1.114191626595431e-06, + "loss": 0.9736, + "step": 10246 + }, + { + "epoch": 0.7899321615787851, + "grad_norm": 3.8564565181732178, + "learning_rate": 1.1134060986493688e-06, + "loss": 0.834, + "step": 10247 + }, + { + "epoch": 0.790009250693802, + "grad_norm": 3.95080828666687, + "learning_rate": 1.1126208130123056e-06, + "loss": 0.8655, + "step": 10248 + }, + { + "epoch": 0.790086339808819, + "grad_norm": 3.4541101455688477, + "learning_rate": 1.1118357697332027e-06, + "loss": 0.8184, + "step": 10249 + }, + { + "epoch": 0.790163428923836, + "grad_norm": 3.7209436893463135, + "learning_rate": 1.1110509688610038e-06, + "loss": 0.8719, + "step": 10250 + }, + { + "epoch": 0.7902405180388529, + "grad_norm": 3.6410865783691406, + "learning_rate": 1.1102664104446342e-06, + "loss": 0.9483, + "step": 10251 + }, + { + "epoch": 0.7903176071538699, + "grad_norm": 3.3770508766174316, + "learning_rate": 1.1094820945330088e-06, + "loss": 0.7337, + "step": 10252 + }, + { + "epoch": 0.7903946962688868, + "grad_norm": 3.8358688354492188, + "learning_rate": 1.1086980211750247e-06, + "loss": 0.8971, + "step": 10253 + }, + { + "epoch": 0.7904717853839038, + "grad_norm": 3.432882308959961, + "learning_rate": 1.1079141904195662e-06, + "loss": 0.8461, + "step": 10254 + }, + { + "epoch": 0.7905488744989208, + "grad_norm": 4.271652698516846, + "learning_rate": 1.107130602315501e-06, + "loss": 1.0068, + "step": 10255 + }, + { + "epoch": 0.7906259636139377, + "grad_norm": 3.7254769802093506, + "learning_rate": 1.1063472569116802e-06, + "loss": 0.8175, + "step": 10256 + }, + { + "epoch": 0.7907030527289547, + "grad_norm": 3.4626779556274414, + "learning_rate": 1.1055641542569418e-06, + "loss": 0.7748, + "step": 10257 + }, + { + "epoch": 0.7907801418439716, + "grad_norm": 4.178032398223877, + "learning_rate": 1.1047812944001084e-06, + "loss": 0.9467, + "step": 10258 + }, + { + "epoch": 0.7908572309589886, + "grad_norm": 3.711949348449707, + "learning_rate": 1.103998677389988e-06, + "loss": 0.9139, + "step": 10259 + }, + { + "epoch": 0.7909343200740055, + "grad_norm": 4.120387554168701, + "learning_rate": 1.1032163032753717e-06, + "loss": 0.9809, + "step": 10260 + }, + { + "epoch": 0.7910114091890225, + "grad_norm": 3.4602792263031006, + "learning_rate": 1.1024341721050385e-06, + "loss": 0.9068, + "step": 10261 + }, + { + "epoch": 0.7910884983040395, + "grad_norm": 3.6511247158050537, + "learning_rate": 1.1016522839277471e-06, + "loss": 0.8453, + "step": 10262 + }, + { + "epoch": 0.7911655874190564, + "grad_norm": 4.048055648803711, + "learning_rate": 1.1008706387922457e-06, + "loss": 1.0161, + "step": 10263 + }, + { + "epoch": 0.7912426765340734, + "grad_norm": 3.4524850845336914, + "learning_rate": 1.100089236747266e-06, + "loss": 0.8793, + "step": 10264 + }, + { + "epoch": 0.7913197656490903, + "grad_norm": 3.5564897060394287, + "learning_rate": 1.0993080778415245e-06, + "loss": 0.8971, + "step": 10265 + }, + { + "epoch": 0.7913968547641073, + "grad_norm": 4.036877155303955, + "learning_rate": 1.0985271621237231e-06, + "loss": 0.91, + "step": 10266 + }, + { + "epoch": 0.7914739438791243, + "grad_norm": 3.743246078491211, + "learning_rate": 1.0977464896425461e-06, + "loss": 0.9205, + "step": 10267 + }, + { + "epoch": 0.7915510329941412, + "grad_norm": 3.4432597160339355, + "learning_rate": 1.0969660604466648e-06, + "loss": 0.8664, + "step": 10268 + }, + { + "epoch": 0.7916281221091582, + "grad_norm": 3.663029670715332, + "learning_rate": 1.0961858745847348e-06, + "loss": 0.9574, + "step": 10269 + }, + { + "epoch": 0.7917052112241751, + "grad_norm": 3.6875574588775635, + "learning_rate": 1.0954059321053978e-06, + "loss": 0.859, + "step": 10270 + }, + { + "epoch": 0.7917823003391921, + "grad_norm": 3.6623551845550537, + "learning_rate": 1.0946262330572798e-06, + "loss": 0.8892, + "step": 10271 + }, + { + "epoch": 0.7918593894542091, + "grad_norm": 3.574294328689575, + "learning_rate": 1.0938467774889883e-06, + "loss": 0.9423, + "step": 10272 + }, + { + "epoch": 0.791936478569226, + "grad_norm": 3.773029088973999, + "learning_rate": 1.0930675654491197e-06, + "loss": 0.8767, + "step": 10273 + }, + { + "epoch": 0.792013567684243, + "grad_norm": 3.5403318405151367, + "learning_rate": 1.092288596986254e-06, + "loss": 0.962, + "step": 10274 + }, + { + "epoch": 0.7920906567992599, + "grad_norm": 3.787517786026001, + "learning_rate": 1.0915098721489553e-06, + "loss": 0.9419, + "step": 10275 + }, + { + "epoch": 0.7921677459142769, + "grad_norm": 3.657186508178711, + "learning_rate": 1.0907313909857737e-06, + "loss": 0.9917, + "step": 10276 + }, + { + "epoch": 0.7922448350292939, + "grad_norm": 3.8778676986694336, + "learning_rate": 1.0899531535452452e-06, + "loss": 0.8329, + "step": 10277 + }, + { + "epoch": 0.7923219241443108, + "grad_norm": 4.028141975402832, + "learning_rate": 1.0891751598758849e-06, + "loss": 0.9051, + "step": 10278 + }, + { + "epoch": 0.7923990132593278, + "grad_norm": 3.770726442337036, + "learning_rate": 1.0883974100261985e-06, + "loss": 0.8382, + "step": 10279 + }, + { + "epoch": 0.7924761023743447, + "grad_norm": 3.7787578105926514, + "learning_rate": 1.0876199040446754e-06, + "loss": 0.8499, + "step": 10280 + }, + { + "epoch": 0.7925531914893617, + "grad_norm": 3.5443973541259766, + "learning_rate": 1.0868426419797883e-06, + "loss": 0.9412, + "step": 10281 + }, + { + "epoch": 0.7926302806043787, + "grad_norm": 3.392566680908203, + "learning_rate": 1.0860656238799971e-06, + "loss": 0.8903, + "step": 10282 + }, + { + "epoch": 0.7927073697193956, + "grad_norm": 3.8018105030059814, + "learning_rate": 1.0852888497937424e-06, + "loss": 0.8888, + "step": 10283 + }, + { + "epoch": 0.7927844588344126, + "grad_norm": 3.710118293762207, + "learning_rate": 1.0845123197694528e-06, + "loss": 1.0176, + "step": 10284 + }, + { + "epoch": 0.7928615479494295, + "grad_norm": 3.820878028869629, + "learning_rate": 1.0837360338555414e-06, + "loss": 0.9367, + "step": 10285 + }, + { + "epoch": 0.7929386370644464, + "grad_norm": 4.015862941741943, + "learning_rate": 1.0829599921004054e-06, + "loss": 0.8729, + "step": 10286 + }, + { + "epoch": 0.7930157261794635, + "grad_norm": 3.3819711208343506, + "learning_rate": 1.0821841945524265e-06, + "loss": 0.8165, + "step": 10287 + }, + { + "epoch": 0.7930928152944804, + "grad_norm": 4.338066101074219, + "learning_rate": 1.081408641259974e-06, + "loss": 1.0214, + "step": 10288 + }, + { + "epoch": 0.7931699044094974, + "grad_norm": 3.5678458213806152, + "learning_rate": 1.0806333322713964e-06, + "loss": 0.8869, + "step": 10289 + }, + { + "epoch": 0.7932469935245143, + "grad_norm": 4.043673515319824, + "learning_rate": 1.0798582676350316e-06, + "loss": 0.9494, + "step": 10290 + }, + { + "epoch": 0.7933240826395312, + "grad_norm": 3.7308008670806885, + "learning_rate": 1.0790834473992013e-06, + "loss": 0.7882, + "step": 10291 + }, + { + "epoch": 0.7934011717545483, + "grad_norm": 4.306490421295166, + "learning_rate": 1.0783088716122102e-06, + "loss": 0.995, + "step": 10292 + }, + { + "epoch": 0.7934782608695652, + "grad_norm": 3.6858410835266113, + "learning_rate": 1.0775345403223509e-06, + "loss": 0.8796, + "step": 10293 + }, + { + "epoch": 0.7935553499845822, + "grad_norm": 3.7026453018188477, + "learning_rate": 1.0767604535778976e-06, + "loss": 0.9955, + "step": 10294 + }, + { + "epoch": 0.7936324390995991, + "grad_norm": 3.596752882003784, + "learning_rate": 1.075986611427111e-06, + "loss": 0.8235, + "step": 10295 + }, + { + "epoch": 0.793709528214616, + "grad_norm": 3.608722448348999, + "learning_rate": 1.0752130139182364e-06, + "loss": 0.8598, + "step": 10296 + }, + { + "epoch": 0.7937866173296331, + "grad_norm": 3.686950922012329, + "learning_rate": 1.0744396610995033e-06, + "loss": 0.8529, + "step": 10297 + }, + { + "epoch": 0.79386370644465, + "grad_norm": 4.59524393081665, + "learning_rate": 1.0736665530191276e-06, + "loss": 1.1376, + "step": 10298 + }, + { + "epoch": 0.793940795559667, + "grad_norm": 3.7717888355255127, + "learning_rate": 1.072893689725306e-06, + "loss": 0.8147, + "step": 10299 + }, + { + "epoch": 0.7940178846746839, + "grad_norm": 3.5233025550842285, + "learning_rate": 1.072121071266224e-06, + "loss": 0.8799, + "step": 10300 + }, + { + "epoch": 0.7940949737897008, + "grad_norm": 3.7410104274749756, + "learning_rate": 1.07134869769005e-06, + "loss": 0.9286, + "step": 10301 + }, + { + "epoch": 0.7941720629047179, + "grad_norm": 3.774850606918335, + "learning_rate": 1.0705765690449376e-06, + "loss": 0.9598, + "step": 10302 + }, + { + "epoch": 0.7942491520197348, + "grad_norm": 4.010867595672607, + "learning_rate": 1.0698046853790268e-06, + "loss": 0.9681, + "step": 10303 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 3.8371970653533936, + "learning_rate": 1.0690330467404375e-06, + "loss": 0.9313, + "step": 10304 + }, + { + "epoch": 0.7944033302497687, + "grad_norm": 3.689708709716797, + "learning_rate": 1.0682616531772782e-06, + "loss": 0.9279, + "step": 10305 + }, + { + "epoch": 0.7944804193647856, + "grad_norm": 3.87109112739563, + "learning_rate": 1.0674905047376423e-06, + "loss": 0.8764, + "step": 10306 + }, + { + "epoch": 0.7945575084798027, + "grad_norm": 3.5622525215148926, + "learning_rate": 1.0667196014696062e-06, + "loss": 0.9274, + "step": 10307 + }, + { + "epoch": 0.7946345975948196, + "grad_norm": 3.48157000541687, + "learning_rate": 1.0659489434212323e-06, + "loss": 0.8604, + "step": 10308 + }, + { + "epoch": 0.7947116867098366, + "grad_norm": 3.320413112640381, + "learning_rate": 1.0651785306405683e-06, + "loss": 0.803, + "step": 10309 + }, + { + "epoch": 0.7947887758248535, + "grad_norm": 3.5544357299804688, + "learning_rate": 1.064408363175642e-06, + "loss": 0.9412, + "step": 10310 + }, + { + "epoch": 0.7948658649398704, + "grad_norm": 3.2090461254119873, + "learning_rate": 1.0636384410744716e-06, + "loss": 0.8046, + "step": 10311 + }, + { + "epoch": 0.7949429540548875, + "grad_norm": 3.8545830249786377, + "learning_rate": 1.0628687643850572e-06, + "loss": 0.9692, + "step": 10312 + }, + { + "epoch": 0.7950200431699044, + "grad_norm": 3.9732425212860107, + "learning_rate": 1.0620993331553848e-06, + "loss": 0.8421, + "step": 10313 + }, + { + "epoch": 0.7950971322849214, + "grad_norm": 3.6110129356384277, + "learning_rate": 1.0613301474334254e-06, + "loss": 0.943, + "step": 10314 + }, + { + "epoch": 0.7951742213999383, + "grad_norm": 3.558922529220581, + "learning_rate": 1.0605612072671305e-06, + "loss": 0.8885, + "step": 10315 + }, + { + "epoch": 0.7952513105149552, + "grad_norm": 3.5651447772979736, + "learning_rate": 1.0597925127044423e-06, + "loss": 0.8734, + "step": 10316 + }, + { + "epoch": 0.7953283996299723, + "grad_norm": 3.470515251159668, + "learning_rate": 1.0590240637932835e-06, + "loss": 0.7586, + "step": 10317 + }, + { + "epoch": 0.7954054887449892, + "grad_norm": 3.696683883666992, + "learning_rate": 1.0582558605815636e-06, + "loss": 0.8775, + "step": 10318 + }, + { + "epoch": 0.7954825778600062, + "grad_norm": 3.392326593399048, + "learning_rate": 1.0574879031171776e-06, + "loss": 0.7721, + "step": 10319 + }, + { + "epoch": 0.7955596669750231, + "grad_norm": 4.037016868591309, + "learning_rate": 1.0567201914480002e-06, + "loss": 0.8883, + "step": 10320 + }, + { + "epoch": 0.79563675609004, + "grad_norm": 3.5130562782287598, + "learning_rate": 1.0559527256218959e-06, + "loss": 0.861, + "step": 10321 + }, + { + "epoch": 0.7957138452050571, + "grad_norm": 3.640817165374756, + "learning_rate": 1.0551855056867132e-06, + "loss": 0.9281, + "step": 10322 + }, + { + "epoch": 0.795790934320074, + "grad_norm": 4.14249324798584, + "learning_rate": 1.0544185316902834e-06, + "loss": 0.9328, + "step": 10323 + }, + { + "epoch": 0.795868023435091, + "grad_norm": 3.433025598526001, + "learning_rate": 1.0536518036804228e-06, + "loss": 0.8419, + "step": 10324 + }, + { + "epoch": 0.7959451125501079, + "grad_norm": 3.647221565246582, + "learning_rate": 1.0528853217049357e-06, + "loss": 0.9425, + "step": 10325 + }, + { + "epoch": 0.7960222016651248, + "grad_norm": 3.8280515670776367, + "learning_rate": 1.0521190858116042e-06, + "loss": 0.9141, + "step": 10326 + }, + { + "epoch": 0.7960992907801419, + "grad_norm": 3.922826051712036, + "learning_rate": 1.0513530960482016e-06, + "loss": 1.0064, + "step": 10327 + }, + { + "epoch": 0.7961763798951588, + "grad_norm": 3.8651580810546875, + "learning_rate": 1.0505873524624821e-06, + "loss": 1.0625, + "step": 10328 + }, + { + "epoch": 0.7962534690101758, + "grad_norm": 3.5953264236450195, + "learning_rate": 1.0498218551021876e-06, + "loss": 0.8368, + "step": 10329 + }, + { + "epoch": 0.7963305581251927, + "grad_norm": 3.8444368839263916, + "learning_rate": 1.0490566040150428e-06, + "loss": 0.9272, + "step": 10330 + }, + { + "epoch": 0.7964076472402097, + "grad_norm": 3.7892086505889893, + "learning_rate": 1.0482915992487546e-06, + "loss": 0.8233, + "step": 10331 + }, + { + "epoch": 0.7964847363552267, + "grad_norm": 3.583220958709717, + "learning_rate": 1.0475268408510192e-06, + "loss": 1.0282, + "step": 10332 + }, + { + "epoch": 0.7965618254702436, + "grad_norm": 3.683847427368164, + "learning_rate": 1.046762328869515e-06, + "loss": 0.8625, + "step": 10333 + }, + { + "epoch": 0.7966389145852606, + "grad_norm": 3.9110031127929688, + "learning_rate": 1.0459980633519052e-06, + "loss": 0.9429, + "step": 10334 + }, + { + "epoch": 0.7967160037002775, + "grad_norm": 3.5115268230438232, + "learning_rate": 1.0452340443458376e-06, + "loss": 0.8812, + "step": 10335 + }, + { + "epoch": 0.7967930928152945, + "grad_norm": 3.59769868850708, + "learning_rate": 1.0444702718989452e-06, + "loss": 0.8564, + "step": 10336 + }, + { + "epoch": 0.7968701819303114, + "grad_norm": 3.9897191524505615, + "learning_rate": 1.043706746058845e-06, + "loss": 0.8961, + "step": 10337 + }, + { + "epoch": 0.7969472710453284, + "grad_norm": 3.6812877655029297, + "learning_rate": 1.0429434668731393e-06, + "loss": 1.0195, + "step": 10338 + }, + { + "epoch": 0.7970243601603454, + "grad_norm": 4.031727313995361, + "learning_rate": 1.0421804343894142e-06, + "loss": 0.9039, + "step": 10339 + }, + { + "epoch": 0.7971014492753623, + "grad_norm": 4.048935413360596, + "learning_rate": 1.0414176486552424e-06, + "loss": 0.8887, + "step": 10340 + }, + { + "epoch": 0.7971785383903793, + "grad_norm": 3.7354636192321777, + "learning_rate": 1.0406551097181765e-06, + "loss": 0.9092, + "step": 10341 + }, + { + "epoch": 0.7972556275053962, + "grad_norm": 3.63582706451416, + "learning_rate": 1.0398928176257588e-06, + "loss": 0.9253, + "step": 10342 + }, + { + "epoch": 0.7973327166204132, + "grad_norm": 3.9569201469421387, + "learning_rate": 1.039130772425514e-06, + "loss": 1.0315, + "step": 10343 + }, + { + "epoch": 0.7974098057354302, + "grad_norm": 3.3830461502075195, + "learning_rate": 1.0383689741649516e-06, + "loss": 0.8325, + "step": 10344 + }, + { + "epoch": 0.7974868948504471, + "grad_norm": 4.122878074645996, + "learning_rate": 1.0376074228915662e-06, + "loss": 1.0077, + "step": 10345 + }, + { + "epoch": 0.7975639839654641, + "grad_norm": 3.5504045486450195, + "learning_rate": 1.0368461186528366e-06, + "loss": 0.9757, + "step": 10346 + }, + { + "epoch": 0.797641073080481, + "grad_norm": 3.6346943378448486, + "learning_rate": 1.0360850614962254e-06, + "loss": 0.7473, + "step": 10347 + }, + { + "epoch": 0.797718162195498, + "grad_norm": 3.489760637283325, + "learning_rate": 1.0353242514691807e-06, + "loss": 0.8558, + "step": 10348 + }, + { + "epoch": 0.797795251310515, + "grad_norm": 4.047338485717773, + "learning_rate": 1.0345636886191351e-06, + "loss": 0.9747, + "step": 10349 + }, + { + "epoch": 0.7978723404255319, + "grad_norm": 3.327815055847168, + "learning_rate": 1.0338033729935064e-06, + "loss": 0.9214, + "step": 10350 + }, + { + "epoch": 0.7979494295405489, + "grad_norm": 3.7090442180633545, + "learning_rate": 1.0330433046396971e-06, + "loss": 0.9179, + "step": 10351 + }, + { + "epoch": 0.7980265186555658, + "grad_norm": 3.697265863418579, + "learning_rate": 1.032283483605091e-06, + "loss": 0.8827, + "step": 10352 + }, + { + "epoch": 0.7981036077705828, + "grad_norm": 3.3949902057647705, + "learning_rate": 1.03152390993706e-06, + "loss": 0.9044, + "step": 10353 + }, + { + "epoch": 0.7981806968855998, + "grad_norm": 3.660855531692505, + "learning_rate": 1.0307645836829604e-06, + "loss": 0.9089, + "step": 10354 + }, + { + "epoch": 0.7982577860006167, + "grad_norm": 3.7583930492401123, + "learning_rate": 1.030005504890132e-06, + "loss": 0.9184, + "step": 10355 + }, + { + "epoch": 0.7983348751156337, + "grad_norm": 4.092621803283691, + "learning_rate": 1.0292466736058988e-06, + "loss": 0.919, + "step": 10356 + }, + { + "epoch": 0.7984119642306506, + "grad_norm": 3.7729806900024414, + "learning_rate": 1.0284880898775717e-06, + "loss": 0.8264, + "step": 10357 + }, + { + "epoch": 0.7984890533456676, + "grad_norm": 3.7292611598968506, + "learning_rate": 1.0277297537524422e-06, + "loss": 0.7999, + "step": 10358 + }, + { + "epoch": 0.7985661424606846, + "grad_norm": 3.5362532138824463, + "learning_rate": 1.0269716652777894e-06, + "loss": 0.9376, + "step": 10359 + }, + { + "epoch": 0.7986432315757015, + "grad_norm": 3.8130404949188232, + "learning_rate": 1.0262138245008768e-06, + "loss": 0.9412, + "step": 10360 + }, + { + "epoch": 0.7987203206907185, + "grad_norm": 3.703113555908203, + "learning_rate": 1.0254562314689508e-06, + "loss": 0.9883, + "step": 10361 + }, + { + "epoch": 0.7987974098057354, + "grad_norm": 3.831934928894043, + "learning_rate": 1.0246988862292462e-06, + "loss": 0.8621, + "step": 10362 + }, + { + "epoch": 0.7988744989207524, + "grad_norm": 3.592346668243408, + "learning_rate": 1.023941788828976e-06, + "loss": 0.9581, + "step": 10363 + }, + { + "epoch": 0.7989515880357694, + "grad_norm": 3.945913553237915, + "learning_rate": 1.023184939315342e-06, + "loss": 0.9714, + "step": 10364 + }, + { + "epoch": 0.7990286771507863, + "grad_norm": 3.960003614425659, + "learning_rate": 1.0224283377355316e-06, + "loss": 0.9178, + "step": 10365 + }, + { + "epoch": 0.7991057662658033, + "grad_norm": 3.8358631134033203, + "learning_rate": 1.021671984136713e-06, + "loss": 0.9071, + "step": 10366 + }, + { + "epoch": 0.7991828553808202, + "grad_norm": 3.6046762466430664, + "learning_rate": 1.020915878566044e-06, + "loss": 0.8723, + "step": 10367 + }, + { + "epoch": 0.7992599444958371, + "grad_norm": 3.6357581615448, + "learning_rate": 1.0201600210706596e-06, + "loss": 0.8464, + "step": 10368 + }, + { + "epoch": 0.7993370336108542, + "grad_norm": 3.683241605758667, + "learning_rate": 1.0194044116976864e-06, + "loss": 0.9561, + "step": 10369 + }, + { + "epoch": 0.7994141227258711, + "grad_norm": 4.788916110992432, + "learning_rate": 1.018649050494232e-06, + "loss": 1.021, + "step": 10370 + }, + { + "epoch": 0.7994912118408881, + "grad_norm": 4.036243438720703, + "learning_rate": 1.0178939375073892e-06, + "loss": 1.0098, + "step": 10371 + }, + { + "epoch": 0.799568300955905, + "grad_norm": 3.611304998397827, + "learning_rate": 1.0171390727842357e-06, + "loss": 0.8585, + "step": 10372 + }, + { + "epoch": 0.799645390070922, + "grad_norm": 3.9461710453033447, + "learning_rate": 1.0163844563718344e-06, + "loss": 0.9371, + "step": 10373 + }, + { + "epoch": 0.799722479185939, + "grad_norm": 3.3774936199188232, + "learning_rate": 1.0156300883172292e-06, + "loss": 0.8, + "step": 10374 + }, + { + "epoch": 0.7997995683009559, + "grad_norm": 3.748626947402954, + "learning_rate": 1.0148759686674532e-06, + "loss": 0.8262, + "step": 10375 + }, + { + "epoch": 0.7998766574159729, + "grad_norm": 3.4377031326293945, + "learning_rate": 1.0141220974695199e-06, + "loss": 0.794, + "step": 10376 + }, + { + "epoch": 0.7999537465309898, + "grad_norm": 3.6376664638519287, + "learning_rate": 1.0133684747704314e-06, + "loss": 0.8805, + "step": 10377 + }, + { + "epoch": 0.8000308356460067, + "grad_norm": 3.6229360103607178, + "learning_rate": 1.012615100617172e-06, + "loss": 0.8017, + "step": 10378 + }, + { + "epoch": 0.8001079247610238, + "grad_norm": 3.5345287322998047, + "learning_rate": 1.0118619750567082e-06, + "loss": 0.8068, + "step": 10379 + }, + { + "epoch": 0.8001850138760407, + "grad_norm": 3.6617918014526367, + "learning_rate": 1.0111090981359961e-06, + "loss": 1.0157, + "step": 10380 + }, + { + "epoch": 0.8002621029910577, + "grad_norm": 3.9429585933685303, + "learning_rate": 1.0103564699019707e-06, + "loss": 0.9599, + "step": 10381 + }, + { + "epoch": 0.8003391921060746, + "grad_norm": 3.5763978958129883, + "learning_rate": 1.009604090401558e-06, + "loss": 0.9006, + "step": 10382 + }, + { + "epoch": 0.8004162812210915, + "grad_norm": 3.58232045173645, + "learning_rate": 1.008851959681665e-06, + "loss": 0.8655, + "step": 10383 + }, + { + "epoch": 0.8004933703361086, + "grad_norm": 3.727440118789673, + "learning_rate": 1.0081000777891803e-06, + "loss": 0.9123, + "step": 10384 + }, + { + "epoch": 0.8005704594511255, + "grad_norm": 3.6381850242614746, + "learning_rate": 1.0073484447709809e-06, + "loss": 0.8735, + "step": 10385 + }, + { + "epoch": 0.8006475485661425, + "grad_norm": 3.738128662109375, + "learning_rate": 1.0065970606739273e-06, + "loss": 0.8284, + "step": 10386 + }, + { + "epoch": 0.8007246376811594, + "grad_norm": 3.4860498905181885, + "learning_rate": 1.005845925544865e-06, + "loss": 0.8732, + "step": 10387 + }, + { + "epoch": 0.8008017267961763, + "grad_norm": 3.461859703063965, + "learning_rate": 1.0050950394306242e-06, + "loss": 0.9177, + "step": 10388 + }, + { + "epoch": 0.8008788159111934, + "grad_norm": 3.968186855316162, + "learning_rate": 1.0043444023780164e-06, + "loss": 0.9015, + "step": 10389 + }, + { + "epoch": 0.8009559050262103, + "grad_norm": 3.562736749649048, + "learning_rate": 1.0035940144338406e-06, + "loss": 0.7696, + "step": 10390 + }, + { + "epoch": 0.8010329941412273, + "grad_norm": 4.058101654052734, + "learning_rate": 1.0028438756448805e-06, + "loss": 0.9197, + "step": 10391 + }, + { + "epoch": 0.8011100832562442, + "grad_norm": 4.087806224822998, + "learning_rate": 1.0020939860579033e-06, + "loss": 0.9222, + "step": 10392 + }, + { + "epoch": 0.8011871723712611, + "grad_norm": 3.800830364227295, + "learning_rate": 1.0013443457196598e-06, + "loss": 1.0234, + "step": 10393 + }, + { + "epoch": 0.8012642614862782, + "grad_norm": 3.52839994430542, + "learning_rate": 1.0005949546768879e-06, + "loss": 0.8291, + "step": 10394 + }, + { + "epoch": 0.8013413506012951, + "grad_norm": 3.8631598949432373, + "learning_rate": 9.998458129763062e-07, + "loss": 0.8932, + "step": 10395 + }, + { + "epoch": 0.8014184397163121, + "grad_norm": 3.8592798709869385, + "learning_rate": 9.990969206646205e-07, + "loss": 1.0186, + "step": 10396 + }, + { + "epoch": 0.801495528831329, + "grad_norm": 3.7347230911254883, + "learning_rate": 9.983482777885211e-07, + "loss": 0.9141, + "step": 10397 + }, + { + "epoch": 0.8015726179463459, + "grad_norm": 3.569770574569702, + "learning_rate": 9.975998843946811e-07, + "loss": 0.8909, + "step": 10398 + }, + { + "epoch": 0.801649707061363, + "grad_norm": 3.699913740158081, + "learning_rate": 9.968517405297607e-07, + "loss": 0.916, + "step": 10399 + }, + { + "epoch": 0.8017267961763799, + "grad_norm": 3.733384609222412, + "learning_rate": 9.961038462403999e-07, + "loss": 0.8737, + "step": 10400 + }, + { + "epoch": 0.8018038852913969, + "grad_norm": 3.7092156410217285, + "learning_rate": 9.953562015732281e-07, + "loss": 0.9746, + "step": 10401 + }, + { + "epoch": 0.8018809744064138, + "grad_norm": 3.567920446395874, + "learning_rate": 9.94608806574856e-07, + "loss": 0.8904, + "step": 10402 + }, + { + "epoch": 0.8019580635214307, + "grad_norm": 3.6356654167175293, + "learning_rate": 9.93861661291881e-07, + "loss": 0.9383, + "step": 10403 + }, + { + "epoch": 0.8020351526364478, + "grad_norm": 3.344292640686035, + "learning_rate": 9.931147657708823e-07, + "loss": 0.7388, + "step": 10404 + }, + { + "epoch": 0.8021122417514647, + "grad_norm": 3.598586082458496, + "learning_rate": 9.92368120058428e-07, + "loss": 0.7381, + "step": 10405 + }, + { + "epoch": 0.8021893308664817, + "grad_norm": 3.6138226985931396, + "learning_rate": 9.916217242010634e-07, + "loss": 0.8098, + "step": 10406 + }, + { + "epoch": 0.8022664199814986, + "grad_norm": 3.5014567375183105, + "learning_rate": 9.908755782453245e-07, + "loss": 0.8243, + "step": 10407 + }, + { + "epoch": 0.8023435090965155, + "grad_norm": 3.7613766193389893, + "learning_rate": 9.901296822377293e-07, + "loss": 0.8441, + "step": 10408 + }, + { + "epoch": 0.8024205982115326, + "grad_norm": 3.4418587684631348, + "learning_rate": 9.893840362247809e-07, + "loss": 0.8912, + "step": 10409 + }, + { + "epoch": 0.8024976873265495, + "grad_norm": 3.7355403900146484, + "learning_rate": 9.88638640252968e-07, + "loss": 0.9684, + "step": 10410 + }, + { + "epoch": 0.8025747764415665, + "grad_norm": 3.9282965660095215, + "learning_rate": 9.87893494368759e-07, + "loss": 0.9324, + "step": 10411 + }, + { + "epoch": 0.8026518655565834, + "grad_norm": 3.668252944946289, + "learning_rate": 9.871485986186114e-07, + "loss": 0.9229, + "step": 10412 + }, + { + "epoch": 0.8027289546716003, + "grad_norm": 3.8640308380126953, + "learning_rate": 9.86403953048965e-07, + "loss": 0.9495, + "step": 10413 + }, + { + "epoch": 0.8028060437866174, + "grad_norm": 3.8201818466186523, + "learning_rate": 9.856595577062456e-07, + "loss": 0.8985, + "step": 10414 + }, + { + "epoch": 0.8028831329016343, + "grad_norm": 3.5459721088409424, + "learning_rate": 9.849154126368638e-07, + "loss": 0.9009, + "step": 10415 + }, + { + "epoch": 0.8029602220166513, + "grad_norm": 3.3526997566223145, + "learning_rate": 9.841715178872092e-07, + "loss": 0.8461, + "step": 10416 + }, + { + "epoch": 0.8030373111316682, + "grad_norm": 3.9801721572875977, + "learning_rate": 9.834278735036623e-07, + "loss": 0.9436, + "step": 10417 + }, + { + "epoch": 0.8031144002466851, + "grad_norm": 3.7711634635925293, + "learning_rate": 9.826844795325852e-07, + "loss": 1.0106, + "step": 10418 + }, + { + "epoch": 0.8031914893617021, + "grad_norm": 3.627129316329956, + "learning_rate": 9.819413360203244e-07, + "loss": 1.0238, + "step": 10419 + }, + { + "epoch": 0.8032685784767191, + "grad_norm": 3.4935200214385986, + "learning_rate": 9.811984430132116e-07, + "loss": 0.7833, + "step": 10420 + }, + { + "epoch": 0.8033456675917361, + "grad_norm": 3.4968650341033936, + "learning_rate": 9.804558005575632e-07, + "loss": 0.9387, + "step": 10421 + }, + { + "epoch": 0.803422756706753, + "grad_norm": 3.8687143325805664, + "learning_rate": 9.797134086996762e-07, + "loss": 0.8498, + "step": 10422 + }, + { + "epoch": 0.8034998458217699, + "grad_norm": 3.631547451019287, + "learning_rate": 9.78971267485837e-07, + "loss": 0.8742, + "step": 10423 + }, + { + "epoch": 0.803576934936787, + "grad_norm": 3.5340750217437744, + "learning_rate": 9.782293769623136e-07, + "loss": 0.9018, + "step": 10424 + }, + { + "epoch": 0.8036540240518039, + "grad_norm": 3.6497690677642822, + "learning_rate": 9.774877371753594e-07, + "loss": 0.8692, + "step": 10425 + }, + { + "epoch": 0.8037311131668209, + "grad_norm": 4.077952861785889, + "learning_rate": 9.767463481712113e-07, + "loss": 0.867, + "step": 10426 + }, + { + "epoch": 0.8038082022818378, + "grad_norm": 3.7168338298797607, + "learning_rate": 9.760052099960921e-07, + "loss": 0.8681, + "step": 10427 + }, + { + "epoch": 0.8038852913968547, + "grad_norm": 3.6925764083862305, + "learning_rate": 9.752643226962066e-07, + "loss": 0.9753, + "step": 10428 + }, + { + "epoch": 0.8039623805118717, + "grad_norm": 3.3511204719543457, + "learning_rate": 9.745236863177465e-07, + "loss": 0.8573, + "step": 10429 + }, + { + "epoch": 0.8040394696268887, + "grad_norm": 3.528261661529541, + "learning_rate": 9.737833009068859e-07, + "loss": 0.958, + "step": 10430 + }, + { + "epoch": 0.8041165587419057, + "grad_norm": 3.954437255859375, + "learning_rate": 9.73043166509785e-07, + "loss": 1.0196, + "step": 10431 + }, + { + "epoch": 0.8041936478569226, + "grad_norm": 3.7125864028930664, + "learning_rate": 9.723032831725859e-07, + "loss": 0.9288, + "step": 10432 + }, + { + "epoch": 0.8042707369719395, + "grad_norm": 3.7825093269348145, + "learning_rate": 9.715636509414168e-07, + "loss": 0.967, + "step": 10433 + }, + { + "epoch": 0.8043478260869565, + "grad_norm": 3.930483818054199, + "learning_rate": 9.708242698623898e-07, + "loss": 0.8991, + "step": 10434 + }, + { + "epoch": 0.8044249152019735, + "grad_norm": 3.8229269981384277, + "learning_rate": 9.700851399816026e-07, + "loss": 0.8454, + "step": 10435 + }, + { + "epoch": 0.8045020043169905, + "grad_norm": 3.621720790863037, + "learning_rate": 9.693462613451365e-07, + "loss": 0.9305, + "step": 10436 + }, + { + "epoch": 0.8045790934320074, + "grad_norm": 3.6314635276794434, + "learning_rate": 9.686076339990546e-07, + "loss": 0.8921, + "step": 10437 + }, + { + "epoch": 0.8046561825470243, + "grad_norm": 3.537757635116577, + "learning_rate": 9.678692579894072e-07, + "loss": 0.9057, + "step": 10438 + }, + { + "epoch": 0.8047332716620413, + "grad_norm": 3.6594655513763428, + "learning_rate": 9.671311333622292e-07, + "loss": 0.9033, + "step": 10439 + }, + { + "epoch": 0.8048103607770583, + "grad_norm": 4.193519592285156, + "learning_rate": 9.663932601635378e-07, + "loss": 0.9157, + "step": 10440 + }, + { + "epoch": 0.8048874498920753, + "grad_norm": 3.8593950271606445, + "learning_rate": 9.656556384393362e-07, + "loss": 0.9297, + "step": 10441 + }, + { + "epoch": 0.8049645390070922, + "grad_norm": 3.904437780380249, + "learning_rate": 9.649182682356122e-07, + "loss": 0.8452, + "step": 10442 + }, + { + "epoch": 0.8050416281221091, + "grad_norm": 4.147661209106445, + "learning_rate": 9.64181149598335e-07, + "loss": 0.9162, + "step": 10443 + }, + { + "epoch": 0.8051187172371261, + "grad_norm": 4.427166938781738, + "learning_rate": 9.634442825734609e-07, + "loss": 0.919, + "step": 10444 + }, + { + "epoch": 0.805195806352143, + "grad_norm": 3.7740793228149414, + "learning_rate": 9.627076672069302e-07, + "loss": 0.8899, + "step": 10445 + }, + { + "epoch": 0.8052728954671601, + "grad_norm": 3.5739588737487793, + "learning_rate": 9.619713035446666e-07, + "loss": 0.9004, + "step": 10446 + }, + { + "epoch": 0.805349984582177, + "grad_norm": 3.8233814239501953, + "learning_rate": 9.612351916325795e-07, + "loss": 0.9768, + "step": 10447 + }, + { + "epoch": 0.8054270736971939, + "grad_norm": 4.079464912414551, + "learning_rate": 9.604993315165607e-07, + "loss": 1.0417, + "step": 10448 + }, + { + "epoch": 0.8055041628122109, + "grad_norm": 3.4598426818847656, + "learning_rate": 9.597637232424866e-07, + "loss": 0.8658, + "step": 10449 + }, + { + "epoch": 0.8055812519272278, + "grad_norm": 3.5878536701202393, + "learning_rate": 9.590283668562195e-07, + "loss": 0.8726, + "step": 10450 + }, + { + "epoch": 0.8056583410422449, + "grad_norm": 3.5726075172424316, + "learning_rate": 9.582932624036052e-07, + "loss": 0.8884, + "step": 10451 + }, + { + "epoch": 0.8057354301572618, + "grad_norm": 3.6233930587768555, + "learning_rate": 9.575584099304735e-07, + "loss": 0.8807, + "step": 10452 + }, + { + "epoch": 0.8058125192722787, + "grad_norm": 3.603677272796631, + "learning_rate": 9.5682380948264e-07, + "loss": 0.9608, + "step": 10453 + }, + { + "epoch": 0.8058896083872957, + "grad_norm": 3.6978039741516113, + "learning_rate": 9.560894611059001e-07, + "loss": 0.906, + "step": 10454 + }, + { + "epoch": 0.8059666975023126, + "grad_norm": 3.5961194038391113, + "learning_rate": 9.55355364846039e-07, + "loss": 0.9432, + "step": 10455 + }, + { + "epoch": 0.8060437866173297, + "grad_norm": 3.6993865966796875, + "learning_rate": 9.546215207488225e-07, + "loss": 0.9392, + "step": 10456 + }, + { + "epoch": 0.8061208757323466, + "grad_norm": 4.084522724151611, + "learning_rate": 9.538879288600028e-07, + "loss": 0.8663, + "step": 10457 + }, + { + "epoch": 0.8061979648473635, + "grad_norm": 3.5728228092193604, + "learning_rate": 9.531545892253169e-07, + "loss": 0.9397, + "step": 10458 + }, + { + "epoch": 0.8062750539623805, + "grad_norm": 3.443225383758545, + "learning_rate": 9.52421501890482e-07, + "loss": 0.8319, + "step": 10459 + }, + { + "epoch": 0.8063521430773974, + "grad_norm": 3.599980115890503, + "learning_rate": 9.516886669012032e-07, + "loss": 0.9463, + "step": 10460 + }, + { + "epoch": 0.8064292321924145, + "grad_norm": 3.455798387527466, + "learning_rate": 9.509560843031695e-07, + "loss": 0.8954, + "step": 10461 + }, + { + "epoch": 0.8065063213074314, + "grad_norm": 3.5912976264953613, + "learning_rate": 9.502237541420534e-07, + "loss": 0.9573, + "step": 10462 + }, + { + "epoch": 0.8065834104224483, + "grad_norm": 3.59824275970459, + "learning_rate": 9.494916764635126e-07, + "loss": 0.8395, + "step": 10463 + }, + { + "epoch": 0.8066604995374653, + "grad_norm": 3.764808416366577, + "learning_rate": 9.487598513131868e-07, + "loss": 0.8776, + "step": 10464 + }, + { + "epoch": 0.8067375886524822, + "grad_norm": 3.6447689533233643, + "learning_rate": 9.480282787367029e-07, + "loss": 0.8773, + "step": 10465 + }, + { + "epoch": 0.8068146777674993, + "grad_norm": 3.842656373977661, + "learning_rate": 9.472969587796694e-07, + "loss": 0.9713, + "step": 10466 + }, + { + "epoch": 0.8068917668825162, + "grad_norm": 3.9619832038879395, + "learning_rate": 9.465658914876813e-07, + "loss": 0.871, + "step": 10467 + }, + { + "epoch": 0.8069688559975331, + "grad_norm": 3.368314743041992, + "learning_rate": 9.458350769063162e-07, + "loss": 0.9105, + "step": 10468 + }, + { + "epoch": 0.8070459451125501, + "grad_norm": 3.7825496196746826, + "learning_rate": 9.451045150811377e-07, + "loss": 0.9072, + "step": 10469 + }, + { + "epoch": 0.807123034227567, + "grad_norm": 4.548655033111572, + "learning_rate": 9.443742060576916e-07, + "loss": 0.8509, + "step": 10470 + }, + { + "epoch": 0.8072001233425841, + "grad_norm": 3.4712092876434326, + "learning_rate": 9.436441498815086e-07, + "loss": 0.8334, + "step": 10471 + }, + { + "epoch": 0.807277212457601, + "grad_norm": 3.7380151748657227, + "learning_rate": 9.42914346598105e-07, + "loss": 0.8441, + "step": 10472 + }, + { + "epoch": 0.8073543015726179, + "grad_norm": 4.4421467781066895, + "learning_rate": 9.421847962529806e-07, + "loss": 0.8698, + "step": 10473 + }, + { + "epoch": 0.8074313906876349, + "grad_norm": 3.492043972015381, + "learning_rate": 9.414554988916175e-07, + "loss": 0.86, + "step": 10474 + }, + { + "epoch": 0.8075084798026518, + "grad_norm": 3.6429760456085205, + "learning_rate": 9.407264545594841e-07, + "loss": 0.9613, + "step": 10475 + }, + { + "epoch": 0.8075855689176689, + "grad_norm": 4.198061466217041, + "learning_rate": 9.399976633020325e-07, + "loss": 1.0213, + "step": 10476 + }, + { + "epoch": 0.8076626580326858, + "grad_norm": 4.003988265991211, + "learning_rate": 9.392691251646991e-07, + "loss": 0.9604, + "step": 10477 + }, + { + "epoch": 0.8077397471477027, + "grad_norm": 3.8301596641540527, + "learning_rate": 9.385408401929052e-07, + "loss": 0.9308, + "step": 10478 + }, + { + "epoch": 0.8078168362627197, + "grad_norm": 3.498955726623535, + "learning_rate": 9.378128084320559e-07, + "loss": 0.9101, + "step": 10479 + }, + { + "epoch": 0.8078939253777366, + "grad_norm": 3.979250907897949, + "learning_rate": 9.370850299275386e-07, + "loss": 0.976, + "step": 10480 + }, + { + "epoch": 0.8079710144927537, + "grad_norm": 3.989082098007202, + "learning_rate": 9.36357504724727e-07, + "loss": 0.9461, + "step": 10481 + }, + { + "epoch": 0.8080481036077706, + "grad_norm": 3.72397518157959, + "learning_rate": 9.356302328689787e-07, + "loss": 0.9482, + "step": 10482 + }, + { + "epoch": 0.8081251927227875, + "grad_norm": 3.821753740310669, + "learning_rate": 9.349032144056358e-07, + "loss": 0.9482, + "step": 10483 + }, + { + "epoch": 0.8082022818378045, + "grad_norm": 3.7636606693267822, + "learning_rate": 9.341764493800248e-07, + "loss": 0.7877, + "step": 10484 + }, + { + "epoch": 0.8082793709528214, + "grad_norm": 3.7938175201416016, + "learning_rate": 9.334499378374534e-07, + "loss": 0.8539, + "step": 10485 + }, + { + "epoch": 0.8083564600678385, + "grad_norm": 3.736424446105957, + "learning_rate": 9.327236798232176e-07, + "loss": 0.9258, + "step": 10486 + }, + { + "epoch": 0.8084335491828554, + "grad_norm": 3.6965887546539307, + "learning_rate": 9.319976753825949e-07, + "loss": 0.871, + "step": 10487 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 3.8846166133880615, + "learning_rate": 9.312719245608487e-07, + "loss": 0.9277, + "step": 10488 + }, + { + "epoch": 0.8085877274128893, + "grad_norm": 4.221765518188477, + "learning_rate": 9.305464274032256e-07, + "loss": 0.9001, + "step": 10489 + }, + { + "epoch": 0.8086648165279062, + "grad_norm": 3.9422693252563477, + "learning_rate": 9.298211839549576e-07, + "loss": 0.9941, + "step": 10490 + }, + { + "epoch": 0.8087419056429233, + "grad_norm": 3.6819467544555664, + "learning_rate": 9.290961942612576e-07, + "loss": 0.855, + "step": 10491 + }, + { + "epoch": 0.8088189947579402, + "grad_norm": 3.909057140350342, + "learning_rate": 9.283714583673264e-07, + "loss": 0.8636, + "step": 10492 + }, + { + "epoch": 0.8088960838729571, + "grad_norm": 3.8165695667266846, + "learning_rate": 9.276469763183471e-07, + "loss": 0.9215, + "step": 10493 + }, + { + "epoch": 0.8089731729879741, + "grad_norm": 3.7066540718078613, + "learning_rate": 9.269227481594872e-07, + "loss": 0.8403, + "step": 10494 + }, + { + "epoch": 0.809050262102991, + "grad_norm": 3.6968142986297607, + "learning_rate": 9.261987739359007e-07, + "loss": 0.9137, + "step": 10495 + }, + { + "epoch": 0.809127351218008, + "grad_norm": 3.936868906021118, + "learning_rate": 9.254750536927204e-07, + "loss": 1.0582, + "step": 10496 + }, + { + "epoch": 0.809204440333025, + "grad_norm": 3.842881441116333, + "learning_rate": 9.247515874750679e-07, + "loss": 0.9131, + "step": 10497 + }, + { + "epoch": 0.809281529448042, + "grad_norm": 3.542695999145508, + "learning_rate": 9.240283753280477e-07, + "loss": 0.7972, + "step": 10498 + }, + { + "epoch": 0.8093586185630589, + "grad_norm": 3.8389735221862793, + "learning_rate": 9.233054172967481e-07, + "loss": 0.9359, + "step": 10499 + }, + { + "epoch": 0.8094357076780758, + "grad_norm": 3.6747260093688965, + "learning_rate": 9.225827134262422e-07, + "loss": 0.901, + "step": 10500 + }, + { + "epoch": 0.8095127967930928, + "grad_norm": 4.066951751708984, + "learning_rate": 9.218602637615882e-07, + "loss": 0.8817, + "step": 10501 + }, + { + "epoch": 0.8095898859081098, + "grad_norm": 3.7741777896881104, + "learning_rate": 9.211380683478238e-07, + "loss": 1.0278, + "step": 10502 + }, + { + "epoch": 0.8096669750231268, + "grad_norm": 4.318873882293701, + "learning_rate": 9.204161272299761e-07, + "loss": 1.0011, + "step": 10503 + }, + { + "epoch": 0.8097440641381437, + "grad_norm": 3.677009105682373, + "learning_rate": 9.196944404530544e-07, + "loss": 0.8841, + "step": 10504 + }, + { + "epoch": 0.8098211532531606, + "grad_norm": 3.3423690795898438, + "learning_rate": 9.189730080620512e-07, + "loss": 0.7182, + "step": 10505 + }, + { + "epoch": 0.8098982423681776, + "grad_norm": 3.752957344055176, + "learning_rate": 9.182518301019466e-07, + "loss": 0.9169, + "step": 10506 + }, + { + "epoch": 0.8099753314831946, + "grad_norm": 4.208918571472168, + "learning_rate": 9.175309066176991e-07, + "loss": 0.9648, + "step": 10507 + }, + { + "epoch": 0.8100524205982116, + "grad_norm": 3.9163053035736084, + "learning_rate": 9.168102376542559e-07, + "loss": 0.8975, + "step": 10508 + }, + { + "epoch": 0.8101295097132285, + "grad_norm": 3.690941333770752, + "learning_rate": 9.160898232565468e-07, + "loss": 0.8554, + "step": 10509 + }, + { + "epoch": 0.8102065988282454, + "grad_norm": 3.9642856121063232, + "learning_rate": 9.153696634694865e-07, + "loss": 1.0219, + "step": 10510 + }, + { + "epoch": 0.8102836879432624, + "grad_norm": 3.2848901748657227, + "learning_rate": 9.146497583379737e-07, + "loss": 0.8234, + "step": 10511 + }, + { + "epoch": 0.8103607770582794, + "grad_norm": 3.8563730716705322, + "learning_rate": 9.139301079068891e-07, + "loss": 0.9495, + "step": 10512 + }, + { + "epoch": 0.8104378661732964, + "grad_norm": 3.7783915996551514, + "learning_rate": 9.132107122210998e-07, + "loss": 0.9717, + "step": 10513 + }, + { + "epoch": 0.8105149552883133, + "grad_norm": 3.5279428958892822, + "learning_rate": 9.124915713254551e-07, + "loss": 0.9075, + "step": 10514 + }, + { + "epoch": 0.8105920444033302, + "grad_norm": 3.8966267108917236, + "learning_rate": 9.117726852647924e-07, + "loss": 0.9536, + "step": 10515 + }, + { + "epoch": 0.8106691335183472, + "grad_norm": 3.5992579460144043, + "learning_rate": 9.110540540839307e-07, + "loss": 0.866, + "step": 10516 + }, + { + "epoch": 0.8107462226333642, + "grad_norm": 3.7475366592407227, + "learning_rate": 9.103356778276701e-07, + "loss": 0.9517, + "step": 10517 + }, + { + "epoch": 0.8108233117483812, + "grad_norm": 3.9139578342437744, + "learning_rate": 9.096175565407994e-07, + "loss": 0.9181, + "step": 10518 + }, + { + "epoch": 0.8109004008633981, + "grad_norm": 3.3899145126342773, + "learning_rate": 9.088996902680891e-07, + "loss": 0.8865, + "step": 10519 + }, + { + "epoch": 0.810977489978415, + "grad_norm": 3.5167202949523926, + "learning_rate": 9.08182079054295e-07, + "loss": 0.9024, + "step": 10520 + }, + { + "epoch": 0.811054579093432, + "grad_norm": 4.085640907287598, + "learning_rate": 9.074647229441575e-07, + "loss": 0.9606, + "step": 10521 + }, + { + "epoch": 0.811131668208449, + "grad_norm": 3.6280179023742676, + "learning_rate": 9.067476219823973e-07, + "loss": 0.963, + "step": 10522 + }, + { + "epoch": 0.811208757323466, + "grad_norm": 4.181948184967041, + "learning_rate": 9.060307762137233e-07, + "loss": 0.9805, + "step": 10523 + }, + { + "epoch": 0.8112858464384829, + "grad_norm": 3.7412025928497314, + "learning_rate": 9.053141856828274e-07, + "loss": 0.888, + "step": 10524 + }, + { + "epoch": 0.8113629355534998, + "grad_norm": 3.9258925914764404, + "learning_rate": 9.04597850434385e-07, + "loss": 1.0001, + "step": 10525 + }, + { + "epoch": 0.8114400246685168, + "grad_norm": 3.6146883964538574, + "learning_rate": 9.038817705130559e-07, + "loss": 0.9586, + "step": 10526 + }, + { + "epoch": 0.8115171137835338, + "grad_norm": 3.679476499557495, + "learning_rate": 9.031659459634856e-07, + "loss": 0.9259, + "step": 10527 + }, + { + "epoch": 0.8115942028985508, + "grad_norm": 4.017477512359619, + "learning_rate": 9.024503768302989e-07, + "loss": 0.9798, + "step": 10528 + }, + { + "epoch": 0.8116712920135677, + "grad_norm": 3.543410301208496, + "learning_rate": 9.017350631581095e-07, + "loss": 0.9361, + "step": 10529 + }, + { + "epoch": 0.8117483811285846, + "grad_norm": 3.8175089359283447, + "learning_rate": 9.010200049915141e-07, + "loss": 0.851, + "step": 10530 + }, + { + "epoch": 0.8118254702436016, + "grad_norm": 4.348031997680664, + "learning_rate": 9.003052023750913e-07, + "loss": 0.9564, + "step": 10531 + }, + { + "epoch": 0.8119025593586185, + "grad_norm": 3.7295446395874023, + "learning_rate": 8.995906553534084e-07, + "loss": 0.8679, + "step": 10532 + }, + { + "epoch": 0.8119796484736356, + "grad_norm": 3.961146116256714, + "learning_rate": 8.988763639710097e-07, + "loss": 0.9047, + "step": 10533 + }, + { + "epoch": 0.8120567375886525, + "grad_norm": 3.5502161979675293, + "learning_rate": 8.981623282724295e-07, + "loss": 0.9201, + "step": 10534 + }, + { + "epoch": 0.8121338267036694, + "grad_norm": 4.238675594329834, + "learning_rate": 8.974485483021844e-07, + "loss": 0.8612, + "step": 10535 + }, + { + "epoch": 0.8122109158186864, + "grad_norm": 3.8428282737731934, + "learning_rate": 8.967350241047745e-07, + "loss": 0.887, + "step": 10536 + }, + { + "epoch": 0.8122880049337033, + "grad_norm": 3.5076427459716797, + "learning_rate": 8.960217557246842e-07, + "loss": 0.9022, + "step": 10537 + }, + { + "epoch": 0.8123650940487204, + "grad_norm": 3.7073769569396973, + "learning_rate": 8.953087432063839e-07, + "loss": 0.9396, + "step": 10538 + }, + { + "epoch": 0.8124421831637373, + "grad_norm": 4.131918430328369, + "learning_rate": 8.945959865943238e-07, + "loss": 0.9772, + "step": 10539 + }, + { + "epoch": 0.8125192722787542, + "grad_norm": 3.5746655464172363, + "learning_rate": 8.938834859329415e-07, + "loss": 0.8473, + "step": 10540 + }, + { + "epoch": 0.8125963613937712, + "grad_norm": 3.607383966445923, + "learning_rate": 8.931712412666571e-07, + "loss": 0.9258, + "step": 10541 + }, + { + "epoch": 0.8126734505087881, + "grad_norm": 3.5877065658569336, + "learning_rate": 8.924592526398762e-07, + "loss": 0.8664, + "step": 10542 + }, + { + "epoch": 0.8127505396238052, + "grad_norm": 3.302738904953003, + "learning_rate": 8.917475200969889e-07, + "loss": 0.7732, + "step": 10543 + }, + { + "epoch": 0.8128276287388221, + "grad_norm": 3.6982762813568115, + "learning_rate": 8.910360436823651e-07, + "loss": 0.9946, + "step": 10544 + }, + { + "epoch": 0.812904717853839, + "grad_norm": 3.758657932281494, + "learning_rate": 8.903248234403633e-07, + "loss": 0.8554, + "step": 10545 + }, + { + "epoch": 0.812981806968856, + "grad_norm": 3.797997236251831, + "learning_rate": 8.896138594153242e-07, + "loss": 0.9052, + "step": 10546 + }, + { + "epoch": 0.8130588960838729, + "grad_norm": 4.33210563659668, + "learning_rate": 8.889031516515729e-07, + "loss": 1.022, + "step": 10547 + }, + { + "epoch": 0.81313598519889, + "grad_norm": 3.558067560195923, + "learning_rate": 8.881927001934177e-07, + "loss": 0.829, + "step": 10548 + }, + { + "epoch": 0.8132130743139069, + "grad_norm": 3.977452516555786, + "learning_rate": 8.874825050851532e-07, + "loss": 0.9614, + "step": 10549 + }, + { + "epoch": 0.8132901634289238, + "grad_norm": 4.086385250091553, + "learning_rate": 8.867725663710547e-07, + "loss": 1.0238, + "step": 10550 + }, + { + "epoch": 0.8133672525439408, + "grad_norm": 4.283471584320068, + "learning_rate": 8.860628840953833e-07, + "loss": 0.8701, + "step": 10551 + }, + { + "epoch": 0.8134443416589577, + "grad_norm": 4.118513584136963, + "learning_rate": 8.853534583023843e-07, + "loss": 0.9455, + "step": 10552 + }, + { + "epoch": 0.8135214307739748, + "grad_norm": 3.710076332092285, + "learning_rate": 8.846442890362872e-07, + "loss": 0.8362, + "step": 10553 + }, + { + "epoch": 0.8135985198889917, + "grad_norm": 3.5066654682159424, + "learning_rate": 8.839353763413056e-07, + "loss": 0.8795, + "step": 10554 + }, + { + "epoch": 0.8136756090040086, + "grad_norm": 3.774261236190796, + "learning_rate": 8.832267202616346e-07, + "loss": 0.9621, + "step": 10555 + }, + { + "epoch": 0.8137526981190256, + "grad_norm": 3.7309882640838623, + "learning_rate": 8.825183208414562e-07, + "loss": 0.9751, + "step": 10556 + }, + { + "epoch": 0.8138297872340425, + "grad_norm": 3.934528112411499, + "learning_rate": 8.818101781249355e-07, + "loss": 0.9044, + "step": 10557 + }, + { + "epoch": 0.8139068763490596, + "grad_norm": 3.5573508739471436, + "learning_rate": 8.811022921562218e-07, + "loss": 0.8235, + "step": 10558 + }, + { + "epoch": 0.8139839654640765, + "grad_norm": 3.88106632232666, + "learning_rate": 8.803946629794475e-07, + "loss": 0.9063, + "step": 10559 + }, + { + "epoch": 0.8140610545790934, + "grad_norm": 3.713273048400879, + "learning_rate": 8.796872906387299e-07, + "loss": 0.8801, + "step": 10560 + }, + { + "epoch": 0.8141381436941104, + "grad_norm": 3.7265689373016357, + "learning_rate": 8.789801751781707e-07, + "loss": 0.9106, + "step": 10561 + }, + { + "epoch": 0.8142152328091273, + "grad_norm": 3.56809401512146, + "learning_rate": 8.782733166418539e-07, + "loss": 0.9087, + "step": 10562 + }, + { + "epoch": 0.8142923219241444, + "grad_norm": 4.446261405944824, + "learning_rate": 8.775667150738487e-07, + "loss": 0.9448, + "step": 10563 + }, + { + "epoch": 0.8143694110391613, + "grad_norm": 3.8217625617980957, + "learning_rate": 8.768603705182094e-07, + "loss": 0.8059, + "step": 10564 + }, + { + "epoch": 0.8144465001541782, + "grad_norm": 3.5717344284057617, + "learning_rate": 8.761542830189706e-07, + "loss": 0.859, + "step": 10565 + }, + { + "epoch": 0.8145235892691952, + "grad_norm": 3.8214683532714844, + "learning_rate": 8.754484526201546e-07, + "loss": 0.8514, + "step": 10566 + }, + { + "epoch": 0.8146006783842121, + "grad_norm": 4.17495059967041, + "learning_rate": 8.747428793657658e-07, + "loss": 0.9882, + "step": 10567 + }, + { + "epoch": 0.8146777674992292, + "grad_norm": 4.129594326019287, + "learning_rate": 8.740375632997927e-07, + "loss": 0.9762, + "step": 10568 + }, + { + "epoch": 0.8147548566142461, + "grad_norm": 3.8099358081817627, + "learning_rate": 8.733325044662106e-07, + "loss": 0.9268, + "step": 10569 + }, + { + "epoch": 0.814831945729263, + "grad_norm": 4.046210765838623, + "learning_rate": 8.726277029089725e-07, + "loss": 0.8863, + "step": 10570 + }, + { + "epoch": 0.81490903484428, + "grad_norm": 4.233992099761963, + "learning_rate": 8.719231586720211e-07, + "loss": 0.9588, + "step": 10571 + }, + { + "epoch": 0.8149861239592969, + "grad_norm": 3.290083646774292, + "learning_rate": 8.712188717992814e-07, + "loss": 0.8304, + "step": 10572 + }, + { + "epoch": 0.815063213074314, + "grad_norm": 3.646782875061035, + "learning_rate": 8.70514842334661e-07, + "loss": 0.9994, + "step": 10573 + }, + { + "epoch": 0.8151403021893309, + "grad_norm": 3.9229447841644287, + "learning_rate": 8.69811070322053e-07, + "loss": 0.8786, + "step": 10574 + }, + { + "epoch": 0.8152173913043478, + "grad_norm": 3.7923243045806885, + "learning_rate": 8.691075558053353e-07, + "loss": 0.936, + "step": 10575 + }, + { + "epoch": 0.8152944804193648, + "grad_norm": 3.350618600845337, + "learning_rate": 8.684042988283659e-07, + "loss": 0.858, + "step": 10576 + }, + { + "epoch": 0.8153715695343817, + "grad_norm": 4.026613235473633, + "learning_rate": 8.677012994349904e-07, + "loss": 0.9224, + "step": 10577 + }, + { + "epoch": 0.8154486586493987, + "grad_norm": 3.9146225452423096, + "learning_rate": 8.669985576690371e-07, + "loss": 0.8832, + "step": 10578 + }, + { + "epoch": 0.8155257477644157, + "grad_norm": 3.648346185684204, + "learning_rate": 8.662960735743181e-07, + "loss": 0.7847, + "step": 10579 + }, + { + "epoch": 0.8156028368794326, + "grad_norm": 3.5210020542144775, + "learning_rate": 8.655938471946313e-07, + "loss": 0.8185, + "step": 10580 + }, + { + "epoch": 0.8156799259944496, + "grad_norm": 4.143526554107666, + "learning_rate": 8.648918785737542e-07, + "loss": 0.8891, + "step": 10581 + }, + { + "epoch": 0.8157570151094665, + "grad_norm": 3.710387706756592, + "learning_rate": 8.641901677554526e-07, + "loss": 0.8862, + "step": 10582 + }, + { + "epoch": 0.8158341042244835, + "grad_norm": 3.5736660957336426, + "learning_rate": 8.634887147834736e-07, + "loss": 0.8868, + "step": 10583 + }, + { + "epoch": 0.8159111933395005, + "grad_norm": 3.8351974487304688, + "learning_rate": 8.6278751970155e-07, + "loss": 0.9472, + "step": 10584 + }, + { + "epoch": 0.8159882824545174, + "grad_norm": 3.8771169185638428, + "learning_rate": 8.620865825533975e-07, + "loss": 0.9296, + "step": 10585 + }, + { + "epoch": 0.8160653715695344, + "grad_norm": 3.649178981781006, + "learning_rate": 8.613859033827165e-07, + "loss": 0.9505, + "step": 10586 + }, + { + "epoch": 0.8161424606845513, + "grad_norm": 3.742682456970215, + "learning_rate": 8.606854822331895e-07, + "loss": 0.8974, + "step": 10587 + }, + { + "epoch": 0.8162195497995683, + "grad_norm": 4.2910237312316895, + "learning_rate": 8.599853191484842e-07, + "loss": 0.9299, + "step": 10588 + }, + { + "epoch": 0.8162966389145853, + "grad_norm": 3.860732078552246, + "learning_rate": 8.592854141722523e-07, + "loss": 1.0105, + "step": 10589 + }, + { + "epoch": 0.8163737280296022, + "grad_norm": 3.7548787593841553, + "learning_rate": 8.585857673481302e-07, + "loss": 0.9132, + "step": 10590 + }, + { + "epoch": 0.8164508171446192, + "grad_norm": 3.929503917694092, + "learning_rate": 8.578863787197372e-07, + "loss": 0.9521, + "step": 10591 + }, + { + "epoch": 0.8165279062596361, + "grad_norm": 4.102705955505371, + "learning_rate": 8.571872483306748e-07, + "loss": 0.9715, + "step": 10592 + }, + { + "epoch": 0.8166049953746531, + "grad_norm": 3.499938488006592, + "learning_rate": 8.564883762245313e-07, + "loss": 0.9841, + "step": 10593 + }, + { + "epoch": 0.8166820844896701, + "grad_norm": 4.210318565368652, + "learning_rate": 8.55789762444878e-07, + "loss": 0.9848, + "step": 10594 + }, + { + "epoch": 0.816759173604687, + "grad_norm": 3.527986526489258, + "learning_rate": 8.550914070352695e-07, + "loss": 0.9264, + "step": 10595 + }, + { + "epoch": 0.816836262719704, + "grad_norm": 3.987180471420288, + "learning_rate": 8.54393310039246e-07, + "loss": 0.9992, + "step": 10596 + }, + { + "epoch": 0.8169133518347209, + "grad_norm": 3.61505389213562, + "learning_rate": 8.536954715003276e-07, + "loss": 0.8168, + "step": 10597 + }, + { + "epoch": 0.8169904409497379, + "grad_norm": 3.735048294067383, + "learning_rate": 8.529978914620219e-07, + "loss": 0.8828, + "step": 10598 + }, + { + "epoch": 0.8170675300647549, + "grad_norm": 4.112254619598389, + "learning_rate": 8.523005699678205e-07, + "loss": 0.9597, + "step": 10599 + }, + { + "epoch": 0.8171446191797718, + "grad_norm": 4.099547386169434, + "learning_rate": 8.516035070611967e-07, + "loss": 0.9305, + "step": 10600 + }, + { + "epoch": 0.8172217082947888, + "grad_norm": 3.74941086769104, + "learning_rate": 8.509067027856093e-07, + "loss": 0.8205, + "step": 10601 + }, + { + "epoch": 0.8172987974098057, + "grad_norm": 3.4929075241088867, + "learning_rate": 8.502101571845001e-07, + "loss": 0.9547, + "step": 10602 + }, + { + "epoch": 0.8173758865248227, + "grad_norm": 4.058771133422852, + "learning_rate": 8.495138703012957e-07, + "loss": 0.983, + "step": 10603 + }, + { + "epoch": 0.8174529756398397, + "grad_norm": 3.9835104942321777, + "learning_rate": 8.488178421794047e-07, + "loss": 0.9632, + "step": 10604 + }, + { + "epoch": 0.8175300647548566, + "grad_norm": 4.225916862487793, + "learning_rate": 8.481220728622225e-07, + "loss": 0.9406, + "step": 10605 + }, + { + "epoch": 0.8176071538698736, + "grad_norm": 3.349649429321289, + "learning_rate": 8.474265623931272e-07, + "loss": 0.7666, + "step": 10606 + }, + { + "epoch": 0.8176842429848905, + "grad_norm": 3.722669839859009, + "learning_rate": 8.467313108154773e-07, + "loss": 0.939, + "step": 10607 + }, + { + "epoch": 0.8177613320999075, + "grad_norm": 3.4461019039154053, + "learning_rate": 8.4603631817262e-07, + "loss": 0.8237, + "step": 10608 + }, + { + "epoch": 0.8178384212149244, + "grad_norm": 3.774502992630005, + "learning_rate": 8.453415845078844e-07, + "loss": 0.8583, + "step": 10609 + }, + { + "epoch": 0.8179155103299414, + "grad_norm": 3.614598512649536, + "learning_rate": 8.446471098645831e-07, + "loss": 0.9551, + "step": 10610 + }, + { + "epoch": 0.8179925994449584, + "grad_norm": 3.7169055938720703, + "learning_rate": 8.439528942860137e-07, + "loss": 0.8629, + "step": 10611 + }, + { + "epoch": 0.8180696885599753, + "grad_norm": 3.7345728874206543, + "learning_rate": 8.432589378154582e-07, + "loss": 0.9106, + "step": 10612 + }, + { + "epoch": 0.8181467776749923, + "grad_norm": 4.463329315185547, + "learning_rate": 8.425652404961781e-07, + "loss": 0.9058, + "step": 10613 + }, + { + "epoch": 0.8182238667900092, + "grad_norm": 3.3270933628082275, + "learning_rate": 8.418718023714235e-07, + "loss": 0.8002, + "step": 10614 + }, + { + "epoch": 0.8183009559050262, + "grad_norm": 3.8927876949310303, + "learning_rate": 8.41178623484426e-07, + "loss": 0.8976, + "step": 10615 + }, + { + "epoch": 0.8183780450200432, + "grad_norm": 3.9593920707702637, + "learning_rate": 8.404857038784026e-07, + "loss": 0.9744, + "step": 10616 + }, + { + "epoch": 0.8184551341350601, + "grad_norm": 3.8429269790649414, + "learning_rate": 8.39793043596554e-07, + "loss": 0.8321, + "step": 10617 + }, + { + "epoch": 0.8185322232500771, + "grad_norm": 3.442072629928589, + "learning_rate": 8.391006426820619e-07, + "loss": 0.8208, + "step": 10618 + }, + { + "epoch": 0.818609312365094, + "grad_norm": 3.8619630336761475, + "learning_rate": 8.384085011780946e-07, + "loss": 0.8541, + "step": 10619 + }, + { + "epoch": 0.818686401480111, + "grad_norm": 3.815845489501953, + "learning_rate": 8.377166191278036e-07, + "loss": 0.9521, + "step": 10620 + }, + { + "epoch": 0.818763490595128, + "grad_norm": 3.9525768756866455, + "learning_rate": 8.370249965743249e-07, + "loss": 0.9698, + "step": 10621 + }, + { + "epoch": 0.8188405797101449, + "grad_norm": 3.71791934967041, + "learning_rate": 8.363336335607769e-07, + "loss": 1.037, + "step": 10622 + }, + { + "epoch": 0.8189176688251619, + "grad_norm": 3.4764459133148193, + "learning_rate": 8.356425301302639e-07, + "loss": 0.8571, + "step": 10623 + }, + { + "epoch": 0.8189947579401788, + "grad_norm": 4.280252933502197, + "learning_rate": 8.3495168632587e-07, + "loss": 1.067, + "step": 10624 + }, + { + "epoch": 0.8190718470551958, + "grad_norm": 3.945054769515991, + "learning_rate": 8.342611021906672e-07, + "loss": 0.9154, + "step": 10625 + }, + { + "epoch": 0.8191489361702128, + "grad_norm": 3.6436848640441895, + "learning_rate": 8.335707777677099e-07, + "loss": 0.8429, + "step": 10626 + }, + { + "epoch": 0.8192260252852297, + "grad_norm": 3.7490007877349854, + "learning_rate": 8.32880713100036e-07, + "loss": 0.9217, + "step": 10627 + }, + { + "epoch": 0.8193031144002467, + "grad_norm": 3.5114471912384033, + "learning_rate": 8.321909082306684e-07, + "loss": 0.9029, + "step": 10628 + }, + { + "epoch": 0.8193802035152636, + "grad_norm": 3.8217241764068604, + "learning_rate": 8.315013632026114e-07, + "loss": 0.861, + "step": 10629 + }, + { + "epoch": 0.8194572926302806, + "grad_norm": 3.3438620567321777, + "learning_rate": 8.308120780588553e-07, + "loss": 0.7675, + "step": 10630 + }, + { + "epoch": 0.8195343817452976, + "grad_norm": 4.367412090301514, + "learning_rate": 8.301230528423726e-07, + "loss": 0.934, + "step": 10631 + }, + { + "epoch": 0.8196114708603145, + "grad_norm": 4.256845951080322, + "learning_rate": 8.29434287596122e-07, + "loss": 0.9082, + "step": 10632 + }, + { + "epoch": 0.8196885599753315, + "grad_norm": 3.7340166568756104, + "learning_rate": 8.287457823630429e-07, + "loss": 0.9485, + "step": 10633 + }, + { + "epoch": 0.8197656490903484, + "grad_norm": 3.8312957286834717, + "learning_rate": 8.280575371860627e-07, + "loss": 0.8524, + "step": 10634 + }, + { + "epoch": 0.8198427382053654, + "grad_norm": 3.471848726272583, + "learning_rate": 8.273695521080866e-07, + "loss": 0.8596, + "step": 10635 + }, + { + "epoch": 0.8199198273203824, + "grad_norm": 3.743417978286743, + "learning_rate": 8.266818271720078e-07, + "loss": 0.984, + "step": 10636 + }, + { + "epoch": 0.8199969164353993, + "grad_norm": 3.6192376613616943, + "learning_rate": 8.259943624207034e-07, + "loss": 0.9746, + "step": 10637 + }, + { + "epoch": 0.8200740055504163, + "grad_norm": 3.581130027770996, + "learning_rate": 8.253071578970329e-07, + "loss": 0.8544, + "step": 10638 + }, + { + "epoch": 0.8201510946654332, + "grad_norm": 3.57409405708313, + "learning_rate": 8.246202136438408e-07, + "loss": 0.7728, + "step": 10639 + }, + { + "epoch": 0.8202281837804501, + "grad_norm": 3.536517858505249, + "learning_rate": 8.239335297039525e-07, + "loss": 0.8483, + "step": 10640 + }, + { + "epoch": 0.8203052728954672, + "grad_norm": 3.8591723442077637, + "learning_rate": 8.232471061201797e-07, + "loss": 0.9938, + "step": 10641 + }, + { + "epoch": 0.8203823620104841, + "grad_norm": 3.6102797985076904, + "learning_rate": 8.225609429353187e-07, + "loss": 0.9334, + "step": 10642 + }, + { + "epoch": 0.8204594511255011, + "grad_norm": 3.6389827728271484, + "learning_rate": 8.218750401921466e-07, + "loss": 0.8946, + "step": 10643 + }, + { + "epoch": 0.820536540240518, + "grad_norm": 3.4284937381744385, + "learning_rate": 8.211893979334284e-07, + "loss": 0.7989, + "step": 10644 + }, + { + "epoch": 0.820613629355535, + "grad_norm": 3.6757824420928955, + "learning_rate": 8.205040162019073e-07, + "loss": 0.8724, + "step": 10645 + }, + { + "epoch": 0.820690718470552, + "grad_norm": 4.049697399139404, + "learning_rate": 8.198188950403146e-07, + "loss": 0.9266, + "step": 10646 + }, + { + "epoch": 0.8207678075855689, + "grad_norm": 3.7433371543884277, + "learning_rate": 8.191340344913629e-07, + "loss": 0.9927, + "step": 10647 + }, + { + "epoch": 0.8208448967005859, + "grad_norm": 3.705571174621582, + "learning_rate": 8.184494345977517e-07, + "loss": 0.8741, + "step": 10648 + }, + { + "epoch": 0.8209219858156028, + "grad_norm": 3.779428482055664, + "learning_rate": 8.177650954021632e-07, + "loss": 0.9699, + "step": 10649 + }, + { + "epoch": 0.8209990749306197, + "grad_norm": 3.6336560249328613, + "learning_rate": 8.170810169472593e-07, + "loss": 0.8423, + "step": 10650 + }, + { + "epoch": 0.8210761640456368, + "grad_norm": 3.8812954425811768, + "learning_rate": 8.163971992756897e-07, + "loss": 0.9858, + "step": 10651 + }, + { + "epoch": 0.8211532531606537, + "grad_norm": 3.7961621284484863, + "learning_rate": 8.157136424300877e-07, + "loss": 0.9314, + "step": 10652 + }, + { + "epoch": 0.8212303422756707, + "grad_norm": 3.7296488285064697, + "learning_rate": 8.15030346453069e-07, + "loss": 1.0059, + "step": 10653 + }, + { + "epoch": 0.8213074313906876, + "grad_norm": 3.9131686687469482, + "learning_rate": 8.143473113872353e-07, + "loss": 0.8374, + "step": 10654 + }, + { + "epoch": 0.8213845205057045, + "grad_norm": 4.030622482299805, + "learning_rate": 8.136645372751667e-07, + "loss": 0.8713, + "step": 10655 + }, + { + "epoch": 0.8214616096207216, + "grad_norm": 3.774033308029175, + "learning_rate": 8.129820241594333e-07, + "loss": 0.9483, + "step": 10656 + }, + { + "epoch": 0.8215386987357385, + "grad_norm": 3.8504068851470947, + "learning_rate": 8.122997720825853e-07, + "loss": 1.045, + "step": 10657 + }, + { + "epoch": 0.8216157878507555, + "grad_norm": 3.6778900623321533, + "learning_rate": 8.116177810871578e-07, + "loss": 0.8602, + "step": 10658 + }, + { + "epoch": 0.8216928769657724, + "grad_norm": 3.6927058696746826, + "learning_rate": 8.109360512156695e-07, + "loss": 0.9418, + "step": 10659 + }, + { + "epoch": 0.8217699660807893, + "grad_norm": 3.3587496280670166, + "learning_rate": 8.10254582510624e-07, + "loss": 0.8898, + "step": 10660 + }, + { + "epoch": 0.8218470551958064, + "grad_norm": 3.8647491931915283, + "learning_rate": 8.095733750145046e-07, + "loss": 0.9488, + "step": 10661 + }, + { + "epoch": 0.8219241443108233, + "grad_norm": 3.846730947494507, + "learning_rate": 8.088924287697824e-07, + "loss": 0.9386, + "step": 10662 + }, + { + "epoch": 0.8220012334258403, + "grad_norm": 3.8304007053375244, + "learning_rate": 8.082117438189113e-07, + "loss": 0.9066, + "step": 10663 + }, + { + "epoch": 0.8220783225408572, + "grad_norm": 3.793449878692627, + "learning_rate": 8.075313202043277e-07, + "loss": 0.8526, + "step": 10664 + }, + { + "epoch": 0.8221554116558741, + "grad_norm": 4.21845006942749, + "learning_rate": 8.068511579684546e-07, + "loss": 1.0152, + "step": 10665 + }, + { + "epoch": 0.8222325007708912, + "grad_norm": 3.6791608333587646, + "learning_rate": 8.06171257153694e-07, + "loss": 0.9214, + "step": 10666 + }, + { + "epoch": 0.8223095898859081, + "grad_norm": 3.7897164821624756, + "learning_rate": 8.054916178024347e-07, + "loss": 0.9361, + "step": 10667 + }, + { + "epoch": 0.8223866790009251, + "grad_norm": 3.942296266555786, + "learning_rate": 8.04812239957049e-07, + "loss": 1.037, + "step": 10668 + }, + { + "epoch": 0.822463768115942, + "grad_norm": 3.920706272125244, + "learning_rate": 8.041331236598926e-07, + "loss": 0.8321, + "step": 10669 + }, + { + "epoch": 0.822540857230959, + "grad_norm": 3.6586873531341553, + "learning_rate": 8.034542689533053e-07, + "loss": 0.9324, + "step": 10670 + }, + { + "epoch": 0.822617946345976, + "grad_norm": 3.7215545177459717, + "learning_rate": 8.027756758796107e-07, + "loss": 0.884, + "step": 10671 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 3.4907851219177246, + "learning_rate": 8.020973444811142e-07, + "loss": 0.801, + "step": 10672 + }, + { + "epoch": 0.8227721245760099, + "grad_norm": 3.6691055297851562, + "learning_rate": 8.014192748001059e-07, + "loss": 0.9102, + "step": 10673 + }, + { + "epoch": 0.8228492136910268, + "grad_norm": 3.8015921115875244, + "learning_rate": 8.007414668788616e-07, + "loss": 0.924, + "step": 10674 + }, + { + "epoch": 0.8229263028060438, + "grad_norm": 3.784499168395996, + "learning_rate": 8.000639207596377e-07, + "loss": 0.928, + "step": 10675 + }, + { + "epoch": 0.8230033919210608, + "grad_norm": 3.817927122116089, + "learning_rate": 7.993866364846781e-07, + "loss": 0.892, + "step": 10676 + }, + { + "epoch": 0.8230804810360777, + "grad_norm": 3.8663864135742188, + "learning_rate": 7.987096140962047e-07, + "loss": 0.8445, + "step": 10677 + }, + { + "epoch": 0.8231575701510947, + "grad_norm": 4.127407073974609, + "learning_rate": 7.980328536364279e-07, + "loss": 0.914, + "step": 10678 + }, + { + "epoch": 0.8232346592661116, + "grad_norm": 3.5715925693511963, + "learning_rate": 7.9735635514754e-07, + "loss": 0.924, + "step": 10679 + }, + { + "epoch": 0.8233117483811286, + "grad_norm": 3.3841607570648193, + "learning_rate": 7.966801186717172e-07, + "loss": 0.805, + "step": 10680 + }, + { + "epoch": 0.8233888374961456, + "grad_norm": 3.75689435005188, + "learning_rate": 7.960041442511196e-07, + "loss": 0.8059, + "step": 10681 + }, + { + "epoch": 0.8234659266111625, + "grad_norm": 3.8247830867767334, + "learning_rate": 7.95328431927892e-07, + "loss": 0.9118, + "step": 10682 + }, + { + "epoch": 0.8235430157261795, + "grad_norm": 4.0702385902404785, + "learning_rate": 7.946529817441584e-07, + "loss": 0.9658, + "step": 10683 + }, + { + "epoch": 0.8236201048411964, + "grad_norm": 3.605431318283081, + "learning_rate": 7.939777937420317e-07, + "loss": 0.9196, + "step": 10684 + }, + { + "epoch": 0.8236971939562134, + "grad_norm": 3.7044453620910645, + "learning_rate": 7.933028679636057e-07, + "loss": 0.908, + "step": 10685 + }, + { + "epoch": 0.8237742830712304, + "grad_norm": 3.886737585067749, + "learning_rate": 7.926282044509593e-07, + "loss": 0.9069, + "step": 10686 + }, + { + "epoch": 0.8238513721862473, + "grad_norm": 3.6313858032226562, + "learning_rate": 7.919538032461544e-07, + "loss": 0.9743, + "step": 10687 + }, + { + "epoch": 0.8239284613012643, + "grad_norm": 3.607703924179077, + "learning_rate": 7.912796643912352e-07, + "loss": 0.8587, + "step": 10688 + }, + { + "epoch": 0.8240055504162812, + "grad_norm": 3.6826751232147217, + "learning_rate": 7.906057879282314e-07, + "loss": 0.9056, + "step": 10689 + }, + { + "epoch": 0.8240826395312982, + "grad_norm": 4.050000190734863, + "learning_rate": 7.89932173899155e-07, + "loss": 0.8867, + "step": 10690 + }, + { + "epoch": 0.8241597286463151, + "grad_norm": 3.8793132305145264, + "learning_rate": 7.89258822346004e-07, + "loss": 0.8438, + "step": 10691 + }, + { + "epoch": 0.8242368177613321, + "grad_norm": 3.8853375911712646, + "learning_rate": 7.885857333107566e-07, + "loss": 0.8774, + "step": 10692 + }, + { + "epoch": 0.8243139068763491, + "grad_norm": 3.8986237049102783, + "learning_rate": 7.87912906835378e-07, + "loss": 0.9329, + "step": 10693 + }, + { + "epoch": 0.824390995991366, + "grad_norm": 3.8146276473999023, + "learning_rate": 7.872403429618141e-07, + "loss": 0.9111, + "step": 10694 + }, + { + "epoch": 0.824468085106383, + "grad_norm": 3.530385732650757, + "learning_rate": 7.865680417319965e-07, + "loss": 0.9361, + "step": 10695 + }, + { + "epoch": 0.8245451742214, + "grad_norm": 3.7008519172668457, + "learning_rate": 7.858960031878399e-07, + "loss": 0.905, + "step": 10696 + }, + { + "epoch": 0.8246222633364169, + "grad_norm": 3.490631103515625, + "learning_rate": 7.852242273712429e-07, + "loss": 0.8935, + "step": 10697 + }, + { + "epoch": 0.8246993524514339, + "grad_norm": 4.064510345458984, + "learning_rate": 7.845527143240856e-07, + "loss": 0.93, + "step": 10698 + }, + { + "epoch": 0.8247764415664508, + "grad_norm": 3.9456470012664795, + "learning_rate": 7.838814640882342e-07, + "loss": 0.9109, + "step": 10699 + }, + { + "epoch": 0.8248535306814678, + "grad_norm": 4.111172676086426, + "learning_rate": 7.832104767055376e-07, + "loss": 0.9342, + "step": 10700 + }, + { + "epoch": 0.8249306197964847, + "grad_norm": 3.4403626918792725, + "learning_rate": 7.82539752217828e-07, + "loss": 0.972, + "step": 10701 + }, + { + "epoch": 0.8250077089115017, + "grad_norm": 3.8715031147003174, + "learning_rate": 7.81869290666924e-07, + "loss": 0.7921, + "step": 10702 + }, + { + "epoch": 0.8250847980265187, + "grad_norm": 3.712979793548584, + "learning_rate": 7.811990920946216e-07, + "loss": 1.0237, + "step": 10703 + }, + { + "epoch": 0.8251618871415356, + "grad_norm": 3.841836452484131, + "learning_rate": 7.805291565427065e-07, + "loss": 1.0109, + "step": 10704 + }, + { + "epoch": 0.8252389762565526, + "grad_norm": 3.964392900466919, + "learning_rate": 7.79859484052945e-07, + "loss": 0.8749, + "step": 10705 + }, + { + "epoch": 0.8253160653715695, + "grad_norm": 3.964123010635376, + "learning_rate": 7.791900746670883e-07, + "loss": 0.9281, + "step": 10706 + }, + { + "epoch": 0.8253931544865865, + "grad_norm": 3.7393078804016113, + "learning_rate": 7.785209284268702e-07, + "loss": 0.8826, + "step": 10707 + }, + { + "epoch": 0.8254702436016035, + "grad_norm": 3.9628257751464844, + "learning_rate": 7.778520453740096e-07, + "loss": 0.9923, + "step": 10708 + }, + { + "epoch": 0.8255473327166204, + "grad_norm": 3.574497699737549, + "learning_rate": 7.771834255502059e-07, + "loss": 0.8425, + "step": 10709 + }, + { + "epoch": 0.8256244218316374, + "grad_norm": 3.801563262939453, + "learning_rate": 7.765150689971451e-07, + "loss": 0.8815, + "step": 10710 + }, + { + "epoch": 0.8257015109466543, + "grad_norm": 3.759230613708496, + "learning_rate": 7.758469757564957e-07, + "loss": 0.8957, + "step": 10711 + }, + { + "epoch": 0.8257786000616713, + "grad_norm": 3.4054291248321533, + "learning_rate": 7.751791458699099e-07, + "loss": 0.8231, + "step": 10712 + }, + { + "epoch": 0.8258556891766883, + "grad_norm": 3.554560422897339, + "learning_rate": 7.745115793790247e-07, + "loss": 0.9277, + "step": 10713 + }, + { + "epoch": 0.8259327782917052, + "grad_norm": 3.6641149520874023, + "learning_rate": 7.738442763254573e-07, + "loss": 0.9879, + "step": 10714 + }, + { + "epoch": 0.8260098674067222, + "grad_norm": 3.7249083518981934, + "learning_rate": 7.731772367508111e-07, + "loss": 0.8303, + "step": 10715 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 3.779303789138794, + "learning_rate": 7.725104606966727e-07, + "loss": 0.9721, + "step": 10716 + }, + { + "epoch": 0.826164045636756, + "grad_norm": 3.8608760833740234, + "learning_rate": 7.718439482046125e-07, + "loss": 0.871, + "step": 10717 + }, + { + "epoch": 0.8262411347517731, + "grad_norm": 3.9529666900634766, + "learning_rate": 7.711776993161841e-07, + "loss": 1.0389, + "step": 10718 + }, + { + "epoch": 0.82631822386679, + "grad_norm": 3.6020145416259766, + "learning_rate": 7.705117140729257e-07, + "loss": 0.7875, + "step": 10719 + }, + { + "epoch": 0.826395312981807, + "grad_norm": 4.178860664367676, + "learning_rate": 7.698459925163554e-07, + "loss": 1.0956, + "step": 10720 + }, + { + "epoch": 0.8264724020968239, + "grad_norm": 3.685406446456909, + "learning_rate": 7.691805346879794e-07, + "loss": 0.9837, + "step": 10721 + }, + { + "epoch": 0.8265494912118408, + "grad_norm": 3.7086429595947266, + "learning_rate": 7.685153406292845e-07, + "loss": 1.0095, + "step": 10722 + }, + { + "epoch": 0.8266265803268579, + "grad_norm": 3.816329002380371, + "learning_rate": 7.67850410381743e-07, + "loss": 0.9181, + "step": 10723 + }, + { + "epoch": 0.8267036694418748, + "grad_norm": 3.774265766143799, + "learning_rate": 7.671857439868107e-07, + "loss": 0.8757, + "step": 10724 + }, + { + "epoch": 0.8267807585568918, + "grad_norm": 4.03645133972168, + "learning_rate": 7.665213414859235e-07, + "loss": 0.9412, + "step": 10725 + }, + { + "epoch": 0.8268578476719087, + "grad_norm": 3.7689404487609863, + "learning_rate": 7.658572029205052e-07, + "loss": 0.8735, + "step": 10726 + }, + { + "epoch": 0.8269349367869256, + "grad_norm": 3.5467817783355713, + "learning_rate": 7.651933283319613e-07, + "loss": 0.8445, + "step": 10727 + }, + { + "epoch": 0.8270120259019427, + "grad_norm": 4.0023088455200195, + "learning_rate": 7.645297177616806e-07, + "loss": 0.9186, + "step": 10728 + }, + { + "epoch": 0.8270891150169596, + "grad_norm": 3.534449577331543, + "learning_rate": 7.638663712510358e-07, + "loss": 0.8886, + "step": 10729 + }, + { + "epoch": 0.8271662041319766, + "grad_norm": 3.6895134449005127, + "learning_rate": 7.632032888413848e-07, + "loss": 0.798, + "step": 10730 + }, + { + "epoch": 0.8272432932469935, + "grad_norm": 3.8588826656341553, + "learning_rate": 7.625404705740641e-07, + "loss": 0.8937, + "step": 10731 + }, + { + "epoch": 0.8273203823620104, + "grad_norm": 3.6557419300079346, + "learning_rate": 7.618779164903988e-07, + "loss": 0.8956, + "step": 10732 + }, + { + "epoch": 0.8273974714770275, + "grad_norm": 3.7859385013580322, + "learning_rate": 7.612156266316962e-07, + "loss": 0.927, + "step": 10733 + }, + { + "epoch": 0.8274745605920444, + "grad_norm": 3.476879358291626, + "learning_rate": 7.60553601039245e-07, + "loss": 0.869, + "step": 10734 + }, + { + "epoch": 0.8275516497070614, + "grad_norm": 3.6959054470062256, + "learning_rate": 7.598918397543209e-07, + "loss": 0.8605, + "step": 10735 + }, + { + "epoch": 0.8276287388220783, + "grad_norm": 3.761538505554199, + "learning_rate": 7.592303428181802e-07, + "loss": 0.9357, + "step": 10736 + }, + { + "epoch": 0.8277058279370952, + "grad_norm": 3.5777299404144287, + "learning_rate": 7.585691102720643e-07, + "loss": 0.8868, + "step": 10737 + }, + { + "epoch": 0.8277829170521123, + "grad_norm": 3.6994411945343018, + "learning_rate": 7.579081421571976e-07, + "loss": 0.8553, + "step": 10738 + }, + { + "epoch": 0.8278600061671292, + "grad_norm": 3.478304862976074, + "learning_rate": 7.572474385147888e-07, + "loss": 0.8501, + "step": 10739 + }, + { + "epoch": 0.8279370952821462, + "grad_norm": 3.7842917442321777, + "learning_rate": 7.56586999386027e-07, + "loss": 0.7966, + "step": 10740 + }, + { + "epoch": 0.8280141843971631, + "grad_norm": 3.9170117378234863, + "learning_rate": 7.559268248120887e-07, + "loss": 1.0096, + "step": 10741 + }, + { + "epoch": 0.82809127351218, + "grad_norm": 3.4382944107055664, + "learning_rate": 7.55266914834133e-07, + "loss": 0.8735, + "step": 10742 + }, + { + "epoch": 0.8281683626271971, + "grad_norm": 3.6370790004730225, + "learning_rate": 7.546072694933004e-07, + "loss": 0.9112, + "step": 10743 + }, + { + "epoch": 0.828245451742214, + "grad_norm": 4.061718463897705, + "learning_rate": 7.539478888307173e-07, + "loss": 1.021, + "step": 10744 + }, + { + "epoch": 0.828322540857231, + "grad_norm": 3.542180061340332, + "learning_rate": 7.532887728874933e-07, + "loss": 0.8531, + "step": 10745 + }, + { + "epoch": 0.8283996299722479, + "grad_norm": 3.725696325302124, + "learning_rate": 7.526299217047195e-07, + "loss": 1.0043, + "step": 10746 + }, + { + "epoch": 0.8284767190872648, + "grad_norm": 3.6887924671173096, + "learning_rate": 7.51971335323472e-07, + "loss": 0.9136, + "step": 10747 + }, + { + "epoch": 0.8285538082022819, + "grad_norm": 3.769517660140991, + "learning_rate": 7.513130137848101e-07, + "loss": 0.8516, + "step": 10748 + }, + { + "epoch": 0.8286308973172988, + "grad_norm": 3.488903522491455, + "learning_rate": 7.506549571297783e-07, + "loss": 0.7989, + "step": 10749 + }, + { + "epoch": 0.8287079864323158, + "grad_norm": 3.627612352371216, + "learning_rate": 7.499971653994026e-07, + "loss": 0.8622, + "step": 10750 + }, + { + "epoch": 0.8287850755473327, + "grad_norm": 3.5904018878936768, + "learning_rate": 7.493396386346913e-07, + "loss": 0.8568, + "step": 10751 + }, + { + "epoch": 0.8288621646623496, + "grad_norm": 3.8833725452423096, + "learning_rate": 7.486823768766388e-07, + "loss": 0.8596, + "step": 10752 + }, + { + "epoch": 0.8289392537773667, + "grad_norm": 3.913623809814453, + "learning_rate": 7.480253801662219e-07, + "loss": 1.0065, + "step": 10753 + }, + { + "epoch": 0.8290163428923836, + "grad_norm": 3.8650975227355957, + "learning_rate": 7.473686485444009e-07, + "loss": 0.8731, + "step": 10754 + }, + { + "epoch": 0.8290934320074006, + "grad_norm": 3.8849239349365234, + "learning_rate": 7.467121820521194e-07, + "loss": 0.9002, + "step": 10755 + }, + { + "epoch": 0.8291705211224175, + "grad_norm": 3.877471685409546, + "learning_rate": 7.46055980730307e-07, + "loss": 0.8938, + "step": 10756 + }, + { + "epoch": 0.8292476102374344, + "grad_norm": 3.7837424278259277, + "learning_rate": 7.454000446198706e-07, + "loss": 0.9011, + "step": 10757 + }, + { + "epoch": 0.8293246993524515, + "grad_norm": 3.6548290252685547, + "learning_rate": 7.447443737617066e-07, + "loss": 0.9058, + "step": 10758 + }, + { + "epoch": 0.8294017884674684, + "grad_norm": 3.6717207431793213, + "learning_rate": 7.440889681966923e-07, + "loss": 0.9047, + "step": 10759 + }, + { + "epoch": 0.8294788775824854, + "grad_norm": 3.3943278789520264, + "learning_rate": 7.434338279656889e-07, + "loss": 0.8617, + "step": 10760 + }, + { + "epoch": 0.8295559666975023, + "grad_norm": 3.89233660697937, + "learning_rate": 7.427789531095425e-07, + "loss": 0.9176, + "step": 10761 + }, + { + "epoch": 0.8296330558125192, + "grad_norm": 3.8601460456848145, + "learning_rate": 7.421243436690778e-07, + "loss": 0.8272, + "step": 10762 + }, + { + "epoch": 0.8297101449275363, + "grad_norm": 3.9159624576568604, + "learning_rate": 7.414699996851088e-07, + "loss": 0.977, + "step": 10763 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 3.4939470291137695, + "learning_rate": 7.408159211984294e-07, + "loss": 0.8306, + "step": 10764 + }, + { + "epoch": 0.8298643231575702, + "grad_norm": 3.7393009662628174, + "learning_rate": 7.401621082498189e-07, + "loss": 0.9836, + "step": 10765 + }, + { + "epoch": 0.8299414122725871, + "grad_norm": 3.700958013534546, + "learning_rate": 7.395085608800384e-07, + "loss": 0.854, + "step": 10766 + }, + { + "epoch": 0.830018501387604, + "grad_norm": 4.00097131729126, + "learning_rate": 7.388552791298343e-07, + "loss": 0.9573, + "step": 10767 + }, + { + "epoch": 0.830095590502621, + "grad_norm": 3.7124149799346924, + "learning_rate": 7.382022630399338e-07, + "loss": 0.925, + "step": 10768 + }, + { + "epoch": 0.830172679617638, + "grad_norm": 3.643216371536255, + "learning_rate": 7.375495126510496e-07, + "loss": 0.8348, + "step": 10769 + }, + { + "epoch": 0.830249768732655, + "grad_norm": 11.0379638671875, + "learning_rate": 7.368970280038778e-07, + "loss": 0.8255, + "step": 10770 + }, + { + "epoch": 0.8303268578476719, + "grad_norm": 3.5740654468536377, + "learning_rate": 7.362448091390966e-07, + "loss": 0.8926, + "step": 10771 + }, + { + "epoch": 0.8304039469626888, + "grad_norm": 3.759594440460205, + "learning_rate": 7.355928560973707e-07, + "loss": 0.8353, + "step": 10772 + }, + { + "epoch": 0.8304810360777058, + "grad_norm": 3.6525447368621826, + "learning_rate": 7.349411689193426e-07, + "loss": 0.8027, + "step": 10773 + }, + { + "epoch": 0.8305581251927228, + "grad_norm": 3.7419726848602295, + "learning_rate": 7.34289747645644e-07, + "loss": 0.8801, + "step": 10774 + }, + { + "epoch": 0.8306352143077398, + "grad_norm": 3.5162227153778076, + "learning_rate": 7.336385923168865e-07, + "loss": 0.8258, + "step": 10775 + }, + { + "epoch": 0.8307123034227567, + "grad_norm": 3.819016933441162, + "learning_rate": 7.329877029736665e-07, + "loss": 0.9143, + "step": 10776 + }, + { + "epoch": 0.8307893925377736, + "grad_norm": 3.756284475326538, + "learning_rate": 7.323370796565637e-07, + "loss": 0.9504, + "step": 10777 + }, + { + "epoch": 0.8308664816527906, + "grad_norm": 3.6436846256256104, + "learning_rate": 7.31686722406143e-07, + "loss": 0.9968, + "step": 10778 + }, + { + "epoch": 0.8309435707678076, + "grad_norm": 3.641303300857544, + "learning_rate": 7.310366312629475e-07, + "loss": 0.949, + "step": 10779 + }, + { + "epoch": 0.8310206598828246, + "grad_norm": 3.9875218868255615, + "learning_rate": 7.303868062675073e-07, + "loss": 0.9923, + "step": 10780 + }, + { + "epoch": 0.8310977489978415, + "grad_norm": 3.8598506450653076, + "learning_rate": 7.297372474603381e-07, + "loss": 1.0225, + "step": 10781 + }, + { + "epoch": 0.8311748381128584, + "grad_norm": 3.6692776679992676, + "learning_rate": 7.290879548819363e-07, + "loss": 0.8962, + "step": 10782 + }, + { + "epoch": 0.8312519272278754, + "grad_norm": 3.7915146350860596, + "learning_rate": 7.284389285727799e-07, + "loss": 0.953, + "step": 10783 + }, + { + "epoch": 0.8313290163428924, + "grad_norm": 3.9026849269866943, + "learning_rate": 7.277901685733335e-07, + "loss": 0.9768, + "step": 10784 + }, + { + "epoch": 0.8314061054579094, + "grad_norm": 3.8057961463928223, + "learning_rate": 7.271416749240435e-07, + "loss": 0.9183, + "step": 10785 + }, + { + "epoch": 0.8314831945729263, + "grad_norm": 3.617309808731079, + "learning_rate": 7.264934476653401e-07, + "loss": 0.9085, + "step": 10786 + }, + { + "epoch": 0.8315602836879432, + "grad_norm": 3.680267572402954, + "learning_rate": 7.258454868376385e-07, + "loss": 0.8003, + "step": 10787 + }, + { + "epoch": 0.8316373728029602, + "grad_norm": 3.6309967041015625, + "learning_rate": 7.251977924813336e-07, + "loss": 0.9443, + "step": 10788 + }, + { + "epoch": 0.8317144619179772, + "grad_norm": 3.5176992416381836, + "learning_rate": 7.245503646368063e-07, + "loss": 0.8685, + "step": 10789 + }, + { + "epoch": 0.8317915510329942, + "grad_norm": 3.7374002933502197, + "learning_rate": 7.239032033444205e-07, + "loss": 0.8479, + "step": 10790 + }, + { + "epoch": 0.8318686401480111, + "grad_norm": 4.19711446762085, + "learning_rate": 7.232563086445238e-07, + "loss": 0.9312, + "step": 10791 + }, + { + "epoch": 0.831945729263028, + "grad_norm": 3.7935373783111572, + "learning_rate": 7.226096805774463e-07, + "loss": 1.0013, + "step": 10792 + }, + { + "epoch": 0.832022818378045, + "grad_norm": 3.825516700744629, + "learning_rate": 7.21963319183503e-07, + "loss": 0.8755, + "step": 10793 + }, + { + "epoch": 0.832099907493062, + "grad_norm": 3.986665725708008, + "learning_rate": 7.213172245029892e-07, + "loss": 0.889, + "step": 10794 + }, + { + "epoch": 0.832176996608079, + "grad_norm": 3.7849297523498535, + "learning_rate": 7.206713965761864e-07, + "loss": 0.9522, + "step": 10795 + }, + { + "epoch": 0.8322540857230959, + "grad_norm": 3.8678035736083984, + "learning_rate": 7.200258354433593e-07, + "loss": 0.9428, + "step": 10796 + }, + { + "epoch": 0.8323311748381128, + "grad_norm": 3.5105371475219727, + "learning_rate": 7.193805411447546e-07, + "loss": 0.7586, + "step": 10797 + }, + { + "epoch": 0.8324082639531298, + "grad_norm": 4.333858966827393, + "learning_rate": 7.187355137206042e-07, + "loss": 0.8917, + "step": 10798 + }, + { + "epoch": 0.8324853530681467, + "grad_norm": 3.7516279220581055, + "learning_rate": 7.180907532111203e-07, + "loss": 0.9158, + "step": 10799 + }, + { + "epoch": 0.8325624421831638, + "grad_norm": 3.8405489921569824, + "learning_rate": 7.174462596565012e-07, + "loss": 0.9124, + "step": 10800 + }, + { + "epoch": 0.8326395312981807, + "grad_norm": 4.030445098876953, + "learning_rate": 7.168020330969283e-07, + "loss": 0.9262, + "step": 10801 + }, + { + "epoch": 0.8327166204131976, + "grad_norm": 4.261043548583984, + "learning_rate": 7.161580735725648e-07, + "loss": 1.0228, + "step": 10802 + }, + { + "epoch": 0.8327937095282146, + "grad_norm": 4.2174973487854, + "learning_rate": 7.155143811235593e-07, + "loss": 1.0124, + "step": 10803 + }, + { + "epoch": 0.8328707986432315, + "grad_norm": 4.092016696929932, + "learning_rate": 7.14870955790043e-07, + "loss": 0.8879, + "step": 10804 + }, + { + "epoch": 0.8329478877582486, + "grad_norm": 3.6013119220733643, + "learning_rate": 7.142277976121287e-07, + "loss": 0.8734, + "step": 10805 + }, + { + "epoch": 0.8330249768732655, + "grad_norm": 4.047834396362305, + "learning_rate": 7.135849066299144e-07, + "loss": 0.9181, + "step": 10806 + }, + { + "epoch": 0.8331020659882824, + "grad_norm": 3.696688413619995, + "learning_rate": 7.129422828834809e-07, + "loss": 0.8078, + "step": 10807 + }, + { + "epoch": 0.8331791551032994, + "grad_norm": 3.918166399002075, + "learning_rate": 7.122999264128933e-07, + "loss": 0.8543, + "step": 10808 + }, + { + "epoch": 0.8332562442183163, + "grad_norm": 3.7116942405700684, + "learning_rate": 7.116578372581995e-07, + "loss": 0.8861, + "step": 10809 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 3.7015819549560547, + "learning_rate": 7.110160154594287e-07, + "loss": 0.8536, + "step": 10810 + }, + { + "epoch": 0.8334104224483503, + "grad_norm": 3.7466604709625244, + "learning_rate": 7.10374461056596e-07, + "loss": 0.9696, + "step": 10811 + }, + { + "epoch": 0.8334875115633672, + "grad_norm": 3.6178178787231445, + "learning_rate": 7.097331740896995e-07, + "loss": 0.9173, + "step": 10812 + }, + { + "epoch": 0.8335646006783842, + "grad_norm": 3.8105711936950684, + "learning_rate": 7.090921545987195e-07, + "loss": 0.9118, + "step": 10813 + }, + { + "epoch": 0.8336416897934011, + "grad_norm": 3.8142049312591553, + "learning_rate": 7.0845140262362e-07, + "loss": 0.7892, + "step": 10814 + }, + { + "epoch": 0.8337187789084182, + "grad_norm": 3.651261329650879, + "learning_rate": 7.078109182043508e-07, + "loss": 0.9293, + "step": 10815 + }, + { + "epoch": 0.8337958680234351, + "grad_norm": 3.8389205932617188, + "learning_rate": 7.071707013808399e-07, + "loss": 0.8746, + "step": 10816 + }, + { + "epoch": 0.833872957138452, + "grad_norm": 4.682302951812744, + "learning_rate": 7.065307521930026e-07, + "loss": 0.8675, + "step": 10817 + }, + { + "epoch": 0.833950046253469, + "grad_norm": 3.9559519290924072, + "learning_rate": 7.058910706807359e-07, + "loss": 0.9481, + "step": 10818 + }, + { + "epoch": 0.8340271353684859, + "grad_norm": 3.810323715209961, + "learning_rate": 7.052516568839218e-07, + "loss": 0.9161, + "step": 10819 + }, + { + "epoch": 0.834104224483503, + "grad_norm": 4.13281774520874, + "learning_rate": 7.046125108424245e-07, + "loss": 0.9867, + "step": 10820 + }, + { + "epoch": 0.8341813135985199, + "grad_norm": 3.446863889694214, + "learning_rate": 7.039736325960899e-07, + "loss": 0.8842, + "step": 10821 + }, + { + "epoch": 0.8342584027135368, + "grad_norm": 3.840688943862915, + "learning_rate": 7.033350221847496e-07, + "loss": 0.9211, + "step": 10822 + }, + { + "epoch": 0.8343354918285538, + "grad_norm": 3.8792099952697754, + "learning_rate": 7.026966796482177e-07, + "loss": 0.8376, + "step": 10823 + }, + { + "epoch": 0.8344125809435707, + "grad_norm": 3.9304957389831543, + "learning_rate": 7.020586050262912e-07, + "loss": 0.919, + "step": 10824 + }, + { + "epoch": 0.8344896700585878, + "grad_norm": 3.9042816162109375, + "learning_rate": 7.014207983587517e-07, + "loss": 0.8255, + "step": 10825 + }, + { + "epoch": 0.8345667591736047, + "grad_norm": 3.54953670501709, + "learning_rate": 7.00783259685362e-07, + "loss": 0.8494, + "step": 10826 + }, + { + "epoch": 0.8346438482886216, + "grad_norm": 3.699927568435669, + "learning_rate": 7.0014598904587e-07, + "loss": 0.7763, + "step": 10827 + }, + { + "epoch": 0.8347209374036386, + "grad_norm": 3.908418655395508, + "learning_rate": 6.995089864800059e-07, + "loss": 0.9824, + "step": 10828 + }, + { + "epoch": 0.8347980265186555, + "grad_norm": 3.541356325149536, + "learning_rate": 6.988722520274838e-07, + "loss": 0.7651, + "step": 10829 + }, + { + "epoch": 0.8348751156336726, + "grad_norm": 3.6384029388427734, + "learning_rate": 6.98235785728002e-07, + "loss": 0.9051, + "step": 10830 + }, + { + "epoch": 0.8349522047486895, + "grad_norm": 3.667262315750122, + "learning_rate": 6.975995876212383e-07, + "loss": 0.9101, + "step": 10831 + }, + { + "epoch": 0.8350292938637064, + "grad_norm": 3.559422492980957, + "learning_rate": 6.969636577468575e-07, + "loss": 0.8595, + "step": 10832 + }, + { + "epoch": 0.8351063829787234, + "grad_norm": 4.325561046600342, + "learning_rate": 6.963279961445068e-07, + "loss": 0.9312, + "step": 10833 + }, + { + "epoch": 0.8351834720937403, + "grad_norm": 4.272604465484619, + "learning_rate": 6.956926028538163e-07, + "loss": 0.8992, + "step": 10834 + }, + { + "epoch": 0.8352605612087574, + "grad_norm": 3.4892845153808594, + "learning_rate": 6.950574779144004e-07, + "loss": 0.8594, + "step": 10835 + }, + { + "epoch": 0.8353376503237743, + "grad_norm": 3.5458579063415527, + "learning_rate": 6.944226213658534e-07, + "loss": 0.9747, + "step": 10836 + }, + { + "epoch": 0.8354147394387912, + "grad_norm": 3.427591562271118, + "learning_rate": 6.937880332477576e-07, + "loss": 0.9044, + "step": 10837 + }, + { + "epoch": 0.8354918285538082, + "grad_norm": 4.049854755401611, + "learning_rate": 6.931537135996747e-07, + "loss": 0.9564, + "step": 10838 + }, + { + "epoch": 0.8355689176688251, + "grad_norm": 3.7413408756256104, + "learning_rate": 6.925196624611525e-07, + "loss": 0.9341, + "step": 10839 + }, + { + "epoch": 0.8356460067838422, + "grad_norm": 3.709747076034546, + "learning_rate": 6.918858798717204e-07, + "loss": 0.9402, + "step": 10840 + }, + { + "epoch": 0.8357230958988591, + "grad_norm": 4.0322465896606445, + "learning_rate": 6.912523658708919e-07, + "loss": 0.91, + "step": 10841 + }, + { + "epoch": 0.8358001850138761, + "grad_norm": 3.899702310562134, + "learning_rate": 6.906191204981621e-07, + "loss": 0.9262, + "step": 10842 + }, + { + "epoch": 0.835877274128893, + "grad_norm": 3.4912753105163574, + "learning_rate": 6.899861437930116e-07, + "loss": 0.8982, + "step": 10843 + }, + { + "epoch": 0.8359543632439099, + "grad_norm": 3.437044620513916, + "learning_rate": 6.893534357949022e-07, + "loss": 0.8238, + "step": 10844 + }, + { + "epoch": 0.836031452358927, + "grad_norm": 3.966545820236206, + "learning_rate": 6.887209965432812e-07, + "loss": 0.977, + "step": 10845 + }, + { + "epoch": 0.8361085414739439, + "grad_norm": 3.3362553119659424, + "learning_rate": 6.880888260775786e-07, + "loss": 0.877, + "step": 10846 + }, + { + "epoch": 0.8361856305889609, + "grad_norm": 3.762795925140381, + "learning_rate": 6.874569244372042e-07, + "loss": 0.9176, + "step": 10847 + }, + { + "epoch": 0.8362627197039778, + "grad_norm": 3.6273088455200195, + "learning_rate": 6.868252916615553e-07, + "loss": 0.8417, + "step": 10848 + }, + { + "epoch": 0.8363398088189947, + "grad_norm": 3.5637712478637695, + "learning_rate": 6.861939277900115e-07, + "loss": 0.8944, + "step": 10849 + }, + { + "epoch": 0.8364168979340117, + "grad_norm": 3.720207929611206, + "learning_rate": 6.855628328619341e-07, + "loss": 1.0064, + "step": 10850 + }, + { + "epoch": 0.8364939870490287, + "grad_norm": 3.5952982902526855, + "learning_rate": 6.849320069166693e-07, + "loss": 0.9475, + "step": 10851 + }, + { + "epoch": 0.8365710761640457, + "grad_norm": 3.7892141342163086, + "learning_rate": 6.843014499935463e-07, + "loss": 0.9439, + "step": 10852 + }, + { + "epoch": 0.8366481652790626, + "grad_norm": 3.6238391399383545, + "learning_rate": 6.836711621318753e-07, + "loss": 0.8605, + "step": 10853 + }, + { + "epoch": 0.8367252543940795, + "grad_norm": 3.9006173610687256, + "learning_rate": 6.830411433709527e-07, + "loss": 0.8448, + "step": 10854 + }, + { + "epoch": 0.8368023435090965, + "grad_norm": 3.381910800933838, + "learning_rate": 6.824113937500565e-07, + "loss": 0.9104, + "step": 10855 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 3.998650312423706, + "learning_rate": 6.817819133084485e-07, + "loss": 1.0149, + "step": 10856 + }, + { + "epoch": 0.8369565217391305, + "grad_norm": 3.4496870040893555, + "learning_rate": 6.811527020853748e-07, + "loss": 0.8017, + "step": 10857 + }, + { + "epoch": 0.8370336108541474, + "grad_norm": 3.7553465366363525, + "learning_rate": 6.805237601200615e-07, + "loss": 0.7632, + "step": 10858 + }, + { + "epoch": 0.8371106999691643, + "grad_norm": 4.353603839874268, + "learning_rate": 6.798950874517201e-07, + "loss": 0.9318, + "step": 10859 + }, + { + "epoch": 0.8371877890841813, + "grad_norm": 3.8863165378570557, + "learning_rate": 6.792666841195455e-07, + "loss": 0.904, + "step": 10860 + }, + { + "epoch": 0.8372648781991983, + "grad_norm": 3.8300702571868896, + "learning_rate": 6.786385501627157e-07, + "loss": 0.8406, + "step": 10861 + }, + { + "epoch": 0.8373419673142153, + "grad_norm": 3.770839214324951, + "learning_rate": 6.780106856203916e-07, + "loss": 0.9262, + "step": 10862 + }, + { + "epoch": 0.8374190564292322, + "grad_norm": 3.5766117572784424, + "learning_rate": 6.77383090531718e-07, + "loss": 0.868, + "step": 10863 + }, + { + "epoch": 0.8374961455442491, + "grad_norm": 3.751206159591675, + "learning_rate": 6.767557649358203e-07, + "loss": 0.8759, + "step": 10864 + }, + { + "epoch": 0.8375732346592661, + "grad_norm": 3.9734339714050293, + "learning_rate": 6.7612870887181e-07, + "loss": 0.8378, + "step": 10865 + }, + { + "epoch": 0.8376503237742831, + "grad_norm": 3.7045814990997314, + "learning_rate": 6.755019223787807e-07, + "loss": 0.9453, + "step": 10866 + }, + { + "epoch": 0.8377274128893001, + "grad_norm": 3.88385272026062, + "learning_rate": 6.748754054958095e-07, + "loss": 0.9384, + "step": 10867 + }, + { + "epoch": 0.837804502004317, + "grad_norm": 4.15552282333374, + "learning_rate": 6.742491582619559e-07, + "loss": 0.9795, + "step": 10868 + }, + { + "epoch": 0.8378815911193339, + "grad_norm": 3.2853026390075684, + "learning_rate": 6.736231807162641e-07, + "loss": 0.843, + "step": 10869 + }, + { + "epoch": 0.8379586802343509, + "grad_norm": 3.701760768890381, + "learning_rate": 6.7299747289776e-07, + "loss": 0.9865, + "step": 10870 + }, + { + "epoch": 0.8380357693493679, + "grad_norm": 3.7172200679779053, + "learning_rate": 6.723720348454538e-07, + "loss": 0.8868, + "step": 10871 + }, + { + "epoch": 0.8381128584643849, + "grad_norm": 3.8276724815368652, + "learning_rate": 6.717468665983384e-07, + "loss": 0.9274, + "step": 10872 + }, + { + "epoch": 0.8381899475794018, + "grad_norm": 3.7223312854766846, + "learning_rate": 6.711219681953885e-07, + "loss": 0.8895, + "step": 10873 + }, + { + "epoch": 0.8382670366944187, + "grad_norm": 3.972644329071045, + "learning_rate": 6.704973396755638e-07, + "loss": 0.9633, + "step": 10874 + }, + { + "epoch": 0.8383441258094357, + "grad_norm": 3.7583634853363037, + "learning_rate": 6.698729810778065e-07, + "loss": 0.8806, + "step": 10875 + }, + { + "epoch": 0.8384212149244527, + "grad_norm": 4.294461727142334, + "learning_rate": 6.692488924410434e-07, + "loss": 0.957, + "step": 10876 + }, + { + "epoch": 0.8384983040394697, + "grad_norm": 3.7632694244384766, + "learning_rate": 6.686250738041816e-07, + "loss": 0.8539, + "step": 10877 + }, + { + "epoch": 0.8385753931544866, + "grad_norm": 3.696632146835327, + "learning_rate": 6.68001525206115e-07, + "loss": 0.9736, + "step": 10878 + }, + { + "epoch": 0.8386524822695035, + "grad_norm": 3.7102134227752686, + "learning_rate": 6.673782466857165e-07, + "loss": 0.9112, + "step": 10879 + }, + { + "epoch": 0.8387295713845205, + "grad_norm": 4.029872417449951, + "learning_rate": 6.667552382818449e-07, + "loss": 0.9195, + "step": 10880 + }, + { + "epoch": 0.8388066604995374, + "grad_norm": 4.231304168701172, + "learning_rate": 6.661325000333419e-07, + "loss": 0.9635, + "step": 10881 + }, + { + "epoch": 0.8388837496145545, + "grad_norm": 3.9188613891601562, + "learning_rate": 6.655100319790314e-07, + "loss": 0.7859, + "step": 10882 + }, + { + "epoch": 0.8389608387295714, + "grad_norm": 3.596547842025757, + "learning_rate": 6.64887834157723e-07, + "loss": 0.8746, + "step": 10883 + }, + { + "epoch": 0.8390379278445883, + "grad_norm": 3.9885101318359375, + "learning_rate": 6.642659066082046e-07, + "loss": 0.922, + "step": 10884 + }, + { + "epoch": 0.8391150169596053, + "grad_norm": 3.787560224533081, + "learning_rate": 6.636442493692518e-07, + "loss": 0.894, + "step": 10885 + }, + { + "epoch": 0.8391921060746222, + "grad_norm": 3.884455680847168, + "learning_rate": 6.630228624796215e-07, + "loss": 0.9345, + "step": 10886 + }, + { + "epoch": 0.8392691951896393, + "grad_norm": 3.6436421871185303, + "learning_rate": 6.624017459780541e-07, + "loss": 0.837, + "step": 10887 + }, + { + "epoch": 0.8393462843046562, + "grad_norm": 3.4954371452331543, + "learning_rate": 6.617808999032727e-07, + "loss": 0.8966, + "step": 10888 + }, + { + "epoch": 0.8394233734196731, + "grad_norm": 4.002804279327393, + "learning_rate": 6.611603242939846e-07, + "loss": 1.0232, + "step": 10889 + }, + { + "epoch": 0.8395004625346901, + "grad_norm": 3.798150062561035, + "learning_rate": 6.605400191888784e-07, + "loss": 0.9284, + "step": 10890 + }, + { + "epoch": 0.839577551649707, + "grad_norm": 3.5775742530822754, + "learning_rate": 6.599199846266274e-07, + "loss": 0.8892, + "step": 10891 + }, + { + "epoch": 0.8396546407647241, + "grad_norm": 3.491213321685791, + "learning_rate": 6.59300220645887e-07, + "loss": 0.9179, + "step": 10892 + }, + { + "epoch": 0.839731729879741, + "grad_norm": 3.7937722206115723, + "learning_rate": 6.586807272852969e-07, + "loss": 0.9093, + "step": 10893 + }, + { + "epoch": 0.8398088189947579, + "grad_norm": 4.150206089019775, + "learning_rate": 6.580615045834803e-07, + "loss": 0.9983, + "step": 10894 + }, + { + "epoch": 0.8398859081097749, + "grad_norm": 3.7913031578063965, + "learning_rate": 6.574425525790407e-07, + "loss": 0.8851, + "step": 10895 + }, + { + "epoch": 0.8399629972247918, + "grad_norm": 3.3941149711608887, + "learning_rate": 6.568238713105668e-07, + "loss": 0.8295, + "step": 10896 + }, + { + "epoch": 0.8400400863398089, + "grad_norm": 3.6495449542999268, + "learning_rate": 6.562054608166307e-07, + "loss": 0.7818, + "step": 10897 + }, + { + "epoch": 0.8401171754548258, + "grad_norm": 3.510993719100952, + "learning_rate": 6.555873211357872e-07, + "loss": 0.8457, + "step": 10898 + }, + { + "epoch": 0.8401942645698427, + "grad_norm": 3.769568920135498, + "learning_rate": 6.549694523065742e-07, + "loss": 0.9033, + "step": 10899 + }, + { + "epoch": 0.8402713536848597, + "grad_norm": 3.4650299549102783, + "learning_rate": 6.543518543675132e-07, + "loss": 0.7952, + "step": 10900 + }, + { + "epoch": 0.8403484427998766, + "grad_norm": 3.828594923019409, + "learning_rate": 6.537345273571061e-07, + "loss": 1.0226, + "step": 10901 + }, + { + "epoch": 0.8404255319148937, + "grad_norm": 4.020386695861816, + "learning_rate": 6.531174713138416e-07, + "loss": 0.9334, + "step": 10902 + }, + { + "epoch": 0.8405026210299106, + "grad_norm": 4.063125133514404, + "learning_rate": 6.525006862761895e-07, + "loss": 0.8969, + "step": 10903 + }, + { + "epoch": 0.8405797101449275, + "grad_norm": 4.093360900878906, + "learning_rate": 6.51884172282603e-07, + "loss": 1.0452, + "step": 10904 + }, + { + "epoch": 0.8406567992599445, + "grad_norm": 3.666809320449829, + "learning_rate": 6.512679293715208e-07, + "loss": 0.9207, + "step": 10905 + }, + { + "epoch": 0.8407338883749614, + "grad_norm": 3.7799835205078125, + "learning_rate": 6.506519575813591e-07, + "loss": 0.8436, + "step": 10906 + }, + { + "epoch": 0.8408109774899785, + "grad_norm": 3.6833157539367676, + "learning_rate": 6.500362569505215e-07, + "loss": 0.9263, + "step": 10907 + }, + { + "epoch": 0.8408880666049954, + "grad_norm": 3.6458330154418945, + "learning_rate": 6.494208275173947e-07, + "loss": 0.9459, + "step": 10908 + }, + { + "epoch": 0.8409651557200123, + "grad_norm": 3.5508716106414795, + "learning_rate": 6.488056693203471e-07, + "loss": 0.7733, + "step": 10909 + }, + { + "epoch": 0.8410422448350293, + "grad_norm": 3.958599805831909, + "learning_rate": 6.481907823977307e-07, + "loss": 0.9815, + "step": 10910 + }, + { + "epoch": 0.8411193339500462, + "grad_norm": 4.111875057220459, + "learning_rate": 6.47576166787881e-07, + "loss": 0.9399, + "step": 10911 + }, + { + "epoch": 0.8411964230650633, + "grad_norm": 3.8634867668151855, + "learning_rate": 6.469618225291141e-07, + "loss": 0.8141, + "step": 10912 + }, + { + "epoch": 0.8412735121800802, + "grad_norm": 4.0127716064453125, + "learning_rate": 6.463477496597332e-07, + "loss": 0.9476, + "step": 10913 + }, + { + "epoch": 0.8413506012950971, + "grad_norm": 3.996286153793335, + "learning_rate": 6.45733948218022e-07, + "loss": 0.9159, + "step": 10914 + }, + { + "epoch": 0.8414276904101141, + "grad_norm": 3.8326454162597656, + "learning_rate": 6.451204182422488e-07, + "loss": 0.8948, + "step": 10915 + }, + { + "epoch": 0.841504779525131, + "grad_norm": 3.672430992126465, + "learning_rate": 6.44507159770662e-07, + "loss": 0.9338, + "step": 10916 + }, + { + "epoch": 0.8415818686401481, + "grad_norm": 3.8770806789398193, + "learning_rate": 6.43894172841496e-07, + "loss": 0.9311, + "step": 10917 + }, + { + "epoch": 0.841658957755165, + "grad_norm": 3.768202304840088, + "learning_rate": 6.43281457492968e-07, + "loss": 0.9567, + "step": 10918 + }, + { + "epoch": 0.8417360468701819, + "grad_norm": 4.294236660003662, + "learning_rate": 6.426690137632763e-07, + "loss": 0.9294, + "step": 10919 + }, + { + "epoch": 0.8418131359851989, + "grad_norm": 3.6829004287719727, + "learning_rate": 6.420568416906059e-07, + "loss": 0.9141, + "step": 10920 + }, + { + "epoch": 0.8418902251002158, + "grad_norm": 4.125067710876465, + "learning_rate": 6.414449413131202e-07, + "loss": 0.9829, + "step": 10921 + }, + { + "epoch": 0.8419673142152329, + "grad_norm": 3.662050247192383, + "learning_rate": 6.408333126689686e-07, + "loss": 0.9487, + "step": 10922 + }, + { + "epoch": 0.8420444033302498, + "grad_norm": 4.144391059875488, + "learning_rate": 6.402219557962835e-07, + "loss": 0.9397, + "step": 10923 + }, + { + "epoch": 0.8421214924452667, + "grad_norm": 4.148819446563721, + "learning_rate": 6.396108707331794e-07, + "loss": 1.0001, + "step": 10924 + }, + { + "epoch": 0.8421985815602837, + "grad_norm": 3.679710865020752, + "learning_rate": 6.390000575177546e-07, + "loss": 0.9325, + "step": 10925 + }, + { + "epoch": 0.8422756706753006, + "grad_norm": 3.7735555171966553, + "learning_rate": 6.38389516188091e-07, + "loss": 0.9016, + "step": 10926 + }, + { + "epoch": 0.8423527597903177, + "grad_norm": 4.009374618530273, + "learning_rate": 6.377792467822502e-07, + "loss": 0.9601, + "step": 10927 + }, + { + "epoch": 0.8424298489053346, + "grad_norm": 3.806119441986084, + "learning_rate": 6.371692493382814e-07, + "loss": 0.9, + "step": 10928 + }, + { + "epoch": 0.8425069380203515, + "grad_norm": 3.881964921951294, + "learning_rate": 6.36559523894214e-07, + "loss": 0.9717, + "step": 10929 + }, + { + "epoch": 0.8425840271353685, + "grad_norm": 3.9612395763397217, + "learning_rate": 6.359500704880617e-07, + "loss": 0.955, + "step": 10930 + }, + { + "epoch": 0.8426611162503854, + "grad_norm": 3.793069839477539, + "learning_rate": 6.353408891578212e-07, + "loss": 0.8759, + "step": 10931 + }, + { + "epoch": 0.8427382053654024, + "grad_norm": 3.9942023754119873, + "learning_rate": 6.347319799414702e-07, + "loss": 0.9591, + "step": 10932 + }, + { + "epoch": 0.8428152944804194, + "grad_norm": 3.749751567840576, + "learning_rate": 6.341233428769722e-07, + "loss": 0.8971, + "step": 10933 + }, + { + "epoch": 0.8428923835954363, + "grad_norm": 3.6659231185913086, + "learning_rate": 6.33514978002272e-07, + "loss": 0.7227, + "step": 10934 + }, + { + "epoch": 0.8429694727104533, + "grad_norm": 3.700681686401367, + "learning_rate": 6.329068853552983e-07, + "loss": 0.8878, + "step": 10935 + }, + { + "epoch": 0.8430465618254702, + "grad_norm": 3.738607883453369, + "learning_rate": 6.322990649739624e-07, + "loss": 0.924, + "step": 10936 + }, + { + "epoch": 0.8431236509404872, + "grad_norm": 3.6381099224090576, + "learning_rate": 6.316915168961602e-07, + "loss": 0.9039, + "step": 10937 + }, + { + "epoch": 0.8432007400555042, + "grad_norm": 3.76474928855896, + "learning_rate": 6.310842411597667e-07, + "loss": 0.9657, + "step": 10938 + }, + { + "epoch": 0.8432778291705211, + "grad_norm": 3.532024383544922, + "learning_rate": 6.304772378026441e-07, + "loss": 0.8998, + "step": 10939 + }, + { + "epoch": 0.8433549182855381, + "grad_norm": 3.453786611557007, + "learning_rate": 6.298705068626348e-07, + "loss": 0.7852, + "step": 10940 + }, + { + "epoch": 0.843432007400555, + "grad_norm": 3.8487155437469482, + "learning_rate": 6.292640483775664e-07, + "loss": 0.8156, + "step": 10941 + }, + { + "epoch": 0.843509096515572, + "grad_norm": 3.691965341567993, + "learning_rate": 6.286578623852485e-07, + "loss": 0.9317, + "step": 10942 + }, + { + "epoch": 0.843586185630589, + "grad_norm": 3.6872313022613525, + "learning_rate": 6.280519489234721e-07, + "loss": 0.8926, + "step": 10943 + }, + { + "epoch": 0.8436632747456059, + "grad_norm": 3.968344211578369, + "learning_rate": 6.274463080300142e-07, + "loss": 0.9674, + "step": 10944 + }, + { + "epoch": 0.8437403638606229, + "grad_norm": 3.671273708343506, + "learning_rate": 6.268409397426323e-07, + "loss": 0.8654, + "step": 10945 + }, + { + "epoch": 0.8438174529756398, + "grad_norm": 3.5088984966278076, + "learning_rate": 6.26235844099069e-07, + "loss": 0.8773, + "step": 10946 + }, + { + "epoch": 0.8438945420906568, + "grad_norm": 3.9786536693573, + "learning_rate": 6.256310211370486e-07, + "loss": 0.8877, + "step": 10947 + }, + { + "epoch": 0.8439716312056738, + "grad_norm": 3.580746650695801, + "learning_rate": 6.2502647089428e-07, + "loss": 0.888, + "step": 10948 + }, + { + "epoch": 0.8440487203206907, + "grad_norm": 3.8491809368133545, + "learning_rate": 6.244221934084504e-07, + "loss": 0.9276, + "step": 10949 + }, + { + "epoch": 0.8441258094357077, + "grad_norm": 3.9452064037323, + "learning_rate": 6.238181887172362e-07, + "loss": 0.9588, + "step": 10950 + }, + { + "epoch": 0.8442028985507246, + "grad_norm": 3.862786054611206, + "learning_rate": 6.232144568582926e-07, + "loss": 0.8399, + "step": 10951 + }, + { + "epoch": 0.8442799876657416, + "grad_norm": 3.535684823989868, + "learning_rate": 6.226109978692596e-07, + "loss": 0.8093, + "step": 10952 + }, + { + "epoch": 0.8443570767807586, + "grad_norm": 3.716644048690796, + "learning_rate": 6.220078117877615e-07, + "loss": 0.9213, + "step": 10953 + }, + { + "epoch": 0.8444341658957755, + "grad_norm": 3.8293979167938232, + "learning_rate": 6.214048986514004e-07, + "loss": 0.9022, + "step": 10954 + }, + { + "epoch": 0.8445112550107925, + "grad_norm": 3.8130550384521484, + "learning_rate": 6.208022584977668e-07, + "loss": 0.9534, + "step": 10955 + }, + { + "epoch": 0.8445883441258094, + "grad_norm": 3.5550150871276855, + "learning_rate": 6.201998913644319e-07, + "loss": 0.8563, + "step": 10956 + }, + { + "epoch": 0.8446654332408264, + "grad_norm": 3.5891706943511963, + "learning_rate": 6.195977972889505e-07, + "loss": 0.8518, + "step": 10957 + }, + { + "epoch": 0.8447425223558434, + "grad_norm": 3.988762855529785, + "learning_rate": 6.189959763088593e-07, + "loss": 0.9264, + "step": 10958 + }, + { + "epoch": 0.8448196114708603, + "grad_norm": 3.7621021270751953, + "learning_rate": 6.183944284616794e-07, + "loss": 0.8383, + "step": 10959 + }, + { + "epoch": 0.8448967005858773, + "grad_norm": 3.256624460220337, + "learning_rate": 6.177931537849141e-07, + "loss": 0.7408, + "step": 10960 + }, + { + "epoch": 0.8449737897008942, + "grad_norm": 3.6678712368011475, + "learning_rate": 6.17192152316049e-07, + "loss": 0.8796, + "step": 10961 + }, + { + "epoch": 0.8450508788159112, + "grad_norm": 3.6658334732055664, + "learning_rate": 6.165914240925547e-07, + "loss": 0.9107, + "step": 10962 + }, + { + "epoch": 0.8451279679309281, + "grad_norm": 3.5411484241485596, + "learning_rate": 6.15990969151884e-07, + "loss": 0.8073, + "step": 10963 + }, + { + "epoch": 0.8452050570459451, + "grad_norm": 3.56990385055542, + "learning_rate": 6.153907875314697e-07, + "loss": 0.8502, + "step": 10964 + }, + { + "epoch": 0.8452821461609621, + "grad_norm": 4.044010639190674, + "learning_rate": 6.147908792687307e-07, + "loss": 0.9423, + "step": 10965 + }, + { + "epoch": 0.845359235275979, + "grad_norm": 3.776015043258667, + "learning_rate": 6.141912444010694e-07, + "loss": 0.9912, + "step": 10966 + }, + { + "epoch": 0.845436324390996, + "grad_norm": 3.4817419052124023, + "learning_rate": 6.135918829658694e-07, + "loss": 0.8841, + "step": 10967 + }, + { + "epoch": 0.845513413506013, + "grad_norm": 3.5754287242889404, + "learning_rate": 6.129927950004988e-07, + "loss": 0.8635, + "step": 10968 + }, + { + "epoch": 0.8455905026210299, + "grad_norm": 3.851569414138794, + "learning_rate": 6.123939805423051e-07, + "loss": 1.0293, + "step": 10969 + }, + { + "epoch": 0.8456675917360469, + "grad_norm": 3.6617162227630615, + "learning_rate": 6.117954396286236e-07, + "loss": 0.864, + "step": 10970 + }, + { + "epoch": 0.8457446808510638, + "grad_norm": 3.597431182861328, + "learning_rate": 6.111971722967686e-07, + "loss": 0.9426, + "step": 10971 + }, + { + "epoch": 0.8458217699660808, + "grad_norm": 3.592806816101074, + "learning_rate": 6.105991785840398e-07, + "loss": 0.9014, + "step": 10972 + }, + { + "epoch": 0.8458988590810977, + "grad_norm": 3.4167540073394775, + "learning_rate": 6.100014585277187e-07, + "loss": 0.8378, + "step": 10973 + }, + { + "epoch": 0.8459759481961147, + "grad_norm": 4.124277114868164, + "learning_rate": 6.094040121650719e-07, + "loss": 0.9724, + "step": 10974 + }, + { + "epoch": 0.8460530373111317, + "grad_norm": 4.155192852020264, + "learning_rate": 6.088068395333441e-07, + "loss": 0.9636, + "step": 10975 + }, + { + "epoch": 0.8461301264261486, + "grad_norm": 3.687875270843506, + "learning_rate": 6.082099406697673e-07, + "loss": 0.935, + "step": 10976 + }, + { + "epoch": 0.8462072155411656, + "grad_norm": 3.8519952297210693, + "learning_rate": 6.076133156115549e-07, + "loss": 0.8827, + "step": 10977 + }, + { + "epoch": 0.8462843046561825, + "grad_norm": 3.6095309257507324, + "learning_rate": 6.070169643959034e-07, + "loss": 0.9423, + "step": 10978 + }, + { + "epoch": 0.8463613937711995, + "grad_norm": 3.734135866165161, + "learning_rate": 6.064208870599935e-07, + "loss": 0.9421, + "step": 10979 + }, + { + "epoch": 0.8464384828862165, + "grad_norm": 3.8327412605285645, + "learning_rate": 6.058250836409856e-07, + "loss": 0.9157, + "step": 10980 + }, + { + "epoch": 0.8465155720012334, + "grad_norm": 4.3431782722473145, + "learning_rate": 6.052295541760256e-07, + "loss": 1.0126, + "step": 10981 + }, + { + "epoch": 0.8465926611162504, + "grad_norm": 3.996701717376709, + "learning_rate": 6.046342987022419e-07, + "loss": 0.9892, + "step": 10982 + }, + { + "epoch": 0.8466697502312673, + "grad_norm": 4.109698295593262, + "learning_rate": 6.04039317256745e-07, + "loss": 0.9538, + "step": 10983 + }, + { + "epoch": 0.8467468393462843, + "grad_norm": 3.8181276321411133, + "learning_rate": 6.0344460987663e-07, + "loss": 0.8675, + "step": 10984 + }, + { + "epoch": 0.8468239284613013, + "grad_norm": 3.624297618865967, + "learning_rate": 6.028501765989736e-07, + "loss": 0.9302, + "step": 10985 + }, + { + "epoch": 0.8469010175763182, + "grad_norm": 3.416637659072876, + "learning_rate": 6.02256017460835e-07, + "loss": 0.8451, + "step": 10986 + }, + { + "epoch": 0.8469781066913352, + "grad_norm": 3.6329896450042725, + "learning_rate": 6.016621324992566e-07, + "loss": 0.9613, + "step": 10987 + }, + { + "epoch": 0.8470551958063521, + "grad_norm": 4.0961408615112305, + "learning_rate": 6.010685217512647e-07, + "loss": 0.9912, + "step": 10988 + }, + { + "epoch": 0.847132284921369, + "grad_norm": 3.716765880584717, + "learning_rate": 6.004751852538682e-07, + "loss": 0.9723, + "step": 10989 + }, + { + "epoch": 0.8472093740363861, + "grad_norm": 3.594797372817993, + "learning_rate": 5.998821230440588e-07, + "loss": 0.8827, + "step": 10990 + }, + { + "epoch": 0.847286463151403, + "grad_norm": 4.180978298187256, + "learning_rate": 5.992893351588097e-07, + "loss": 1.004, + "step": 10991 + }, + { + "epoch": 0.84736355226642, + "grad_norm": 3.7273788452148438, + "learning_rate": 5.986968216350786e-07, + "loss": 0.859, + "step": 10992 + }, + { + "epoch": 0.8474406413814369, + "grad_norm": 3.659938097000122, + "learning_rate": 5.981045825098053e-07, + "loss": 0.9083, + "step": 10993 + }, + { + "epoch": 0.8475177304964538, + "grad_norm": 4.153972625732422, + "learning_rate": 5.975126178199136e-07, + "loss": 0.8852, + "step": 10994 + }, + { + "epoch": 0.8475948196114709, + "grad_norm": 3.5449161529541016, + "learning_rate": 5.969209276023091e-07, + "loss": 0.7538, + "step": 10995 + }, + { + "epoch": 0.8476719087264878, + "grad_norm": 3.8162269592285156, + "learning_rate": 5.963295118938816e-07, + "loss": 0.8447, + "step": 10996 + }, + { + "epoch": 0.8477489978415048, + "grad_norm": 3.6209535598754883, + "learning_rate": 5.957383707315006e-07, + "loss": 0.9266, + "step": 10997 + }, + { + "epoch": 0.8478260869565217, + "grad_norm": 3.3889384269714355, + "learning_rate": 5.951475041520222e-07, + "loss": 0.7844, + "step": 10998 + }, + { + "epoch": 0.8479031760715386, + "grad_norm": 3.7911598682403564, + "learning_rate": 5.945569121922834e-07, + "loss": 0.876, + "step": 10999 + }, + { + "epoch": 0.8479802651865557, + "grad_norm": 3.6760973930358887, + "learning_rate": 5.939665948891049e-07, + "loss": 0.9148, + "step": 11000 + }, + { + "epoch": 0.8480573543015726, + "grad_norm": 3.681337833404541, + "learning_rate": 5.9337655227929e-07, + "loss": 0.9059, + "step": 11001 + }, + { + "epoch": 0.8481344434165896, + "grad_norm": 3.6630451679229736, + "learning_rate": 5.927867843996243e-07, + "loss": 0.8611, + "step": 11002 + }, + { + "epoch": 0.8482115325316065, + "grad_norm": 3.8612163066864014, + "learning_rate": 5.921972912868768e-07, + "loss": 0.9276, + "step": 11003 + }, + { + "epoch": 0.8482886216466234, + "grad_norm": 3.941199541091919, + "learning_rate": 5.916080729778e-07, + "loss": 0.9488, + "step": 11004 + }, + { + "epoch": 0.8483657107616405, + "grad_norm": 3.7736704349517822, + "learning_rate": 5.910191295091289e-07, + "loss": 0.8789, + "step": 11005 + }, + { + "epoch": 0.8484427998766574, + "grad_norm": 3.812868595123291, + "learning_rate": 5.904304609175798e-07, + "loss": 0.9216, + "step": 11006 + }, + { + "epoch": 0.8485198889916744, + "grad_norm": 3.7537074089050293, + "learning_rate": 5.89842067239853e-07, + "loss": 0.8657, + "step": 11007 + }, + { + "epoch": 0.8485969781066913, + "grad_norm": 3.538891553878784, + "learning_rate": 5.892539485126331e-07, + "loss": 0.9183, + "step": 11008 + }, + { + "epoch": 0.8486740672217082, + "grad_norm": 3.7746341228485107, + "learning_rate": 5.886661047725856e-07, + "loss": 0.9663, + "step": 11009 + }, + { + "epoch": 0.8487511563367253, + "grad_norm": 4.052391529083252, + "learning_rate": 5.880785360563596e-07, + "loss": 0.9895, + "step": 11010 + }, + { + "epoch": 0.8488282454517422, + "grad_norm": 3.68890643119812, + "learning_rate": 5.87491242400588e-07, + "loss": 0.836, + "step": 11011 + }, + { + "epoch": 0.8489053345667592, + "grad_norm": 3.5953805446624756, + "learning_rate": 5.869042238418832e-07, + "loss": 0.9303, + "step": 11012 + }, + { + "epoch": 0.8489824236817761, + "grad_norm": 3.813616991043091, + "learning_rate": 5.863174804168442e-07, + "loss": 0.9442, + "step": 11013 + }, + { + "epoch": 0.8490595127967931, + "grad_norm": 3.9077954292297363, + "learning_rate": 5.857310121620513e-07, + "loss": 0.9025, + "step": 11014 + }, + { + "epoch": 0.8491366019118101, + "grad_norm": 4.491962909698486, + "learning_rate": 5.851448191140674e-07, + "loss": 0.9682, + "step": 11015 + }, + { + "epoch": 0.849213691026827, + "grad_norm": 3.854057550430298, + "learning_rate": 5.845589013094405e-07, + "loss": 0.9459, + "step": 11016 + }, + { + "epoch": 0.849290780141844, + "grad_norm": 3.7456696033477783, + "learning_rate": 5.839732587846963e-07, + "loss": 0.9548, + "step": 11017 + }, + { + "epoch": 0.8493678692568609, + "grad_norm": 3.6205060482025146, + "learning_rate": 5.833878915763485e-07, + "loss": 0.9676, + "step": 11018 + }, + { + "epoch": 0.849444958371878, + "grad_norm": 3.759831428527832, + "learning_rate": 5.828027997208918e-07, + "loss": 0.8192, + "step": 11019 + }, + { + "epoch": 0.8495220474868949, + "grad_norm": 3.9140279293060303, + "learning_rate": 5.822179832548025e-07, + "loss": 0.9411, + "step": 11020 + }, + { + "epoch": 0.8495991366019118, + "grad_norm": 3.6115550994873047, + "learning_rate": 5.81633442214542e-07, + "loss": 0.8644, + "step": 11021 + }, + { + "epoch": 0.8496762257169288, + "grad_norm": 3.4248645305633545, + "learning_rate": 5.810491766365545e-07, + "loss": 0.897, + "step": 11022 + }, + { + "epoch": 0.8497533148319457, + "grad_norm": 4.218714237213135, + "learning_rate": 5.80465186557263e-07, + "loss": 0.9441, + "step": 11023 + }, + { + "epoch": 0.8498304039469627, + "grad_norm": 3.3824057579040527, + "learning_rate": 5.798814720130779e-07, + "loss": 0.8109, + "step": 11024 + }, + { + "epoch": 0.8499074930619797, + "grad_norm": 3.555572032928467, + "learning_rate": 5.792980330403908e-07, + "loss": 0.9638, + "step": 11025 + }, + { + "epoch": 0.8499845821769966, + "grad_norm": 3.7353179454803467, + "learning_rate": 5.787148696755757e-07, + "loss": 0.997, + "step": 11026 + }, + { + "epoch": 0.8500616712920136, + "grad_norm": 3.5979185104370117, + "learning_rate": 5.781319819549913e-07, + "loss": 0.8548, + "step": 11027 + }, + { + "epoch": 0.8501387604070305, + "grad_norm": 3.9905002117156982, + "learning_rate": 5.775493699149754e-07, + "loss": 0.9858, + "step": 11028 + }, + { + "epoch": 0.8502158495220475, + "grad_norm": 3.6577179431915283, + "learning_rate": 5.769670335918515e-07, + "loss": 0.9216, + "step": 11029 + }, + { + "epoch": 0.8502929386370645, + "grad_norm": 4.757742404937744, + "learning_rate": 5.763849730219257e-07, + "loss": 1.0233, + "step": 11030 + }, + { + "epoch": 0.8503700277520814, + "grad_norm": 3.6999893188476562, + "learning_rate": 5.758031882414861e-07, + "loss": 0.9356, + "step": 11031 + }, + { + "epoch": 0.8504471168670984, + "grad_norm": 4.301717281341553, + "learning_rate": 5.752216792868048e-07, + "loss": 1.0229, + "step": 11032 + }, + { + "epoch": 0.8505242059821153, + "grad_norm": 3.8095266819000244, + "learning_rate": 5.746404461941358e-07, + "loss": 0.965, + "step": 11033 + }, + { + "epoch": 0.8506012950971323, + "grad_norm": 3.8716020584106445, + "learning_rate": 5.740594889997147e-07, + "loss": 0.8976, + "step": 11034 + }, + { + "epoch": 0.8506783842121493, + "grad_norm": 3.362471103668213, + "learning_rate": 5.734788077397618e-07, + "loss": 0.8177, + "step": 11035 + }, + { + "epoch": 0.8507554733271662, + "grad_norm": 3.7064013481140137, + "learning_rate": 5.728984024504796e-07, + "loss": 0.9328, + "step": 11036 + }, + { + "epoch": 0.8508325624421832, + "grad_norm": 3.902492046356201, + "learning_rate": 5.723182731680538e-07, + "loss": 1.0963, + "step": 11037 + }, + { + "epoch": 0.8509096515572001, + "grad_norm": 3.7971115112304688, + "learning_rate": 5.717384199286529e-07, + "loss": 0.9039, + "step": 11038 + }, + { + "epoch": 0.8509867406722171, + "grad_norm": 3.7627763748168945, + "learning_rate": 5.711588427684262e-07, + "loss": 0.9493, + "step": 11039 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 3.705435037612915, + "learning_rate": 5.705795417235077e-07, + "loss": 0.9674, + "step": 11040 + }, + { + "epoch": 0.851140918902251, + "grad_norm": 4.405561447143555, + "learning_rate": 5.700005168300144e-07, + "loss": 0.9634, + "step": 11041 + }, + { + "epoch": 0.851218008017268, + "grad_norm": 3.6750857830047607, + "learning_rate": 5.694217681240455e-07, + "loss": 0.9182, + "step": 11042 + }, + { + "epoch": 0.8512950971322849, + "grad_norm": 3.666693925857544, + "learning_rate": 5.688432956416823e-07, + "loss": 0.8737, + "step": 11043 + }, + { + "epoch": 0.8513721862473019, + "grad_norm": 3.796339988708496, + "learning_rate": 5.682650994189915e-07, + "loss": 0.9079, + "step": 11044 + }, + { + "epoch": 0.8514492753623188, + "grad_norm": 3.8772926330566406, + "learning_rate": 5.676871794920169e-07, + "loss": 0.8889, + "step": 11045 + }, + { + "epoch": 0.8515263644773358, + "grad_norm": 3.8630027770996094, + "learning_rate": 5.671095358967926e-07, + "loss": 0.9146, + "step": 11046 + }, + { + "epoch": 0.8516034535923528, + "grad_norm": 3.968269109725952, + "learning_rate": 5.665321686693298e-07, + "loss": 0.9438, + "step": 11047 + }, + { + "epoch": 0.8516805427073697, + "grad_norm": 3.7893552780151367, + "learning_rate": 5.659550778456258e-07, + "loss": 0.9303, + "step": 11048 + }, + { + "epoch": 0.8517576318223867, + "grad_norm": 3.5288009643554688, + "learning_rate": 5.653782634616573e-07, + "loss": 0.8899, + "step": 11049 + }, + { + "epoch": 0.8518347209374036, + "grad_norm": 4.320144176483154, + "learning_rate": 5.648017255533866e-07, + "loss": 0.9022, + "step": 11050 + }, + { + "epoch": 0.8519118100524206, + "grad_norm": 3.9753165245056152, + "learning_rate": 5.642254641567579e-07, + "loss": 0.8956, + "step": 11051 + }, + { + "epoch": 0.8519888991674376, + "grad_norm": 3.734354257583618, + "learning_rate": 5.636494793076974e-07, + "loss": 0.9239, + "step": 11052 + }, + { + "epoch": 0.8520659882824545, + "grad_norm": 3.6947646141052246, + "learning_rate": 5.630737710421174e-07, + "loss": 0.9127, + "step": 11053 + }, + { + "epoch": 0.8521430773974715, + "grad_norm": 3.678510904312134, + "learning_rate": 5.624983393959066e-07, + "loss": 0.8578, + "step": 11054 + }, + { + "epoch": 0.8522201665124884, + "grad_norm": 3.4292240142822266, + "learning_rate": 5.619231844049422e-07, + "loss": 0.9145, + "step": 11055 + }, + { + "epoch": 0.8522972556275054, + "grad_norm": 4.060187816619873, + "learning_rate": 5.613483061050818e-07, + "loss": 0.9417, + "step": 11056 + }, + { + "epoch": 0.8523743447425224, + "grad_norm": 3.9965662956237793, + "learning_rate": 5.607737045321666e-07, + "loss": 0.7447, + "step": 11057 + }, + { + "epoch": 0.8524514338575393, + "grad_norm": 3.6497106552124023, + "learning_rate": 5.60199379722019e-07, + "loss": 0.8786, + "step": 11058 + }, + { + "epoch": 0.8525285229725563, + "grad_norm": 3.614408493041992, + "learning_rate": 5.596253317104472e-07, + "loss": 0.8301, + "step": 11059 + }, + { + "epoch": 0.8526056120875732, + "grad_norm": 4.088712692260742, + "learning_rate": 5.590515605332369e-07, + "loss": 1.0084, + "step": 11060 + }, + { + "epoch": 0.8526827012025902, + "grad_norm": 3.6503963470458984, + "learning_rate": 5.584780662261624e-07, + "loss": 0.8587, + "step": 11061 + }, + { + "epoch": 0.8527597903176072, + "grad_norm": 3.714747667312622, + "learning_rate": 5.579048488249766e-07, + "loss": 0.9238, + "step": 11062 + }, + { + "epoch": 0.8528368794326241, + "grad_norm": 4.048036098480225, + "learning_rate": 5.573319083654177e-07, + "loss": 0.953, + "step": 11063 + }, + { + "epoch": 0.8529139685476411, + "grad_norm": 3.8897385597229004, + "learning_rate": 5.56759244883206e-07, + "loss": 0.8999, + "step": 11064 + }, + { + "epoch": 0.852991057662658, + "grad_norm": 4.280935287475586, + "learning_rate": 5.561868584140423e-07, + "loss": 0.8316, + "step": 11065 + }, + { + "epoch": 0.853068146777675, + "grad_norm": 3.4700443744659424, + "learning_rate": 5.556147489936131e-07, + "loss": 0.8984, + "step": 11066 + }, + { + "epoch": 0.853145235892692, + "grad_norm": 3.7145278453826904, + "learning_rate": 5.55042916657586e-07, + "loss": 0.911, + "step": 11067 + }, + { + "epoch": 0.8532223250077089, + "grad_norm": 3.821202278137207, + "learning_rate": 5.54471361441612e-07, + "loss": 1.0008, + "step": 11068 + }, + { + "epoch": 0.8532994141227259, + "grad_norm": 3.5977516174316406, + "learning_rate": 5.539000833813246e-07, + "loss": 0.9296, + "step": 11069 + }, + { + "epoch": 0.8533765032377428, + "grad_norm": 3.5324435234069824, + "learning_rate": 5.533290825123411e-07, + "loss": 0.7846, + "step": 11070 + }, + { + "epoch": 0.8534535923527597, + "grad_norm": 3.96653151512146, + "learning_rate": 5.527583588702584e-07, + "loss": 0.9285, + "step": 11071 + }, + { + "epoch": 0.8535306814677768, + "grad_norm": 4.007519721984863, + "learning_rate": 5.521879124906587e-07, + "loss": 0.961, + "step": 11072 + }, + { + "epoch": 0.8536077705827937, + "grad_norm": 3.8947818279266357, + "learning_rate": 5.516177434091074e-07, + "loss": 0.8486, + "step": 11073 + }, + { + "epoch": 0.8536848596978107, + "grad_norm": 3.4934535026550293, + "learning_rate": 5.510478516611512e-07, + "loss": 0.8873, + "step": 11074 + }, + { + "epoch": 0.8537619488128276, + "grad_norm": 3.4623193740844727, + "learning_rate": 5.504782372823203e-07, + "loss": 0.9679, + "step": 11075 + }, + { + "epoch": 0.8538390379278445, + "grad_norm": 3.704373836517334, + "learning_rate": 5.499089003081259e-07, + "loss": 0.8837, + "step": 11076 + }, + { + "epoch": 0.8539161270428616, + "grad_norm": 3.9960663318634033, + "learning_rate": 5.493398407740641e-07, + "loss": 0.9834, + "step": 11077 + }, + { + "epoch": 0.8539932161578785, + "grad_norm": 4.131714820861816, + "learning_rate": 5.48771058715612e-07, + "loss": 1.0009, + "step": 11078 + }, + { + "epoch": 0.8540703052728955, + "grad_norm": 3.8496928215026855, + "learning_rate": 5.482025541682312e-07, + "loss": 0.902, + "step": 11079 + }, + { + "epoch": 0.8541473943879124, + "grad_norm": 3.6283254623413086, + "learning_rate": 5.476343271673651e-07, + "loss": 0.9259, + "step": 11080 + }, + { + "epoch": 0.8542244835029293, + "grad_norm": 3.9260733127593994, + "learning_rate": 5.470663777484402e-07, + "loss": 0.9205, + "step": 11081 + }, + { + "epoch": 0.8543015726179464, + "grad_norm": 3.7307844161987305, + "learning_rate": 5.464987059468629e-07, + "loss": 0.9092, + "step": 11082 + }, + { + "epoch": 0.8543786617329633, + "grad_norm": 3.9613077640533447, + "learning_rate": 5.459313117980264e-07, + "loss": 0.9234, + "step": 11083 + }, + { + "epoch": 0.8544557508479803, + "grad_norm": 3.670541524887085, + "learning_rate": 5.45364195337304e-07, + "loss": 0.9093, + "step": 11084 + }, + { + "epoch": 0.8545328399629972, + "grad_norm": 3.992414951324463, + "learning_rate": 5.447973566000531e-07, + "loss": 0.9303, + "step": 11085 + }, + { + "epoch": 0.8546099290780141, + "grad_norm": 3.7107622623443604, + "learning_rate": 5.442307956216142e-07, + "loss": 0.8985, + "step": 11086 + }, + { + "epoch": 0.8546870181930312, + "grad_norm": 4.1595354080200195, + "learning_rate": 5.436645124373069e-07, + "loss": 0.906, + "step": 11087 + }, + { + "epoch": 0.8547641073080481, + "grad_norm": 3.343167543411255, + "learning_rate": 5.430985070824374e-07, + "loss": 0.7814, + "step": 11088 + }, + { + "epoch": 0.8548411964230651, + "grad_norm": 3.637519598007202, + "learning_rate": 5.42532779592293e-07, + "loss": 0.8609, + "step": 11089 + }, + { + "epoch": 0.854918285538082, + "grad_norm": 3.568153142929077, + "learning_rate": 5.419673300021427e-07, + "loss": 0.9346, + "step": 11090 + }, + { + "epoch": 0.8549953746530989, + "grad_norm": 3.780703067779541, + "learning_rate": 5.414021583472429e-07, + "loss": 0.9486, + "step": 11091 + }, + { + "epoch": 0.855072463768116, + "grad_norm": 3.555079460144043, + "learning_rate": 5.408372646628257e-07, + "loss": 0.8887, + "step": 11092 + }, + { + "epoch": 0.8551495528831329, + "grad_norm": 3.8127081394195557, + "learning_rate": 5.40272648984111e-07, + "loss": 0.8914, + "step": 11093 + }, + { + "epoch": 0.8552266419981499, + "grad_norm": 3.8799734115600586, + "learning_rate": 5.397083113462986e-07, + "loss": 0.8731, + "step": 11094 + }, + { + "epoch": 0.8553037311131668, + "grad_norm": 3.8874175548553467, + "learning_rate": 5.391442517845724e-07, + "loss": 0.8681, + "step": 11095 + }, + { + "epoch": 0.8553808202281837, + "grad_norm": 3.862079381942749, + "learning_rate": 5.385804703341002e-07, + "loss": 0.895, + "step": 11096 + }, + { + "epoch": 0.8554579093432008, + "grad_norm": 3.89817476272583, + "learning_rate": 5.380169670300284e-07, + "loss": 0.8392, + "step": 11097 + }, + { + "epoch": 0.8555349984582177, + "grad_norm": 3.441835403442383, + "learning_rate": 5.374537419074893e-07, + "loss": 0.8703, + "step": 11098 + }, + { + "epoch": 0.8556120875732347, + "grad_norm": 3.845632314682007, + "learning_rate": 5.368907950015972e-07, + "loss": 0.8489, + "step": 11099 + }, + { + "epoch": 0.8556891766882516, + "grad_norm": 3.8767735958099365, + "learning_rate": 5.36328126347449e-07, + "loss": 0.8637, + "step": 11100 + }, + { + "epoch": 0.8557662658032685, + "grad_norm": 3.9100561141967773, + "learning_rate": 5.357657359801249e-07, + "loss": 0.951, + "step": 11101 + }, + { + "epoch": 0.8558433549182856, + "grad_norm": 3.784698486328125, + "learning_rate": 5.352036239346858e-07, + "loss": 0.9673, + "step": 11102 + }, + { + "epoch": 0.8559204440333025, + "grad_norm": 3.8566789627075195, + "learning_rate": 5.34641790246177e-07, + "loss": 0.8962, + "step": 11103 + }, + { + "epoch": 0.8559975331483195, + "grad_norm": 3.3040380477905273, + "learning_rate": 5.340802349496254e-07, + "loss": 0.8545, + "step": 11104 + }, + { + "epoch": 0.8560746222633364, + "grad_norm": 3.585635185241699, + "learning_rate": 5.335189580800421e-07, + "loss": 0.8885, + "step": 11105 + }, + { + "epoch": 0.8561517113783533, + "grad_norm": 3.8033549785614014, + "learning_rate": 5.329579596724188e-07, + "loss": 0.8385, + "step": 11106 + }, + { + "epoch": 0.8562288004933704, + "grad_norm": 3.958906888961792, + "learning_rate": 5.323972397617327e-07, + "loss": 0.9254, + "step": 11107 + }, + { + "epoch": 0.8563058896083873, + "grad_norm": 3.711397647857666, + "learning_rate": 5.318367983829393e-07, + "loss": 0.9813, + "step": 11108 + }, + { + "epoch": 0.8563829787234043, + "grad_norm": 3.6522676944732666, + "learning_rate": 5.312766355709803e-07, + "loss": 0.8936, + "step": 11109 + }, + { + "epoch": 0.8564600678384212, + "grad_norm": 4.029238700866699, + "learning_rate": 5.307167513607786e-07, + "loss": 1.009, + "step": 11110 + }, + { + "epoch": 0.8565371569534381, + "grad_norm": 3.7067220211029053, + "learning_rate": 5.301571457872407e-07, + "loss": 0.9072, + "step": 11111 + }, + { + "epoch": 0.8566142460684552, + "grad_norm": 4.054270267486572, + "learning_rate": 5.295978188852557e-07, + "loss": 1.0588, + "step": 11112 + }, + { + "epoch": 0.8566913351834721, + "grad_norm": 3.697340250015259, + "learning_rate": 5.290387706896933e-07, + "loss": 0.9871, + "step": 11113 + }, + { + "epoch": 0.8567684242984891, + "grad_norm": 4.2902092933654785, + "learning_rate": 5.284800012354075e-07, + "loss": 0.8552, + "step": 11114 + }, + { + "epoch": 0.856845513413506, + "grad_norm": 3.741992950439453, + "learning_rate": 5.279215105572355e-07, + "loss": 0.8789, + "step": 11115 + }, + { + "epoch": 0.8569226025285229, + "grad_norm": 3.8144991397857666, + "learning_rate": 5.273632986899951e-07, + "loss": 0.8931, + "step": 11116 + }, + { + "epoch": 0.85699969164354, + "grad_norm": 3.5984315872192383, + "learning_rate": 5.268053656684891e-07, + "loss": 0.9031, + "step": 11117 + }, + { + "epoch": 0.8570767807585569, + "grad_norm": 4.40741491317749, + "learning_rate": 5.262477115275022e-07, + "loss": 0.9021, + "step": 11118 + }, + { + "epoch": 0.8571538698735739, + "grad_norm": 3.9442696571350098, + "learning_rate": 5.256903363017995e-07, + "loss": 1.0166, + "step": 11119 + }, + { + "epoch": 0.8572309589885908, + "grad_norm": 3.6909403800964355, + "learning_rate": 5.251332400261311e-07, + "loss": 0.8354, + "step": 11120 + }, + { + "epoch": 0.8573080481036077, + "grad_norm": 3.769570827484131, + "learning_rate": 5.24576422735229e-07, + "loss": 0.9404, + "step": 11121 + }, + { + "epoch": 0.8573851372186247, + "grad_norm": 3.7346904277801514, + "learning_rate": 5.240198844638084e-07, + "loss": 0.9651, + "step": 11122 + }, + { + "epoch": 0.8574622263336417, + "grad_norm": 3.3625786304473877, + "learning_rate": 5.234636252465675e-07, + "loss": 0.9208, + "step": 11123 + }, + { + "epoch": 0.8575393154486587, + "grad_norm": 3.8647756576538086, + "learning_rate": 5.229076451181836e-07, + "loss": 0.9616, + "step": 11124 + }, + { + "epoch": 0.8576164045636756, + "grad_norm": 3.784053325653076, + "learning_rate": 5.223519441133206e-07, + "loss": 0.9683, + "step": 11125 + }, + { + "epoch": 0.8576934936786925, + "grad_norm": 3.712662696838379, + "learning_rate": 5.217965222666239e-07, + "loss": 0.8376, + "step": 11126 + }, + { + "epoch": 0.8577705827937095, + "grad_norm": 4.219189167022705, + "learning_rate": 5.212413796127208e-07, + "loss": 1.0494, + "step": 11127 + }, + { + "epoch": 0.8578476719087265, + "grad_norm": 3.7683613300323486, + "learning_rate": 5.206865161862212e-07, + "loss": 0.9379, + "step": 11128 + }, + { + "epoch": 0.8579247610237435, + "grad_norm": 3.439199447631836, + "learning_rate": 5.201319320217196e-07, + "loss": 0.8908, + "step": 11129 + }, + { + "epoch": 0.8580018501387604, + "grad_norm": 3.727071762084961, + "learning_rate": 5.195776271537894e-07, + "loss": 0.8725, + "step": 11130 + }, + { + "epoch": 0.8580789392537773, + "grad_norm": 3.9292681217193604, + "learning_rate": 5.190236016169892e-07, + "loss": 0.8883, + "step": 11131 + }, + { + "epoch": 0.8581560283687943, + "grad_norm": 4.457842826843262, + "learning_rate": 5.184698554458595e-07, + "loss": 0.8266, + "step": 11132 + }, + { + "epoch": 0.8582331174838113, + "grad_norm": 4.050070762634277, + "learning_rate": 5.179163886749244e-07, + "loss": 1.0174, + "step": 11133 + }, + { + "epoch": 0.8583102065988283, + "grad_norm": 3.961498498916626, + "learning_rate": 5.173632013386892e-07, + "loss": 0.9666, + "step": 11134 + }, + { + "epoch": 0.8583872957138452, + "grad_norm": 3.9840786457061768, + "learning_rate": 5.168102934716419e-07, + "loss": 0.857, + "step": 11135 + }, + { + "epoch": 0.8584643848288621, + "grad_norm": 3.528966188430786, + "learning_rate": 5.162576651082541e-07, + "loss": 0.9403, + "step": 11136 + }, + { + "epoch": 0.8585414739438791, + "grad_norm": 3.7930431365966797, + "learning_rate": 5.157053162829789e-07, + "loss": 0.919, + "step": 11137 + }, + { + "epoch": 0.8586185630588961, + "grad_norm": 3.9015920162200928, + "learning_rate": 5.151532470302523e-07, + "loss": 0.9518, + "step": 11138 + }, + { + "epoch": 0.8586956521739131, + "grad_norm": 3.5874063968658447, + "learning_rate": 5.146014573844943e-07, + "loss": 0.8774, + "step": 11139 + }, + { + "epoch": 0.85877274128893, + "grad_norm": 3.664182662963867, + "learning_rate": 5.140499473801036e-07, + "loss": 0.9138, + "step": 11140 + }, + { + "epoch": 0.8588498304039469, + "grad_norm": 3.809180736541748, + "learning_rate": 5.134987170514654e-07, + "loss": 0.8837, + "step": 11141 + }, + { + "epoch": 0.8589269195189639, + "grad_norm": 4.012628078460693, + "learning_rate": 5.129477664329463e-07, + "loss": 0.9214, + "step": 11142 + }, + { + "epoch": 0.8590040086339809, + "grad_norm": 3.4249749183654785, + "learning_rate": 5.123970955588947e-07, + "loss": 0.6997, + "step": 11143 + }, + { + "epoch": 0.8590810977489979, + "grad_norm": 3.523764133453369, + "learning_rate": 5.118467044636438e-07, + "loss": 0.8808, + "step": 11144 + }, + { + "epoch": 0.8591581868640148, + "grad_norm": 3.6512789726257324, + "learning_rate": 5.112965931815045e-07, + "loss": 0.9734, + "step": 11145 + }, + { + "epoch": 0.8592352759790317, + "grad_norm": 3.7311131954193115, + "learning_rate": 5.107467617467754e-07, + "loss": 0.961, + "step": 11146 + }, + { + "epoch": 0.8593123650940487, + "grad_norm": 3.953731060028076, + "learning_rate": 5.101972101937352e-07, + "loss": 0.9679, + "step": 11147 + }, + { + "epoch": 0.8593894542090657, + "grad_norm": 3.7409465312957764, + "learning_rate": 5.096479385566455e-07, + "loss": 0.9236, + "step": 11148 + }, + { + "epoch": 0.8594665433240827, + "grad_norm": 3.9198498725891113, + "learning_rate": 5.090989468697515e-07, + "loss": 0.9617, + "step": 11149 + }, + { + "epoch": 0.8595436324390996, + "grad_norm": 3.991605520248413, + "learning_rate": 5.085502351672788e-07, + "loss": 0.8857, + "step": 11150 + }, + { + "epoch": 0.8596207215541165, + "grad_norm": 3.804913282394409, + "learning_rate": 5.080018034834367e-07, + "loss": 0.9456, + "step": 11151 + }, + { + "epoch": 0.8596978106691335, + "grad_norm": 3.8616576194763184, + "learning_rate": 5.074536518524176e-07, + "loss": 0.9924, + "step": 11152 + }, + { + "epoch": 0.8597748997841504, + "grad_norm": 3.636204957962036, + "learning_rate": 5.06905780308396e-07, + "loss": 1.0058, + "step": 11153 + }, + { + "epoch": 0.8598519888991675, + "grad_norm": 4.016709327697754, + "learning_rate": 5.063581888855285e-07, + "loss": 0.9465, + "step": 11154 + }, + { + "epoch": 0.8599290780141844, + "grad_norm": 4.032883644104004, + "learning_rate": 5.058108776179555e-07, + "loss": 0.9796, + "step": 11155 + }, + { + "epoch": 0.8600061671292013, + "grad_norm": 3.659719705581665, + "learning_rate": 5.05263846539798e-07, + "loss": 0.9076, + "step": 11156 + }, + { + "epoch": 0.8600832562442183, + "grad_norm": 4.1367950439453125, + "learning_rate": 5.0471709568516e-07, + "loss": 0.9214, + "step": 11157 + }, + { + "epoch": 0.8601603453592352, + "grad_norm": 4.058650493621826, + "learning_rate": 5.041706250881301e-07, + "loss": 1.0593, + "step": 11158 + }, + { + "epoch": 0.8602374344742523, + "grad_norm": 3.848299980163574, + "learning_rate": 5.036244347827768e-07, + "loss": 0.8446, + "step": 11159 + }, + { + "epoch": 0.8603145235892692, + "grad_norm": 3.458164930343628, + "learning_rate": 5.030785248031534e-07, + "loss": 0.877, + "step": 11160 + }, + { + "epoch": 0.8603916127042861, + "grad_norm": 3.6949539184570312, + "learning_rate": 5.025328951832936e-07, + "loss": 0.8685, + "step": 11161 + }, + { + "epoch": 0.8604687018193031, + "grad_norm": 4.433669090270996, + "learning_rate": 5.019875459572143e-07, + "loss": 0.9717, + "step": 11162 + }, + { + "epoch": 0.86054579093432, + "grad_norm": 3.662029266357422, + "learning_rate": 5.014424771589155e-07, + "loss": 0.8557, + "step": 11163 + }, + { + "epoch": 0.8606228800493371, + "grad_norm": 3.8234245777130127, + "learning_rate": 5.008976888223799e-07, + "loss": 0.9186, + "step": 11164 + }, + { + "epoch": 0.860699969164354, + "grad_norm": 3.7250325679779053, + "learning_rate": 5.003531809815721e-07, + "loss": 0.9464, + "step": 11165 + }, + { + "epoch": 0.8607770582793709, + "grad_norm": 3.8604021072387695, + "learning_rate": 4.998089536704399e-07, + "loss": 1.0107, + "step": 11166 + }, + { + "epoch": 0.8608541473943879, + "grad_norm": 3.525672674179077, + "learning_rate": 4.992650069229116e-07, + "loss": 0.9176, + "step": 11167 + }, + { + "epoch": 0.8609312365094048, + "grad_norm": 3.9230077266693115, + "learning_rate": 4.987213407729002e-07, + "loss": 0.8406, + "step": 11168 + }, + { + "epoch": 0.8610083256244219, + "grad_norm": 3.6223180294036865, + "learning_rate": 4.981779552543004e-07, + "loss": 0.8505, + "step": 11169 + }, + { + "epoch": 0.8610854147394388, + "grad_norm": 4.130772590637207, + "learning_rate": 4.976348504009899e-07, + "loss": 0.9977, + "step": 11170 + }, + { + "epoch": 0.8611625038544557, + "grad_norm": 3.5042903423309326, + "learning_rate": 4.970920262468282e-07, + "loss": 0.8921, + "step": 11171 + }, + { + "epoch": 0.8612395929694727, + "grad_norm": 4.355722427368164, + "learning_rate": 4.965494828256573e-07, + "loss": 0.8054, + "step": 11172 + }, + { + "epoch": 0.8613166820844896, + "grad_norm": 3.7091667652130127, + "learning_rate": 4.960072201713018e-07, + "loss": 0.8937, + "step": 11173 + }, + { + "epoch": 0.8613937711995067, + "grad_norm": 4.190767288208008, + "learning_rate": 4.954652383175696e-07, + "loss": 0.9878, + "step": 11174 + }, + { + "epoch": 0.8614708603145236, + "grad_norm": 3.6260433197021484, + "learning_rate": 4.949235372982503e-07, + "loss": 0.9711, + "step": 11175 + }, + { + "epoch": 0.8615479494295405, + "grad_norm": 3.458486318588257, + "learning_rate": 4.943821171471158e-07, + "loss": 0.8357, + "step": 11176 + }, + { + "epoch": 0.8616250385445575, + "grad_norm": 3.68377423286438, + "learning_rate": 4.93840977897922e-07, + "loss": 0.8426, + "step": 11177 + }, + { + "epoch": 0.8617021276595744, + "grad_norm": 3.642500877380371, + "learning_rate": 4.933001195844034e-07, + "loss": 0.8853, + "step": 11178 + }, + { + "epoch": 0.8617792167745915, + "grad_norm": 3.628203868865967, + "learning_rate": 4.927595422402826e-07, + "loss": 0.9523, + "step": 11179 + }, + { + "epoch": 0.8618563058896084, + "grad_norm": 3.9804065227508545, + "learning_rate": 4.922192458992609e-07, + "loss": 1.0091, + "step": 11180 + }, + { + "epoch": 0.8619333950046254, + "grad_norm": 4.004647731781006, + "learning_rate": 4.916792305950235e-07, + "loss": 1.0064, + "step": 11181 + }, + { + "epoch": 0.8620104841196423, + "grad_norm": 3.7954509258270264, + "learning_rate": 4.911394963612359e-07, + "loss": 0.919, + "step": 11182 + }, + { + "epoch": 0.8620875732346592, + "grad_norm": 3.5825188159942627, + "learning_rate": 4.906000432315489e-07, + "loss": 0.9094, + "step": 11183 + }, + { + "epoch": 0.8621646623496763, + "grad_norm": 3.406367778778076, + "learning_rate": 4.900608712395943e-07, + "loss": 0.917, + "step": 11184 + }, + { + "epoch": 0.8622417514646932, + "grad_norm": 3.9264237880706787, + "learning_rate": 4.895219804189871e-07, + "loss": 0.8757, + "step": 11185 + }, + { + "epoch": 0.8623188405797102, + "grad_norm": 3.8309881687164307, + "learning_rate": 4.889833708033248e-07, + "loss": 0.9881, + "step": 11186 + }, + { + "epoch": 0.8623959296947271, + "grad_norm": 3.63254714012146, + "learning_rate": 4.884450424261849e-07, + "loss": 0.8489, + "step": 11187 + }, + { + "epoch": 0.862473018809744, + "grad_norm": 3.7826976776123047, + "learning_rate": 4.879069953211312e-07, + "loss": 0.9321, + "step": 11188 + }, + { + "epoch": 0.8625501079247611, + "grad_norm": 3.793102979660034, + "learning_rate": 4.873692295217076e-07, + "loss": 0.9313, + "step": 11189 + }, + { + "epoch": 0.862627197039778, + "grad_norm": 3.558684825897217, + "learning_rate": 4.868317450614407e-07, + "loss": 0.9335, + "step": 11190 + }, + { + "epoch": 0.862704286154795, + "grad_norm": 4.0856194496154785, + "learning_rate": 4.862945419738401e-07, + "loss": 0.9869, + "step": 11191 + }, + { + "epoch": 0.8627813752698119, + "grad_norm": 3.5604751110076904, + "learning_rate": 4.857576202923986e-07, + "loss": 0.8317, + "step": 11192 + }, + { + "epoch": 0.8628584643848288, + "grad_norm": 3.5392532348632812, + "learning_rate": 4.852209800505892e-07, + "loss": 0.8233, + "step": 11193 + }, + { + "epoch": 0.8629355534998459, + "grad_norm": 3.6630494594573975, + "learning_rate": 4.846846212818684e-07, + "loss": 0.8782, + "step": 11194 + }, + { + "epoch": 0.8630126426148628, + "grad_norm": 3.6344716548919678, + "learning_rate": 4.841485440196763e-07, + "loss": 0.8732, + "step": 11195 + }, + { + "epoch": 0.8630897317298798, + "grad_norm": 3.5769948959350586, + "learning_rate": 4.836127482974346e-07, + "loss": 0.8491, + "step": 11196 + }, + { + "epoch": 0.8631668208448967, + "grad_norm": 3.778287649154663, + "learning_rate": 4.830772341485479e-07, + "loss": 0.9085, + "step": 11197 + }, + { + "epoch": 0.8632439099599136, + "grad_norm": 3.6463840007781982, + "learning_rate": 4.825420016064009e-07, + "loss": 0.9255, + "step": 11198 + }, + { + "epoch": 0.8633209990749307, + "grad_norm": 3.8667242527008057, + "learning_rate": 4.820070507043633e-07, + "loss": 0.931, + "step": 11199 + }, + { + "epoch": 0.8633980881899476, + "grad_norm": 3.825806140899658, + "learning_rate": 4.814723814757871e-07, + "loss": 0.9954, + "step": 11200 + }, + { + "epoch": 0.8634751773049646, + "grad_norm": 3.5652754306793213, + "learning_rate": 4.80937993954006e-07, + "loss": 0.8503, + "step": 11201 + }, + { + "epoch": 0.8635522664199815, + "grad_norm": 3.5560030937194824, + "learning_rate": 4.804038881723361e-07, + "loss": 0.8271, + "step": 11202 + }, + { + "epoch": 0.8636293555349984, + "grad_norm": 3.7864773273468018, + "learning_rate": 4.798700641640768e-07, + "loss": 0.8704, + "step": 11203 + }, + { + "epoch": 0.8637064446500154, + "grad_norm": 3.6049001216888428, + "learning_rate": 4.793365219625079e-07, + "loss": 0.9203, + "step": 11204 + }, + { + "epoch": 0.8637835337650324, + "grad_norm": 3.6420271396636963, + "learning_rate": 4.788032616008936e-07, + "loss": 0.966, + "step": 11205 + }, + { + "epoch": 0.8638606228800494, + "grad_norm": 3.835549831390381, + "learning_rate": 4.782702831124803e-07, + "loss": 0.9633, + "step": 11206 + }, + { + "epoch": 0.8639377119950663, + "grad_norm": 3.744598388671875, + "learning_rate": 4.777375865304962e-07, + "loss": 0.8078, + "step": 11207 + }, + { + "epoch": 0.8640148011100832, + "grad_norm": 3.756577491760254, + "learning_rate": 4.772051718881532e-07, + "loss": 0.9566, + "step": 11208 + }, + { + "epoch": 0.8640918902251002, + "grad_norm": 3.558971643447876, + "learning_rate": 4.766730392186425e-07, + "loss": 0.9583, + "step": 11209 + }, + { + "epoch": 0.8641689793401172, + "grad_norm": 3.711453676223755, + "learning_rate": 4.761411885551409e-07, + "loss": 0.9485, + "step": 11210 + }, + { + "epoch": 0.8642460684551342, + "grad_norm": 3.7305915355682373, + "learning_rate": 4.7560961993080636e-07, + "loss": 0.9207, + "step": 11211 + }, + { + "epoch": 0.8643231575701511, + "grad_norm": 3.7739152908325195, + "learning_rate": 4.750783333787795e-07, + "loss": 0.8622, + "step": 11212 + }, + { + "epoch": 0.864400246685168, + "grad_norm": 3.8151278495788574, + "learning_rate": 4.745473289321839e-07, + "loss": 0.7792, + "step": 11213 + }, + { + "epoch": 0.864477335800185, + "grad_norm": 3.5902225971221924, + "learning_rate": 4.7401660662412477e-07, + "loss": 0.9519, + "step": 11214 + }, + { + "epoch": 0.864554424915202, + "grad_norm": 3.8135595321655273, + "learning_rate": 4.7348616648768886e-07, + "loss": 0.8904, + "step": 11215 + }, + { + "epoch": 0.864631514030219, + "grad_norm": 3.9646761417388916, + "learning_rate": 4.729560085559476e-07, + "loss": 0.8801, + "step": 11216 + }, + { + "epoch": 0.8647086031452359, + "grad_norm": 4.04872465133667, + "learning_rate": 4.7242613286195227e-07, + "loss": 1.0091, + "step": 11217 + }, + { + "epoch": 0.8647856922602528, + "grad_norm": 3.746007204055786, + "learning_rate": 4.718965394387387e-07, + "loss": 0.9377, + "step": 11218 + }, + { + "epoch": 0.8648627813752698, + "grad_norm": 3.77582049369812, + "learning_rate": 4.7136722831932546e-07, + "loss": 0.9175, + "step": 11219 + }, + { + "epoch": 0.8649398704902868, + "grad_norm": 3.808149576187134, + "learning_rate": 4.7083819953671007e-07, + "loss": 0.8849, + "step": 11220 + }, + { + "epoch": 0.8650169596053038, + "grad_norm": 3.639953851699829, + "learning_rate": 4.703094531238761e-07, + "loss": 0.8691, + "step": 11221 + }, + { + "epoch": 0.8650940487203207, + "grad_norm": 4.364095211029053, + "learning_rate": 4.6978098911378776e-07, + "loss": 0.9547, + "step": 11222 + }, + { + "epoch": 0.8651711378353376, + "grad_norm": 3.8730828762054443, + "learning_rate": 4.6925280753939097e-07, + "loss": 0.9314, + "step": 11223 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 3.9587454795837402, + "learning_rate": 4.687249084336182e-07, + "loss": 0.9972, + "step": 11224 + }, + { + "epoch": 0.8653253160653716, + "grad_norm": 3.558377504348755, + "learning_rate": 4.6819729182937865e-07, + "loss": 0.8813, + "step": 11225 + }, + { + "epoch": 0.8654024051803886, + "grad_norm": 3.674800395965576, + "learning_rate": 4.676699577595667e-07, + "loss": 0.8476, + "step": 11226 + }, + { + "epoch": 0.8654794942954055, + "grad_norm": 3.645923376083374, + "learning_rate": 4.6714290625705983e-07, + "loss": 0.9952, + "step": 11227 + }, + { + "epoch": 0.8655565834104224, + "grad_norm": 4.369297981262207, + "learning_rate": 4.666161373547162e-07, + "loss": 0.9502, + "step": 11228 + }, + { + "epoch": 0.8656336725254394, + "grad_norm": 3.8362910747528076, + "learning_rate": 4.660896510853785e-07, + "loss": 0.8966, + "step": 11229 + }, + { + "epoch": 0.8657107616404563, + "grad_norm": 4.300778388977051, + "learning_rate": 4.655634474818682e-07, + "loss": 1.024, + "step": 11230 + }, + { + "epoch": 0.8657878507554734, + "grad_norm": 3.7804079055786133, + "learning_rate": 4.650375265769924e-07, + "loss": 0.8685, + "step": 11231 + }, + { + "epoch": 0.8658649398704903, + "grad_norm": 3.908895492553711, + "learning_rate": 4.645118884035399e-07, + "loss": 0.9242, + "step": 11232 + }, + { + "epoch": 0.8659420289855072, + "grad_norm": 3.7380692958831787, + "learning_rate": 4.639865329942811e-07, + "loss": 0.8432, + "step": 11233 + }, + { + "epoch": 0.8660191181005242, + "grad_norm": 3.6968612670898438, + "learning_rate": 4.6346146038197035e-07, + "loss": 0.9012, + "step": 11234 + }, + { + "epoch": 0.8660962072155411, + "grad_norm": 3.667461395263672, + "learning_rate": 4.6293667059934154e-07, + "loss": 0.8959, + "step": 11235 + }, + { + "epoch": 0.8661732963305582, + "grad_norm": 3.7316794395446777, + "learning_rate": 4.624121636791129e-07, + "loss": 0.9219, + "step": 11236 + }, + { + "epoch": 0.8662503854455751, + "grad_norm": 3.7822744846343994, + "learning_rate": 4.6188793965398493e-07, + "loss": 0.9128, + "step": 11237 + }, + { + "epoch": 0.866327474560592, + "grad_norm": 3.7413852214813232, + "learning_rate": 4.6136399855664093e-07, + "loss": 0.9516, + "step": 11238 + }, + { + "epoch": 0.866404563675609, + "grad_norm": 3.792145013809204, + "learning_rate": 4.6084034041974533e-07, + "loss": 0.9049, + "step": 11239 + }, + { + "epoch": 0.866481652790626, + "grad_norm": 4.004141330718994, + "learning_rate": 4.603169652759465e-07, + "loss": 0.8256, + "step": 11240 + }, + { + "epoch": 0.866558741905643, + "grad_norm": 4.334702968597412, + "learning_rate": 4.5979387315787215e-07, + "loss": 0.9637, + "step": 11241 + }, + { + "epoch": 0.8666358310206599, + "grad_norm": 3.4571897983551025, + "learning_rate": 4.592710640981352e-07, + "loss": 0.8333, + "step": 11242 + }, + { + "epoch": 0.8667129201356768, + "grad_norm": 3.4184558391571045, + "learning_rate": 4.5874853812933107e-07, + "loss": 0.8278, + "step": 11243 + }, + { + "epoch": 0.8667900092506938, + "grad_norm": 3.8723700046539307, + "learning_rate": 4.582262952840355e-07, + "loss": 0.9484, + "step": 11244 + }, + { + "epoch": 0.8668670983657107, + "grad_norm": 3.8549139499664307, + "learning_rate": 4.5770433559480854e-07, + "loss": 1.0091, + "step": 11245 + }, + { + "epoch": 0.8669441874807278, + "grad_norm": 3.5513107776641846, + "learning_rate": 4.571826590941908e-07, + "loss": 0.8842, + "step": 11246 + }, + { + "epoch": 0.8670212765957447, + "grad_norm": 3.3549818992614746, + "learning_rate": 4.5666126581470625e-07, + "loss": 0.8307, + "step": 11247 + }, + { + "epoch": 0.8670983657107616, + "grad_norm": 3.714850664138794, + "learning_rate": 4.561401557888606e-07, + "loss": 0.8353, + "step": 11248 + }, + { + "epoch": 0.8671754548257786, + "grad_norm": 3.587151527404785, + "learning_rate": 4.5561932904914395e-07, + "loss": 0.9133, + "step": 11249 + }, + { + "epoch": 0.8672525439407955, + "grad_norm": 3.682015895843506, + "learning_rate": 4.5509878562802536e-07, + "loss": 0.8385, + "step": 11250 + }, + { + "epoch": 0.8673296330558126, + "grad_norm": 3.6053788661956787, + "learning_rate": 4.5457852555796044e-07, + "loss": 0.8828, + "step": 11251 + }, + { + "epoch": 0.8674067221708295, + "grad_norm": 3.85351824760437, + "learning_rate": 4.5405854887138165e-07, + "loss": 0.8888, + "step": 11252 + }, + { + "epoch": 0.8674838112858464, + "grad_norm": 4.928849220275879, + "learning_rate": 4.5353885560070867e-07, + "loss": 0.8688, + "step": 11253 + }, + { + "epoch": 0.8675609004008634, + "grad_norm": 3.783874034881592, + "learning_rate": 4.53019445778341e-07, + "loss": 0.8879, + "step": 11254 + }, + { + "epoch": 0.8676379895158803, + "grad_norm": 3.7811548709869385, + "learning_rate": 4.5250031943666174e-07, + "loss": 0.9084, + "step": 11255 + }, + { + "epoch": 0.8677150786308974, + "grad_norm": 3.439472198486328, + "learning_rate": 4.5198147660803605e-07, + "loss": 0.9078, + "step": 11256 + }, + { + "epoch": 0.8677921677459143, + "grad_norm": 3.847801923751831, + "learning_rate": 4.5146291732480975e-07, + "loss": 0.9516, + "step": 11257 + }, + { + "epoch": 0.8678692568609312, + "grad_norm": 3.690554141998291, + "learning_rate": 4.5094464161931305e-07, + "loss": 0.9484, + "step": 11258 + }, + { + "epoch": 0.8679463459759482, + "grad_norm": 3.9348044395446777, + "learning_rate": 4.5042664952385797e-07, + "loss": 0.9123, + "step": 11259 + }, + { + "epoch": 0.8680234350909651, + "grad_norm": 4.005280494689941, + "learning_rate": 4.499089410707386e-07, + "loss": 0.9793, + "step": 11260 + }, + { + "epoch": 0.8681005242059822, + "grad_norm": 4.255978584289551, + "learning_rate": 4.493915162922308e-07, + "loss": 0.8457, + "step": 11261 + }, + { + "epoch": 0.8681776133209991, + "grad_norm": 4.279727935791016, + "learning_rate": 4.4887437522059487e-07, + "loss": 0.9742, + "step": 11262 + }, + { + "epoch": 0.868254702436016, + "grad_norm": 3.771867036819458, + "learning_rate": 4.4835751788807e-07, + "loss": 0.9787, + "step": 11263 + }, + { + "epoch": 0.868331791551033, + "grad_norm": 3.949169635772705, + "learning_rate": 4.4784094432687997e-07, + "loss": 0.9254, + "step": 11264 + }, + { + "epoch": 0.8684088806660499, + "grad_norm": 4.027673244476318, + "learning_rate": 4.4732465456923113e-07, + "loss": 0.9364, + "step": 11265 + }, + { + "epoch": 0.868485969781067, + "grad_norm": 3.5182044506073, + "learning_rate": 4.468086486473111e-07, + "loss": 1.0193, + "step": 11266 + }, + { + "epoch": 0.8685630588960839, + "grad_norm": 3.757246494293213, + "learning_rate": 4.462929265932897e-07, + "loss": 0.9157, + "step": 11267 + }, + { + "epoch": 0.8686401480111008, + "grad_norm": 3.785848379135132, + "learning_rate": 4.457774884393207e-07, + "loss": 0.8675, + "step": 11268 + }, + { + "epoch": 0.8687172371261178, + "grad_norm": 4.203325271606445, + "learning_rate": 4.452623342175383e-07, + "loss": 0.9538, + "step": 11269 + }, + { + "epoch": 0.8687943262411347, + "grad_norm": 3.8824243545532227, + "learning_rate": 4.447474639600596e-07, + "loss": 0.9154, + "step": 11270 + }, + { + "epoch": 0.8688714153561518, + "grad_norm": 3.8376996517181396, + "learning_rate": 4.442328776989846e-07, + "loss": 0.9482, + "step": 11271 + }, + { + "epoch": 0.8689485044711687, + "grad_norm": 3.8418562412261963, + "learning_rate": 4.437185754663953e-07, + "loss": 0.9741, + "step": 11272 + }, + { + "epoch": 0.8690255935861856, + "grad_norm": 3.5321462154388428, + "learning_rate": 4.432045572943544e-07, + "loss": 0.885, + "step": 11273 + }, + { + "epoch": 0.8691026827012026, + "grad_norm": 4.0153374671936035, + "learning_rate": 4.4269082321490906e-07, + "loss": 0.9149, + "step": 11274 + }, + { + "epoch": 0.8691797718162195, + "grad_norm": 4.0503129959106445, + "learning_rate": 4.4217737326008814e-07, + "loss": 0.9256, + "step": 11275 + }, + { + "epoch": 0.8692568609312366, + "grad_norm": 3.9747254848480225, + "learning_rate": 4.4166420746190206e-07, + "loss": 0.9465, + "step": 11276 + }, + { + "epoch": 0.8693339500462535, + "grad_norm": 3.6550421714782715, + "learning_rate": 4.411513258523459e-07, + "loss": 0.8072, + "step": 11277 + }, + { + "epoch": 0.8694110391612704, + "grad_norm": 3.5995473861694336, + "learning_rate": 4.406387284633923e-07, + "loss": 0.7394, + "step": 11278 + }, + { + "epoch": 0.8694881282762874, + "grad_norm": 3.6607563495635986, + "learning_rate": 4.4012641532700075e-07, + "loss": 0.9314, + "step": 11279 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 3.560035228729248, + "learning_rate": 4.3961438647511066e-07, + "loss": 0.8134, + "step": 11280 + }, + { + "epoch": 0.8696423065063213, + "grad_norm": 3.919574022293091, + "learning_rate": 4.391026419396449e-07, + "loss": 0.9771, + "step": 11281 + }, + { + "epoch": 0.8697193956213383, + "grad_norm": 3.7704415321350098, + "learning_rate": 4.3859118175250905e-07, + "loss": 0.8802, + "step": 11282 + }, + { + "epoch": 0.8697964847363552, + "grad_norm": 3.845430850982666, + "learning_rate": 4.3808000594558754e-07, + "loss": 0.8919, + "step": 11283 + }, + { + "epoch": 0.8698735738513722, + "grad_norm": 3.642606735229492, + "learning_rate": 4.3756911455075155e-07, + "loss": 0.8085, + "step": 11284 + }, + { + "epoch": 0.8699506629663891, + "grad_norm": 3.7264983654022217, + "learning_rate": 4.3705850759985127e-07, + "loss": 0.9121, + "step": 11285 + }, + { + "epoch": 0.8700277520814061, + "grad_norm": 3.6813814640045166, + "learning_rate": 4.3654818512472106e-07, + "loss": 0.8442, + "step": 11286 + }, + { + "epoch": 0.8701048411964231, + "grad_norm": 4.019516468048096, + "learning_rate": 4.3603814715717674e-07, + "loss": 0.8202, + "step": 11287 + }, + { + "epoch": 0.87018193031144, + "grad_norm": 3.6544294357299805, + "learning_rate": 4.355283937290178e-07, + "loss": 0.8273, + "step": 11288 + }, + { + "epoch": 0.870259019426457, + "grad_norm": 3.649919271469116, + "learning_rate": 4.350189248720221e-07, + "loss": 0.876, + "step": 11289 + }, + { + "epoch": 0.8703361085414739, + "grad_norm": 3.7568275928497314, + "learning_rate": 4.3450974061795437e-07, + "loss": 0.9599, + "step": 11290 + }, + { + "epoch": 0.870413197656491, + "grad_norm": 3.7730679512023926, + "learning_rate": 4.3400084099855854e-07, + "loss": 0.9461, + "step": 11291 + }, + { + "epoch": 0.8704902867715079, + "grad_norm": 4.046023845672607, + "learning_rate": 4.334922260455626e-07, + "loss": 0.9321, + "step": 11292 + }, + { + "epoch": 0.8705673758865248, + "grad_norm": 3.993216037750244, + "learning_rate": 4.3298389579067677e-07, + "loss": 0.8991, + "step": 11293 + }, + { + "epoch": 0.8706444650015418, + "grad_norm": 3.6614949703216553, + "learning_rate": 4.324758502655907e-07, + "loss": 0.9043, + "step": 11294 + }, + { + "epoch": 0.8707215541165587, + "grad_norm": 3.688453197479248, + "learning_rate": 4.3196808950197954e-07, + "loss": 0.9041, + "step": 11295 + }, + { + "epoch": 0.8707986432315757, + "grad_norm": 3.937119245529175, + "learning_rate": 4.314606135314997e-07, + "loss": 0.9584, + "step": 11296 + }, + { + "epoch": 0.8708757323465927, + "grad_norm": 3.8757388591766357, + "learning_rate": 4.3095342238578974e-07, + "loss": 0.8873, + "step": 11297 + }, + { + "epoch": 0.8709528214616096, + "grad_norm": 3.7967705726623535, + "learning_rate": 4.304465160964699e-07, + "loss": 0.8722, + "step": 11298 + }, + { + "epoch": 0.8710299105766266, + "grad_norm": 3.7907609939575195, + "learning_rate": 4.299398946951444e-07, + "loss": 0.9177, + "step": 11299 + }, + { + "epoch": 0.8711069996916435, + "grad_norm": 3.48050594329834, + "learning_rate": 4.294335582133968e-07, + "loss": 0.9796, + "step": 11300 + }, + { + "epoch": 0.8711840888066605, + "grad_norm": 3.578596830368042, + "learning_rate": 4.289275066827947e-07, + "loss": 0.9109, + "step": 11301 + }, + { + "epoch": 0.8712611779216775, + "grad_norm": 3.6778552532196045, + "learning_rate": 4.284217401348889e-07, + "loss": 0.9468, + "step": 11302 + }, + { + "epoch": 0.8713382670366944, + "grad_norm": 4.028721809387207, + "learning_rate": 4.2791625860121087e-07, + "loss": 0.8429, + "step": 11303 + }, + { + "epoch": 0.8714153561517114, + "grad_norm": 3.7571585178375244, + "learning_rate": 4.2741106211327544e-07, + "loss": 0.9363, + "step": 11304 + }, + { + "epoch": 0.8714924452667283, + "grad_norm": 3.5501813888549805, + "learning_rate": 4.2690615070257737e-07, + "loss": 0.8177, + "step": 11305 + }, + { + "epoch": 0.8715695343817453, + "grad_norm": 3.985402822494507, + "learning_rate": 4.264015244005959e-07, + "loss": 0.8521, + "step": 11306 + }, + { + "epoch": 0.8716466234967623, + "grad_norm": 3.679849624633789, + "learning_rate": 4.258971832387926e-07, + "loss": 0.9388, + "step": 11307 + }, + { + "epoch": 0.8717237126117792, + "grad_norm": 3.909078359603882, + "learning_rate": 4.253931272486095e-07, + "loss": 0.9455, + "step": 11308 + }, + { + "epoch": 0.8718008017267962, + "grad_norm": 3.8097403049468994, + "learning_rate": 4.2488935646147253e-07, + "loss": 0.9776, + "step": 11309 + }, + { + "epoch": 0.8718778908418131, + "grad_norm": 3.7239317893981934, + "learning_rate": 4.2438587090879e-07, + "loss": 0.8996, + "step": 11310 + }, + { + "epoch": 0.8719549799568301, + "grad_norm": 3.8013432025909424, + "learning_rate": 4.23882670621949e-07, + "loss": 0.9272, + "step": 11311 + }, + { + "epoch": 0.872032069071847, + "grad_norm": 4.090017795562744, + "learning_rate": 4.2337975563232437e-07, + "loss": 0.8814, + "step": 11312 + }, + { + "epoch": 0.872109158186864, + "grad_norm": 3.7773220539093018, + "learning_rate": 4.2287712597126884e-07, + "loss": 0.924, + "step": 11313 + }, + { + "epoch": 0.872186247301881, + "grad_norm": 3.6810662746429443, + "learning_rate": 4.223747816701196e-07, + "loss": 0.8808, + "step": 11314 + }, + { + "epoch": 0.8722633364168979, + "grad_norm": 3.542295455932617, + "learning_rate": 4.2187272276019373e-07, + "loss": 0.7952, + "step": 11315 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 3.5852906703948975, + "learning_rate": 4.2137094927279296e-07, + "loss": 0.8106, + "step": 11316 + }, + { + "epoch": 0.8724175146469318, + "grad_norm": 3.841078281402588, + "learning_rate": 4.208694612392006e-07, + "loss": 0.9806, + "step": 11317 + }, + { + "epoch": 0.8724946037619488, + "grad_norm": 3.797222852706909, + "learning_rate": 4.2036825869068097e-07, + "loss": 0.9715, + "step": 11318 + }, + { + "epoch": 0.8725716928769658, + "grad_norm": 3.6867878437042236, + "learning_rate": 4.19867341658482e-07, + "loss": 0.7848, + "step": 11319 + }, + { + "epoch": 0.8726487819919827, + "grad_norm": 3.439988613128662, + "learning_rate": 4.1936671017383356e-07, + "loss": 0.8382, + "step": 11320 + }, + { + "epoch": 0.8727258711069997, + "grad_norm": 3.4686200618743896, + "learning_rate": 4.188663642679469e-07, + "loss": 0.7944, + "step": 11321 + }, + { + "epoch": 0.8728029602220166, + "grad_norm": 3.7739365100860596, + "learning_rate": 4.1836630397201593e-07, + "loss": 0.8806, + "step": 11322 + }, + { + "epoch": 0.8728800493370336, + "grad_norm": 4.114104747772217, + "learning_rate": 4.1786652931721683e-07, + "loss": 0.8682, + "step": 11323 + }, + { + "epoch": 0.8729571384520506, + "grad_norm": 3.7291269302368164, + "learning_rate": 4.17367040334708e-07, + "loss": 0.8022, + "step": 11324 + }, + { + "epoch": 0.8730342275670675, + "grad_norm": 4.308863162994385, + "learning_rate": 4.1686783705563115e-07, + "loss": 1.0564, + "step": 11325 + }, + { + "epoch": 0.8731113166820845, + "grad_norm": 3.66945481300354, + "learning_rate": 4.163689195111076e-07, + "loss": 0.8329, + "step": 11326 + }, + { + "epoch": 0.8731884057971014, + "grad_norm": 3.6699118614196777, + "learning_rate": 4.158702877322418e-07, + "loss": 0.8636, + "step": 11327 + }, + { + "epoch": 0.8732654949121184, + "grad_norm": 3.587113618850708, + "learning_rate": 4.153719417501223e-07, + "loss": 0.8866, + "step": 11328 + }, + { + "epoch": 0.8733425840271354, + "grad_norm": 3.8809823989868164, + "learning_rate": 4.148738815958181e-07, + "loss": 0.8977, + "step": 11329 + }, + { + "epoch": 0.8734196731421523, + "grad_norm": 3.6669769287109375, + "learning_rate": 4.14376107300381e-07, + "loss": 0.8677, + "step": 11330 + }, + { + "epoch": 0.8734967622571693, + "grad_norm": 3.631700038909912, + "learning_rate": 4.1387861889484294e-07, + "loss": 0.9199, + "step": 11331 + }, + { + "epoch": 0.8735738513721862, + "grad_norm": 4.055285930633545, + "learning_rate": 4.1338141641022125e-07, + "loss": 0.9046, + "step": 11332 + }, + { + "epoch": 0.8736509404872032, + "grad_norm": 3.6422579288482666, + "learning_rate": 4.128844998775133e-07, + "loss": 1.0013, + "step": 11333 + }, + { + "epoch": 0.8737280296022202, + "grad_norm": 3.686168909072876, + "learning_rate": 4.1238786932769947e-07, + "loss": 0.9467, + "step": 11334 + }, + { + "epoch": 0.8738051187172371, + "grad_norm": 3.7894532680511475, + "learning_rate": 4.118915247917421e-07, + "loss": 0.884, + "step": 11335 + }, + { + "epoch": 0.8738822078322541, + "grad_norm": 3.6736958026885986, + "learning_rate": 4.113954663005865e-07, + "loss": 0.9269, + "step": 11336 + }, + { + "epoch": 0.873959296947271, + "grad_norm": 3.9054605960845947, + "learning_rate": 4.108996938851578e-07, + "loss": 0.9955, + "step": 11337 + }, + { + "epoch": 0.874036386062288, + "grad_norm": 3.832731008529663, + "learning_rate": 4.104042075763659e-07, + "loss": 0.96, + "step": 11338 + }, + { + "epoch": 0.874113475177305, + "grad_norm": 3.614189386367798, + "learning_rate": 4.0990900740510155e-07, + "loss": 0.8904, + "step": 11339 + }, + { + "epoch": 0.8741905642923219, + "grad_norm": 3.8343393802642822, + "learning_rate": 4.094140934022378e-07, + "loss": 0.8866, + "step": 11340 + }, + { + "epoch": 0.8742676534073389, + "grad_norm": 4.005188465118408, + "learning_rate": 4.089194655986306e-07, + "loss": 1.0047, + "step": 11341 + }, + { + "epoch": 0.8743447425223558, + "grad_norm": 3.9065210819244385, + "learning_rate": 4.084251240251164e-07, + "loss": 0.992, + "step": 11342 + }, + { + "epoch": 0.8744218316373727, + "grad_norm": 3.6721572875976562, + "learning_rate": 4.079310687125154e-07, + "loss": 0.8878, + "step": 11343 + }, + { + "epoch": 0.8744989207523898, + "grad_norm": 3.8441555500030518, + "learning_rate": 4.0743729969162924e-07, + "loss": 0.9359, + "step": 11344 + }, + { + "epoch": 0.8745760098674067, + "grad_norm": 3.992184638977051, + "learning_rate": 4.0694381699324157e-07, + "loss": 0.9161, + "step": 11345 + }, + { + "epoch": 0.8746530989824237, + "grad_norm": 3.973717451095581, + "learning_rate": 4.0645062064811945e-07, + "loss": 0.9411, + "step": 11346 + }, + { + "epoch": 0.8747301880974406, + "grad_norm": 3.883294105529785, + "learning_rate": 4.059577106870111e-07, + "loss": 0.9405, + "step": 11347 + }, + { + "epoch": 0.8748072772124575, + "grad_norm": 3.9654433727264404, + "learning_rate": 4.054650871406451e-07, + "loss": 0.9231, + "step": 11348 + }, + { + "epoch": 0.8748843663274746, + "grad_norm": 3.9687893390655518, + "learning_rate": 4.0497275003973604e-07, + "loss": 0.9068, + "step": 11349 + }, + { + "epoch": 0.8749614554424915, + "grad_norm": 3.720082998275757, + "learning_rate": 4.044806994149769e-07, + "loss": 0.9742, + "step": 11350 + }, + { + "epoch": 0.8750385445575085, + "grad_norm": 3.890352964401245, + "learning_rate": 4.039889352970461e-07, + "loss": 0.9705, + "step": 11351 + }, + { + "epoch": 0.8751156336725254, + "grad_norm": 4.000323295593262, + "learning_rate": 4.0349745771660233e-07, + "loss": 0.9054, + "step": 11352 + }, + { + "epoch": 0.8751927227875425, + "grad_norm": 4.110193729400635, + "learning_rate": 4.03006266704285e-07, + "loss": 0.8724, + "step": 11353 + }, + { + "epoch": 0.8752698119025594, + "grad_norm": 3.643488645553589, + "learning_rate": 4.0251536229071906e-07, + "loss": 0.8721, + "step": 11354 + }, + { + "epoch": 0.8753469010175763, + "grad_norm": 3.35498309135437, + "learning_rate": 4.0202474450650786e-07, + "loss": 0.8364, + "step": 11355 + }, + { + "epoch": 0.8754239901325933, + "grad_norm": 3.5333361625671387, + "learning_rate": 4.0153441338224187e-07, + "loss": 0.8993, + "step": 11356 + }, + { + "epoch": 0.8755010792476102, + "grad_norm": 3.7463607788085938, + "learning_rate": 4.010443689484894e-07, + "loss": 0.8257, + "step": 11357 + }, + { + "epoch": 0.8755781683626273, + "grad_norm": 3.9253594875335693, + "learning_rate": 4.0055461123580166e-07, + "loss": 0.921, + "step": 11358 + }, + { + "epoch": 0.8756552574776442, + "grad_norm": 4.455300331115723, + "learning_rate": 4.0006514027471243e-07, + "loss": 0.9877, + "step": 11359 + }, + { + "epoch": 0.8757323465926611, + "grad_norm": 3.6334636211395264, + "learning_rate": 3.9957595609573794e-07, + "loss": 0.824, + "step": 11360 + }, + { + "epoch": 0.8758094357076781, + "grad_norm": 4.01237154006958, + "learning_rate": 3.99087058729376e-07, + "loss": 0.8933, + "step": 11361 + }, + { + "epoch": 0.875886524822695, + "grad_norm": 3.8893368244171143, + "learning_rate": 3.985984482061089e-07, + "loss": 0.9028, + "step": 11362 + }, + { + "epoch": 0.875963613937712, + "grad_norm": 3.884305238723755, + "learning_rate": 3.981101245563962e-07, + "loss": 0.9565, + "step": 11363 + }, + { + "epoch": 0.876040703052729, + "grad_norm": 4.136764049530029, + "learning_rate": 3.976220878106829e-07, + "loss": 0.958, + "step": 11364 + }, + { + "epoch": 0.8761177921677459, + "grad_norm": 4.275717258453369, + "learning_rate": 3.971343379993964e-07, + "loss": 0.9941, + "step": 11365 + }, + { + "epoch": 0.8761948812827629, + "grad_norm": 3.4475440979003906, + "learning_rate": 3.9664687515294566e-07, + "loss": 0.8929, + "step": 11366 + }, + { + "epoch": 0.8762719703977798, + "grad_norm": 3.3091351985931396, + "learning_rate": 3.9615969930172027e-07, + "loss": 0.7643, + "step": 11367 + }, + { + "epoch": 0.8763490595127968, + "grad_norm": 3.8550055027008057, + "learning_rate": 3.9567281047609427e-07, + "loss": 0.8662, + "step": 11368 + }, + { + "epoch": 0.8764261486278138, + "grad_norm": 3.535909414291382, + "learning_rate": 3.9518620870642176e-07, + "loss": 0.9508, + "step": 11369 + }, + { + "epoch": 0.8765032377428307, + "grad_norm": 4.034812927246094, + "learning_rate": 3.946998940230401e-07, + "loss": 0.8804, + "step": 11370 + }, + { + "epoch": 0.8765803268578477, + "grad_norm": 3.9649436473846436, + "learning_rate": 3.9421386645626894e-07, + "loss": 1.0498, + "step": 11371 + }, + { + "epoch": 0.8766574159728646, + "grad_norm": 3.772454261779785, + "learning_rate": 3.9372812603640844e-07, + "loss": 0.9142, + "step": 11372 + }, + { + "epoch": 0.8767345050878816, + "grad_norm": 3.668459415435791, + "learning_rate": 3.932426727937444e-07, + "loss": 0.8981, + "step": 11373 + }, + { + "epoch": 0.8768115942028986, + "grad_norm": 4.301527500152588, + "learning_rate": 3.927575067585393e-07, + "loss": 0.7799, + "step": 11374 + }, + { + "epoch": 0.8768886833179155, + "grad_norm": 3.583970546722412, + "learning_rate": 3.9227262796104226e-07, + "loss": 0.9783, + "step": 11375 + }, + { + "epoch": 0.8769657724329325, + "grad_norm": 4.123873710632324, + "learning_rate": 3.91788036431483e-07, + "loss": 0.9984, + "step": 11376 + }, + { + "epoch": 0.8770428615479494, + "grad_norm": 3.809326648712158, + "learning_rate": 3.9130373220007347e-07, + "loss": 0.9792, + "step": 11377 + }, + { + "epoch": 0.8771199506629664, + "grad_norm": 3.716824769973755, + "learning_rate": 3.9081971529700726e-07, + "loss": 0.8683, + "step": 11378 + }, + { + "epoch": 0.8771970397779834, + "grad_norm": 3.9373066425323486, + "learning_rate": 3.903359857524597e-07, + "loss": 0.8693, + "step": 11379 + }, + { + "epoch": 0.8772741288930003, + "grad_norm": 3.775418996810913, + "learning_rate": 3.898525435965894e-07, + "loss": 0.8815, + "step": 11380 + }, + { + "epoch": 0.8773512180080173, + "grad_norm": 3.630040168762207, + "learning_rate": 3.893693888595368e-07, + "loss": 0.9739, + "step": 11381 + }, + { + "epoch": 0.8774283071230342, + "grad_norm": 3.7354750633239746, + "learning_rate": 3.8888652157142327e-07, + "loss": 0.8161, + "step": 11382 + }, + { + "epoch": 0.8775053962380512, + "grad_norm": 3.6053590774536133, + "learning_rate": 3.8840394176235365e-07, + "loss": 0.9304, + "step": 11383 + }, + { + "epoch": 0.8775824853530682, + "grad_norm": 3.883542537689209, + "learning_rate": 3.87921649462415e-07, + "loss": 0.8643, + "step": 11384 + }, + { + "epoch": 0.8776595744680851, + "grad_norm": 3.824704647064209, + "learning_rate": 3.8743964470167427e-07, + "loss": 0.8747, + "step": 11385 + }, + { + "epoch": 0.8777366635831021, + "grad_norm": 3.9502909183502197, + "learning_rate": 3.8695792751018257e-07, + "loss": 0.9046, + "step": 11386 + }, + { + "epoch": 0.877813752698119, + "grad_norm": 3.645355224609375, + "learning_rate": 3.864764979179725e-07, + "loss": 0.89, + "step": 11387 + }, + { + "epoch": 0.877890841813136, + "grad_norm": 3.920626163482666, + "learning_rate": 3.859953559550589e-07, + "loss": 1.0779, + "step": 11388 + }, + { + "epoch": 0.877967930928153, + "grad_norm": 3.494645595550537, + "learning_rate": 3.855145016514389e-07, + "loss": 0.9319, + "step": 11389 + }, + { + "epoch": 0.8780450200431699, + "grad_norm": 3.413755178451538, + "learning_rate": 3.8503393503708965e-07, + "loss": 0.9129, + "step": 11390 + }, + { + "epoch": 0.8781221091581869, + "grad_norm": 3.9198455810546875, + "learning_rate": 3.8455365614197327e-07, + "loss": 1.0013, + "step": 11391 + }, + { + "epoch": 0.8781991982732038, + "grad_norm": 4.102309703826904, + "learning_rate": 3.840736649960325e-07, + "loss": 0.9704, + "step": 11392 + }, + { + "epoch": 0.8782762873882208, + "grad_norm": 3.7239162921905518, + "learning_rate": 3.8359396162919225e-07, + "loss": 0.8796, + "step": 11393 + }, + { + "epoch": 0.8783533765032377, + "grad_norm": 3.941939115524292, + "learning_rate": 3.831145460713592e-07, + "loss": 0.9241, + "step": 11394 + }, + { + "epoch": 0.8784304656182547, + "grad_norm": 3.810060977935791, + "learning_rate": 3.826354183524239e-07, + "loss": 0.8778, + "step": 11395 + }, + { + "epoch": 0.8785075547332717, + "grad_norm": 3.513016700744629, + "learning_rate": 3.821565785022552e-07, + "loss": 0.9103, + "step": 11396 + }, + { + "epoch": 0.8785846438482886, + "grad_norm": 3.6620430946350098, + "learning_rate": 3.816780265507075e-07, + "loss": 0.9333, + "step": 11397 + }, + { + "epoch": 0.8786617329633056, + "grad_norm": 3.8001415729522705, + "learning_rate": 3.811997625276165e-07, + "loss": 0.8179, + "step": 11398 + }, + { + "epoch": 0.8787388220783225, + "grad_norm": 3.8821539878845215, + "learning_rate": 3.807217864627982e-07, + "loss": 0.9516, + "step": 11399 + }, + { + "epoch": 0.8788159111933395, + "grad_norm": 3.6851859092712402, + "learning_rate": 3.802440983860528e-07, + "loss": 0.9707, + "step": 11400 + }, + { + "epoch": 0.8788930003083565, + "grad_norm": 4.240994453430176, + "learning_rate": 3.7976669832716193e-07, + "loss": 0.9472, + "step": 11401 + }, + { + "epoch": 0.8789700894233734, + "grad_norm": 3.83020281791687, + "learning_rate": 3.7928958631588795e-07, + "loss": 0.9476, + "step": 11402 + }, + { + "epoch": 0.8790471785383904, + "grad_norm": 3.228642702102661, + "learning_rate": 3.788127623819776e-07, + "loss": 0.7694, + "step": 11403 + }, + { + "epoch": 0.8791242676534073, + "grad_norm": 3.589622735977173, + "learning_rate": 3.783362265551577e-07, + "loss": 0.9188, + "step": 11404 + }, + { + "epoch": 0.8792013567684243, + "grad_norm": 3.4474847316741943, + "learning_rate": 3.7785997886513827e-07, + "loss": 0.8424, + "step": 11405 + }, + { + "epoch": 0.8792784458834413, + "grad_norm": 3.7509865760803223, + "learning_rate": 3.7738401934161006e-07, + "loss": 0.883, + "step": 11406 + }, + { + "epoch": 0.8793555349984582, + "grad_norm": 3.862302541732788, + "learning_rate": 3.7690834801424714e-07, + "loss": 0.8483, + "step": 11407 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 3.5118508338928223, + "learning_rate": 3.764329649127046e-07, + "loss": 0.8729, + "step": 11408 + }, + { + "epoch": 0.8795097132284921, + "grad_norm": 3.7637479305267334, + "learning_rate": 3.759578700666211e-07, + "loss": 0.9572, + "step": 11409 + }, + { + "epoch": 0.879586802343509, + "grad_norm": 3.7532765865325928, + "learning_rate": 3.754830635056167e-07, + "loss": 0.9194, + "step": 11410 + }, + { + "epoch": 0.8796638914585261, + "grad_norm": 3.501901865005493, + "learning_rate": 3.750085452592911e-07, + "loss": 0.7931, + "step": 11411 + }, + { + "epoch": 0.879740980573543, + "grad_norm": 3.859379529953003, + "learning_rate": 3.745343153572295e-07, + "loss": 0.8634, + "step": 11412 + }, + { + "epoch": 0.87981806968856, + "grad_norm": 3.430656671524048, + "learning_rate": 3.7406037382899716e-07, + "loss": 0.9078, + "step": 11413 + }, + { + "epoch": 0.8798951588035769, + "grad_norm": 3.642101526260376, + "learning_rate": 3.735867207041427e-07, + "loss": 0.9528, + "step": 11414 + }, + { + "epoch": 0.8799722479185939, + "grad_norm": 3.803781270980835, + "learning_rate": 3.7311335601219524e-07, + "loss": 0.8334, + "step": 11415 + }, + { + "epoch": 0.8800493370336109, + "grad_norm": 3.886075496673584, + "learning_rate": 3.7264027978266727e-07, + "loss": 0.945, + "step": 11416 + }, + { + "epoch": 0.8801264261486278, + "grad_norm": 3.6445107460021973, + "learning_rate": 3.7216749204505187e-07, + "loss": 0.8265, + "step": 11417 + }, + { + "epoch": 0.8802035152636448, + "grad_norm": 4.004867076873779, + "learning_rate": 3.7169499282882435e-07, + "loss": 0.8832, + "step": 11418 + }, + { + "epoch": 0.8802806043786617, + "grad_norm": 3.6422877311706543, + "learning_rate": 3.71222782163444e-07, + "loss": 0.982, + "step": 11419 + }, + { + "epoch": 0.8803576934936787, + "grad_norm": 3.556857109069824, + "learning_rate": 3.707508600783499e-07, + "loss": 0.8123, + "step": 11420 + }, + { + "epoch": 0.8804347826086957, + "grad_norm": 3.529580593109131, + "learning_rate": 3.7027922660296533e-07, + "loss": 0.8967, + "step": 11421 + }, + { + "epoch": 0.8805118717237126, + "grad_norm": 3.779604911804199, + "learning_rate": 3.6980788176669223e-07, + "loss": 0.8938, + "step": 11422 + }, + { + "epoch": 0.8805889608387296, + "grad_norm": 3.4977400302886963, + "learning_rate": 3.6933682559891717e-07, + "loss": 0.9388, + "step": 11423 + }, + { + "epoch": 0.8806660499537465, + "grad_norm": 3.975605010986328, + "learning_rate": 3.6886605812900766e-07, + "loss": 0.9501, + "step": 11424 + }, + { + "epoch": 0.8807431390687634, + "grad_norm": 4.2428202629089355, + "learning_rate": 3.683955793863148e-07, + "loss": 0.9384, + "step": 11425 + }, + { + "epoch": 0.8808202281837805, + "grad_norm": 3.509303092956543, + "learning_rate": 3.6792538940017054e-07, + "loss": 0.9074, + "step": 11426 + }, + { + "epoch": 0.8808973172987974, + "grad_norm": 3.5925185680389404, + "learning_rate": 3.6745548819988763e-07, + "loss": 0.7849, + "step": 11427 + }, + { + "epoch": 0.8809744064138144, + "grad_norm": 3.5892629623413086, + "learning_rate": 3.669858758147621e-07, + "loss": 0.8397, + "step": 11428 + }, + { + "epoch": 0.8810514955288313, + "grad_norm": 3.527111291885376, + "learning_rate": 3.6651655227407214e-07, + "loss": 0.7728, + "step": 11429 + }, + { + "epoch": 0.8811285846438482, + "grad_norm": 3.720457077026367, + "learning_rate": 3.660475176070777e-07, + "loss": 0.8121, + "step": 11430 + }, + { + "epoch": 0.8812056737588653, + "grad_norm": 3.873420000076294, + "learning_rate": 3.6557877184302093e-07, + "loss": 0.9066, + "step": 11431 + }, + { + "epoch": 0.8812827628738822, + "grad_norm": 3.740910530090332, + "learning_rate": 3.6511031501112625e-07, + "loss": 0.8271, + "step": 11432 + }, + { + "epoch": 0.8813598519888992, + "grad_norm": 3.7194406986236572, + "learning_rate": 3.6464214714059754e-07, + "loss": 0.9262, + "step": 11433 + }, + { + "epoch": 0.8814369411039161, + "grad_norm": 3.7724802494049072, + "learning_rate": 3.641742682606242e-07, + "loss": 0.7748, + "step": 11434 + }, + { + "epoch": 0.881514030218933, + "grad_norm": 4.035959243774414, + "learning_rate": 3.637066784003757e-07, + "loss": 0.9279, + "step": 11435 + }, + { + "epoch": 0.8815911193339501, + "grad_norm": 3.918487310409546, + "learning_rate": 3.632393775890036e-07, + "loss": 0.9514, + "step": 11436 + }, + { + "epoch": 0.881668208448967, + "grad_norm": 3.338373899459839, + "learning_rate": 3.627723658556426e-07, + "loss": 0.7877, + "step": 11437 + }, + { + "epoch": 0.881745297563984, + "grad_norm": 3.8025522232055664, + "learning_rate": 3.6230564322940754e-07, + "loss": 0.9449, + "step": 11438 + }, + { + "epoch": 0.8818223866790009, + "grad_norm": 3.8101162910461426, + "learning_rate": 3.618392097393958e-07, + "loss": 0.9916, + "step": 11439 + }, + { + "epoch": 0.8818994757940178, + "grad_norm": 3.876401662826538, + "learning_rate": 3.6137306541468797e-07, + "loss": 0.8952, + "step": 11440 + }, + { + "epoch": 0.8819765649090349, + "grad_norm": 3.733764171600342, + "learning_rate": 3.6090721028434527e-07, + "loss": 0.8225, + "step": 11441 + }, + { + "epoch": 0.8820536540240518, + "grad_norm": 3.6853973865509033, + "learning_rate": 3.604416443774117e-07, + "loss": 0.8797, + "step": 11442 + }, + { + "epoch": 0.8821307431390688, + "grad_norm": 3.8799946308135986, + "learning_rate": 3.599763677229134e-07, + "loss": 0.9028, + "step": 11443 + }, + { + "epoch": 0.8822078322540857, + "grad_norm": 3.8343660831451416, + "learning_rate": 3.595113803498557e-07, + "loss": 0.9354, + "step": 11444 + }, + { + "epoch": 0.8822849213691026, + "grad_norm": 3.665975332260132, + "learning_rate": 3.5904668228723074e-07, + "loss": 0.9204, + "step": 11445 + }, + { + "epoch": 0.8823620104841197, + "grad_norm": 3.6539506912231445, + "learning_rate": 3.5858227356400876e-07, + "loss": 0.8875, + "step": 11446 + }, + { + "epoch": 0.8824390995991366, + "grad_norm": 3.846554756164551, + "learning_rate": 3.581181542091444e-07, + "loss": 0.875, + "step": 11447 + }, + { + "epoch": 0.8825161887141536, + "grad_norm": 3.7280449867248535, + "learning_rate": 3.576543242515712e-07, + "loss": 0.9288, + "step": 11448 + }, + { + "epoch": 0.8825932778291705, + "grad_norm": 3.6032848358154297, + "learning_rate": 3.571907837202077e-07, + "loss": 0.8565, + "step": 11449 + }, + { + "epoch": 0.8826703669441874, + "grad_norm": 3.6691172122955322, + "learning_rate": 3.5672753264395345e-07, + "loss": 0.8965, + "step": 11450 + }, + { + "epoch": 0.8827474560592045, + "grad_norm": 3.7892770767211914, + "learning_rate": 3.5626457105168876e-07, + "loss": 0.8547, + "step": 11451 + }, + { + "epoch": 0.8828245451742214, + "grad_norm": 4.1033477783203125, + "learning_rate": 3.558018989722778e-07, + "loss": 0.9716, + "step": 11452 + }, + { + "epoch": 0.8829016342892384, + "grad_norm": 3.6659538745880127, + "learning_rate": 3.5533951643456686e-07, + "loss": 0.8052, + "step": 11453 + }, + { + "epoch": 0.8829787234042553, + "grad_norm": 4.2055864334106445, + "learning_rate": 3.548774234673802e-07, + "loss": 0.8538, + "step": 11454 + }, + { + "epoch": 0.8830558125192722, + "grad_norm": 3.6786320209503174, + "learning_rate": 3.5441562009952856e-07, + "loss": 0.9322, + "step": 11455 + }, + { + "epoch": 0.8831329016342893, + "grad_norm": 3.6481385231018066, + "learning_rate": 3.5395410635980343e-07, + "loss": 0.8989, + "step": 11456 + }, + { + "epoch": 0.8832099907493062, + "grad_norm": 3.7394192218780518, + "learning_rate": 3.5349288227697675e-07, + "loss": 0.9274, + "step": 11457 + }, + { + "epoch": 0.8832870798643232, + "grad_norm": 3.672476053237915, + "learning_rate": 3.5303194787980497e-07, + "loss": 0.9608, + "step": 11458 + }, + { + "epoch": 0.8833641689793401, + "grad_norm": 3.525527238845825, + "learning_rate": 3.5257130319702347e-07, + "loss": 0.8911, + "step": 11459 + }, + { + "epoch": 0.883441258094357, + "grad_norm": 3.9847187995910645, + "learning_rate": 3.5211094825735147e-07, + "loss": 0.895, + "step": 11460 + }, + { + "epoch": 0.883518347209374, + "grad_norm": 4.699793338775635, + "learning_rate": 3.516508830894894e-07, + "loss": 0.956, + "step": 11461 + }, + { + "epoch": 0.883595436324391, + "grad_norm": 3.901806354522705, + "learning_rate": 3.5119110772212084e-07, + "loss": 0.9902, + "step": 11462 + }, + { + "epoch": 0.883672525439408, + "grad_norm": 4.073935031890869, + "learning_rate": 3.5073162218390967e-07, + "loss": 0.8234, + "step": 11463 + }, + { + "epoch": 0.8837496145544249, + "grad_norm": 3.7054238319396973, + "learning_rate": 3.502724265035035e-07, + "loss": 0.8809, + "step": 11464 + }, + { + "epoch": 0.8838267036694418, + "grad_norm": 3.653473377227783, + "learning_rate": 3.498135207095288e-07, + "loss": 0.8325, + "step": 11465 + }, + { + "epoch": 0.8839037927844589, + "grad_norm": 3.7188761234283447, + "learning_rate": 3.4935490483059775e-07, + "loss": 0.8455, + "step": 11466 + }, + { + "epoch": 0.8839808818994758, + "grad_norm": 4.025871276855469, + "learning_rate": 3.4889657889530195e-07, + "loss": 0.9839, + "step": 11467 + }, + { + "epoch": 0.8840579710144928, + "grad_norm": 3.9531667232513428, + "learning_rate": 3.4843854293221515e-07, + "loss": 0.9269, + "step": 11468 + }, + { + "epoch": 0.8841350601295097, + "grad_norm": 3.4958362579345703, + "learning_rate": 3.479807969698956e-07, + "loss": 0.8985, + "step": 11469 + }, + { + "epoch": 0.8842121492445266, + "grad_norm": 3.8494677543640137, + "learning_rate": 3.475233410368789e-07, + "loss": 0.9038, + "step": 11470 + }, + { + "epoch": 0.8842892383595437, + "grad_norm": 3.6966187953948975, + "learning_rate": 3.470661751616866e-07, + "loss": 0.816, + "step": 11471 + }, + { + "epoch": 0.8843663274745606, + "grad_norm": 3.576230049133301, + "learning_rate": 3.466092993728193e-07, + "loss": 0.8531, + "step": 11472 + }, + { + "epoch": 0.8844434165895776, + "grad_norm": 3.6708030700683594, + "learning_rate": 3.461527136987619e-07, + "loss": 0.8799, + "step": 11473 + }, + { + "epoch": 0.8845205057045945, + "grad_norm": 4.025277614593506, + "learning_rate": 3.4569641816798115e-07, + "loss": 0.9407, + "step": 11474 + }, + { + "epoch": 0.8845975948196114, + "grad_norm": 3.651562452316284, + "learning_rate": 3.452404128089221e-07, + "loss": 0.8917, + "step": 11475 + }, + { + "epoch": 0.8846746839346284, + "grad_norm": 3.617121934890747, + "learning_rate": 3.447846976500163e-07, + "loss": 0.9336, + "step": 11476 + }, + { + "epoch": 0.8847517730496454, + "grad_norm": 3.4860620498657227, + "learning_rate": 3.44329272719674e-07, + "loss": 0.8401, + "step": 11477 + }, + { + "epoch": 0.8848288621646624, + "grad_norm": 4.102723121643066, + "learning_rate": 3.4387413804628955e-07, + "loss": 1.0335, + "step": 11478 + }, + { + "epoch": 0.8849059512796793, + "grad_norm": 3.53051495552063, + "learning_rate": 3.434192936582381e-07, + "loss": 0.8077, + "step": 11479 + }, + { + "epoch": 0.8849830403946962, + "grad_norm": 3.604583978652954, + "learning_rate": 3.42964739583877e-07, + "loss": 0.977, + "step": 11480 + }, + { + "epoch": 0.8850601295097132, + "grad_norm": 3.8658695220947266, + "learning_rate": 3.425104758515441e-07, + "loss": 0.887, + "step": 11481 + }, + { + "epoch": 0.8851372186247302, + "grad_norm": 3.5702638626098633, + "learning_rate": 3.420565024895617e-07, + "loss": 0.9748, + "step": 11482 + }, + { + "epoch": 0.8852143077397472, + "grad_norm": 3.790424108505249, + "learning_rate": 3.416028195262322e-07, + "loss": 0.9366, + "step": 11483 + }, + { + "epoch": 0.8852913968547641, + "grad_norm": 4.021109580993652, + "learning_rate": 3.411494269898402e-07, + "loss": 1.0176, + "step": 11484 + }, + { + "epoch": 0.885368485969781, + "grad_norm": 4.051353454589844, + "learning_rate": 3.406963249086537e-07, + "loss": 0.8541, + "step": 11485 + }, + { + "epoch": 0.885445575084798, + "grad_norm": 3.7868709564208984, + "learning_rate": 3.4024351331091945e-07, + "loss": 0.8638, + "step": 11486 + }, + { + "epoch": 0.885522664199815, + "grad_norm": 3.931445598602295, + "learning_rate": 3.3979099222486824e-07, + "loss": 0.9284, + "step": 11487 + }, + { + "epoch": 0.885599753314832, + "grad_norm": 3.800612449645996, + "learning_rate": 3.3933876167871196e-07, + "loss": 0.8934, + "step": 11488 + }, + { + "epoch": 0.8856768424298489, + "grad_norm": 3.67228627204895, + "learning_rate": 3.388868217006469e-07, + "loss": 0.8676, + "step": 11489 + }, + { + "epoch": 0.8857539315448658, + "grad_norm": 4.013760089874268, + "learning_rate": 3.3843517231884894e-07, + "loss": 0.8973, + "step": 11490 + }, + { + "epoch": 0.8858310206598828, + "grad_norm": 3.6695284843444824, + "learning_rate": 3.379838135614738e-07, + "loss": 0.781, + "step": 11491 + }, + { + "epoch": 0.8859081097748998, + "grad_norm": 3.716569185256958, + "learning_rate": 3.375327454566629e-07, + "loss": 0.9104, + "step": 11492 + }, + { + "epoch": 0.8859851988899168, + "grad_norm": 3.993039608001709, + "learning_rate": 3.370819680325377e-07, + "loss": 0.8933, + "step": 11493 + }, + { + "epoch": 0.8860622880049337, + "grad_norm": 3.30863356590271, + "learning_rate": 3.3663148131720223e-07, + "loss": 0.799, + "step": 11494 + }, + { + "epoch": 0.8861393771199506, + "grad_norm": 3.9631307125091553, + "learning_rate": 3.3618128533874196e-07, + "loss": 0.8422, + "step": 11495 + }, + { + "epoch": 0.8862164662349676, + "grad_norm": 3.697521209716797, + "learning_rate": 3.357313801252238e-07, + "loss": 0.8291, + "step": 11496 + }, + { + "epoch": 0.8862935553499846, + "grad_norm": 4.11827278137207, + "learning_rate": 3.3528176570469697e-07, + "loss": 0.9668, + "step": 11497 + }, + { + "epoch": 0.8863706444650016, + "grad_norm": 4.193023204803467, + "learning_rate": 3.348324421051929e-07, + "loss": 0.896, + "step": 11498 + }, + { + "epoch": 0.8864477335800185, + "grad_norm": 3.8270843029022217, + "learning_rate": 3.3438340935472436e-07, + "loss": 0.9042, + "step": 11499 + }, + { + "epoch": 0.8865248226950354, + "grad_norm": 3.8805384635925293, + "learning_rate": 3.3393466748128657e-07, + "loss": 0.9534, + "step": 11500 + }, + { + "epoch": 0.8866019118100524, + "grad_norm": 3.6519830226898193, + "learning_rate": 3.3348621651285663e-07, + "loss": 0.8594, + "step": 11501 + }, + { + "epoch": 0.8866790009250693, + "grad_norm": 3.8050684928894043, + "learning_rate": 3.330380564773922e-07, + "loss": 0.8864, + "step": 11502 + }, + { + "epoch": 0.8867560900400864, + "grad_norm": 4.044971942901611, + "learning_rate": 3.3259018740283366e-07, + "loss": 0.9039, + "step": 11503 + }, + { + "epoch": 0.8868331791551033, + "grad_norm": 3.61731219291687, + "learning_rate": 3.321426093171043e-07, + "loss": 0.8472, + "step": 11504 + }, + { + "epoch": 0.8869102682701202, + "grad_norm": 3.757559061050415, + "learning_rate": 3.316953222481073e-07, + "loss": 0.9608, + "step": 11505 + }, + { + "epoch": 0.8869873573851372, + "grad_norm": 4.073364734649658, + "learning_rate": 3.312483262237304e-07, + "loss": 0.97, + "step": 11506 + }, + { + "epoch": 0.8870644465001541, + "grad_norm": 3.8168318271636963, + "learning_rate": 3.30801621271839e-07, + "loss": 1.0477, + "step": 11507 + }, + { + "epoch": 0.8871415356151712, + "grad_norm": 3.800417184829712, + "learning_rate": 3.303552074202848e-07, + "loss": 0.9133, + "step": 11508 + }, + { + "epoch": 0.8872186247301881, + "grad_norm": 3.7072558403015137, + "learning_rate": 3.299090846968983e-07, + "loss": 0.8998, + "step": 11509 + }, + { + "epoch": 0.887295713845205, + "grad_norm": 3.8859128952026367, + "learning_rate": 3.294632531294933e-07, + "loss": 0.8885, + "step": 11510 + }, + { + "epoch": 0.887372802960222, + "grad_norm": 3.608797311782837, + "learning_rate": 3.290177127458655e-07, + "loss": 0.9306, + "step": 11511 + }, + { + "epoch": 0.8874498920752389, + "grad_norm": 3.846325635910034, + "learning_rate": 3.285724635737919e-07, + "loss": 0.8752, + "step": 11512 + }, + { + "epoch": 0.887526981190256, + "grad_norm": 3.467984199523926, + "learning_rate": 3.281275056410305e-07, + "loss": 0.7637, + "step": 11513 + }, + { + "epoch": 0.8876040703052729, + "grad_norm": 3.4977262020111084, + "learning_rate": 3.276828389753234e-07, + "loss": 0.8379, + "step": 11514 + }, + { + "epoch": 0.8876811594202898, + "grad_norm": 3.824713945388794, + "learning_rate": 3.2723846360439236e-07, + "loss": 0.8603, + "step": 11515 + }, + { + "epoch": 0.8877582485353068, + "grad_norm": 3.6566162109375, + "learning_rate": 3.267943795559425e-07, + "loss": 0.9688, + "step": 11516 + }, + { + "epoch": 0.8878353376503237, + "grad_norm": 3.705967664718628, + "learning_rate": 3.263505868576605e-07, + "loss": 0.9366, + "step": 11517 + }, + { + "epoch": 0.8879124267653408, + "grad_norm": 3.7640185356140137, + "learning_rate": 3.259070855372132e-07, + "loss": 0.8421, + "step": 11518 + }, + { + "epoch": 0.8879895158803577, + "grad_norm": 3.6393392086029053, + "learning_rate": 3.2546387562225166e-07, + "loss": 0.9986, + "step": 11519 + }, + { + "epoch": 0.8880666049953746, + "grad_norm": 3.8339908123016357, + "learning_rate": 3.2502095714040673e-07, + "loss": 0.8996, + "step": 11520 + }, + { + "epoch": 0.8881436941103916, + "grad_norm": 3.783637285232544, + "learning_rate": 3.245783301192934e-07, + "loss": 0.9308, + "step": 11521 + }, + { + "epoch": 0.8882207832254085, + "grad_norm": 3.7414445877075195, + "learning_rate": 3.2413599458650744e-07, + "loss": 0.8207, + "step": 11522 + }, + { + "epoch": 0.8882978723404256, + "grad_norm": 3.6002695560455322, + "learning_rate": 3.2369395056962404e-07, + "loss": 0.8007, + "step": 11523 + }, + { + "epoch": 0.8883749614554425, + "grad_norm": 3.9991655349731445, + "learning_rate": 3.232521980962039e-07, + "loss": 0.8721, + "step": 11524 + }, + { + "epoch": 0.8884520505704595, + "grad_norm": 3.785975933074951, + "learning_rate": 3.2281073719378773e-07, + "loss": 0.9526, + "step": 11525 + }, + { + "epoch": 0.8885291396854764, + "grad_norm": 3.625488758087158, + "learning_rate": 3.22369567889898e-07, + "loss": 0.8719, + "step": 11526 + }, + { + "epoch": 0.8886062288004933, + "grad_norm": 3.615448236465454, + "learning_rate": 3.2192869021203997e-07, + "loss": 0.8264, + "step": 11527 + }, + { + "epoch": 0.8886833179155104, + "grad_norm": 3.5204501152038574, + "learning_rate": 3.214881041877005e-07, + "loss": 0.8942, + "step": 11528 + }, + { + "epoch": 0.8887604070305273, + "grad_norm": 3.519294261932373, + "learning_rate": 3.210478098443459e-07, + "loss": 0.7734, + "step": 11529 + }, + { + "epoch": 0.8888374961455443, + "grad_norm": 3.4945321083068848, + "learning_rate": 3.206078072094276e-07, + "loss": 0.8511, + "step": 11530 + }, + { + "epoch": 0.8889145852605612, + "grad_norm": 3.7900755405426025, + "learning_rate": 3.201680963103776e-07, + "loss": 0.8575, + "step": 11531 + }, + { + "epoch": 0.8889916743755781, + "grad_norm": 3.660367012023926, + "learning_rate": 3.197286771746094e-07, + "loss": 0.91, + "step": 11532 + }, + { + "epoch": 0.8890687634905952, + "grad_norm": 3.8760204315185547, + "learning_rate": 3.192895498295179e-07, + "loss": 0.8206, + "step": 11533 + }, + { + "epoch": 0.8891458526056121, + "grad_norm": 3.871697425842285, + "learning_rate": 3.188507143024816e-07, + "loss": 0.9093, + "step": 11534 + }, + { + "epoch": 0.8892229417206291, + "grad_norm": 3.5889370441436768, + "learning_rate": 3.184121706208587e-07, + "loss": 0.8424, + "step": 11535 + }, + { + "epoch": 0.889300030835646, + "grad_norm": 3.674396514892578, + "learning_rate": 3.1797391881199014e-07, + "loss": 0.9136, + "step": 11536 + }, + { + "epoch": 0.8893771199506629, + "grad_norm": 3.540079116821289, + "learning_rate": 3.17535958903199e-07, + "loss": 0.8915, + "step": 11537 + }, + { + "epoch": 0.88945420906568, + "grad_norm": 3.643944501876831, + "learning_rate": 3.170982909217907e-07, + "loss": 0.9742, + "step": 11538 + }, + { + "epoch": 0.8895312981806969, + "grad_norm": 3.8346221446990967, + "learning_rate": 3.1666091489505004e-07, + "loss": 0.9745, + "step": 11539 + }, + { + "epoch": 0.8896083872957139, + "grad_norm": 3.8320624828338623, + "learning_rate": 3.162238308502452e-07, + "loss": 0.8823, + "step": 11540 + }, + { + "epoch": 0.8896854764107308, + "grad_norm": 3.508303642272949, + "learning_rate": 3.157870388146267e-07, + "loss": 0.8418, + "step": 11541 + }, + { + "epoch": 0.8897625655257477, + "grad_norm": 3.801394462585449, + "learning_rate": 3.1535053881542657e-07, + "loss": 0.9079, + "step": 11542 + }, + { + "epoch": 0.8898396546407648, + "grad_norm": 3.286583662033081, + "learning_rate": 3.14914330879858e-07, + "loss": 0.7761, + "step": 11543 + }, + { + "epoch": 0.8899167437557817, + "grad_norm": 4.076812267303467, + "learning_rate": 3.14478415035116e-07, + "loss": 1.0037, + "step": 11544 + }, + { + "epoch": 0.8899938328707987, + "grad_norm": 3.5942256450653076, + "learning_rate": 3.140427913083777e-07, + "loss": 0.8007, + "step": 11545 + }, + { + "epoch": 0.8900709219858156, + "grad_norm": 3.80216121673584, + "learning_rate": 3.136074597268024e-07, + "loss": 0.9157, + "step": 11546 + }, + { + "epoch": 0.8901480111008325, + "grad_norm": 4.44173002243042, + "learning_rate": 3.1317242031753013e-07, + "loss": 1.0875, + "step": 11547 + }, + { + "epoch": 0.8902251002158496, + "grad_norm": 3.757503032684326, + "learning_rate": 3.127376731076837e-07, + "loss": 0.9564, + "step": 11548 + }, + { + "epoch": 0.8903021893308665, + "grad_norm": 3.6914825439453125, + "learning_rate": 3.1230321812436846e-07, + "loss": 0.8826, + "step": 11549 + }, + { + "epoch": 0.8903792784458835, + "grad_norm": 3.583136796951294, + "learning_rate": 3.118690553946685e-07, + "loss": 0.8588, + "step": 11550 + }, + { + "epoch": 0.8904563675609004, + "grad_norm": 3.7965340614318848, + "learning_rate": 3.114351849456526e-07, + "loss": 0.9515, + "step": 11551 + }, + { + "epoch": 0.8905334566759173, + "grad_norm": 3.5539650917053223, + "learning_rate": 3.110016068043703e-07, + "loss": 0.9266, + "step": 11552 + }, + { + "epoch": 0.8906105457909343, + "grad_norm": 3.8235104084014893, + "learning_rate": 3.105683209978527e-07, + "loss": 0.9676, + "step": 11553 + }, + { + "epoch": 0.8906876349059513, + "grad_norm": 3.7492964267730713, + "learning_rate": 3.101353275531138e-07, + "loss": 0.8673, + "step": 11554 + }, + { + "epoch": 0.8907647240209683, + "grad_norm": 3.6325714588165283, + "learning_rate": 3.09702626497147e-07, + "loss": 0.8821, + "step": 11555 + }, + { + "epoch": 0.8908418131359852, + "grad_norm": 3.75046443939209, + "learning_rate": 3.092702178569301e-07, + "loss": 0.9256, + "step": 11556 + }, + { + "epoch": 0.8909189022510021, + "grad_norm": 3.687927007675171, + "learning_rate": 3.088381016594211e-07, + "loss": 0.8995, + "step": 11557 + }, + { + "epoch": 0.8909959913660191, + "grad_norm": 3.565046787261963, + "learning_rate": 3.0840627793156e-07, + "loss": 0.8913, + "step": 11558 + }, + { + "epoch": 0.8910730804810361, + "grad_norm": 3.7868244647979736, + "learning_rate": 3.0797474670026917e-07, + "loss": 0.9228, + "step": 11559 + }, + { + "epoch": 0.8911501695960531, + "grad_norm": 3.6656594276428223, + "learning_rate": 3.0754350799245323e-07, + "loss": 0.9629, + "step": 11560 + }, + { + "epoch": 0.89122725871107, + "grad_norm": 3.7980310916900635, + "learning_rate": 3.0711256183499574e-07, + "loss": 0.9249, + "step": 11561 + }, + { + "epoch": 0.8913043478260869, + "grad_norm": 3.5572173595428467, + "learning_rate": 3.066819082547651e-07, + "loss": 0.7583, + "step": 11562 + }, + { + "epoch": 0.8913814369411039, + "grad_norm": 3.639159679412842, + "learning_rate": 3.062515472786098e-07, + "loss": 1.0125, + "step": 11563 + }, + { + "epoch": 0.8914585260561209, + "grad_norm": 3.9139461517333984, + "learning_rate": 3.0582147893336136e-07, + "loss": 0.994, + "step": 11564 + }, + { + "epoch": 0.8915356151711379, + "grad_norm": 3.53010630607605, + "learning_rate": 3.0539170324583257e-07, + "loss": 0.8378, + "step": 11565 + }, + { + "epoch": 0.8916127042861548, + "grad_norm": 3.5645971298217773, + "learning_rate": 3.049622202428165e-07, + "loss": 0.8547, + "step": 11566 + }, + { + "epoch": 0.8916897934011717, + "grad_norm": 3.8837499618530273, + "learning_rate": 3.045330299510896e-07, + "loss": 0.9795, + "step": 11567 + }, + { + "epoch": 0.8917668825161887, + "grad_norm": 4.236751079559326, + "learning_rate": 3.041041323974098e-07, + "loss": 0.9627, + "step": 11568 + }, + { + "epoch": 0.8918439716312057, + "grad_norm": 4.3294477462768555, + "learning_rate": 3.0367552760851684e-07, + "loss": 0.8706, + "step": 11569 + }, + { + "epoch": 0.8919210607462227, + "grad_norm": 3.358781337738037, + "learning_rate": 3.0324721561113213e-07, + "loss": 0.8451, + "step": 11570 + }, + { + "epoch": 0.8919981498612396, + "grad_norm": 3.78355073928833, + "learning_rate": 3.028191964319582e-07, + "loss": 0.771, + "step": 11571 + }, + { + "epoch": 0.8920752389762565, + "grad_norm": 3.7563531398773193, + "learning_rate": 3.023914700976799e-07, + "loss": 0.8986, + "step": 11572 + }, + { + "epoch": 0.8921523280912735, + "grad_norm": 3.557497024536133, + "learning_rate": 3.019640366349641e-07, + "loss": 0.9661, + "step": 11573 + }, + { + "epoch": 0.8922294172062905, + "grad_norm": 3.693223476409912, + "learning_rate": 3.015368960704584e-07, + "loss": 0.8886, + "step": 11574 + }, + { + "epoch": 0.8923065063213075, + "grad_norm": 3.728806972503662, + "learning_rate": 3.0111004843079327e-07, + "loss": 0.8569, + "step": 11575 + }, + { + "epoch": 0.8923835954363244, + "grad_norm": 3.8139472007751465, + "learning_rate": 3.0068349374258175e-07, + "loss": 0.8939, + "step": 11576 + }, + { + "epoch": 0.8924606845513413, + "grad_norm": 3.815910577774048, + "learning_rate": 3.002572320324143e-07, + "loss": 0.8666, + "step": 11577 + }, + { + "epoch": 0.8925377736663583, + "grad_norm": 3.8948774337768555, + "learning_rate": 2.99831263326868e-07, + "loss": 0.9402, + "step": 11578 + }, + { + "epoch": 0.8926148627813753, + "grad_norm": 4.020652770996094, + "learning_rate": 2.994055876525004e-07, + "loss": 1.011, + "step": 11579 + }, + { + "epoch": 0.8926919518963923, + "grad_norm": 3.6577086448669434, + "learning_rate": 2.9898020503584977e-07, + "loss": 0.9003, + "step": 11580 + }, + { + "epoch": 0.8927690410114092, + "grad_norm": 3.762552499771118, + "learning_rate": 2.985551155034355e-07, + "loss": 0.9369, + "step": 11581 + }, + { + "epoch": 0.8928461301264261, + "grad_norm": 3.9927804470062256, + "learning_rate": 2.9813031908176025e-07, + "loss": 0.8735, + "step": 11582 + }, + { + "epoch": 0.8929232192414431, + "grad_norm": 3.5767393112182617, + "learning_rate": 2.977058157973084e-07, + "loss": 0.9675, + "step": 11583 + }, + { + "epoch": 0.89300030835646, + "grad_norm": 3.9472174644470215, + "learning_rate": 2.9728160567654483e-07, + "loss": 0.9394, + "step": 11584 + }, + { + "epoch": 0.8930773974714771, + "grad_norm": 3.4129271507263184, + "learning_rate": 2.968576887459168e-07, + "loss": 0.7976, + "step": 11585 + }, + { + "epoch": 0.893154486586494, + "grad_norm": 3.768298864364624, + "learning_rate": 2.964340650318548e-07, + "loss": 0.9286, + "step": 11586 + }, + { + "epoch": 0.8932315757015109, + "grad_norm": 3.6881837844848633, + "learning_rate": 2.9601073456076766e-07, + "loss": 0.8934, + "step": 11587 + }, + { + "epoch": 0.8933086648165279, + "grad_norm": 3.6824235916137695, + "learning_rate": 2.955876973590488e-07, + "loss": 0.9171, + "step": 11588 + }, + { + "epoch": 0.8933857539315448, + "grad_norm": 3.3715460300445557, + "learning_rate": 2.9516495345307207e-07, + "loss": 0.8354, + "step": 11589 + }, + { + "epoch": 0.8934628430465619, + "grad_norm": 3.638155698776245, + "learning_rate": 2.947425028691936e-07, + "loss": 0.8242, + "step": 11590 + }, + { + "epoch": 0.8935399321615788, + "grad_norm": 3.8177831172943115, + "learning_rate": 2.943203456337512e-07, + "loss": 1.0496, + "step": 11591 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 3.5661537647247314, + "learning_rate": 2.938984817730639e-07, + "loss": 0.8795, + "step": 11592 + }, + { + "epoch": 0.8936941103916127, + "grad_norm": 3.6265878677368164, + "learning_rate": 2.9347691131343223e-07, + "loss": 0.95, + "step": 11593 + }, + { + "epoch": 0.8937711995066296, + "grad_norm": 3.493656635284424, + "learning_rate": 2.930556342811397e-07, + "loss": 0.8027, + "step": 11594 + }, + { + "epoch": 0.8938482886216467, + "grad_norm": 3.3560476303100586, + "learning_rate": 2.926346507024502e-07, + "loss": 0.8137, + "step": 11595 + }, + { + "epoch": 0.8939253777366636, + "grad_norm": 3.855767011642456, + "learning_rate": 2.922139606036106e-07, + "loss": 0.9452, + "step": 11596 + }, + { + "epoch": 0.8940024668516805, + "grad_norm": 4.110727787017822, + "learning_rate": 2.917935640108488e-07, + "loss": 1.0172, + "step": 11597 + }, + { + "epoch": 0.8940795559666975, + "grad_norm": 3.6166462898254395, + "learning_rate": 2.9137346095037324e-07, + "loss": 0.9122, + "step": 11598 + }, + { + "epoch": 0.8941566450817144, + "grad_norm": 3.6462063789367676, + "learning_rate": 2.909536514483752e-07, + "loss": 0.831, + "step": 11599 + }, + { + "epoch": 0.8942337341967315, + "grad_norm": 3.6715078353881836, + "learning_rate": 2.9053413553102874e-07, + "loss": 0.8714, + "step": 11600 + }, + { + "epoch": 0.8943108233117484, + "grad_norm": 3.616772174835205, + "learning_rate": 2.901149132244879e-07, + "loss": 1.0388, + "step": 11601 + }, + { + "epoch": 0.8943879124267653, + "grad_norm": 4.153509616851807, + "learning_rate": 2.896959845548902e-07, + "loss": 0.8361, + "step": 11602 + }, + { + "epoch": 0.8944650015417823, + "grad_norm": 3.7695071697235107, + "learning_rate": 2.892773495483514e-07, + "loss": 0.9392, + "step": 11603 + }, + { + "epoch": 0.8945420906567992, + "grad_norm": 3.27496600151062, + "learning_rate": 2.8885900823097223e-07, + "loss": 0.8339, + "step": 11604 + }, + { + "epoch": 0.8946191797718163, + "grad_norm": 4.047286510467529, + "learning_rate": 2.8844096062883466e-07, + "loss": 0.9914, + "step": 11605 + }, + { + "epoch": 0.8946962688868332, + "grad_norm": 3.8037588596343994, + "learning_rate": 2.880232067680011e-07, + "loss": 0.8849, + "step": 11606 + }, + { + "epoch": 0.8947733580018501, + "grad_norm": 3.4862048625946045, + "learning_rate": 2.876057466745169e-07, + "loss": 0.8648, + "step": 11607 + }, + { + "epoch": 0.8948504471168671, + "grad_norm": 3.446492910385132, + "learning_rate": 2.87188580374409e-07, + "loss": 0.8775, + "step": 11608 + }, + { + "epoch": 0.894927536231884, + "grad_norm": 3.7354066371917725, + "learning_rate": 2.867717078936838e-07, + "loss": 0.9698, + "step": 11609 + }, + { + "epoch": 0.8950046253469011, + "grad_norm": 4.134057998657227, + "learning_rate": 2.863551292583322e-07, + "loss": 0.9169, + "step": 11610 + }, + { + "epoch": 0.895081714461918, + "grad_norm": 3.7595643997192383, + "learning_rate": 2.8593884449432617e-07, + "loss": 0.9883, + "step": 11611 + }, + { + "epoch": 0.8951588035769349, + "grad_norm": 3.875776529312134, + "learning_rate": 2.8552285362761833e-07, + "loss": 0.985, + "step": 11612 + }, + { + "epoch": 0.8952358926919519, + "grad_norm": 3.6377480030059814, + "learning_rate": 2.85107156684144e-07, + "loss": 0.8584, + "step": 11613 + }, + { + "epoch": 0.8953129818069688, + "grad_norm": 3.419172763824463, + "learning_rate": 2.846917536898192e-07, + "loss": 0.8829, + "step": 11614 + }, + { + "epoch": 0.8953900709219859, + "grad_norm": 3.9442739486694336, + "learning_rate": 2.8427664467054194e-07, + "loss": 0.9979, + "step": 11615 + }, + { + "epoch": 0.8954671600370028, + "grad_norm": 3.6538686752319336, + "learning_rate": 2.838618296521922e-07, + "loss": 0.9496, + "step": 11616 + }, + { + "epoch": 0.8955442491520197, + "grad_norm": 3.4656448364257812, + "learning_rate": 2.8344730866063264e-07, + "loss": 0.7275, + "step": 11617 + }, + { + "epoch": 0.8956213382670367, + "grad_norm": 3.758117437362671, + "learning_rate": 2.8303308172170587e-07, + "loss": 0.8674, + "step": 11618 + }, + { + "epoch": 0.8956984273820536, + "grad_norm": 3.560523271560669, + "learning_rate": 2.826191488612362e-07, + "loss": 0.8544, + "step": 11619 + }, + { + "epoch": 0.8957755164970707, + "grad_norm": 3.549065351486206, + "learning_rate": 2.822055101050308e-07, + "loss": 0.8741, + "step": 11620 + }, + { + "epoch": 0.8958526056120876, + "grad_norm": 3.7114226818084717, + "learning_rate": 2.817921654788769e-07, + "loss": 0.9257, + "step": 11621 + }, + { + "epoch": 0.8959296947271045, + "grad_norm": 4.023944854736328, + "learning_rate": 2.813791150085454e-07, + "loss": 1.1294, + "step": 11622 + }, + { + "epoch": 0.8960067838421215, + "grad_norm": 3.9397988319396973, + "learning_rate": 2.8096635871978915e-07, + "loss": 0.906, + "step": 11623 + }, + { + "epoch": 0.8960838729571384, + "grad_norm": 3.5151617527008057, + "learning_rate": 2.8055389663833923e-07, + "loss": 0.9842, + "step": 11624 + }, + { + "epoch": 0.8961609620721555, + "grad_norm": 3.4286890029907227, + "learning_rate": 2.801417287899111e-07, + "loss": 0.9019, + "step": 11625 + }, + { + "epoch": 0.8962380511871724, + "grad_norm": 3.755769968032837, + "learning_rate": 2.7972985520020093e-07, + "loss": 0.9232, + "step": 11626 + }, + { + "epoch": 0.8963151403021893, + "grad_norm": 3.760594606399536, + "learning_rate": 2.793182758948881e-07, + "loss": 0.8456, + "step": 11627 + }, + { + "epoch": 0.8963922294172063, + "grad_norm": 3.722588300704956, + "learning_rate": 2.7890699089963225e-07, + "loss": 0.923, + "step": 11628 + }, + { + "epoch": 0.8964693185322232, + "grad_norm": 3.614448070526123, + "learning_rate": 2.784960002400733e-07, + "loss": 1.0738, + "step": 11629 + }, + { + "epoch": 0.8965464076472403, + "grad_norm": 3.7189695835113525, + "learning_rate": 2.7808530394183577e-07, + "loss": 0.9993, + "step": 11630 + }, + { + "epoch": 0.8966234967622572, + "grad_norm": 3.6947999000549316, + "learning_rate": 2.776749020305236e-07, + "loss": 0.8291, + "step": 11631 + }, + { + "epoch": 0.8967005858772741, + "grad_norm": 3.6183085441589355, + "learning_rate": 2.7726479453172415e-07, + "loss": 0.7955, + "step": 11632 + }, + { + "epoch": 0.8967776749922911, + "grad_norm": 3.457646608352661, + "learning_rate": 2.7685498147100533e-07, + "loss": 0.8498, + "step": 11633 + }, + { + "epoch": 0.896854764107308, + "grad_norm": 3.7649595737457275, + "learning_rate": 2.7644546287391716e-07, + "loss": 0.8975, + "step": 11634 + }, + { + "epoch": 0.896931853222325, + "grad_norm": 3.469334840774536, + "learning_rate": 2.760362387659893e-07, + "loss": 0.8225, + "step": 11635 + }, + { + "epoch": 0.897008942337342, + "grad_norm": 3.689354181289673, + "learning_rate": 2.756273091727363e-07, + "loss": 0.9019, + "step": 11636 + }, + { + "epoch": 0.8970860314523589, + "grad_norm": 3.99678635597229, + "learning_rate": 2.7521867411965273e-07, + "loss": 0.9491, + "step": 11637 + }, + { + "epoch": 0.8971631205673759, + "grad_norm": 3.96669602394104, + "learning_rate": 2.7481033363221385e-07, + "loss": 0.8561, + "step": 11638 + }, + { + "epoch": 0.8972402096823928, + "grad_norm": 3.8000166416168213, + "learning_rate": 2.744022877358793e-07, + "loss": 0.9234, + "step": 11639 + }, + { + "epoch": 0.8973172987974098, + "grad_norm": 3.587712287902832, + "learning_rate": 2.73994536456087e-07, + "loss": 0.905, + "step": 11640 + }, + { + "epoch": 0.8973943879124268, + "grad_norm": 3.3833603858947754, + "learning_rate": 2.735870798182588e-07, + "loss": 0.8504, + "step": 11641 + }, + { + "epoch": 0.8974714770274437, + "grad_norm": 3.685027837753296, + "learning_rate": 2.7317991784779727e-07, + "loss": 0.9113, + "step": 11642 + }, + { + "epoch": 0.8975485661424607, + "grad_norm": 3.7003281116485596, + "learning_rate": 2.727730505700871e-07, + "loss": 0.8788, + "step": 11643 + }, + { + "epoch": 0.8976256552574776, + "grad_norm": 3.5367331504821777, + "learning_rate": 2.7236647801049456e-07, + "loss": 0.879, + "step": 11644 + }, + { + "epoch": 0.8977027443724946, + "grad_norm": 4.164783000946045, + "learning_rate": 2.719602001943672e-07, + "loss": 1.0591, + "step": 11645 + }, + { + "epoch": 0.8977798334875116, + "grad_norm": 3.9324138164520264, + "learning_rate": 2.7155421714703424e-07, + "loss": 0.9406, + "step": 11646 + }, + { + "epoch": 0.8978569226025285, + "grad_norm": 4.007849216461182, + "learning_rate": 2.7114852889380594e-07, + "loss": 0.9425, + "step": 11647 + }, + { + "epoch": 0.8979340117175455, + "grad_norm": 3.8484272956848145, + "learning_rate": 2.7074313545997546e-07, + "loss": 0.9126, + "step": 11648 + }, + { + "epoch": 0.8980111008325624, + "grad_norm": 4.0231804847717285, + "learning_rate": 2.703380368708175e-07, + "loss": 0.8856, + "step": 11649 + }, + { + "epoch": 0.8980881899475794, + "grad_norm": 3.677838087081909, + "learning_rate": 2.6993323315158804e-07, + "loss": 0.9547, + "step": 11650 + }, + { + "epoch": 0.8981652790625964, + "grad_norm": 3.767169237136841, + "learning_rate": 2.6952872432752295e-07, + "loss": 0.9004, + "step": 11651 + }, + { + "epoch": 0.8982423681776133, + "grad_norm": 3.888305187225342, + "learning_rate": 2.691245104238421e-07, + "loss": 0.9836, + "step": 11652 + }, + { + "epoch": 0.8983194572926303, + "grad_norm": 4.260769367218018, + "learning_rate": 2.687205914657465e-07, + "loss": 0.9681, + "step": 11653 + }, + { + "epoch": 0.8983965464076472, + "grad_norm": 3.9282066822052, + "learning_rate": 2.6831696747841804e-07, + "loss": 0.8736, + "step": 11654 + }, + { + "epoch": 0.8984736355226642, + "grad_norm": 4.230199337005615, + "learning_rate": 2.679136384870201e-07, + "loss": 1.0049, + "step": 11655 + }, + { + "epoch": 0.8985507246376812, + "grad_norm": 3.359044075012207, + "learning_rate": 2.6751060451670033e-07, + "loss": 0.7274, + "step": 11656 + }, + { + "epoch": 0.8986278137526981, + "grad_norm": 3.813596487045288, + "learning_rate": 2.671078655925829e-07, + "loss": 0.8514, + "step": 11657 + }, + { + "epoch": 0.8987049028677151, + "grad_norm": 3.9940125942230225, + "learning_rate": 2.6670542173977745e-07, + "loss": 0.9324, + "step": 11658 + }, + { + "epoch": 0.898781991982732, + "grad_norm": 3.4712913036346436, + "learning_rate": 2.6630327298337535e-07, + "loss": 0.8621, + "step": 11659 + }, + { + "epoch": 0.898859081097749, + "grad_norm": 3.4401824474334717, + "learning_rate": 2.6590141934844715e-07, + "loss": 0.7482, + "step": 11660 + }, + { + "epoch": 0.898936170212766, + "grad_norm": 4.060207366943359, + "learning_rate": 2.654998608600479e-07, + "loss": 1.0239, + "step": 11661 + }, + { + "epoch": 0.8990132593277829, + "grad_norm": 3.8618593215942383, + "learning_rate": 2.6509859754321076e-07, + "loss": 0.9923, + "step": 11662 + }, + { + "epoch": 0.8990903484427999, + "grad_norm": 3.749800682067871, + "learning_rate": 2.6469762942295363e-07, + "loss": 0.9303, + "step": 11663 + }, + { + "epoch": 0.8991674375578168, + "grad_norm": 3.6844890117645264, + "learning_rate": 2.642969565242748e-07, + "loss": 0.9534, + "step": 11664 + }, + { + "epoch": 0.8992445266728338, + "grad_norm": 3.837385892868042, + "learning_rate": 2.6389657887215314e-07, + "loss": 0.9022, + "step": 11665 + }, + { + "epoch": 0.8993216157878507, + "grad_norm": 3.7096149921417236, + "learning_rate": 2.6349649649155153e-07, + "loss": 0.9856, + "step": 11666 + }, + { + "epoch": 0.8993987049028677, + "grad_norm": 4.130624771118164, + "learning_rate": 2.6309670940741215e-07, + "loss": 0.9756, + "step": 11667 + }, + { + "epoch": 0.8994757940178847, + "grad_norm": 4.128146171569824, + "learning_rate": 2.6269721764466016e-07, + "loss": 0.9506, + "step": 11668 + }, + { + "epoch": 0.8995528831329016, + "grad_norm": 3.8928797245025635, + "learning_rate": 2.6229802122820114e-07, + "loss": 0.9731, + "step": 11669 + }, + { + "epoch": 0.8996299722479186, + "grad_norm": 4.215887069702148, + "learning_rate": 2.618991201829235e-07, + "loss": 0.937, + "step": 11670 + }, + { + "epoch": 0.8997070613629355, + "grad_norm": 3.51582407951355, + "learning_rate": 2.6150051453369684e-07, + "loss": 0.8767, + "step": 11671 + }, + { + "epoch": 0.8997841504779525, + "grad_norm": 3.87080979347229, + "learning_rate": 2.6110220430537124e-07, + "loss": 0.866, + "step": 11672 + }, + { + "epoch": 0.8998612395929695, + "grad_norm": 3.93489670753479, + "learning_rate": 2.607041895227802e-07, + "loss": 0.9668, + "step": 11673 + }, + { + "epoch": 0.8999383287079864, + "grad_norm": 3.6503822803497314, + "learning_rate": 2.603064702107372e-07, + "loss": 0.8732, + "step": 11674 + }, + { + "epoch": 0.9000154178230034, + "grad_norm": 3.783892869949341, + "learning_rate": 2.599090463940385e-07, + "loss": 0.9098, + "step": 11675 + }, + { + "epoch": 0.9000925069380203, + "grad_norm": 3.5961296558380127, + "learning_rate": 2.5951191809746146e-07, + "loss": 0.8894, + "step": 11676 + }, + { + "epoch": 0.9001695960530373, + "grad_norm": 3.5369625091552734, + "learning_rate": 2.591150853457641e-07, + "loss": 0.8137, + "step": 11677 + }, + { + "epoch": 0.9002466851680543, + "grad_norm": 3.896723508834839, + "learning_rate": 2.587185481636878e-07, + "loss": 0.8884, + "step": 11678 + }, + { + "epoch": 0.9003237742830712, + "grad_norm": 3.630908250808716, + "learning_rate": 2.583223065759538e-07, + "loss": 0.9381, + "step": 11679 + }, + { + "epoch": 0.9004008633980882, + "grad_norm": 3.8402626514434814, + "learning_rate": 2.5792636060726684e-07, + "loss": 0.7879, + "step": 11680 + }, + { + "epoch": 0.9004779525131051, + "grad_norm": 3.4314825534820557, + "learning_rate": 2.5753071028231104e-07, + "loss": 0.7755, + "step": 11681 + }, + { + "epoch": 0.900555041628122, + "grad_norm": 3.852926015853882, + "learning_rate": 2.571353556257544e-07, + "loss": 0.8611, + "step": 11682 + }, + { + "epoch": 0.9006321307431391, + "grad_norm": 3.5122294425964355, + "learning_rate": 2.56740296662244e-07, + "loss": 0.8702, + "step": 11683 + }, + { + "epoch": 0.900709219858156, + "grad_norm": 3.657466411590576, + "learning_rate": 2.5634553341640943e-07, + "loss": 0.8854, + "step": 11684 + }, + { + "epoch": 0.900786308973173, + "grad_norm": 3.6080384254455566, + "learning_rate": 2.5595106591286335e-07, + "loss": 0.8445, + "step": 11685 + }, + { + "epoch": 0.9008633980881899, + "grad_norm": 3.5531303882598877, + "learning_rate": 2.555568941761982e-07, + "loss": 0.8449, + "step": 11686 + }, + { + "epoch": 0.9009404872032069, + "grad_norm": 3.68255352973938, + "learning_rate": 2.5516301823098944e-07, + "loss": 0.9405, + "step": 11687 + }, + { + "epoch": 0.9010175763182239, + "grad_norm": 3.9351344108581543, + "learning_rate": 2.547694381017912e-07, + "loss": 0.8447, + "step": 11688 + }, + { + "epoch": 0.9010946654332408, + "grad_norm": 4.277076244354248, + "learning_rate": 2.5437615381314284e-07, + "loss": 0.8522, + "step": 11689 + }, + { + "epoch": 0.9011717545482578, + "grad_norm": 3.9808146953582764, + "learning_rate": 2.5398316538956246e-07, + "loss": 0.8612, + "step": 11690 + }, + { + "epoch": 0.9012488436632747, + "grad_norm": 3.763561725616455, + "learning_rate": 2.5359047285555214e-07, + "loss": 0.9626, + "step": 11691 + }, + { + "epoch": 0.9013259327782918, + "grad_norm": 3.6474595069885254, + "learning_rate": 2.5319807623559287e-07, + "loss": 0.9554, + "step": 11692 + }, + { + "epoch": 0.9014030218933087, + "grad_norm": 4.234410762786865, + "learning_rate": 2.5280597555415067e-07, + "loss": 0.8349, + "step": 11693 + }, + { + "epoch": 0.9014801110083256, + "grad_norm": 3.9878194332122803, + "learning_rate": 2.524141708356681e-07, + "loss": 0.7692, + "step": 11694 + }, + { + "epoch": 0.9015572001233426, + "grad_norm": 3.9393675327301025, + "learning_rate": 2.52022662104574e-07, + "loss": 0.9336, + "step": 11695 + }, + { + "epoch": 0.9016342892383595, + "grad_norm": 3.7222704887390137, + "learning_rate": 2.5163144938527675e-07, + "loss": 0.8467, + "step": 11696 + }, + { + "epoch": 0.9017113783533766, + "grad_norm": 3.774393081665039, + "learning_rate": 2.5124053270216553e-07, + "loss": 0.9367, + "step": 11697 + }, + { + "epoch": 0.9017884674683935, + "grad_norm": 4.058575630187988, + "learning_rate": 2.5084991207961374e-07, + "loss": 0.896, + "step": 11698 + }, + { + "epoch": 0.9018655565834104, + "grad_norm": 3.4426233768463135, + "learning_rate": 2.50459587541973e-07, + "loss": 0.8279, + "step": 11699 + }, + { + "epoch": 0.9019426456984274, + "grad_norm": 3.468212842941284, + "learning_rate": 2.5006955911357823e-07, + "loss": 0.8706, + "step": 11700 + }, + { + "epoch": 0.9020197348134443, + "grad_norm": 3.775519609451294, + "learning_rate": 2.496798268187456e-07, + "loss": 0.8586, + "step": 11701 + }, + { + "epoch": 0.9020968239284614, + "grad_norm": 3.4390523433685303, + "learning_rate": 2.492903906817734e-07, + "loss": 0.781, + "step": 11702 + }, + { + "epoch": 0.9021739130434783, + "grad_norm": 3.695444107055664, + "learning_rate": 2.4890125072694114e-07, + "loss": 0.9408, + "step": 11703 + }, + { + "epoch": 0.9022510021584952, + "grad_norm": 3.9369289875030518, + "learning_rate": 2.4851240697850997e-07, + "loss": 0.8868, + "step": 11704 + }, + { + "epoch": 0.9023280912735122, + "grad_norm": 3.8416519165039062, + "learning_rate": 2.481238594607205e-07, + "loss": 0.8003, + "step": 11705 + }, + { + "epoch": 0.9024051803885291, + "grad_norm": 4.129755020141602, + "learning_rate": 2.477356081977983e-07, + "loss": 0.9659, + "step": 11706 + }, + { + "epoch": 0.9024822695035462, + "grad_norm": 4.082637786865234, + "learning_rate": 2.4734765321394793e-07, + "loss": 0.9412, + "step": 11707 + }, + { + "epoch": 0.9025593586185631, + "grad_norm": 3.7622687816619873, + "learning_rate": 2.469599945333567e-07, + "loss": 0.9492, + "step": 11708 + }, + { + "epoch": 0.90263644773358, + "grad_norm": 4.142516136169434, + "learning_rate": 2.465726321801942e-07, + "loss": 1.0039, + "step": 11709 + }, + { + "epoch": 0.902713536848597, + "grad_norm": 3.67720103263855, + "learning_rate": 2.4618556617860777e-07, + "loss": 0.9048, + "step": 11710 + }, + { + "epoch": 0.9027906259636139, + "grad_norm": 3.787903308868408, + "learning_rate": 2.45798796552732e-07, + "loss": 0.8038, + "step": 11711 + }, + { + "epoch": 0.902867715078631, + "grad_norm": 4.044528007507324, + "learning_rate": 2.454123233266781e-07, + "loss": 0.9192, + "step": 11712 + }, + { + "epoch": 0.9029448041936479, + "grad_norm": 3.6001529693603516, + "learning_rate": 2.4502614652454184e-07, + "loss": 0.8692, + "step": 11713 + }, + { + "epoch": 0.9030218933086648, + "grad_norm": 3.8225014209747314, + "learning_rate": 2.4464026617039784e-07, + "loss": 0.871, + "step": 11714 + }, + { + "epoch": 0.9030989824236818, + "grad_norm": 3.8792974948883057, + "learning_rate": 2.4425468228830516e-07, + "loss": 0.808, + "step": 11715 + }, + { + "epoch": 0.9031760715386987, + "grad_norm": 3.640425682067871, + "learning_rate": 2.438693949023019e-07, + "loss": 0.7728, + "step": 11716 + }, + { + "epoch": 0.9032531606537157, + "grad_norm": 4.195462703704834, + "learning_rate": 2.434844040364098e-07, + "loss": 0.9055, + "step": 11717 + }, + { + "epoch": 0.9033302497687327, + "grad_norm": 3.6664235591888428, + "learning_rate": 2.4309970971462984e-07, + "loss": 0.8798, + "step": 11718 + }, + { + "epoch": 0.9034073388837496, + "grad_norm": 3.623297691345215, + "learning_rate": 2.427153119609477e-07, + "loss": 0.8227, + "step": 11719 + }, + { + "epoch": 0.9034844279987666, + "grad_norm": 3.899339199066162, + "learning_rate": 2.4233121079932585e-07, + "loss": 0.884, + "step": 11720 + }, + { + "epoch": 0.9035615171137835, + "grad_norm": 3.89997935295105, + "learning_rate": 2.4194740625371303e-07, + "loss": 0.8831, + "step": 11721 + }, + { + "epoch": 0.9036386062288005, + "grad_norm": 3.7729170322418213, + "learning_rate": 2.4156389834803616e-07, + "loss": 0.8071, + "step": 11722 + }, + { + "epoch": 0.9037156953438175, + "grad_norm": 3.915086507797241, + "learning_rate": 2.411806871062061e-07, + "loss": 1.0403, + "step": 11723 + }, + { + "epoch": 0.9037927844588344, + "grad_norm": 3.809762716293335, + "learning_rate": 2.4079777255211434e-07, + "loss": 0.864, + "step": 11724 + }, + { + "epoch": 0.9038698735738514, + "grad_norm": 4.3212714195251465, + "learning_rate": 2.4041515470963227e-07, + "loss": 1.0022, + "step": 11725 + }, + { + "epoch": 0.9039469626888683, + "grad_norm": 3.7529261112213135, + "learning_rate": 2.400328336026148e-07, + "loss": 0.9564, + "step": 11726 + }, + { + "epoch": 0.9040240518038853, + "grad_norm": 3.535996675491333, + "learning_rate": 2.396508092548977e-07, + "loss": 0.8769, + "step": 11727 + }, + { + "epoch": 0.9041011409189023, + "grad_norm": 3.646723747253418, + "learning_rate": 2.392690816902976e-07, + "loss": 0.8884, + "step": 11728 + }, + { + "epoch": 0.9041782300339192, + "grad_norm": 4.039508819580078, + "learning_rate": 2.3888765093261434e-07, + "loss": 0.8506, + "step": 11729 + }, + { + "epoch": 0.9042553191489362, + "grad_norm": 4.302793502807617, + "learning_rate": 2.385065170056283e-07, + "loss": 0.9042, + "step": 11730 + }, + { + "epoch": 0.9043324082639531, + "grad_norm": 3.591334104537964, + "learning_rate": 2.3812567993309943e-07, + "loss": 0.7687, + "step": 11731 + }, + { + "epoch": 0.9044094973789701, + "grad_norm": 3.6448166370391846, + "learning_rate": 2.3774513973877254e-07, + "loss": 0.8568, + "step": 11732 + }, + { + "epoch": 0.904486586493987, + "grad_norm": 3.65342116355896, + "learning_rate": 2.3736489644637152e-07, + "loss": 0.8879, + "step": 11733 + }, + { + "epoch": 0.904563675609004, + "grad_norm": 3.8169586658477783, + "learning_rate": 2.3698495007960286e-07, + "loss": 0.9232, + "step": 11734 + }, + { + "epoch": 0.904640764724021, + "grad_norm": 3.728663921356201, + "learning_rate": 2.3660530066215493e-07, + "loss": 0.8023, + "step": 11735 + }, + { + "epoch": 0.9047178538390379, + "grad_norm": 3.7461793422698975, + "learning_rate": 2.3622594821769595e-07, + "loss": 0.8047, + "step": 11736 + }, + { + "epoch": 0.9047949429540549, + "grad_norm": 3.480088472366333, + "learning_rate": 2.358468927698765e-07, + "loss": 0.9529, + "step": 11737 + }, + { + "epoch": 0.9048720320690719, + "grad_norm": 3.421665668487549, + "learning_rate": 2.354681343423293e-07, + "loss": 0.8599, + "step": 11738 + }, + { + "epoch": 0.9049491211840888, + "grad_norm": 3.9536914825439453, + "learning_rate": 2.350896729586677e-07, + "loss": 0.7775, + "step": 11739 + }, + { + "epoch": 0.9050262102991058, + "grad_norm": 3.6503090858459473, + "learning_rate": 2.347115086424867e-07, + "loss": 0.9699, + "step": 11740 + }, + { + "epoch": 0.9051032994141227, + "grad_norm": 3.8761227130889893, + "learning_rate": 2.3433364141736414e-07, + "loss": 0.9433, + "step": 11741 + }, + { + "epoch": 0.9051803885291397, + "grad_norm": 3.4743881225585938, + "learning_rate": 2.3395607130685616e-07, + "loss": 0.9262, + "step": 11742 + }, + { + "epoch": 0.9052574776441566, + "grad_norm": 3.9404661655426025, + "learning_rate": 2.3357879833450335e-07, + "loss": 0.8681, + "step": 11743 + }, + { + "epoch": 0.9053345667591736, + "grad_norm": 3.5697805881500244, + "learning_rate": 2.332018225238264e-07, + "loss": 0.7547, + "step": 11744 + }, + { + "epoch": 0.9054116558741906, + "grad_norm": 3.835613965988159, + "learning_rate": 2.3282514389832755e-07, + "loss": 0.9139, + "step": 11745 + }, + { + "epoch": 0.9054887449892075, + "grad_norm": 3.496504306793213, + "learning_rate": 2.3244876248149196e-07, + "loss": 0.9144, + "step": 11746 + }, + { + "epoch": 0.9055658341042245, + "grad_norm": 3.6928272247314453, + "learning_rate": 2.3207267829678416e-07, + "loss": 0.8003, + "step": 11747 + }, + { + "epoch": 0.9056429232192414, + "grad_norm": 3.711456537246704, + "learning_rate": 2.3169689136765038e-07, + "loss": 0.9611, + "step": 11748 + }, + { + "epoch": 0.9057200123342584, + "grad_norm": 3.7150790691375732, + "learning_rate": 2.3132140171752027e-07, + "loss": 0.9735, + "step": 11749 + }, + { + "epoch": 0.9057971014492754, + "grad_norm": 3.9875683784484863, + "learning_rate": 2.3094620936980283e-07, + "loss": 0.9415, + "step": 11750 + }, + { + "epoch": 0.9058741905642923, + "grad_norm": 3.7568795680999756, + "learning_rate": 2.3057131434788994e-07, + "loss": 0.9143, + "step": 11751 + }, + { + "epoch": 0.9059512796793093, + "grad_norm": 3.826282262802124, + "learning_rate": 2.3019671667515454e-07, + "loss": 0.9529, + "step": 11752 + }, + { + "epoch": 0.9060283687943262, + "grad_norm": 3.865046262741089, + "learning_rate": 2.2982241637494962e-07, + "loss": 1.0432, + "step": 11753 + }, + { + "epoch": 0.9061054579093432, + "grad_norm": 3.8025035858154297, + "learning_rate": 2.2944841347061153e-07, + "loss": 0.8933, + "step": 11754 + }, + { + "epoch": 0.9061825470243602, + "grad_norm": 3.6848461627960205, + "learning_rate": 2.2907470798545772e-07, + "loss": 0.9039, + "step": 11755 + }, + { + "epoch": 0.9062596361393771, + "grad_norm": 3.549743413925171, + "learning_rate": 2.2870129994278733e-07, + "loss": 0.9509, + "step": 11756 + }, + { + "epoch": 0.9063367252543941, + "grad_norm": 3.480574607849121, + "learning_rate": 2.2832818936587954e-07, + "loss": 0.7861, + "step": 11757 + }, + { + "epoch": 0.906413814369411, + "grad_norm": 3.917457342147827, + "learning_rate": 2.2795537627799512e-07, + "loss": 0.773, + "step": 11758 + }, + { + "epoch": 0.906490903484428, + "grad_norm": 3.638928174972534, + "learning_rate": 2.2758286070237889e-07, + "loss": 0.9302, + "step": 11759 + }, + { + "epoch": 0.906567992599445, + "grad_norm": 3.471503257751465, + "learning_rate": 2.2721064266225335e-07, + "loss": 0.8307, + "step": 11760 + }, + { + "epoch": 0.9066450817144619, + "grad_norm": 3.8338592052459717, + "learning_rate": 2.2683872218082659e-07, + "loss": 0.8046, + "step": 11761 + }, + { + "epoch": 0.9067221708294789, + "grad_norm": 3.566448450088501, + "learning_rate": 2.2646709928128397e-07, + "loss": 0.8662, + "step": 11762 + }, + { + "epoch": 0.9067992599444958, + "grad_norm": 3.907142162322998, + "learning_rate": 2.2609577398679472e-07, + "loss": 0.9882, + "step": 11763 + }, + { + "epoch": 0.9068763490595128, + "grad_norm": 3.876858949661255, + "learning_rate": 2.2572474632050977e-07, + "loss": 0.8642, + "step": 11764 + }, + { + "epoch": 0.9069534381745298, + "grad_norm": 3.5765106678009033, + "learning_rate": 2.253540163055601e-07, + "loss": 0.829, + "step": 11765 + }, + { + "epoch": 0.9070305272895467, + "grad_norm": 4.412302494049072, + "learning_rate": 2.249835839650588e-07, + "loss": 1.0133, + "step": 11766 + }, + { + "epoch": 0.9071076164045637, + "grad_norm": 3.776357412338257, + "learning_rate": 2.2461344932210084e-07, + "loss": 0.8659, + "step": 11767 + }, + { + "epoch": 0.9071847055195806, + "grad_norm": 4.0795392990112305, + "learning_rate": 2.2424361239976212e-07, + "loss": 0.9052, + "step": 11768 + }, + { + "epoch": 0.9072617946345976, + "grad_norm": 3.6457784175872803, + "learning_rate": 2.2387407322109922e-07, + "loss": 0.8973, + "step": 11769 + }, + { + "epoch": 0.9073388837496146, + "grad_norm": 3.8819169998168945, + "learning_rate": 2.2350483180915206e-07, + "loss": 0.9255, + "step": 11770 + }, + { + "epoch": 0.9074159728646315, + "grad_norm": 4.195042610168457, + "learning_rate": 2.2313588818694055e-07, + "loss": 0.9305, + "step": 11771 + }, + { + "epoch": 0.9074930619796485, + "grad_norm": 3.5375936031341553, + "learning_rate": 2.2276724237746684e-07, + "loss": 0.9001, + "step": 11772 + }, + { + "epoch": 0.9075701510946654, + "grad_norm": 4.148855209350586, + "learning_rate": 2.223988944037131e-07, + "loss": 0.9946, + "step": 11773 + }, + { + "epoch": 0.9076472402096823, + "grad_norm": 3.680772304534912, + "learning_rate": 2.220308442886443e-07, + "loss": 0.8558, + "step": 11774 + }, + { + "epoch": 0.9077243293246994, + "grad_norm": 3.458876848220825, + "learning_rate": 2.2166309205520707e-07, + "loss": 0.9057, + "step": 11775 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 3.533604145050049, + "learning_rate": 2.2129563772632755e-07, + "loss": 0.8633, + "step": 11776 + }, + { + "epoch": 0.9078785075547333, + "grad_norm": 4.154286861419678, + "learning_rate": 2.2092848132491628e-07, + "loss": 1.0149, + "step": 11777 + }, + { + "epoch": 0.9079555966697502, + "grad_norm": 3.6534740924835205, + "learning_rate": 2.205616228738633e-07, + "loss": 0.7558, + "step": 11778 + }, + { + "epoch": 0.9080326857847671, + "grad_norm": 3.837028741836548, + "learning_rate": 2.201950623960386e-07, + "loss": 0.9613, + "step": 11779 + }, + { + "epoch": 0.9081097748997842, + "grad_norm": 3.6078686714172363, + "learning_rate": 2.1982879991429728e-07, + "loss": 0.8294, + "step": 11780 + }, + { + "epoch": 0.9081868640148011, + "grad_norm": 3.634082794189453, + "learning_rate": 2.1946283545147274e-07, + "loss": 0.9769, + "step": 11781 + }, + { + "epoch": 0.9082639531298181, + "grad_norm": 3.799694776535034, + "learning_rate": 2.1909716903038114e-07, + "loss": 0.945, + "step": 11782 + }, + { + "epoch": 0.908341042244835, + "grad_norm": 3.5379395484924316, + "learning_rate": 2.1873180067382095e-07, + "loss": 0.8928, + "step": 11783 + }, + { + "epoch": 0.9084181313598519, + "grad_norm": 3.9974184036254883, + "learning_rate": 2.1836673040456947e-07, + "loss": 0.946, + "step": 11784 + }, + { + "epoch": 0.908495220474869, + "grad_norm": 3.915295124053955, + "learning_rate": 2.1800195824538794e-07, + "loss": 1.0305, + "step": 11785 + }, + { + "epoch": 0.9085723095898859, + "grad_norm": 3.7968573570251465, + "learning_rate": 2.1763748421901764e-07, + "loss": 0.8681, + "step": 11786 + }, + { + "epoch": 0.9086493987049029, + "grad_norm": 3.79858136177063, + "learning_rate": 2.172733083481815e-07, + "loss": 0.8694, + "step": 11787 + }, + { + "epoch": 0.9087264878199198, + "grad_norm": 3.833800792694092, + "learning_rate": 2.1690943065558412e-07, + "loss": 0.9167, + "step": 11788 + }, + { + "epoch": 0.9088035769349367, + "grad_norm": 3.841184616088867, + "learning_rate": 2.1654585116391236e-07, + "loss": 0.8676, + "step": 11789 + }, + { + "epoch": 0.9088806660499538, + "grad_norm": 3.632209062576294, + "learning_rate": 2.1618256989583197e-07, + "loss": 0.9878, + "step": 11790 + }, + { + "epoch": 0.9089577551649707, + "grad_norm": 4.1442742347717285, + "learning_rate": 2.1581958687399206e-07, + "loss": 0.9334, + "step": 11791 + }, + { + "epoch": 0.9090348442799877, + "grad_norm": 3.917574167251587, + "learning_rate": 2.1545690212102344e-07, + "loss": 0.822, + "step": 11792 + }, + { + "epoch": 0.9091119333950046, + "grad_norm": 4.023653507232666, + "learning_rate": 2.1509451565953688e-07, + "loss": 0.9154, + "step": 11793 + }, + { + "epoch": 0.9091890225100215, + "grad_norm": 3.650171995162964, + "learning_rate": 2.1473242751212653e-07, + "loss": 0.7958, + "step": 11794 + }, + { + "epoch": 0.9092661116250386, + "grad_norm": 3.726482391357422, + "learning_rate": 2.143706377013649e-07, + "loss": 0.9593, + "step": 11795 + }, + { + "epoch": 0.9093432007400555, + "grad_norm": 3.3501317501068115, + "learning_rate": 2.140091462498084e-07, + "loss": 0.9107, + "step": 11796 + }, + { + "epoch": 0.9094202898550725, + "grad_norm": 3.5928847789764404, + "learning_rate": 2.1364795317999455e-07, + "loss": 0.845, + "step": 11797 + }, + { + "epoch": 0.9094973789700894, + "grad_norm": 3.925431489944458, + "learning_rate": 2.13287058514442e-07, + "loss": 0.9261, + "step": 11798 + }, + { + "epoch": 0.9095744680851063, + "grad_norm": 4.184495449066162, + "learning_rate": 2.1292646227564995e-07, + "loss": 1.0248, + "step": 11799 + }, + { + "epoch": 0.9096515572001234, + "grad_norm": 3.948833703994751, + "learning_rate": 2.125661644860999e-07, + "loss": 0.9499, + "step": 11800 + }, + { + "epoch": 0.9097286463151403, + "grad_norm": 3.7556886672973633, + "learning_rate": 2.1220616516825497e-07, + "loss": 0.909, + "step": 11801 + }, + { + "epoch": 0.9098057354301573, + "grad_norm": 3.4662973880767822, + "learning_rate": 2.1184646434455947e-07, + "loss": 0.9219, + "step": 11802 + }, + { + "epoch": 0.9098828245451742, + "grad_norm": 3.4703729152679443, + "learning_rate": 2.1148706203743762e-07, + "loss": 0.8062, + "step": 11803 + }, + { + "epoch": 0.9099599136601911, + "grad_norm": 3.3203537464141846, + "learning_rate": 2.111279582692982e-07, + "loss": 0.8091, + "step": 11804 + }, + { + "epoch": 0.9100370027752082, + "grad_norm": 3.9638702869415283, + "learning_rate": 2.1076915306252776e-07, + "loss": 0.9664, + "step": 11805 + }, + { + "epoch": 0.9101140918902251, + "grad_norm": 3.615192174911499, + "learning_rate": 2.104106464394967e-07, + "loss": 0.8336, + "step": 11806 + }, + { + "epoch": 0.9101911810052421, + "grad_norm": 4.019468784332275, + "learning_rate": 2.1005243842255552e-07, + "loss": 0.9677, + "step": 11807 + }, + { + "epoch": 0.910268270120259, + "grad_norm": 3.62318754196167, + "learning_rate": 2.0969452903403742e-07, + "loss": 0.902, + "step": 11808 + }, + { + "epoch": 0.9103453592352759, + "grad_norm": 3.653689384460449, + "learning_rate": 2.0933691829625624e-07, + "loss": 0.903, + "step": 11809 + }, + { + "epoch": 0.910422448350293, + "grad_norm": 3.717650890350342, + "learning_rate": 2.0897960623150581e-07, + "loss": 0.9309, + "step": 11810 + }, + { + "epoch": 0.9104995374653099, + "grad_norm": 3.7195537090301514, + "learning_rate": 2.0862259286206387e-07, + "loss": 0.9048, + "step": 11811 + }, + { + "epoch": 0.9105766265803269, + "grad_norm": 3.824012279510498, + "learning_rate": 2.0826587821018818e-07, + "loss": 0.8867, + "step": 11812 + }, + { + "epoch": 0.9106537156953438, + "grad_norm": 3.8283004760742188, + "learning_rate": 2.0790946229811758e-07, + "loss": 0.789, + "step": 11813 + }, + { + "epoch": 0.9107308048103607, + "grad_norm": 3.8848516941070557, + "learning_rate": 2.0755334514807379e-07, + "loss": 0.8425, + "step": 11814 + }, + { + "epoch": 0.9108078939253778, + "grad_norm": 3.4980616569519043, + "learning_rate": 2.0719752678225846e-07, + "loss": 0.849, + "step": 11815 + }, + { + "epoch": 0.9108849830403947, + "grad_norm": 3.96785044670105, + "learning_rate": 2.0684200722285386e-07, + "loss": 0.8735, + "step": 11816 + }, + { + "epoch": 0.9109620721554117, + "grad_norm": 3.910421133041382, + "learning_rate": 2.0648678649202615e-07, + "loss": 1.0155, + "step": 11817 + }, + { + "epoch": 0.9110391612704286, + "grad_norm": 3.8571994304656982, + "learning_rate": 2.0613186461192092e-07, + "loss": 0.7947, + "step": 11818 + }, + { + "epoch": 0.9111162503854455, + "grad_norm": 3.5077316761016846, + "learning_rate": 2.057772416046655e-07, + "loss": 0.8644, + "step": 11819 + }, + { + "epoch": 0.9111933395004626, + "grad_norm": 4.075991153717041, + "learning_rate": 2.0542291749237053e-07, + "loss": 0.8711, + "step": 11820 + }, + { + "epoch": 0.9112704286154795, + "grad_norm": 3.9802606105804443, + "learning_rate": 2.050688922971239e-07, + "loss": 0.7961, + "step": 11821 + }, + { + "epoch": 0.9113475177304965, + "grad_norm": 4.114253520965576, + "learning_rate": 2.0471516604099794e-07, + "loss": 0.8708, + "step": 11822 + }, + { + "epoch": 0.9114246068455134, + "grad_norm": 3.611233711242676, + "learning_rate": 2.0436173874604615e-07, + "loss": 0.7748, + "step": 11823 + }, + { + "epoch": 0.9115016959605303, + "grad_norm": 3.610714912414551, + "learning_rate": 2.0400861043430254e-07, + "loss": 0.6573, + "step": 11824 + }, + { + "epoch": 0.9115787850755473, + "grad_norm": 3.904794216156006, + "learning_rate": 2.036557811277834e-07, + "loss": 0.9248, + "step": 11825 + }, + { + "epoch": 0.9116558741905643, + "grad_norm": 3.6194839477539062, + "learning_rate": 2.0330325084848612e-07, + "loss": 0.837, + "step": 11826 + }, + { + "epoch": 0.9117329633055813, + "grad_norm": 3.733680248260498, + "learning_rate": 2.029510196183876e-07, + "loss": 0.8843, + "step": 11827 + }, + { + "epoch": 0.9118100524205982, + "grad_norm": 3.8289010524749756, + "learning_rate": 2.02599087459448e-07, + "loss": 0.9078, + "step": 11828 + }, + { + "epoch": 0.9118871415356151, + "grad_norm": 3.486313581466675, + "learning_rate": 2.0224745439360928e-07, + "loss": 0.8279, + "step": 11829 + }, + { + "epoch": 0.9119642306506321, + "grad_norm": 3.5716099739074707, + "learning_rate": 2.0189612044279384e-07, + "loss": 0.9217, + "step": 11830 + }, + { + "epoch": 0.9120413197656491, + "grad_norm": 3.4846935272216797, + "learning_rate": 2.015450856289053e-07, + "loss": 0.7435, + "step": 11831 + }, + { + "epoch": 0.9121184088806661, + "grad_norm": 3.8818373680114746, + "learning_rate": 2.0119434997382893e-07, + "loss": 0.9561, + "step": 11832 + }, + { + "epoch": 0.912195497995683, + "grad_norm": 3.465576171875, + "learning_rate": 2.0084391349943055e-07, + "loss": 0.8902, + "step": 11833 + }, + { + "epoch": 0.9122725871106999, + "grad_norm": 4.149135589599609, + "learning_rate": 2.0049377622755885e-07, + "loss": 0.9379, + "step": 11834 + }, + { + "epoch": 0.9123496762257169, + "grad_norm": 4.032505512237549, + "learning_rate": 2.0014393818004295e-07, + "loss": 0.8065, + "step": 11835 + }, + { + "epoch": 0.9124267653407339, + "grad_norm": 4.063005447387695, + "learning_rate": 1.9979439937869383e-07, + "loss": 1.0096, + "step": 11836 + }, + { + "epoch": 0.9125038544557509, + "grad_norm": 3.7113261222839355, + "learning_rate": 1.9944515984530343e-07, + "loss": 0.9335, + "step": 11837 + }, + { + "epoch": 0.9125809435707678, + "grad_norm": 4.403497219085693, + "learning_rate": 1.9909621960164382e-07, + "loss": 0.9352, + "step": 11838 + }, + { + "epoch": 0.9126580326857847, + "grad_norm": 3.810023069381714, + "learning_rate": 1.9874757866947036e-07, + "loss": 0.9586, + "step": 11839 + }, + { + "epoch": 0.9127351218008017, + "grad_norm": 3.5744166374206543, + "learning_rate": 1.98399237070519e-07, + "loss": 0.8838, + "step": 11840 + }, + { + "epoch": 0.9128122109158187, + "grad_norm": 3.613818407058716, + "learning_rate": 1.9805119482650737e-07, + "loss": 0.8913, + "step": 11841 + }, + { + "epoch": 0.9128893000308357, + "grad_norm": 3.6662750244140625, + "learning_rate": 1.977034519591342e-07, + "loss": 0.9187, + "step": 11842 + }, + { + "epoch": 0.9129663891458526, + "grad_norm": 4.007206439971924, + "learning_rate": 1.9735600849007774e-07, + "loss": 0.8897, + "step": 11843 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 4.291807651519775, + "learning_rate": 1.970088644410012e-07, + "loss": 0.8999, + "step": 11844 + }, + { + "epoch": 0.9131205673758865, + "grad_norm": 3.8497557640075684, + "learning_rate": 1.966620198335467e-07, + "loss": 0.8368, + "step": 11845 + }, + { + "epoch": 0.9131976564909035, + "grad_norm": 3.969374179840088, + "learning_rate": 1.963154746893392e-07, + "loss": 0.8243, + "step": 11846 + }, + { + "epoch": 0.9132747456059205, + "grad_norm": 3.7255005836486816, + "learning_rate": 1.9596922902998193e-07, + "loss": 0.8739, + "step": 11847 + }, + { + "epoch": 0.9133518347209374, + "grad_norm": 3.940896511077881, + "learning_rate": 1.956232828770621e-07, + "loss": 0.8913, + "step": 11848 + }, + { + "epoch": 0.9134289238359543, + "grad_norm": 3.7923643589019775, + "learning_rate": 1.9527763625214857e-07, + "loss": 0.906, + "step": 11849 + }, + { + "epoch": 0.9135060129509713, + "grad_norm": 3.9090356826782227, + "learning_rate": 1.9493228917679018e-07, + "loss": 0.9555, + "step": 11850 + }, + { + "epoch": 0.9135831020659883, + "grad_norm": 3.677847146987915, + "learning_rate": 1.9458724167251753e-07, + "loss": 0.8394, + "step": 11851 + }, + { + "epoch": 0.9136601911810053, + "grad_norm": 3.4867942333221436, + "learning_rate": 1.942424937608428e-07, + "loss": 0.8895, + "step": 11852 + }, + { + "epoch": 0.9137372802960222, + "grad_norm": 3.678907632827759, + "learning_rate": 1.9389804546325885e-07, + "loss": 0.832, + "step": 11853 + }, + { + "epoch": 0.9138143694110391, + "grad_norm": 3.7241756916046143, + "learning_rate": 1.935538968012396e-07, + "loss": 0.8667, + "step": 11854 + }, + { + "epoch": 0.9138914585260561, + "grad_norm": 3.8677916526794434, + "learning_rate": 1.9321004779624232e-07, + "loss": 0.8854, + "step": 11855 + }, + { + "epoch": 0.913968547641073, + "grad_norm": 3.4867103099823, + "learning_rate": 1.9286649846970318e-07, + "loss": 0.8591, + "step": 11856 + }, + { + "epoch": 0.9140456367560901, + "grad_norm": 3.9573867321014404, + "learning_rate": 1.9252324884304174e-07, + "loss": 0.8497, + "step": 11857 + }, + { + "epoch": 0.914122725871107, + "grad_norm": 3.7366578578948975, + "learning_rate": 1.9218029893765643e-07, + "loss": 0.9457, + "step": 11858 + }, + { + "epoch": 0.9141998149861239, + "grad_norm": 3.685131549835205, + "learning_rate": 1.9183764877492905e-07, + "loss": 0.8712, + "step": 11859 + }, + { + "epoch": 0.9142769041011409, + "grad_norm": 3.735658645629883, + "learning_rate": 1.914952983762225e-07, + "loss": 0.8748, + "step": 11860 + }, + { + "epoch": 0.9143539932161578, + "grad_norm": 4.044561386108398, + "learning_rate": 1.9115324776288024e-07, + "loss": 0.938, + "step": 11861 + }, + { + "epoch": 0.9144310823311749, + "grad_norm": 3.9689042568206787, + "learning_rate": 1.9081149695622693e-07, + "loss": 0.8393, + "step": 11862 + }, + { + "epoch": 0.9145081714461918, + "grad_norm": 3.726811170578003, + "learning_rate": 1.9047004597757045e-07, + "loss": 0.7973, + "step": 11863 + }, + { + "epoch": 0.9145852605612088, + "grad_norm": 3.691999673843384, + "learning_rate": 1.9012889484819665e-07, + "loss": 0.8179, + "step": 11864 + }, + { + "epoch": 0.9146623496762257, + "grad_norm": 3.436737298965454, + "learning_rate": 1.8978804358937508e-07, + "loss": 0.8289, + "step": 11865 + }, + { + "epoch": 0.9147394387912426, + "grad_norm": 4.035014629364014, + "learning_rate": 1.8944749222235658e-07, + "loss": 0.9811, + "step": 11866 + }, + { + "epoch": 0.9148165279062597, + "grad_norm": 3.6229660511016846, + "learning_rate": 1.8910724076837196e-07, + "loss": 0.9873, + "step": 11867 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 3.7031772136688232, + "learning_rate": 1.887672892486353e-07, + "loss": 1.0282, + "step": 11868 + }, + { + "epoch": 0.9149707061362936, + "grad_norm": 3.981867551803589, + "learning_rate": 1.8842763768434024e-07, + "loss": 0.9978, + "step": 11869 + }, + { + "epoch": 0.9150477952513105, + "grad_norm": 3.8303933143615723, + "learning_rate": 1.880882860966615e-07, + "loss": 0.9158, + "step": 11870 + }, + { + "epoch": 0.9151248843663274, + "grad_norm": 3.6401233673095703, + "learning_rate": 1.8774923450675665e-07, + "loss": 0.8627, + "step": 11871 + }, + { + "epoch": 0.9152019734813445, + "grad_norm": 3.968207359313965, + "learning_rate": 1.8741048293576424e-07, + "loss": 0.9463, + "step": 11872 + }, + { + "epoch": 0.9152790625963614, + "grad_norm": 3.688127040863037, + "learning_rate": 1.8707203140480245e-07, + "loss": 0.9945, + "step": 11873 + }, + { + "epoch": 0.9153561517113784, + "grad_norm": 3.7214889526367188, + "learning_rate": 1.8673387993497383e-07, + "loss": 0.8459, + "step": 11874 + }, + { + "epoch": 0.9154332408263953, + "grad_norm": 3.942366600036621, + "learning_rate": 1.8639602854735872e-07, + "loss": 0.7865, + "step": 11875 + }, + { + "epoch": 0.9155103299414122, + "grad_norm": 3.5286405086517334, + "learning_rate": 1.8605847726302085e-07, + "loss": 0.9035, + "step": 11876 + }, + { + "epoch": 0.9155874190564293, + "grad_norm": 3.608853816986084, + "learning_rate": 1.8572122610300447e-07, + "loss": 0.8938, + "step": 11877 + }, + { + "epoch": 0.9156645081714462, + "grad_norm": 3.8086750507354736, + "learning_rate": 1.8538427508833612e-07, + "loss": 0.8832, + "step": 11878 + }, + { + "epoch": 0.9157415972864632, + "grad_norm": 3.3695833683013916, + "learning_rate": 1.8504762424002342e-07, + "loss": 0.9145, + "step": 11879 + }, + { + "epoch": 0.9158186864014801, + "grad_norm": 3.7012627124786377, + "learning_rate": 1.8471127357905348e-07, + "loss": 0.9761, + "step": 11880 + }, + { + "epoch": 0.915895775516497, + "grad_norm": 3.448864459991455, + "learning_rate": 1.843752231263962e-07, + "loss": 0.7754, + "step": 11881 + }, + { + "epoch": 0.9159728646315141, + "grad_norm": 3.5888924598693848, + "learning_rate": 1.8403947290300318e-07, + "loss": 0.936, + "step": 11882 + }, + { + "epoch": 0.916049953746531, + "grad_norm": 3.8859617710113525, + "learning_rate": 1.8370402292980703e-07, + "loss": 0.9449, + "step": 11883 + }, + { + "epoch": 0.916127042861548, + "grad_norm": 3.799428939819336, + "learning_rate": 1.8336887322772e-07, + "loss": 0.8585, + "step": 11884 + }, + { + "epoch": 0.9162041319765649, + "grad_norm": 3.7906007766723633, + "learning_rate": 1.8303402381763924e-07, + "loss": 0.9822, + "step": 11885 + }, + { + "epoch": 0.9162812210915818, + "grad_norm": 3.4046058654785156, + "learning_rate": 1.8269947472043804e-07, + "loss": 0.8123, + "step": 11886 + }, + { + "epoch": 0.9163583102065989, + "grad_norm": 3.974215030670166, + "learning_rate": 1.823652259569747e-07, + "loss": 0.9087, + "step": 11887 + }, + { + "epoch": 0.9164353993216158, + "grad_norm": 3.9902284145355225, + "learning_rate": 1.8203127754808923e-07, + "loss": 0.937, + "step": 11888 + }, + { + "epoch": 0.9165124884366328, + "grad_norm": 3.7469322681427, + "learning_rate": 1.8169762951460112e-07, + "loss": 0.8929, + "step": 11889 + }, + { + "epoch": 0.9165895775516497, + "grad_norm": 3.5326128005981445, + "learning_rate": 1.8136428187731037e-07, + "loss": 0.8825, + "step": 11890 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 3.375762462615967, + "learning_rate": 1.810312346570009e-07, + "loss": 0.8043, + "step": 11891 + }, + { + "epoch": 0.9167437557816837, + "grad_norm": 3.9036519527435303, + "learning_rate": 1.8069848787443556e-07, + "loss": 1.036, + "step": 11892 + }, + { + "epoch": 0.9168208448967006, + "grad_norm": 3.671597719192505, + "learning_rate": 1.8036604155035942e-07, + "loss": 0.9171, + "step": 11893 + }, + { + "epoch": 0.9168979340117176, + "grad_norm": 3.8690693378448486, + "learning_rate": 1.8003389570549978e-07, + "loss": 0.8524, + "step": 11894 + }, + { + "epoch": 0.9169750231267345, + "grad_norm": 3.334073543548584, + "learning_rate": 1.7970205036056287e-07, + "loss": 0.9197, + "step": 11895 + }, + { + "epoch": 0.9170521122417514, + "grad_norm": 3.826357126235962, + "learning_rate": 1.793705055362388e-07, + "loss": 0.8555, + "step": 11896 + }, + { + "epoch": 0.9171292013567685, + "grad_norm": 3.7970800399780273, + "learning_rate": 1.7903926125319603e-07, + "loss": 0.845, + "step": 11897 + }, + { + "epoch": 0.9172062904717854, + "grad_norm": 3.9418017864227295, + "learning_rate": 1.7870831753208752e-07, + "loss": 0.9457, + "step": 11898 + }, + { + "epoch": 0.9172833795868024, + "grad_norm": 4.235414981842041, + "learning_rate": 1.7837767439354503e-07, + "loss": 0.9059, + "step": 11899 + }, + { + "epoch": 0.9173604687018193, + "grad_norm": 3.8422882556915283, + "learning_rate": 1.7804733185818378e-07, + "loss": 0.852, + "step": 11900 + }, + { + "epoch": 0.9174375578168362, + "grad_norm": 4.103250026702881, + "learning_rate": 1.7771728994659677e-07, + "loss": 0.8487, + "step": 11901 + }, + { + "epoch": 0.9175146469318533, + "grad_norm": 3.8441503047943115, + "learning_rate": 1.7738754867936193e-07, + "loss": 0.8913, + "step": 11902 + }, + { + "epoch": 0.9175917360468702, + "grad_norm": 3.94001841545105, + "learning_rate": 1.7705810807703616e-07, + "loss": 0.9332, + "step": 11903 + }, + { + "epoch": 0.9176688251618872, + "grad_norm": 3.453479766845703, + "learning_rate": 1.7672896816015861e-07, + "loss": 0.917, + "step": 11904 + }, + { + "epoch": 0.9177459142769041, + "grad_norm": 3.9532957077026367, + "learning_rate": 1.7640012894925008e-07, + "loss": 0.8864, + "step": 11905 + }, + { + "epoch": 0.917823003391921, + "grad_norm": 3.767510414123535, + "learning_rate": 1.7607159046481138e-07, + "loss": 0.9912, + "step": 11906 + }, + { + "epoch": 0.917900092506938, + "grad_norm": 3.960230588912964, + "learning_rate": 1.7574335272732445e-07, + "loss": 0.8642, + "step": 11907 + }, + { + "epoch": 0.917977181621955, + "grad_norm": 4.039930820465088, + "learning_rate": 1.7541541575725464e-07, + "loss": 1.0177, + "step": 11908 + }, + { + "epoch": 0.918054270736972, + "grad_norm": 4.087274074554443, + "learning_rate": 1.7508777957504662e-07, + "loss": 0.9013, + "step": 11909 + }, + { + "epoch": 0.9181313598519889, + "grad_norm": 3.746781826019287, + "learning_rate": 1.7476044420112637e-07, + "loss": 0.9299, + "step": 11910 + }, + { + "epoch": 0.9182084489670058, + "grad_norm": 3.875746250152588, + "learning_rate": 1.744334096559025e-07, + "loss": 0.8717, + "step": 11911 + }, + { + "epoch": 0.9182855380820228, + "grad_norm": 4.113644123077393, + "learning_rate": 1.741066759597626e-07, + "loss": 1.0246, + "step": 11912 + }, + { + "epoch": 0.9183626271970398, + "grad_norm": 3.610800266265869, + "learning_rate": 1.7378024313307763e-07, + "loss": 0.9421, + "step": 11913 + }, + { + "epoch": 0.9184397163120568, + "grad_norm": 4.1041107177734375, + "learning_rate": 1.7345411119619847e-07, + "loss": 0.8065, + "step": 11914 + }, + { + "epoch": 0.9185168054270737, + "grad_norm": 3.635951519012451, + "learning_rate": 1.7312828016945836e-07, + "loss": 0.9826, + "step": 11915 + }, + { + "epoch": 0.9185938945420906, + "grad_norm": 3.9342377185821533, + "learning_rate": 1.728027500731716e-07, + "loss": 0.981, + "step": 11916 + }, + { + "epoch": 0.9186709836571076, + "grad_norm": 3.4716575145721436, + "learning_rate": 1.7247752092763247e-07, + "loss": 0.9107, + "step": 11917 + }, + { + "epoch": 0.9187480727721246, + "grad_norm": 3.786710262298584, + "learning_rate": 1.7215259275311703e-07, + "loss": 0.9527, + "step": 11918 + }, + { + "epoch": 0.9188251618871416, + "grad_norm": 3.892406940460205, + "learning_rate": 1.71827965569884e-07, + "loss": 0.7812, + "step": 11919 + }, + { + "epoch": 0.9189022510021585, + "grad_norm": 3.8469479084014893, + "learning_rate": 1.7150363939817117e-07, + "loss": 0.919, + "step": 11920 + }, + { + "epoch": 0.9189793401171754, + "grad_norm": 3.4932711124420166, + "learning_rate": 1.7117961425819897e-07, + "loss": 0.9501, + "step": 11921 + }, + { + "epoch": 0.9190564292321924, + "grad_norm": 3.6135263442993164, + "learning_rate": 1.70855890170169e-07, + "loss": 0.8402, + "step": 11922 + }, + { + "epoch": 0.9191335183472094, + "grad_norm": 4.184671878814697, + "learning_rate": 1.7053246715426297e-07, + "loss": 0.8431, + "step": 11923 + }, + { + "epoch": 0.9192106074622264, + "grad_norm": 3.7733452320098877, + "learning_rate": 1.702093452306458e-07, + "loss": 0.9064, + "step": 11924 + }, + { + "epoch": 0.9192876965772433, + "grad_norm": 3.719705820083618, + "learning_rate": 1.698865244194614e-07, + "loss": 0.8943, + "step": 11925 + }, + { + "epoch": 0.9193647856922602, + "grad_norm": 3.8758575916290283, + "learning_rate": 1.6956400474083644e-07, + "loss": 0.9384, + "step": 11926 + }, + { + "epoch": 0.9194418748072772, + "grad_norm": 3.6898250579833984, + "learning_rate": 1.6924178621487875e-07, + "loss": 0.8039, + "step": 11927 + }, + { + "epoch": 0.9195189639222942, + "grad_norm": 3.731034755706787, + "learning_rate": 1.689198688616761e-07, + "loss": 0.9268, + "step": 11928 + }, + { + "epoch": 0.9195960530373112, + "grad_norm": 3.5098345279693604, + "learning_rate": 1.685982527012986e-07, + "loss": 0.8259, + "step": 11929 + }, + { + "epoch": 0.9196731421523281, + "grad_norm": 3.9864370822906494, + "learning_rate": 1.6827693775379794e-07, + "loss": 0.8976, + "step": 11930 + }, + { + "epoch": 0.919750231267345, + "grad_norm": 3.444135904312134, + "learning_rate": 1.6795592403920591e-07, + "loss": 0.8563, + "step": 11931 + }, + { + "epoch": 0.919827320382362, + "grad_norm": 3.5715203285217285, + "learning_rate": 1.6763521157753647e-07, + "loss": 0.8746, + "step": 11932 + }, + { + "epoch": 0.919904409497379, + "grad_norm": 4.088859558105469, + "learning_rate": 1.6731480038878368e-07, + "loss": 0.9353, + "step": 11933 + }, + { + "epoch": 0.919981498612396, + "grad_norm": 3.915494918823242, + "learning_rate": 1.6699469049292427e-07, + "loss": 0.946, + "step": 11934 + }, + { + "epoch": 0.9200585877274129, + "grad_norm": 3.923362970352173, + "learning_rate": 1.666748819099151e-07, + "loss": 0.9018, + "step": 11935 + }, + { + "epoch": 0.9201356768424298, + "grad_norm": 3.7624075412750244, + "learning_rate": 1.6635537465969463e-07, + "loss": 0.8156, + "step": 11936 + }, + { + "epoch": 0.9202127659574468, + "grad_norm": 3.9974863529205322, + "learning_rate": 1.6603616876218308e-07, + "loss": 0.8917, + "step": 11937 + }, + { + "epoch": 0.9202898550724637, + "grad_norm": 4.101132392883301, + "learning_rate": 1.6571726423727953e-07, + "loss": 0.9253, + "step": 11938 + }, + { + "epoch": 0.9203669441874808, + "grad_norm": 4.09736442565918, + "learning_rate": 1.6539866110486802e-07, + "loss": 0.9177, + "step": 11939 + }, + { + "epoch": 0.9204440333024977, + "grad_norm": 3.8575503826141357, + "learning_rate": 1.6508035938481048e-07, + "loss": 0.9692, + "step": 11940 + }, + { + "epoch": 0.9205211224175146, + "grad_norm": 3.686838150024414, + "learning_rate": 1.6476235909695158e-07, + "loss": 0.9461, + "step": 11941 + }, + { + "epoch": 0.9205982115325316, + "grad_norm": 3.914886236190796, + "learning_rate": 1.6444466026111826e-07, + "loss": 0.9885, + "step": 11942 + }, + { + "epoch": 0.9206753006475485, + "grad_norm": 4.105428695678711, + "learning_rate": 1.6412726289711578e-07, + "loss": 0.9431, + "step": 11943 + }, + { + "epoch": 0.9207523897625656, + "grad_norm": 6.005154132843018, + "learning_rate": 1.6381016702473273e-07, + "loss": 0.7875, + "step": 11944 + }, + { + "epoch": 0.9208294788775825, + "grad_norm": 4.297434329986572, + "learning_rate": 1.6349337266373832e-07, + "loss": 1.0278, + "step": 11945 + }, + { + "epoch": 0.9209065679925994, + "grad_norm": 3.489044189453125, + "learning_rate": 1.631768798338834e-07, + "loss": 0.828, + "step": 11946 + }, + { + "epoch": 0.9209836571076164, + "grad_norm": 3.5091493129730225, + "learning_rate": 1.6286068855489946e-07, + "loss": 0.804, + "step": 11947 + }, + { + "epoch": 0.9210607462226333, + "grad_norm": 3.7728817462921143, + "learning_rate": 1.625447988465001e-07, + "loss": 0.9729, + "step": 11948 + }, + { + "epoch": 0.9211378353376504, + "grad_norm": 3.835939884185791, + "learning_rate": 1.6222921072837794e-07, + "loss": 0.9318, + "step": 11949 + }, + { + "epoch": 0.9212149244526673, + "grad_norm": 3.666374921798706, + "learning_rate": 1.6191392422020892e-07, + "loss": 0.922, + "step": 11950 + }, + { + "epoch": 0.9212920135676842, + "grad_norm": 3.756239652633667, + "learning_rate": 1.6159893934165006e-07, + "loss": 0.9781, + "step": 11951 + }, + { + "epoch": 0.9213691026827012, + "grad_norm": 3.4061012268066406, + "learning_rate": 1.6128425611233844e-07, + "loss": 0.7702, + "step": 11952 + }, + { + "epoch": 0.9214461917977181, + "grad_norm": 3.5357954502105713, + "learning_rate": 1.6096987455189338e-07, + "loss": 0.8715, + "step": 11953 + }, + { + "epoch": 0.9215232809127352, + "grad_norm": 4.1103596687316895, + "learning_rate": 1.6065579467991422e-07, + "loss": 0.8386, + "step": 11954 + }, + { + "epoch": 0.9216003700277521, + "grad_norm": 3.631218433380127, + "learning_rate": 1.6034201651598304e-07, + "loss": 0.7605, + "step": 11955 + }, + { + "epoch": 0.921677459142769, + "grad_norm": 3.968055248260498, + "learning_rate": 1.600285400796614e-07, + "loss": 1.0534, + "step": 11956 + }, + { + "epoch": 0.921754548257786, + "grad_norm": 4.089054107666016, + "learning_rate": 1.597153653904937e-07, + "loss": 0.9012, + "step": 11957 + }, + { + "epoch": 0.9218316373728029, + "grad_norm": 3.858142375946045, + "learning_rate": 1.594024924680043e-07, + "loss": 0.8826, + "step": 11958 + }, + { + "epoch": 0.92190872648782, + "grad_norm": 3.8919715881347656, + "learning_rate": 1.5908992133170041e-07, + "loss": 0.7672, + "step": 11959 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 4.094879150390625, + "learning_rate": 1.5877765200106697e-07, + "loss": 1.0555, + "step": 11960 + }, + { + "epoch": 0.9220629047178538, + "grad_norm": 3.7169480323791504, + "learning_rate": 1.5846568449557397e-07, + "loss": 0.7915, + "step": 11961 + }, + { + "epoch": 0.9221399938328708, + "grad_norm": 3.7812538146972656, + "learning_rate": 1.5815401883467086e-07, + "loss": 0.8998, + "step": 11962 + }, + { + "epoch": 0.9222170829478877, + "grad_norm": 3.6201839447021484, + "learning_rate": 1.578426550377876e-07, + "loss": 0.8315, + "step": 11963 + }, + { + "epoch": 0.9222941720629048, + "grad_norm": 3.6739892959594727, + "learning_rate": 1.5753159312433762e-07, + "loss": 0.9893, + "step": 11964 + }, + { + "epoch": 0.9223712611779217, + "grad_norm": 3.594050884246826, + "learning_rate": 1.5722083311371206e-07, + "loss": 0.8761, + "step": 11965 + }, + { + "epoch": 0.9224483502929386, + "grad_norm": 3.51835298538208, + "learning_rate": 1.569103750252865e-07, + "loss": 0.9165, + "step": 11966 + }, + { + "epoch": 0.9225254394079556, + "grad_norm": 3.550865650177002, + "learning_rate": 1.5660021887841603e-07, + "loss": 0.8594, + "step": 11967 + }, + { + "epoch": 0.9226025285229725, + "grad_norm": 3.8896725177764893, + "learning_rate": 1.5629036469243686e-07, + "loss": 0.8592, + "step": 11968 + }, + { + "epoch": 0.9226796176379896, + "grad_norm": 3.8969407081604004, + "learning_rate": 1.5598081248666742e-07, + "loss": 0.8356, + "step": 11969 + }, + { + "epoch": 0.9227567067530065, + "grad_norm": 3.598137378692627, + "learning_rate": 1.5567156228040726e-07, + "loss": 0.8937, + "step": 11970 + }, + { + "epoch": 0.9228337958680234, + "grad_norm": 3.7268011569976807, + "learning_rate": 1.553626140929354e-07, + "loss": 0.9288, + "step": 11971 + }, + { + "epoch": 0.9229108849830404, + "grad_norm": 3.700021266937256, + "learning_rate": 1.5505396794351312e-07, + "loss": 0.8649, + "step": 11972 + }, + { + "epoch": 0.9229879740980573, + "grad_norm": 3.5971262454986572, + "learning_rate": 1.5474562385138337e-07, + "loss": 0.8113, + "step": 11973 + }, + { + "epoch": 0.9230650632130744, + "grad_norm": 5.08428955078125, + "learning_rate": 1.5443758183576962e-07, + "loss": 0.9799, + "step": 11974 + }, + { + "epoch": 0.9231421523280913, + "grad_norm": 4.136900424957275, + "learning_rate": 1.5412984191587766e-07, + "loss": 0.9596, + "step": 11975 + }, + { + "epoch": 0.9232192414431082, + "grad_norm": 3.8938400745391846, + "learning_rate": 1.5382240411089156e-07, + "loss": 0.9409, + "step": 11976 + }, + { + "epoch": 0.9232963305581252, + "grad_norm": 4.196589469909668, + "learning_rate": 1.5351526843997988e-07, + "loss": 0.96, + "step": 11977 + }, + { + "epoch": 0.9233734196731421, + "grad_norm": 4.031776428222656, + "learning_rate": 1.532084349222912e-07, + "loss": 0.9206, + "step": 11978 + }, + { + "epoch": 0.9234505087881592, + "grad_norm": 3.581867218017578, + "learning_rate": 1.5290190357695466e-07, + "loss": 0.8353, + "step": 11979 + }, + { + "epoch": 0.9235275979031761, + "grad_norm": 3.6931517124176025, + "learning_rate": 1.5259567442308e-07, + "loss": 0.7995, + "step": 11980 + }, + { + "epoch": 0.923604687018193, + "grad_norm": 3.691195011138916, + "learning_rate": 1.5228974747975965e-07, + "loss": 0.8052, + "step": 11981 + }, + { + "epoch": 0.92368177613321, + "grad_norm": 3.95929217338562, + "learning_rate": 1.5198412276606622e-07, + "loss": 0.9099, + "step": 11982 + }, + { + "epoch": 0.9237588652482269, + "grad_norm": 3.7074384689331055, + "learning_rate": 1.5167880030105497e-07, + "loss": 0.8664, + "step": 11983 + }, + { + "epoch": 0.923835954363244, + "grad_norm": 4.384084224700928, + "learning_rate": 1.5137378010376015e-07, + "loss": 1.0385, + "step": 11984 + }, + { + "epoch": 0.9239130434782609, + "grad_norm": 3.809328556060791, + "learning_rate": 1.5106906219319871e-07, + "loss": 0.8989, + "step": 11985 + }, + { + "epoch": 0.9239901325932778, + "grad_norm": 3.73268461227417, + "learning_rate": 1.5076464658836775e-07, + "loss": 0.9453, + "step": 11986 + }, + { + "epoch": 0.9240672217082948, + "grad_norm": 4.032083988189697, + "learning_rate": 1.504605333082465e-07, + "loss": 0.9235, + "step": 11987 + }, + { + "epoch": 0.9241443108233117, + "grad_norm": 3.3622939586639404, + "learning_rate": 1.5015672237179424e-07, + "loss": 0.8173, + "step": 11988 + }, + { + "epoch": 0.9242213999383287, + "grad_norm": 3.6310486793518066, + "learning_rate": 1.498532137979525e-07, + "loss": 0.9421, + "step": 11989 + }, + { + "epoch": 0.9242984890533457, + "grad_norm": 3.632511615753174, + "learning_rate": 1.495500076056433e-07, + "loss": 0.8727, + "step": 11990 + }, + { + "epoch": 0.9243755781683626, + "grad_norm": 4.108543872833252, + "learning_rate": 1.4924710381376995e-07, + "loss": 0.984, + "step": 11991 + }, + { + "epoch": 0.9244526672833796, + "grad_norm": 3.7871105670928955, + "learning_rate": 1.4894450244121727e-07, + "loss": 0.8612, + "step": 11992 + }, + { + "epoch": 0.9245297563983965, + "grad_norm": 3.8213963508605957, + "learning_rate": 1.486422035068502e-07, + "loss": 1.1303, + "step": 11993 + }, + { + "epoch": 0.9246068455134135, + "grad_norm": 3.63419508934021, + "learning_rate": 1.483402070295159e-07, + "loss": 0.804, + "step": 11994 + }, + { + "epoch": 0.9246839346284305, + "grad_norm": 3.4561123847961426, + "learning_rate": 1.4803851302804261e-07, + "loss": 0.9273, + "step": 11995 + }, + { + "epoch": 0.9247610237434474, + "grad_norm": 3.9428532123565674, + "learning_rate": 1.477371215212392e-07, + "loss": 0.9112, + "step": 11996 + }, + { + "epoch": 0.9248381128584644, + "grad_norm": 3.147937774658203, + "learning_rate": 1.474360325278956e-07, + "loss": 0.8637, + "step": 11997 + }, + { + "epoch": 0.9249152019734813, + "grad_norm": 3.6595137119293213, + "learning_rate": 1.4713524606678298e-07, + "loss": 0.9017, + "step": 11998 + }, + { + "epoch": 0.9249922910884983, + "grad_norm": 4.111579895019531, + "learning_rate": 1.4683476215665405e-07, + "loss": 1.0554, + "step": 11999 + }, + { + "epoch": 0.9250693802035153, + "grad_norm": 3.70733642578125, + "learning_rate": 1.465345808162427e-07, + "loss": 0.9537, + "step": 12000 + }, + { + "epoch": 0.9251464693185322, + "grad_norm": 3.986865758895874, + "learning_rate": 1.4623470206426404e-07, + "loss": 1.0235, + "step": 12001 + }, + { + "epoch": 0.9252235584335492, + "grad_norm": 3.7944588661193848, + "learning_rate": 1.4593512591941304e-07, + "loss": 0.8312, + "step": 12002 + }, + { + "epoch": 0.9253006475485661, + "grad_norm": 3.5245468616485596, + "learning_rate": 1.4563585240036705e-07, + "loss": 0.8643, + "step": 12003 + }, + { + "epoch": 0.9253777366635831, + "grad_norm": 3.854820966720581, + "learning_rate": 1.4533688152578384e-07, + "loss": 0.8276, + "step": 12004 + }, + { + "epoch": 0.9254548257786, + "grad_norm": 4.025414943695068, + "learning_rate": 1.4503821331430358e-07, + "loss": 0.859, + "step": 12005 + }, + { + "epoch": 0.925531914893617, + "grad_norm": 3.724771499633789, + "learning_rate": 1.447398477845463e-07, + "loss": 0.8583, + "step": 12006 + }, + { + "epoch": 0.925609004008634, + "grad_norm": 3.5540995597839355, + "learning_rate": 1.444417849551133e-07, + "loss": 0.871, + "step": 12007 + }, + { + "epoch": 0.9256860931236509, + "grad_norm": 3.8475286960601807, + "learning_rate": 1.4414402484458746e-07, + "loss": 0.9248, + "step": 12008 + }, + { + "epoch": 0.9257631822386679, + "grad_norm": 3.745168924331665, + "learning_rate": 1.438465674715328e-07, + "loss": 0.9334, + "step": 12009 + }, + { + "epoch": 0.9258402713536849, + "grad_norm": 4.050620079040527, + "learning_rate": 1.4354941285449342e-07, + "loss": 0.9622, + "step": 12010 + }, + { + "epoch": 0.9259173604687018, + "grad_norm": 3.5585291385650635, + "learning_rate": 1.4325256101199615e-07, + "loss": 0.9217, + "step": 12011 + }, + { + "epoch": 0.9259944495837188, + "grad_norm": 3.585474729537964, + "learning_rate": 1.4295601196254838e-07, + "loss": 0.8582, + "step": 12012 + }, + { + "epoch": 0.9260715386987357, + "grad_norm": 3.88104510307312, + "learning_rate": 1.4265976572463815e-07, + "loss": 0.9336, + "step": 12013 + }, + { + "epoch": 0.9261486278137527, + "grad_norm": 3.3760218620300293, + "learning_rate": 1.4236382231673395e-07, + "loss": 0.8343, + "step": 12014 + }, + { + "epoch": 0.9262257169287696, + "grad_norm": 3.9959182739257812, + "learning_rate": 1.420681817572872e-07, + "loss": 0.9329, + "step": 12015 + }, + { + "epoch": 0.9263028060437866, + "grad_norm": 3.396411895751953, + "learning_rate": 1.417728440647298e-07, + "loss": 0.7385, + "step": 12016 + }, + { + "epoch": 0.9263798951588036, + "grad_norm": 3.7224302291870117, + "learning_rate": 1.4147780925747368e-07, + "loss": 0.8898, + "step": 12017 + }, + { + "epoch": 0.9264569842738205, + "grad_norm": 3.9120421409606934, + "learning_rate": 1.4118307735391412e-07, + "loss": 0.8889, + "step": 12018 + }, + { + "epoch": 0.9265340733888375, + "grad_norm": 3.755709171295166, + "learning_rate": 1.4088864837242422e-07, + "loss": 0.9396, + "step": 12019 + }, + { + "epoch": 0.9266111625038544, + "grad_norm": 3.8816263675689697, + "learning_rate": 1.4059452233136094e-07, + "loss": 0.8704, + "step": 12020 + }, + { + "epoch": 0.9266882516188714, + "grad_norm": 3.5586330890655518, + "learning_rate": 1.4030069924906241e-07, + "loss": 0.8279, + "step": 12021 + }, + { + "epoch": 0.9267653407338884, + "grad_norm": 3.6975462436676025, + "learning_rate": 1.4000717914384677e-07, + "loss": 0.9152, + "step": 12022 + }, + { + "epoch": 0.9268424298489053, + "grad_norm": 3.672626256942749, + "learning_rate": 1.397139620340121e-07, + "loss": 0.8989, + "step": 12023 + }, + { + "epoch": 0.9269195189639223, + "grad_norm": 3.6923696994781494, + "learning_rate": 1.3942104793783996e-07, + "loss": 0.8973, + "step": 12024 + }, + { + "epoch": 0.9269966080789392, + "grad_norm": 3.7403361797332764, + "learning_rate": 1.391284368735918e-07, + "loss": 0.8737, + "step": 12025 + }, + { + "epoch": 0.9270736971939562, + "grad_norm": 4.060670852661133, + "learning_rate": 1.388361288595108e-07, + "loss": 0.9774, + "step": 12026 + }, + { + "epoch": 0.9271507863089732, + "grad_norm": 3.923124313354492, + "learning_rate": 1.3854412391382078e-07, + "loss": 0.9751, + "step": 12027 + }, + { + "epoch": 0.9272278754239901, + "grad_norm": 3.894160032272339, + "learning_rate": 1.3825242205472599e-07, + "loss": 0.919, + "step": 12028 + }, + { + "epoch": 0.9273049645390071, + "grad_norm": 3.898775815963745, + "learning_rate": 1.3796102330041305e-07, + "loss": 0.9102, + "step": 12029 + }, + { + "epoch": 0.927382053654024, + "grad_norm": 3.7741925716400146, + "learning_rate": 1.3766992766904908e-07, + "loss": 0.7993, + "step": 12030 + }, + { + "epoch": 0.927459142769041, + "grad_norm": 3.740205764770508, + "learning_rate": 1.3737913517878286e-07, + "loss": 0.9111, + "step": 12031 + }, + { + "epoch": 0.927536231884058, + "grad_norm": 3.5820446014404297, + "learning_rate": 1.370886458477433e-07, + "loss": 0.8983, + "step": 12032 + }, + { + "epoch": 0.9276133209990749, + "grad_norm": 3.634716510772705, + "learning_rate": 1.3679845969404138e-07, + "loss": 0.7987, + "step": 12033 + }, + { + "epoch": 0.9276904101140919, + "grad_norm": 3.8393595218658447, + "learning_rate": 1.3650857673576767e-07, + "loss": 0.8793, + "step": 12034 + }, + { + "epoch": 0.9277674992291088, + "grad_norm": 3.9487619400024414, + "learning_rate": 1.362189969909955e-07, + "loss": 0.8779, + "step": 12035 + }, + { + "epoch": 0.9278445883441259, + "grad_norm": 3.9395010471343994, + "learning_rate": 1.3592972047777874e-07, + "loss": 0.9886, + "step": 12036 + }, + { + "epoch": 0.9279216774591428, + "grad_norm": 3.642089366912842, + "learning_rate": 1.3564074721415243e-07, + "loss": 0.8256, + "step": 12037 + }, + { + "epoch": 0.9279987665741597, + "grad_norm": 3.643852472305298, + "learning_rate": 1.3535207721813327e-07, + "loss": 0.8572, + "step": 12038 + }, + { + "epoch": 0.9280758556891767, + "grad_norm": 3.6606554985046387, + "learning_rate": 1.3506371050771626e-07, + "loss": 0.9307, + "step": 12039 + }, + { + "epoch": 0.9281529448041936, + "grad_norm": 3.6610138416290283, + "learning_rate": 1.3477564710088097e-07, + "loss": 0.8941, + "step": 12040 + }, + { + "epoch": 0.9282300339192107, + "grad_norm": 3.769508123397827, + "learning_rate": 1.3448788701558635e-07, + "loss": 0.8769, + "step": 12041 + }, + { + "epoch": 0.9283071230342276, + "grad_norm": 3.8001794815063477, + "learning_rate": 1.3420043026977303e-07, + "loss": 0.9133, + "step": 12042 + }, + { + "epoch": 0.9283842121492445, + "grad_norm": 3.7807233333587646, + "learning_rate": 1.3391327688136225e-07, + "loss": 0.9274, + "step": 12043 + }, + { + "epoch": 0.9284613012642615, + "grad_norm": 3.714263677597046, + "learning_rate": 1.3362642686825688e-07, + "loss": 0.8069, + "step": 12044 + }, + { + "epoch": 0.9285383903792784, + "grad_norm": 3.7317070960998535, + "learning_rate": 1.3333988024833989e-07, + "loss": 0.9593, + "step": 12045 + }, + { + "epoch": 0.9286154794942955, + "grad_norm": 3.5440969467163086, + "learning_rate": 1.3305363703947581e-07, + "loss": 0.8853, + "step": 12046 + }, + { + "epoch": 0.9286925686093124, + "grad_norm": 3.542323112487793, + "learning_rate": 1.3276769725951155e-07, + "loss": 0.8715, + "step": 12047 + }, + { + "epoch": 0.9287696577243293, + "grad_norm": 3.7308967113494873, + "learning_rate": 1.3248206092627281e-07, + "loss": 0.8932, + "step": 12048 + }, + { + "epoch": 0.9288467468393463, + "grad_norm": 3.898761034011841, + "learning_rate": 1.3219672805756922e-07, + "loss": 0.9775, + "step": 12049 + }, + { + "epoch": 0.9289238359543632, + "grad_norm": 3.488557815551758, + "learning_rate": 1.319116986711877e-07, + "loss": 0.9247, + "step": 12050 + }, + { + "epoch": 0.9290009250693803, + "grad_norm": 4.142662048339844, + "learning_rate": 1.31626972784899e-07, + "loss": 0.9129, + "step": 12051 + }, + { + "epoch": 0.9290780141843972, + "grad_norm": 3.6546530723571777, + "learning_rate": 1.3134255041645505e-07, + "loss": 0.8448, + "step": 12052 + }, + { + "epoch": 0.9291551032994141, + "grad_norm": 3.8783202171325684, + "learning_rate": 1.310584315835872e-07, + "loss": 0.938, + "step": 12053 + }, + { + "epoch": 0.9292321924144311, + "grad_norm": 3.5673296451568604, + "learning_rate": 1.3077461630400967e-07, + "loss": 0.9903, + "step": 12054 + }, + { + "epoch": 0.929309281529448, + "grad_norm": 3.872450590133667, + "learning_rate": 1.3049110459541658e-07, + "loss": 0.8605, + "step": 12055 + }, + { + "epoch": 0.929386370644465, + "grad_norm": 3.5778377056121826, + "learning_rate": 1.3020789647548326e-07, + "loss": 0.8794, + "step": 12056 + }, + { + "epoch": 0.929463459759482, + "grad_norm": 3.7152254581451416, + "learning_rate": 1.2992499196186614e-07, + "loss": 0.8878, + "step": 12057 + }, + { + "epoch": 0.9295405488744989, + "grad_norm": 3.6128242015838623, + "learning_rate": 1.296423910722028e-07, + "loss": 0.9107, + "step": 12058 + }, + { + "epoch": 0.9296176379895159, + "grad_norm": 3.4194366931915283, + "learning_rate": 1.293600938241124e-07, + "loss": 0.8509, + "step": 12059 + }, + { + "epoch": 0.9296947271045328, + "grad_norm": 3.806325912475586, + "learning_rate": 1.2907810023519485e-07, + "loss": 0.8803, + "step": 12060 + }, + { + "epoch": 0.9297718162195499, + "grad_norm": 3.6678504943847656, + "learning_rate": 1.2879641032302993e-07, + "loss": 0.9138, + "step": 12061 + }, + { + "epoch": 0.9298489053345668, + "grad_norm": 3.877365827560425, + "learning_rate": 1.2851502410518025e-07, + "loss": 0.9103, + "step": 12062 + }, + { + "epoch": 0.9299259944495837, + "grad_norm": 3.8869688510894775, + "learning_rate": 1.28233941599189e-07, + "loss": 0.9129, + "step": 12063 + }, + { + "epoch": 0.9300030835646007, + "grad_norm": 4.313940525054932, + "learning_rate": 1.279531628225794e-07, + "loss": 0.8989, + "step": 12064 + }, + { + "epoch": 0.9300801726796176, + "grad_norm": 3.7056541442871094, + "learning_rate": 1.2767268779285803e-07, + "loss": 0.8635, + "step": 12065 + }, + { + "epoch": 0.9301572617946346, + "grad_norm": 3.957458019256592, + "learning_rate": 1.2739251652750917e-07, + "loss": 0.8531, + "step": 12066 + }, + { + "epoch": 0.9302343509096516, + "grad_norm": 4.063552379608154, + "learning_rate": 1.2711264904400167e-07, + "loss": 1.0151, + "step": 12067 + }, + { + "epoch": 0.9303114400246685, + "grad_norm": 4.064858436584473, + "learning_rate": 1.2683308535978323e-07, + "loss": 0.9269, + "step": 12068 + }, + { + "epoch": 0.9303885291396855, + "grad_norm": 3.7394871711730957, + "learning_rate": 1.2655382549228267e-07, + "loss": 0.8913, + "step": 12069 + }, + { + "epoch": 0.9304656182547024, + "grad_norm": 3.8202192783355713, + "learning_rate": 1.2627486945891166e-07, + "loss": 0.9727, + "step": 12070 + }, + { + "epoch": 0.9305427073697194, + "grad_norm": 3.5857367515563965, + "learning_rate": 1.2599621727706014e-07, + "loss": 0.9116, + "step": 12071 + }, + { + "epoch": 0.9306197964847364, + "grad_norm": 3.656193494796753, + "learning_rate": 1.2571786896410144e-07, + "loss": 0.8853, + "step": 12072 + }, + { + "epoch": 0.9306968855997533, + "grad_norm": 3.4861977100372314, + "learning_rate": 1.2543982453738945e-07, + "loss": 0.8542, + "step": 12073 + }, + { + "epoch": 0.9307739747147703, + "grad_norm": 3.6881868839263916, + "learning_rate": 1.251620840142581e-07, + "loss": 0.9306, + "step": 12074 + }, + { + "epoch": 0.9308510638297872, + "grad_norm": 4.058561325073242, + "learning_rate": 1.2488464741202355e-07, + "loss": 0.9352, + "step": 12075 + }, + { + "epoch": 0.9309281529448042, + "grad_norm": 4.049665927886963, + "learning_rate": 1.2460751474798249e-07, + "loss": 1.0101, + "step": 12076 + }, + { + "epoch": 0.9310052420598212, + "grad_norm": 3.6814520359039307, + "learning_rate": 1.2433068603941223e-07, + "loss": 0.881, + "step": 12077 + }, + { + "epoch": 0.9310823311748381, + "grad_norm": 4.561155796051025, + "learning_rate": 1.2405416130357172e-07, + "loss": 0.9776, + "step": 12078 + }, + { + "epoch": 0.9311594202898551, + "grad_norm": 4.181887149810791, + "learning_rate": 1.2377794055770164e-07, + "loss": 0.8847, + "step": 12079 + }, + { + "epoch": 0.931236509404872, + "grad_norm": 3.861226797103882, + "learning_rate": 1.235020238190221e-07, + "loss": 1.0193, + "step": 12080 + }, + { + "epoch": 0.931313598519889, + "grad_norm": 3.324911594390869, + "learning_rate": 1.23226411104736e-07, + "loss": 0.8281, + "step": 12081 + }, + { + "epoch": 0.931390687634906, + "grad_norm": 3.70337176322937, + "learning_rate": 1.2295110243202458e-07, + "loss": 0.9571, + "step": 12082 + }, + { + "epoch": 0.9314677767499229, + "grad_norm": 3.8327345848083496, + "learning_rate": 1.2267609781805356e-07, + "loss": 0.8885, + "step": 12083 + }, + { + "epoch": 0.9315448658649399, + "grad_norm": 3.6880276203155518, + "learning_rate": 1.2240139727996757e-07, + "loss": 0.8384, + "step": 12084 + }, + { + "epoch": 0.9316219549799568, + "grad_norm": 3.634875774383545, + "learning_rate": 1.2212700083489236e-07, + "loss": 0.9202, + "step": 12085 + }, + { + "epoch": 0.9316990440949738, + "grad_norm": 3.5088517665863037, + "learning_rate": 1.2185290849993648e-07, + "loss": 0.8589, + "step": 12086 + }, + { + "epoch": 0.9317761332099908, + "grad_norm": 3.7651207447052, + "learning_rate": 1.2157912029218676e-07, + "loss": 0.899, + "step": 12087 + }, + { + "epoch": 0.9318532223250077, + "grad_norm": 3.844053030014038, + "learning_rate": 1.2130563622871239e-07, + "loss": 0.913, + "step": 12088 + }, + { + "epoch": 0.9319303114400247, + "grad_norm": 3.8823323249816895, + "learning_rate": 1.2103245632656414e-07, + "loss": 0.9003, + "step": 12089 + }, + { + "epoch": 0.9320074005550416, + "grad_norm": 3.5348687171936035, + "learning_rate": 1.2075958060277394e-07, + "loss": 0.8039, + "step": 12090 + }, + { + "epoch": 0.9320844896700586, + "grad_norm": 3.1809444427490234, + "learning_rate": 1.2048700907435318e-07, + "loss": 0.7699, + "step": 12091 + }, + { + "epoch": 0.9321615787850756, + "grad_norm": 3.506540298461914, + "learning_rate": 1.2021474175829662e-07, + "loss": 0.9134, + "step": 12092 + }, + { + "epoch": 0.9322386679000925, + "grad_norm": 3.255476236343384, + "learning_rate": 1.1994277867157734e-07, + "loss": 0.8231, + "step": 12093 + }, + { + "epoch": 0.9323157570151095, + "grad_norm": 3.484185218811035, + "learning_rate": 1.196711198311512e-07, + "loss": 0.8325, + "step": 12094 + }, + { + "epoch": 0.9323928461301264, + "grad_norm": 4.07226037979126, + "learning_rate": 1.1939976525395468e-07, + "loss": 0.9041, + "step": 12095 + }, + { + "epoch": 0.9324699352451434, + "grad_norm": 3.9831414222717285, + "learning_rate": 1.1912871495690592e-07, + "loss": 0.9138, + "step": 12096 + }, + { + "epoch": 0.9325470243601603, + "grad_norm": 3.949620246887207, + "learning_rate": 1.1885796895690304e-07, + "loss": 1.0008, + "step": 12097 + }, + { + "epoch": 0.9326241134751773, + "grad_norm": 3.8057408332824707, + "learning_rate": 1.185875272708259e-07, + "loss": 0.9066, + "step": 12098 + }, + { + "epoch": 0.9327012025901943, + "grad_norm": 4.090064525604248, + "learning_rate": 1.1831738991553432e-07, + "loss": 0.9784, + "step": 12099 + }, + { + "epoch": 0.9327782917052112, + "grad_norm": 4.04171895980835, + "learning_rate": 1.1804755690787095e-07, + "loss": 0.98, + "step": 12100 + }, + { + "epoch": 0.9328553808202282, + "grad_norm": 3.672153949737549, + "learning_rate": 1.177780282646579e-07, + "loss": 0.8458, + "step": 12101 + }, + { + "epoch": 0.9329324699352451, + "grad_norm": 3.4648172855377197, + "learning_rate": 1.1750880400269948e-07, + "loss": 0.8031, + "step": 12102 + }, + { + "epoch": 0.9330095590502621, + "grad_norm": 3.73508882522583, + "learning_rate": 1.1723988413878063e-07, + "loss": 0.9876, + "step": 12103 + }, + { + "epoch": 0.9330866481652791, + "grad_norm": 3.7413759231567383, + "learning_rate": 1.1697126868966569e-07, + "loss": 0.8011, + "step": 12104 + }, + { + "epoch": 0.933163737280296, + "grad_norm": 3.7317357063293457, + "learning_rate": 1.1670295767210238e-07, + "loss": 0.8842, + "step": 12105 + }, + { + "epoch": 0.933240826395313, + "grad_norm": 3.6849234104156494, + "learning_rate": 1.1643495110281844e-07, + "loss": 0.9667, + "step": 12106 + }, + { + "epoch": 0.9333179155103299, + "grad_norm": 3.7106947898864746, + "learning_rate": 1.1616724899852217e-07, + "loss": 0.8664, + "step": 12107 + }, + { + "epoch": 0.9333950046253469, + "grad_norm": 3.798051595687866, + "learning_rate": 1.158998513759052e-07, + "loss": 0.8214, + "step": 12108 + }, + { + "epoch": 0.9334720937403639, + "grad_norm": 4.954608917236328, + "learning_rate": 1.156327582516359e-07, + "loss": 0.9682, + "step": 12109 + }, + { + "epoch": 0.9335491828553808, + "grad_norm": 3.375283718109131, + "learning_rate": 1.1536596964236757e-07, + "loss": 0.7497, + "step": 12110 + }, + { + "epoch": 0.9336262719703978, + "grad_norm": 3.573298931121826, + "learning_rate": 1.1509948556473306e-07, + "loss": 0.8748, + "step": 12111 + }, + { + "epoch": 0.9337033610854147, + "grad_norm": 3.768425464630127, + "learning_rate": 1.1483330603534625e-07, + "loss": 0.8943, + "step": 12112 + }, + { + "epoch": 0.9337804502004317, + "grad_norm": 3.941859245300293, + "learning_rate": 1.1456743107080171e-07, + "loss": 0.9499, + "step": 12113 + }, + { + "epoch": 0.9338575393154487, + "grad_norm": 3.9781956672668457, + "learning_rate": 1.1430186068767557e-07, + "loss": 0.8871, + "step": 12114 + }, + { + "epoch": 0.9339346284304656, + "grad_norm": 3.6682260036468506, + "learning_rate": 1.1403659490252462e-07, + "loss": 0.8444, + "step": 12115 + }, + { + "epoch": 0.9340117175454826, + "grad_norm": 3.898146629333496, + "learning_rate": 1.137716337318867e-07, + "loss": 0.9247, + "step": 12116 + }, + { + "epoch": 0.9340888066604995, + "grad_norm": 4.3249335289001465, + "learning_rate": 1.1350697719228144e-07, + "loss": 0.9518, + "step": 12117 + }, + { + "epoch": 0.9341658957755165, + "grad_norm": 3.901083469390869, + "learning_rate": 1.1324262530020835e-07, + "loss": 0.9795, + "step": 12118 + }, + { + "epoch": 0.9342429848905335, + "grad_norm": 3.6279914379119873, + "learning_rate": 1.1297857807214818e-07, + "loss": 0.8613, + "step": 12119 + }, + { + "epoch": 0.9343200740055504, + "grad_norm": 3.867549180984497, + "learning_rate": 1.127148355245633e-07, + "loss": 0.8051, + "step": 12120 + }, + { + "epoch": 0.9343971631205674, + "grad_norm": 3.700650453567505, + "learning_rate": 1.1245139767389612e-07, + "loss": 0.8942, + "step": 12121 + }, + { + "epoch": 0.9344742522355843, + "grad_norm": 3.632929563522339, + "learning_rate": 1.1218826453657127e-07, + "loss": 0.8625, + "step": 12122 + }, + { + "epoch": 0.9345513413506013, + "grad_norm": 3.861298084259033, + "learning_rate": 1.1192543612899398e-07, + "loss": 0.9062, + "step": 12123 + }, + { + "epoch": 0.9346284304656183, + "grad_norm": 3.5877022743225098, + "learning_rate": 1.1166291246754945e-07, + "loss": 0.8753, + "step": 12124 + }, + { + "epoch": 0.9347055195806352, + "grad_norm": 3.9185619354248047, + "learning_rate": 1.114006935686046e-07, + "loss": 0.8794, + "step": 12125 + }, + { + "epoch": 0.9347826086956522, + "grad_norm": 3.9409501552581787, + "learning_rate": 1.1113877944850804e-07, + "loss": 1.0012, + "step": 12126 + }, + { + "epoch": 0.9348596978106691, + "grad_norm": 3.9709725379943848, + "learning_rate": 1.1087717012358834e-07, + "loss": 1.0408, + "step": 12127 + }, + { + "epoch": 0.934936786925686, + "grad_norm": 3.867388963699341, + "learning_rate": 1.1061586561015636e-07, + "loss": 0.9123, + "step": 12128 + }, + { + "epoch": 0.9350138760407031, + "grad_norm": 3.8618171215057373, + "learning_rate": 1.1035486592450184e-07, + "loss": 0.8511, + "step": 12129 + }, + { + "epoch": 0.93509096515572, + "grad_norm": 3.811495542526245, + "learning_rate": 1.1009417108289733e-07, + "loss": 0.8261, + "step": 12130 + }, + { + "epoch": 0.935168054270737, + "grad_norm": 4.648553848266602, + "learning_rate": 1.0983378110159593e-07, + "loss": 0.9152, + "step": 12131 + }, + { + "epoch": 0.9352451433857539, + "grad_norm": 3.401317834854126, + "learning_rate": 1.0957369599683132e-07, + "loss": 0.9336, + "step": 12132 + }, + { + "epoch": 0.9353222325007708, + "grad_norm": 4.083096504211426, + "learning_rate": 1.0931391578481832e-07, + "loss": 0.9892, + "step": 12133 + }, + { + "epoch": 0.9353993216157879, + "grad_norm": 3.8430960178375244, + "learning_rate": 1.0905444048175396e-07, + "loss": 0.8581, + "step": 12134 + }, + { + "epoch": 0.9354764107308048, + "grad_norm": 3.859454393386841, + "learning_rate": 1.0879527010381419e-07, + "loss": 0.9705, + "step": 12135 + }, + { + "epoch": 0.9355534998458218, + "grad_norm": 4.2190375328063965, + "learning_rate": 1.0853640466715664e-07, + "loss": 0.8858, + "step": 12136 + }, + { + "epoch": 0.9356305889608387, + "grad_norm": 3.7572779655456543, + "learning_rate": 1.0827784418792064e-07, + "loss": 0.9528, + "step": 12137 + }, + { + "epoch": 0.9357076780758556, + "grad_norm": 3.6124427318573, + "learning_rate": 1.0801958868222662e-07, + "loss": 0.8229, + "step": 12138 + }, + { + "epoch": 0.9357847671908727, + "grad_norm": 4.138119220733643, + "learning_rate": 1.0776163816617446e-07, + "loss": 0.9593, + "step": 12139 + }, + { + "epoch": 0.9358618563058896, + "grad_norm": 3.941408634185791, + "learning_rate": 1.0750399265584744e-07, + "loss": 0.8894, + "step": 12140 + }, + { + "epoch": 0.9359389454209066, + "grad_norm": 3.9744157791137695, + "learning_rate": 1.0724665216730711e-07, + "loss": 0.9538, + "step": 12141 + }, + { + "epoch": 0.9360160345359235, + "grad_norm": 3.580399513244629, + "learning_rate": 1.0698961671659791e-07, + "loss": 0.8409, + "step": 12142 + }, + { + "epoch": 0.9360931236509404, + "grad_norm": 3.7937207221984863, + "learning_rate": 1.0673288631974421e-07, + "loss": 1.0333, + "step": 12143 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 3.6252706050872803, + "learning_rate": 1.0647646099275267e-07, + "loss": 0.8437, + "step": 12144 + }, + { + "epoch": 0.9362473018809744, + "grad_norm": 4.086170673370361, + "learning_rate": 1.0622034075160936e-07, + "loss": 0.8064, + "step": 12145 + }, + { + "epoch": 0.9363243909959914, + "grad_norm": 3.6738433837890625, + "learning_rate": 1.059645256122821e-07, + "loss": 0.9238, + "step": 12146 + }, + { + "epoch": 0.9364014801110083, + "grad_norm": 4.102461338043213, + "learning_rate": 1.0570901559072033e-07, + "loss": 0.9676, + "step": 12147 + }, + { + "epoch": 0.9364785692260252, + "grad_norm": 4.378781318664551, + "learning_rate": 1.0545381070285243e-07, + "loss": 0.8863, + "step": 12148 + }, + { + "epoch": 0.9365556583410423, + "grad_norm": 3.481828212738037, + "learning_rate": 1.0519891096459067e-07, + "loss": 0.827, + "step": 12149 + }, + { + "epoch": 0.9366327474560592, + "grad_norm": 3.6842973232269287, + "learning_rate": 1.0494431639182567e-07, + "loss": 0.8607, + "step": 12150 + }, + { + "epoch": 0.9367098365710762, + "grad_norm": 3.917614221572876, + "learning_rate": 1.0469002700043029e-07, + "loss": 0.8737, + "step": 12151 + }, + { + "epoch": 0.9367869256860931, + "grad_norm": 3.5234785079956055, + "learning_rate": 1.0443604280625852e-07, + "loss": 0.8082, + "step": 12152 + }, + { + "epoch": 0.93686401480111, + "grad_norm": 3.9610044956207275, + "learning_rate": 1.0418236382514435e-07, + "loss": 0.9952, + "step": 12153 + }, + { + "epoch": 0.9369411039161271, + "grad_norm": 3.7560231685638428, + "learning_rate": 1.0392899007290347e-07, + "loss": 0.8605, + "step": 12154 + }, + { + "epoch": 0.937018193031144, + "grad_norm": 3.415227174758911, + "learning_rate": 1.036759215653338e-07, + "loss": 0.8353, + "step": 12155 + }, + { + "epoch": 0.937095282146161, + "grad_norm": 3.7259321212768555, + "learning_rate": 1.0342315831821104e-07, + "loss": 0.9268, + "step": 12156 + }, + { + "epoch": 0.9371723712611779, + "grad_norm": 3.7746005058288574, + "learning_rate": 1.0317070034729426e-07, + "loss": 0.8681, + "step": 12157 + }, + { + "epoch": 0.9372494603761948, + "grad_norm": 3.6794521808624268, + "learning_rate": 1.0291854766832254e-07, + "loss": 0.9124, + "step": 12158 + }, + { + "epoch": 0.9373265494912119, + "grad_norm": 3.675534725189209, + "learning_rate": 1.026667002970172e-07, + "loss": 0.9868, + "step": 12159 + }, + { + "epoch": 0.9374036386062288, + "grad_norm": 3.834210157394409, + "learning_rate": 1.0241515824907955e-07, + "loss": 0.8566, + "step": 12160 + }, + { + "epoch": 0.9374807277212458, + "grad_norm": 3.795119285583496, + "learning_rate": 1.021639215401915e-07, + "loss": 0.9076, + "step": 12161 + }, + { + "epoch": 0.9375578168362627, + "grad_norm": 3.7029600143432617, + "learning_rate": 1.0191299018601608e-07, + "loss": 0.8642, + "step": 12162 + }, + { + "epoch": 0.9376349059512796, + "grad_norm": 3.834038734436035, + "learning_rate": 1.0166236420219744e-07, + "loss": 0.9159, + "step": 12163 + }, + { + "epoch": 0.9377119950662967, + "grad_norm": 3.733107328414917, + "learning_rate": 1.0141204360436197e-07, + "loss": 0.8474, + "step": 12164 + }, + { + "epoch": 0.9377890841813136, + "grad_norm": 3.8652164936065674, + "learning_rate": 1.011620284081144e-07, + "loss": 0.9073, + "step": 12165 + }, + { + "epoch": 0.9378661732963306, + "grad_norm": 4.108710289001465, + "learning_rate": 1.0091231862904394e-07, + "loss": 0.8865, + "step": 12166 + }, + { + "epoch": 0.9379432624113475, + "grad_norm": 3.595679998397827, + "learning_rate": 1.0066291428271646e-07, + "loss": 0.9258, + "step": 12167 + }, + { + "epoch": 0.9380203515263644, + "grad_norm": 3.6943235397338867, + "learning_rate": 1.0041381538468175e-07, + "loss": 0.863, + "step": 12168 + }, + { + "epoch": 0.9380974406413815, + "grad_norm": 3.979586124420166, + "learning_rate": 1.0016502195047017e-07, + "loss": 0.8683, + "step": 12169 + }, + { + "epoch": 0.9381745297563984, + "grad_norm": 3.542391777038574, + "learning_rate": 9.991653399559265e-08, + "loss": 0.8578, + "step": 12170 + }, + { + "epoch": 0.9382516188714154, + "grad_norm": 3.59480357170105, + "learning_rate": 9.966835153554177e-08, + "loss": 0.9473, + "step": 12171 + }, + { + "epoch": 0.9383287079864323, + "grad_norm": 3.7652363777160645, + "learning_rate": 9.942047458578852e-08, + "loss": 0.9378, + "step": 12172 + }, + { + "epoch": 0.9384057971014492, + "grad_norm": 3.717331886291504, + "learning_rate": 9.917290316178884e-08, + "loss": 0.8719, + "step": 12173 + }, + { + "epoch": 0.9384828862164662, + "grad_norm": 4.402118682861328, + "learning_rate": 9.892563727897597e-08, + "loss": 0.9718, + "step": 12174 + }, + { + "epoch": 0.9385599753314832, + "grad_norm": 3.7598085403442383, + "learning_rate": 9.867867695276645e-08, + "loss": 0.8839, + "step": 12175 + }, + { + "epoch": 0.9386370644465002, + "grad_norm": 3.4538068771362305, + "learning_rate": 9.843202219855685e-08, + "loss": 0.8984, + "step": 12176 + }, + { + "epoch": 0.9387141535615171, + "grad_norm": 3.9770476818084717, + "learning_rate": 9.818567303172544e-08, + "loss": 0.8237, + "step": 12177 + }, + { + "epoch": 0.938791242676534, + "grad_norm": 3.9193336963653564, + "learning_rate": 9.793962946762936e-08, + "loss": 0.9098, + "step": 12178 + }, + { + "epoch": 0.938868331791551, + "grad_norm": 3.7343082427978516, + "learning_rate": 9.769389152160913e-08, + "loss": 0.9654, + "step": 12179 + }, + { + "epoch": 0.938945420906568, + "grad_norm": 3.766138792037964, + "learning_rate": 9.744845920898527e-08, + "loss": 0.9089, + "step": 12180 + }, + { + "epoch": 0.939022510021585, + "grad_norm": 4.211320877075195, + "learning_rate": 9.720333254505887e-08, + "loss": 0.9806, + "step": 12181 + }, + { + "epoch": 0.9390995991366019, + "grad_norm": 4.284248352050781, + "learning_rate": 9.695851154511271e-08, + "loss": 1.0172, + "step": 12182 + }, + { + "epoch": 0.9391766882516188, + "grad_norm": 3.9053781032562256, + "learning_rate": 9.671399622440957e-08, + "loss": 0.8795, + "step": 12183 + }, + { + "epoch": 0.9392537773666358, + "grad_norm": 3.633023262023926, + "learning_rate": 9.646978659819394e-08, + "loss": 0.8426, + "step": 12184 + }, + { + "epoch": 0.9393308664816528, + "grad_norm": 4.009373188018799, + "learning_rate": 9.622588268169141e-08, + "loss": 0.8787, + "step": 12185 + }, + { + "epoch": 0.9394079555966698, + "grad_norm": 3.9296786785125732, + "learning_rate": 9.598228449010704e-08, + "loss": 1.0206, + "step": 12186 + }, + { + "epoch": 0.9394850447116867, + "grad_norm": 3.5266220569610596, + "learning_rate": 9.573899203862925e-08, + "loss": 0.925, + "step": 12187 + }, + { + "epoch": 0.9395621338267036, + "grad_norm": 3.396301746368408, + "learning_rate": 9.549600534242587e-08, + "loss": 0.831, + "step": 12188 + }, + { + "epoch": 0.9396392229417206, + "grad_norm": 3.696995735168457, + "learning_rate": 9.525332441664481e-08, + "loss": 0.8887, + "step": 12189 + }, + { + "epoch": 0.9397163120567376, + "grad_norm": 3.7658541202545166, + "learning_rate": 9.50109492764173e-08, + "loss": 0.9345, + "step": 12190 + }, + { + "epoch": 0.9397934011717546, + "grad_norm": 3.884662389755249, + "learning_rate": 9.476887993685291e-08, + "loss": 0.9221, + "step": 12191 + }, + { + "epoch": 0.9398704902867715, + "grad_norm": 3.74465274810791, + "learning_rate": 9.452711641304402e-08, + "loss": 0.889, + "step": 12192 + }, + { + "epoch": 0.9399475794017884, + "grad_norm": 3.8537025451660156, + "learning_rate": 9.428565872006356e-08, + "loss": 1.0205, + "step": 12193 + }, + { + "epoch": 0.9400246685168054, + "grad_norm": 3.9346094131469727, + "learning_rate": 9.404450687296507e-08, + "loss": 0.835, + "step": 12194 + }, + { + "epoch": 0.9401017576318224, + "grad_norm": 3.567856550216675, + "learning_rate": 9.380366088678261e-08, + "loss": 0.8092, + "step": 12195 + }, + { + "epoch": 0.9401788467468394, + "grad_norm": 3.484372854232788, + "learning_rate": 9.356312077653196e-08, + "loss": 0.9011, + "step": 12196 + }, + { + "epoch": 0.9402559358618563, + "grad_norm": 4.030934810638428, + "learning_rate": 9.332288655720945e-08, + "loss": 0.9122, + "step": 12197 + }, + { + "epoch": 0.9403330249768732, + "grad_norm": 5.114222526550293, + "learning_rate": 9.308295824379365e-08, + "loss": 0.9911, + "step": 12198 + }, + { + "epoch": 0.9404101140918902, + "grad_norm": 3.6418612003326416, + "learning_rate": 9.284333585124094e-08, + "loss": 0.9481, + "step": 12199 + }, + { + "epoch": 0.9404872032069072, + "grad_norm": 3.4810874462127686, + "learning_rate": 9.260401939449215e-08, + "loss": 0.9044, + "step": 12200 + }, + { + "epoch": 0.9405642923219242, + "grad_norm": 3.6137795448303223, + "learning_rate": 9.236500888846589e-08, + "loss": 0.7977, + "step": 12201 + }, + { + "epoch": 0.9406413814369411, + "grad_norm": 3.7133419513702393, + "learning_rate": 9.212630434806413e-08, + "loss": 0.9542, + "step": 12202 + }, + { + "epoch": 0.940718470551958, + "grad_norm": 3.7179129123687744, + "learning_rate": 9.188790578816942e-08, + "loss": 0.8459, + "step": 12203 + }, + { + "epoch": 0.940795559666975, + "grad_norm": 3.615569591522217, + "learning_rate": 9.164981322364375e-08, + "loss": 0.8417, + "step": 12204 + }, + { + "epoch": 0.940872648781992, + "grad_norm": 3.824866533279419, + "learning_rate": 9.141202666933135e-08, + "loss": 1.0083, + "step": 12205 + }, + { + "epoch": 0.940949737897009, + "grad_norm": 3.547849655151367, + "learning_rate": 9.11745461400565e-08, + "loss": 0.8361, + "step": 12206 + }, + { + "epoch": 0.9410268270120259, + "grad_norm": 3.908607244491577, + "learning_rate": 9.09373716506251e-08, + "loss": 0.8185, + "step": 12207 + }, + { + "epoch": 0.9411039161270429, + "grad_norm": 3.6814157962799072, + "learning_rate": 9.07005032158248e-08, + "loss": 0.8893, + "step": 12208 + }, + { + "epoch": 0.9411810052420598, + "grad_norm": 3.7791712284088135, + "learning_rate": 9.046394085042154e-08, + "loss": 0.9866, + "step": 12209 + }, + { + "epoch": 0.9412580943570767, + "grad_norm": 3.7115676403045654, + "learning_rate": 9.022768456916409e-08, + "loss": 1.0297, + "step": 12210 + }, + { + "epoch": 0.9413351834720938, + "grad_norm": 3.7487754821777344, + "learning_rate": 8.999173438678233e-08, + "loss": 0.9212, + "step": 12211 + }, + { + "epoch": 0.9414122725871107, + "grad_norm": 3.4741318225860596, + "learning_rate": 8.975609031798671e-08, + "loss": 0.9159, + "step": 12212 + }, + { + "epoch": 0.9414893617021277, + "grad_norm": 3.7162187099456787, + "learning_rate": 8.952075237746771e-08, + "loss": 0.939, + "step": 12213 + }, + { + "epoch": 0.9415664508171446, + "grad_norm": 3.8283746242523193, + "learning_rate": 8.928572057989804e-08, + "loss": 0.8525, + "step": 12214 + }, + { + "epoch": 0.9416435399321615, + "grad_norm": 3.759662628173828, + "learning_rate": 8.905099493993041e-08, + "loss": 0.879, + "step": 12215 + }, + { + "epoch": 0.9417206290471786, + "grad_norm": 3.5151100158691406, + "learning_rate": 8.881657547219869e-08, + "loss": 0.8647, + "step": 12216 + }, + { + "epoch": 0.9417977181621955, + "grad_norm": 4.150439739227295, + "learning_rate": 8.858246219131784e-08, + "loss": 0.9453, + "step": 12217 + }, + { + "epoch": 0.9418748072772125, + "grad_norm": 3.5774085521698, + "learning_rate": 8.83486551118834e-08, + "loss": 0.8436, + "step": 12218 + }, + { + "epoch": 0.9419518963922294, + "grad_norm": 3.561293601989746, + "learning_rate": 8.811515424847261e-08, + "loss": 0.7775, + "step": 12219 + }, + { + "epoch": 0.9420289855072463, + "grad_norm": 4.2305827140808105, + "learning_rate": 8.788195961564273e-08, + "loss": 0.9484, + "step": 12220 + }, + { + "epoch": 0.9421060746222634, + "grad_norm": 3.9733986854553223, + "learning_rate": 8.764907122793154e-08, + "loss": 0.8226, + "step": 12221 + }, + { + "epoch": 0.9421831637372803, + "grad_norm": 3.7328057289123535, + "learning_rate": 8.741648909985967e-08, + "loss": 1.0214, + "step": 12222 + }, + { + "epoch": 0.9422602528522973, + "grad_norm": 3.4936575889587402, + "learning_rate": 8.718421324592608e-08, + "loss": 0.9083, + "step": 12223 + }, + { + "epoch": 0.9423373419673142, + "grad_norm": 3.8838651180267334, + "learning_rate": 8.695224368061305e-08, + "loss": 0.922, + "step": 12224 + }, + { + "epoch": 0.9424144310823311, + "grad_norm": 3.990166664123535, + "learning_rate": 8.672058041838294e-08, + "loss": 0.8594, + "step": 12225 + }, + { + "epoch": 0.9424915201973482, + "grad_norm": 3.8123323917388916, + "learning_rate": 8.64892234736775e-08, + "loss": 0.8615, + "step": 12226 + }, + { + "epoch": 0.9425686093123651, + "grad_norm": 3.807555675506592, + "learning_rate": 8.625817286092075e-08, + "loss": 0.8749, + "step": 12227 + }, + { + "epoch": 0.9426456984273821, + "grad_norm": 3.557279586791992, + "learning_rate": 8.602742859451841e-08, + "loss": 0.8594, + "step": 12228 + }, + { + "epoch": 0.942722787542399, + "grad_norm": 4.088743209838867, + "learning_rate": 8.579699068885616e-08, + "loss": 0.9473, + "step": 12229 + }, + { + "epoch": 0.9427998766574159, + "grad_norm": 3.422257661819458, + "learning_rate": 8.556685915830033e-08, + "loss": 0.8581, + "step": 12230 + }, + { + "epoch": 0.942876965772433, + "grad_norm": 3.8169891834259033, + "learning_rate": 8.533703401719773e-08, + "loss": 0.9015, + "step": 12231 + }, + { + "epoch": 0.9429540548874499, + "grad_norm": 3.840945243835449, + "learning_rate": 8.510751527987748e-08, + "loss": 0.9821, + "step": 12232 + }, + { + "epoch": 0.9430311440024669, + "grad_norm": 3.5995681285858154, + "learning_rate": 8.487830296064869e-08, + "loss": 0.9348, + "step": 12233 + }, + { + "epoch": 0.9431082331174838, + "grad_norm": 3.9970502853393555, + "learning_rate": 8.46493970738016e-08, + "loss": 0.9943, + "step": 12234 + }, + { + "epoch": 0.9431853222325007, + "grad_norm": 3.852111339569092, + "learning_rate": 8.442079763360755e-08, + "loss": 0.9228, + "step": 12235 + }, + { + "epoch": 0.9432624113475178, + "grad_norm": 3.759093761444092, + "learning_rate": 8.419250465431905e-08, + "loss": 0.8915, + "step": 12236 + }, + { + "epoch": 0.9433395004625347, + "grad_norm": 3.7389092445373535, + "learning_rate": 8.396451815016749e-08, + "loss": 0.8838, + "step": 12237 + }, + { + "epoch": 0.9434165895775517, + "grad_norm": 3.6164088249206543, + "learning_rate": 8.373683813536703e-08, + "loss": 0.8513, + "step": 12238 + }, + { + "epoch": 0.9434936786925686, + "grad_norm": 3.7332277297973633, + "learning_rate": 8.350946462411303e-08, + "loss": 0.9333, + "step": 12239 + }, + { + "epoch": 0.9435707678075855, + "grad_norm": 3.6654908657073975, + "learning_rate": 8.328239763058077e-08, + "loss": 0.8103, + "step": 12240 + }, + { + "epoch": 0.9436478569226026, + "grad_norm": 3.48644757270813, + "learning_rate": 8.305563716892728e-08, + "loss": 0.9154, + "step": 12241 + }, + { + "epoch": 0.9437249460376195, + "grad_norm": 3.515536308288574, + "learning_rate": 8.282918325328848e-08, + "loss": 0.9073, + "step": 12242 + }, + { + "epoch": 0.9438020351526365, + "grad_norm": 3.6415224075317383, + "learning_rate": 8.260303589778362e-08, + "loss": 0.849, + "step": 12243 + }, + { + "epoch": 0.9438791242676534, + "grad_norm": 3.827643394470215, + "learning_rate": 8.237719511651199e-08, + "loss": 0.8901, + "step": 12244 + }, + { + "epoch": 0.9439562133826703, + "grad_norm": 4.021296501159668, + "learning_rate": 8.215166092355286e-08, + "loss": 1.0162, + "step": 12245 + }, + { + "epoch": 0.9440333024976874, + "grad_norm": 3.674678325653076, + "learning_rate": 8.192643333296779e-08, + "loss": 0.8896, + "step": 12246 + }, + { + "epoch": 0.9441103916127043, + "grad_norm": 3.961991310119629, + "learning_rate": 8.17015123587983e-08, + "loss": 0.9414, + "step": 12247 + }, + { + "epoch": 0.9441874807277213, + "grad_norm": 3.9013025760650635, + "learning_rate": 8.147689801506653e-08, + "loss": 0.9172, + "step": 12248 + }, + { + "epoch": 0.9442645698427382, + "grad_norm": 3.474181890487671, + "learning_rate": 8.125259031577681e-08, + "loss": 0.8584, + "step": 12249 + }, + { + "epoch": 0.9443416589577551, + "grad_norm": 4.0712103843688965, + "learning_rate": 8.102858927491297e-08, + "loss": 0.9646, + "step": 12250 + }, + { + "epoch": 0.9444187480727722, + "grad_norm": 3.6317129135131836, + "learning_rate": 8.080489490644106e-08, + "loss": 0.825, + "step": 12251 + }, + { + "epoch": 0.9444958371877891, + "grad_norm": 3.92905855178833, + "learning_rate": 8.058150722430658e-08, + "loss": 0.9223, + "step": 12252 + }, + { + "epoch": 0.9445729263028061, + "grad_norm": 4.44641637802124, + "learning_rate": 8.035842624243673e-08, + "loss": 0.9286, + "step": 12253 + }, + { + "epoch": 0.944650015417823, + "grad_norm": 3.717348337173462, + "learning_rate": 8.013565197473927e-08, + "loss": 0.8958, + "step": 12254 + }, + { + "epoch": 0.9447271045328399, + "grad_norm": 3.6619021892547607, + "learning_rate": 7.991318443510365e-08, + "loss": 0.873, + "step": 12255 + }, + { + "epoch": 0.944804193647857, + "grad_norm": 3.7299957275390625, + "learning_rate": 7.969102363739933e-08, + "loss": 0.9392, + "step": 12256 + }, + { + "epoch": 0.9448812827628739, + "grad_norm": 3.6785383224487305, + "learning_rate": 7.946916959547635e-08, + "loss": 0.8582, + "step": 12257 + }, + { + "epoch": 0.9449583718778909, + "grad_norm": 3.580245018005371, + "learning_rate": 7.92476223231664e-08, + "loss": 0.8853, + "step": 12258 + }, + { + "epoch": 0.9450354609929078, + "grad_norm": 3.7513821125030518, + "learning_rate": 7.902638183428235e-08, + "loss": 0.9091, + "step": 12259 + }, + { + "epoch": 0.9451125501079247, + "grad_norm": 3.6423444747924805, + "learning_rate": 7.880544814261704e-08, + "loss": 0.9026, + "step": 12260 + }, + { + "epoch": 0.9451896392229417, + "grad_norm": 3.6739706993103027, + "learning_rate": 7.858482126194445e-08, + "loss": 0.8349, + "step": 12261 + }, + { + "epoch": 0.9452667283379587, + "grad_norm": 4.1562628746032715, + "learning_rate": 7.836450120601968e-08, + "loss": 0.8982, + "step": 12262 + }, + { + "epoch": 0.9453438174529757, + "grad_norm": 3.613546133041382, + "learning_rate": 7.814448798857843e-08, + "loss": 0.8372, + "step": 12263 + }, + { + "epoch": 0.9454209065679926, + "grad_norm": 3.739797353744507, + "learning_rate": 7.792478162333694e-08, + "loss": 0.9476, + "step": 12264 + }, + { + "epoch": 0.9454979956830095, + "grad_norm": 4.127760410308838, + "learning_rate": 7.77053821239937e-08, + "loss": 0.9036, + "step": 12265 + }, + { + "epoch": 0.9455750847980265, + "grad_norm": 3.717101812362671, + "learning_rate": 7.748628950422666e-08, + "loss": 0.8591, + "step": 12266 + }, + { + "epoch": 0.9456521739130435, + "grad_norm": 3.6435585021972656, + "learning_rate": 7.726750377769488e-08, + "loss": 0.9294, + "step": 12267 + }, + { + "epoch": 0.9457292630280605, + "grad_norm": 3.5347166061401367, + "learning_rate": 7.704902495803911e-08, + "loss": 0.8237, + "step": 12268 + }, + { + "epoch": 0.9458063521430774, + "grad_norm": 3.792682647705078, + "learning_rate": 7.683085305887961e-08, + "loss": 0.9064, + "step": 12269 + }, + { + "epoch": 0.9458834412580943, + "grad_norm": 3.9385581016540527, + "learning_rate": 7.661298809381878e-08, + "loss": 0.9554, + "step": 12270 + }, + { + "epoch": 0.9459605303731113, + "grad_norm": 3.6666650772094727, + "learning_rate": 7.639543007643913e-08, + "loss": 0.898, + "step": 12271 + }, + { + "epoch": 0.9460376194881283, + "grad_norm": 3.755439043045044, + "learning_rate": 7.617817902030478e-08, + "loss": 0.892, + "step": 12272 + }, + { + "epoch": 0.9461147086031453, + "grad_norm": 4.0008225440979, + "learning_rate": 7.59612349389599e-08, + "loss": 0.8153, + "step": 12273 + }, + { + "epoch": 0.9461917977181622, + "grad_norm": 3.6859991550445557, + "learning_rate": 7.574459784592981e-08, + "loss": 0.8639, + "step": 12274 + }, + { + "epoch": 0.9462688868331791, + "grad_norm": 3.624303102493286, + "learning_rate": 7.552826775472033e-08, + "loss": 0.8132, + "step": 12275 + }, + { + "epoch": 0.9463459759481961, + "grad_norm": 4.032271385192871, + "learning_rate": 7.531224467881848e-08, + "loss": 1.0711, + "step": 12276 + }, + { + "epoch": 0.946423065063213, + "grad_norm": 3.7532832622528076, + "learning_rate": 7.509652863169348e-08, + "loss": 0.8494, + "step": 12277 + }, + { + "epoch": 0.9465001541782301, + "grad_norm": 4.128763675689697, + "learning_rate": 7.48811196267929e-08, + "loss": 0.8642, + "step": 12278 + }, + { + "epoch": 0.946577243293247, + "grad_norm": 3.744234561920166, + "learning_rate": 7.466601767754655e-08, + "loss": 0.9608, + "step": 12279 + }, + { + "epoch": 0.9466543324082639, + "grad_norm": 3.6240429878234863, + "learning_rate": 7.445122279736484e-08, + "loss": 0.8315, + "step": 12280 + }, + { + "epoch": 0.9467314215232809, + "grad_norm": 3.724736213684082, + "learning_rate": 7.423673499963924e-08, + "loss": 0.8643, + "step": 12281 + }, + { + "epoch": 0.9468085106382979, + "grad_norm": 3.973217248916626, + "learning_rate": 7.402255429774241e-08, + "loss": 0.9679, + "step": 12282 + }, + { + "epoch": 0.9468855997533149, + "grad_norm": 3.8771355152130127, + "learning_rate": 7.380868070502644e-08, + "loss": 0.8696, + "step": 12283 + }, + { + "epoch": 0.9469626888683318, + "grad_norm": 3.8972392082214355, + "learning_rate": 7.359511423482679e-08, + "loss": 1.005, + "step": 12284 + }, + { + "epoch": 0.9470397779833487, + "grad_norm": 3.7604198455810547, + "learning_rate": 7.338185490045668e-08, + "loss": 0.867, + "step": 12285 + }, + { + "epoch": 0.9471168670983657, + "grad_norm": 3.7321414947509766, + "learning_rate": 7.316890271521215e-08, + "loss": 0.8899, + "step": 12286 + }, + { + "epoch": 0.9471939562133826, + "grad_norm": 4.230839252471924, + "learning_rate": 7.29562576923698e-08, + "loss": 1.034, + "step": 12287 + }, + { + "epoch": 0.9472710453283997, + "grad_norm": 3.6010680198669434, + "learning_rate": 7.274391984518736e-08, + "loss": 0.8038, + "step": 12288 + }, + { + "epoch": 0.9473481344434166, + "grad_norm": 3.6797475814819336, + "learning_rate": 7.253188918690257e-08, + "loss": 0.9017, + "step": 12289 + }, + { + "epoch": 0.9474252235584335, + "grad_norm": 3.575794219970703, + "learning_rate": 7.232016573073431e-08, + "loss": 0.8756, + "step": 12290 + }, + { + "epoch": 0.9475023126734505, + "grad_norm": 3.789335012435913, + "learning_rate": 7.210874948988255e-08, + "loss": 0.9401, + "step": 12291 + }, + { + "epoch": 0.9475794017884674, + "grad_norm": 3.835794448852539, + "learning_rate": 7.189764047752789e-08, + "loss": 0.9796, + "step": 12292 + }, + { + "epoch": 0.9476564909034845, + "grad_norm": 3.511427879333496, + "learning_rate": 7.168683870683258e-08, + "loss": 0.9066, + "step": 12293 + }, + { + "epoch": 0.9477335800185014, + "grad_norm": 3.5862207412719727, + "learning_rate": 7.14763441909383e-08, + "loss": 0.9302, + "step": 12294 + }, + { + "epoch": 0.9478106691335183, + "grad_norm": 3.8762261867523193, + "learning_rate": 7.126615694296846e-08, + "loss": 0.9133, + "step": 12295 + }, + { + "epoch": 0.9478877582485353, + "grad_norm": 3.6678431034088135, + "learning_rate": 7.105627697602702e-08, + "loss": 0.8997, + "step": 12296 + }, + { + "epoch": 0.9479648473635522, + "grad_norm": 3.454246997833252, + "learning_rate": 7.084670430319907e-08, + "loss": 0.7763, + "step": 12297 + }, + { + "epoch": 0.9480419364785693, + "grad_norm": 3.5856475830078125, + "learning_rate": 7.063743893755026e-08, + "loss": 0.8533, + "step": 12298 + }, + { + "epoch": 0.9481190255935862, + "grad_norm": 3.7448766231536865, + "learning_rate": 7.042848089212794e-08, + "loss": 0.9361, + "step": 12299 + }, + { + "epoch": 0.9481961147086031, + "grad_norm": 3.805290460586548, + "learning_rate": 7.021983017995836e-08, + "loss": 0.9013, + "step": 12300 + }, + { + "epoch": 0.9482732038236201, + "grad_norm": 3.3226075172424316, + "learning_rate": 7.001148681405056e-08, + "loss": 0.7962, + "step": 12301 + }, + { + "epoch": 0.948350292938637, + "grad_norm": 3.614866256713867, + "learning_rate": 6.980345080739303e-08, + "loss": 0.7797, + "step": 12302 + }, + { + "epoch": 0.9484273820536541, + "grad_norm": 3.74783992767334, + "learning_rate": 6.959572217295651e-08, + "loss": 0.9274, + "step": 12303 + }, + { + "epoch": 0.948504471168671, + "grad_norm": 4.042082786560059, + "learning_rate": 6.938830092369176e-08, + "loss": 1.0048, + "step": 12304 + }, + { + "epoch": 0.9485815602836879, + "grad_norm": 3.7380220890045166, + "learning_rate": 6.918118707253007e-08, + "loss": 0.9577, + "step": 12305 + }, + { + "epoch": 0.9486586493987049, + "grad_norm": 4.00551700592041, + "learning_rate": 6.897438063238393e-08, + "loss": 0.8565, + "step": 12306 + }, + { + "epoch": 0.9487357385137218, + "grad_norm": 4.03279972076416, + "learning_rate": 6.87678816161469e-08, + "loss": 1.0123, + "step": 12307 + }, + { + "epoch": 0.9488128276287389, + "grad_norm": 3.53090500831604, + "learning_rate": 6.856169003669256e-08, + "loss": 0.7968, + "step": 12308 + }, + { + "epoch": 0.9488899167437558, + "grad_norm": 3.888162612915039, + "learning_rate": 6.835580590687618e-08, + "loss": 0.9289, + "step": 12309 + }, + { + "epoch": 0.9489670058587727, + "grad_norm": 3.883291721343994, + "learning_rate": 6.815022923953418e-08, + "loss": 0.8446, + "step": 12310 + }, + { + "epoch": 0.9490440949737897, + "grad_norm": 4.122531414031982, + "learning_rate": 6.794496004748241e-08, + "loss": 0.8171, + "step": 12311 + }, + { + "epoch": 0.9491211840888066, + "grad_norm": 3.75138521194458, + "learning_rate": 6.773999834351841e-08, + "loss": 0.8468, + "step": 12312 + }, + { + "epoch": 0.9491982732038237, + "grad_norm": 3.7109479904174805, + "learning_rate": 6.753534414042085e-08, + "loss": 0.9143, + "step": 12313 + }, + { + "epoch": 0.9492753623188406, + "grad_norm": 3.7108957767486572, + "learning_rate": 6.733099745094896e-08, + "loss": 0.9051, + "step": 12314 + }, + { + "epoch": 0.9493524514338575, + "grad_norm": 3.8768532276153564, + "learning_rate": 6.712695828784254e-08, + "loss": 0.8712, + "step": 12315 + }, + { + "epoch": 0.9494295405488745, + "grad_norm": 3.5676753520965576, + "learning_rate": 6.6923226663822e-08, + "loss": 0.8435, + "step": 12316 + }, + { + "epoch": 0.9495066296638914, + "grad_norm": 3.522658348083496, + "learning_rate": 6.671980259158883e-08, + "loss": 0.7721, + "step": 12317 + }, + { + "epoch": 0.9495837187789085, + "grad_norm": 3.815915584564209, + "learning_rate": 6.651668608382622e-08, + "loss": 0.9029, + "step": 12318 + }, + { + "epoch": 0.9496608078939254, + "grad_norm": 3.938446044921875, + "learning_rate": 6.631387715319681e-08, + "loss": 0.9521, + "step": 12319 + }, + { + "epoch": 0.9497378970089423, + "grad_norm": 3.745887517929077, + "learning_rate": 6.611137581234495e-08, + "loss": 0.9028, + "step": 12320 + }, + { + "epoch": 0.9498149861239593, + "grad_norm": 3.692595958709717, + "learning_rate": 6.590918207389608e-08, + "loss": 0.8865, + "step": 12321 + }, + { + "epoch": 0.9498920752389762, + "grad_norm": 4.061500549316406, + "learning_rate": 6.57072959504551e-08, + "loss": 0.9498, + "step": 12322 + }, + { + "epoch": 0.9499691643539933, + "grad_norm": 3.645247459411621, + "learning_rate": 6.550571745460865e-08, + "loss": 0.8695, + "step": 12323 + }, + { + "epoch": 0.9500462534690102, + "grad_norm": 3.4942803382873535, + "learning_rate": 6.530444659892443e-08, + "loss": 0.8362, + "step": 12324 + }, + { + "epoch": 0.9501233425840271, + "grad_norm": 3.572190284729004, + "learning_rate": 6.510348339595074e-08, + "loss": 0.9694, + "step": 12325 + }, + { + "epoch": 0.9502004316990441, + "grad_norm": 3.5825698375701904, + "learning_rate": 6.490282785821645e-08, + "loss": 0.8123, + "step": 12326 + }, + { + "epoch": 0.950277520814061, + "grad_norm": 3.692664861679077, + "learning_rate": 6.470247999823099e-08, + "loss": 0.897, + "step": 12327 + }, + { + "epoch": 0.950354609929078, + "grad_norm": 3.427412271499634, + "learning_rate": 6.450243982848548e-08, + "loss": 0.8236, + "step": 12328 + }, + { + "epoch": 0.950431699044095, + "grad_norm": 3.9332494735717773, + "learning_rate": 6.430270736145106e-08, + "loss": 0.9017, + "step": 12329 + }, + { + "epoch": 0.9505087881591119, + "grad_norm": 3.665562629699707, + "learning_rate": 6.410328260957998e-08, + "loss": 0.8807, + "step": 12330 + }, + { + "epoch": 0.9505858772741289, + "grad_norm": 3.466925859451294, + "learning_rate": 6.390416558530622e-08, + "loss": 0.8508, + "step": 12331 + }, + { + "epoch": 0.9506629663891458, + "grad_norm": 3.790743112564087, + "learning_rate": 6.370535630104257e-08, + "loss": 0.8721, + "step": 12332 + }, + { + "epoch": 0.9507400555041629, + "grad_norm": 3.7184832096099854, + "learning_rate": 6.350685476918416e-08, + "loss": 0.964, + "step": 12333 + }, + { + "epoch": 0.9508171446191798, + "grad_norm": 3.781548500061035, + "learning_rate": 6.33086610021072e-08, + "loss": 0.9842, + "step": 12334 + }, + { + "epoch": 0.9508942337341967, + "grad_norm": 4.001351356506348, + "learning_rate": 6.311077501216677e-08, + "loss": 0.9441, + "step": 12335 + }, + { + "epoch": 0.9509713228492137, + "grad_norm": 3.9916863441467285, + "learning_rate": 6.291319681170139e-08, + "loss": 0.9009, + "step": 12336 + }, + { + "epoch": 0.9510484119642306, + "grad_norm": 4.026021480560303, + "learning_rate": 6.271592641302781e-08, + "loss": 0.9658, + "step": 12337 + }, + { + "epoch": 0.9511255010792476, + "grad_norm": 3.522988796234131, + "learning_rate": 6.251896382844569e-08, + "loss": 0.796, + "step": 12338 + }, + { + "epoch": 0.9512025901942646, + "grad_norm": 3.4949891567230225, + "learning_rate": 6.232230907023407e-08, + "loss": 0.8099, + "step": 12339 + }, + { + "epoch": 0.9512796793092815, + "grad_norm": 3.4826712608337402, + "learning_rate": 6.21259621506537e-08, + "loss": 0.9186, + "step": 12340 + }, + { + "epoch": 0.9513567684242985, + "grad_norm": 3.609074115753174, + "learning_rate": 6.192992308194534e-08, + "loss": 0.8708, + "step": 12341 + }, + { + "epoch": 0.9514338575393154, + "grad_norm": 3.599756956100464, + "learning_rate": 6.173419187633201e-08, + "loss": 0.7601, + "step": 12342 + }, + { + "epoch": 0.9515109466543324, + "grad_norm": 3.8272032737731934, + "learning_rate": 6.15387685460156e-08, + "loss": 0.9467, + "step": 12343 + }, + { + "epoch": 0.9515880357693494, + "grad_norm": 3.584209680557251, + "learning_rate": 6.134365310317969e-08, + "loss": 0.898, + "step": 12344 + }, + { + "epoch": 0.9516651248843663, + "grad_norm": 3.9957098960876465, + "learning_rate": 6.114884555998957e-08, + "loss": 0.9181, + "step": 12345 + }, + { + "epoch": 0.9517422139993833, + "grad_norm": 3.9217257499694824, + "learning_rate": 6.09543459285894e-08, + "loss": 0.8856, + "step": 12346 + }, + { + "epoch": 0.9518193031144002, + "grad_norm": 3.2795722484588623, + "learning_rate": 6.07601542211067e-08, + "loss": 0.8311, + "step": 12347 + }, + { + "epoch": 0.9518963922294172, + "grad_norm": 3.9861719608306885, + "learning_rate": 6.05662704496468e-08, + "loss": 0.9533, + "step": 12348 + }, + { + "epoch": 0.9519734813444342, + "grad_norm": 3.6338775157928467, + "learning_rate": 6.03726946262978e-08, + "loss": 0.8222, + "step": 12349 + }, + { + "epoch": 0.9520505704594511, + "grad_norm": 4.2912726402282715, + "learning_rate": 6.017942676312837e-08, + "loss": 0.9267, + "step": 12350 + }, + { + "epoch": 0.9521276595744681, + "grad_norm": 4.117364883422852, + "learning_rate": 5.998646687218779e-08, + "loss": 0.9223, + "step": 12351 + }, + { + "epoch": 0.952204748689485, + "grad_norm": 4.384626865386963, + "learning_rate": 5.97938149655064e-08, + "loss": 1.0203, + "step": 12352 + }, + { + "epoch": 0.952281837804502, + "grad_norm": 3.674335241317749, + "learning_rate": 5.960147105509406e-08, + "loss": 0.8207, + "step": 12353 + }, + { + "epoch": 0.952358926919519, + "grad_norm": 3.7455525398254395, + "learning_rate": 5.9409435152943395e-08, + "loss": 0.8943, + "step": 12354 + }, + { + "epoch": 0.9524360160345359, + "grad_norm": 3.3691940307617188, + "learning_rate": 5.9217707271025937e-08, + "loss": 0.899, + "step": 12355 + }, + { + "epoch": 0.9525131051495529, + "grad_norm": 3.5628163814544678, + "learning_rate": 5.9026287421296014e-08, + "loss": 0.8902, + "step": 12356 + }, + { + "epoch": 0.9525901942645698, + "grad_norm": 3.971393585205078, + "learning_rate": 5.883517561568686e-08, + "loss": 0.8993, + "step": 12357 + }, + { + "epoch": 0.9526672833795868, + "grad_norm": 3.777592420578003, + "learning_rate": 5.8644371866113934e-08, + "loss": 0.8346, + "step": 12358 + }, + { + "epoch": 0.9527443724946038, + "grad_norm": 3.8741729259490967, + "learning_rate": 5.845387618447218e-08, + "loss": 0.8584, + "step": 12359 + }, + { + "epoch": 0.9528214616096207, + "grad_norm": 3.5481209754943848, + "learning_rate": 5.82636885826382e-08, + "loss": 0.802, + "step": 12360 + }, + { + "epoch": 0.9528985507246377, + "grad_norm": 3.961042642593384, + "learning_rate": 5.807380907246918e-08, + "loss": 0.9015, + "step": 12361 + }, + { + "epoch": 0.9529756398396546, + "grad_norm": 3.8133602142333984, + "learning_rate": 5.788423766580342e-08, + "loss": 0.7618, + "step": 12362 + }, + { + "epoch": 0.9530527289546716, + "grad_norm": 3.639708995819092, + "learning_rate": 5.769497437446037e-08, + "loss": 0.794, + "step": 12363 + }, + { + "epoch": 0.9531298180696886, + "grad_norm": 3.4450173377990723, + "learning_rate": 5.7506019210237794e-08, + "loss": 0.8323, + "step": 12364 + }, + { + "epoch": 0.9532069071847055, + "grad_norm": 3.6423511505126953, + "learning_rate": 5.73173721849174e-08, + "loss": 0.8228, + "step": 12365 + }, + { + "epoch": 0.9532839962997225, + "grad_norm": 3.7283174991607666, + "learning_rate": 5.7129033310260316e-08, + "loss": 0.9807, + "step": 12366 + }, + { + "epoch": 0.9533610854147394, + "grad_norm": 3.4573850631713867, + "learning_rate": 5.694100259800772e-08, + "loss": 0.8502, + "step": 12367 + }, + { + "epoch": 0.9534381745297564, + "grad_norm": 3.561988592147827, + "learning_rate": 5.675328005988301e-08, + "loss": 0.9494, + "step": 12368 + }, + { + "epoch": 0.9535152636447733, + "grad_norm": 3.7618987560272217, + "learning_rate": 5.6565865707590153e-08, + "loss": 0.9674, + "step": 12369 + }, + { + "epoch": 0.9535923527597903, + "grad_norm": 3.7699737548828125, + "learning_rate": 5.637875955281202e-08, + "loss": 0.7715, + "step": 12370 + }, + { + "epoch": 0.9536694418748073, + "grad_norm": 3.5663082599639893, + "learning_rate": 5.6191961607214853e-08, + "loss": 0.8386, + "step": 12371 + }, + { + "epoch": 0.9537465309898242, + "grad_norm": 3.4624545574188232, + "learning_rate": 5.6005471882444316e-08, + "loss": 0.8488, + "step": 12372 + }, + { + "epoch": 0.9538236201048412, + "grad_norm": 4.092374801635742, + "learning_rate": 5.581929039012668e-08, + "loss": 0.9231, + "step": 12373 + }, + { + "epoch": 0.9539007092198581, + "grad_norm": 4.778739929199219, + "learning_rate": 5.563341714186987e-08, + "loss": 1.0022, + "step": 12374 + }, + { + "epoch": 0.9539777983348752, + "grad_norm": 3.724924325942993, + "learning_rate": 5.5447852149262406e-08, + "loss": 0.9458, + "step": 12375 + }, + { + "epoch": 0.9540548874498921, + "grad_norm": 3.464205026626587, + "learning_rate": 5.5262595423872244e-08, + "loss": 0.8716, + "step": 12376 + }, + { + "epoch": 0.954131976564909, + "grad_norm": 3.98112416267395, + "learning_rate": 5.507764697725015e-08, + "loss": 1.0144, + "step": 12377 + }, + { + "epoch": 0.954209065679926, + "grad_norm": 3.9500741958618164, + "learning_rate": 5.4893006820926355e-08, + "loss": 0.9449, + "step": 12378 + }, + { + "epoch": 0.9542861547949429, + "grad_norm": 3.776728391647339, + "learning_rate": 5.4708674966412744e-08, + "loss": 0.8444, + "step": 12379 + }, + { + "epoch": 0.95436324390996, + "grad_norm": 4.004519939422607, + "learning_rate": 5.4524651425200135e-08, + "loss": 0.8892, + "step": 12380 + }, + { + "epoch": 0.9544403330249769, + "grad_norm": 3.8165571689605713, + "learning_rate": 5.434093620876213e-08, + "loss": 0.9224, + "step": 12381 + }, + { + "epoch": 0.9545174221399938, + "grad_norm": 3.499563217163086, + "learning_rate": 5.4157529328552896e-08, + "loss": 0.7655, + "step": 12382 + }, + { + "epoch": 0.9545945112550108, + "grad_norm": 3.9584450721740723, + "learning_rate": 5.397443079600662e-08, + "loss": 0.8664, + "step": 12383 + }, + { + "epoch": 0.9546716003700277, + "grad_norm": 4.017448425292969, + "learning_rate": 5.379164062253861e-08, + "loss": 0.9921, + "step": 12384 + }, + { + "epoch": 0.9547486894850448, + "grad_norm": 3.8151862621307373, + "learning_rate": 5.3609158819544205e-08, + "loss": 0.8458, + "step": 12385 + }, + { + "epoch": 0.9548257786000617, + "grad_norm": 3.502845287322998, + "learning_rate": 5.3426985398400965e-08, + "loss": 0.861, + "step": 12386 + }, + { + "epoch": 0.9549028677150786, + "grad_norm": 3.5837574005126953, + "learning_rate": 5.324512037046647e-08, + "loss": 0.9356, + "step": 12387 + }, + { + "epoch": 0.9549799568300956, + "grad_norm": 3.933488368988037, + "learning_rate": 5.306356374707833e-08, + "loss": 0.9829, + "step": 12388 + }, + { + "epoch": 0.9550570459451125, + "grad_norm": 3.528136968612671, + "learning_rate": 5.288231553955636e-08, + "loss": 0.9494, + "step": 12389 + }, + { + "epoch": 0.9551341350601296, + "grad_norm": 3.7610669136047363, + "learning_rate": 5.270137575920098e-08, + "loss": 0.9087, + "step": 12390 + }, + { + "epoch": 0.9552112241751465, + "grad_norm": 3.8250885009765625, + "learning_rate": 5.2520744417290936e-08, + "loss": 0.9419, + "step": 12391 + }, + { + "epoch": 0.9552883132901634, + "grad_norm": 3.721879720687866, + "learning_rate": 5.2340421525089445e-08, + "loss": 0.9422, + "step": 12392 + }, + { + "epoch": 0.9553654024051804, + "grad_norm": 3.6027677059173584, + "learning_rate": 5.2160407093838074e-08, + "loss": 0.9309, + "step": 12393 + }, + { + "epoch": 0.9554424915201973, + "grad_norm": 3.6538522243499756, + "learning_rate": 5.19807011347595e-08, + "loss": 0.9611, + "step": 12394 + }, + { + "epoch": 0.9555195806352144, + "grad_norm": 4.078005790710449, + "learning_rate": 5.180130365905811e-08, + "loss": 0.9666, + "step": 12395 + }, + { + "epoch": 0.9555966697502313, + "grad_norm": 3.643521785736084, + "learning_rate": 5.162221467791772e-08, + "loss": 0.8401, + "step": 12396 + }, + { + "epoch": 0.9556737588652482, + "grad_norm": 3.5786333084106445, + "learning_rate": 5.1443434202504414e-08, + "loss": 0.8743, + "step": 12397 + }, + { + "epoch": 0.9557508479802652, + "grad_norm": 3.833845853805542, + "learning_rate": 5.1264962243963155e-08, + "loss": 0.9121, + "step": 12398 + }, + { + "epoch": 0.9558279370952821, + "grad_norm": 3.96075439453125, + "learning_rate": 5.1086798813421715e-08, + "loss": 0.9589, + "step": 12399 + }, + { + "epoch": 0.9559050262102992, + "grad_norm": 3.644322156906128, + "learning_rate": 5.090894392198731e-08, + "loss": 0.9057, + "step": 12400 + }, + { + "epoch": 0.9559821153253161, + "grad_norm": 3.6770665645599365, + "learning_rate": 5.07313975807483e-08, + "loss": 0.8914, + "step": 12401 + }, + { + "epoch": 0.956059204440333, + "grad_norm": 3.862185001373291, + "learning_rate": 5.0554159800773604e-08, + "loss": 0.7902, + "step": 12402 + }, + { + "epoch": 0.95613629355535, + "grad_norm": 3.8168914318084717, + "learning_rate": 5.037723059311273e-08, + "loss": 0.8907, + "step": 12403 + }, + { + "epoch": 0.9562133826703669, + "grad_norm": 3.6184260845184326, + "learning_rate": 5.0200609968797386e-08, + "loss": 0.886, + "step": 12404 + }, + { + "epoch": 0.956290471785384, + "grad_norm": 3.8705575466156006, + "learning_rate": 5.0024297938838227e-08, + "loss": 0.9254, + "step": 12405 + }, + { + "epoch": 0.9563675609004009, + "grad_norm": 3.5495188236236572, + "learning_rate": 4.9848294514227566e-08, + "loss": 0.88, + "step": 12406 + }, + { + "epoch": 0.9564446500154178, + "grad_norm": 3.8909826278686523, + "learning_rate": 4.9672599705938294e-08, + "loss": 1.0299, + "step": 12407 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 3.9004476070404053, + "learning_rate": 4.949721352492387e-08, + "loss": 0.9281, + "step": 12408 + }, + { + "epoch": 0.9565988282454517, + "grad_norm": 3.852919816970825, + "learning_rate": 4.9322135982118877e-08, + "loss": 0.9431, + "step": 12409 + }, + { + "epoch": 0.9566759173604688, + "grad_norm": 3.943549633026123, + "learning_rate": 4.914736708843848e-08, + "loss": 0.905, + "step": 12410 + }, + { + "epoch": 0.9567530064754857, + "grad_norm": 3.6017589569091797, + "learning_rate": 4.89729068547784e-08, + "loss": 0.9505, + "step": 12411 + }, + { + "epoch": 0.9568300955905026, + "grad_norm": 3.4761962890625, + "learning_rate": 4.8798755292016055e-08, + "loss": 0.8673, + "step": 12412 + }, + { + "epoch": 0.9569071847055196, + "grad_norm": 3.7245309352874756, + "learning_rate": 4.8624912411007754e-08, + "loss": 0.8425, + "step": 12413 + }, + { + "epoch": 0.9569842738205365, + "grad_norm": 3.6705353260040283, + "learning_rate": 4.8451378222592605e-08, + "loss": 0.8727, + "step": 12414 + }, + { + "epoch": 0.9570613629355536, + "grad_norm": 3.829969644546509, + "learning_rate": 4.827815273758973e-08, + "loss": 1.01, + "step": 12415 + }, + { + "epoch": 0.9571384520505705, + "grad_norm": 3.7990779876708984, + "learning_rate": 4.8105235966797727e-08, + "loss": 0.8892, + "step": 12416 + }, + { + "epoch": 0.9572155411655874, + "grad_norm": 3.6120383739471436, + "learning_rate": 4.793262792099851e-08, + "loss": 0.8526, + "step": 12417 + }, + { + "epoch": 0.9572926302806044, + "grad_norm": 3.7359561920166016, + "learning_rate": 4.776032861095181e-08, + "loss": 0.9674, + "step": 12418 + }, + { + "epoch": 0.9573697193956213, + "grad_norm": 3.7942633628845215, + "learning_rate": 4.758833804740015e-08, + "loss": 0.8739, + "step": 12419 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 4.12003755569458, + "learning_rate": 4.7416656241067175e-08, + "loss": 1.0081, + "step": 12420 + }, + { + "epoch": 0.9575238976256553, + "grad_norm": 3.9636361598968506, + "learning_rate": 4.724528320265542e-08, + "loss": 0.9851, + "step": 12421 + }, + { + "epoch": 0.9576009867406722, + "grad_norm": 3.665433406829834, + "learning_rate": 4.7074218942849134e-08, + "loss": 0.9087, + "step": 12422 + }, + { + "epoch": 0.9576780758556892, + "grad_norm": 3.686065673828125, + "learning_rate": 4.6903463472313114e-08, + "loss": 0.8049, + "step": 12423 + }, + { + "epoch": 0.9577551649707061, + "grad_norm": 3.6736299991607666, + "learning_rate": 4.6733016801693845e-08, + "loss": 0.9044, + "step": 12424 + }, + { + "epoch": 0.9578322540857231, + "grad_norm": 3.865446090698242, + "learning_rate": 4.6562878941617264e-08, + "loss": 1.0022, + "step": 12425 + }, + { + "epoch": 0.9579093432007401, + "grad_norm": 3.7858636379241943, + "learning_rate": 4.6393049902690446e-08, + "loss": 0.8994, + "step": 12426 + }, + { + "epoch": 0.957986432315757, + "grad_norm": 3.769582986831665, + "learning_rate": 4.6223529695502144e-08, + "loss": 0.8945, + "step": 12427 + }, + { + "epoch": 0.958063521430774, + "grad_norm": 3.338371753692627, + "learning_rate": 4.605431833062002e-08, + "loss": 0.8453, + "step": 12428 + }, + { + "epoch": 0.9581406105457909, + "grad_norm": 4.088877201080322, + "learning_rate": 4.588541581859396e-08, + "loss": 0.9191, + "step": 12429 + }, + { + "epoch": 0.9582176996608079, + "grad_norm": 3.5991716384887695, + "learning_rate": 4.5716822169954436e-08, + "loss": 0.9485, + "step": 12430 + }, + { + "epoch": 0.9582947887758249, + "grad_norm": 3.5620510578155518, + "learning_rate": 4.554853739521192e-08, + "loss": 0.9463, + "step": 12431 + }, + { + "epoch": 0.9583718778908418, + "grad_norm": 3.7672712802886963, + "learning_rate": 4.5380561504858586e-08, + "loss": 0.87, + "step": 12432 + }, + { + "epoch": 0.9584489670058588, + "grad_norm": 4.017693996429443, + "learning_rate": 4.521289450936661e-08, + "loss": 0.9304, + "step": 12433 + }, + { + "epoch": 0.9585260561208757, + "grad_norm": 3.617096185684204, + "learning_rate": 4.504553641918874e-08, + "loss": 0.8937, + "step": 12434 + }, + { + "epoch": 0.9586031452358927, + "grad_norm": 3.6982741355895996, + "learning_rate": 4.4878487244759964e-08, + "loss": 0.8577, + "step": 12435 + }, + { + "epoch": 0.9586802343509097, + "grad_norm": 3.6797823905944824, + "learning_rate": 4.471174699649361e-08, + "loss": 0.8549, + "step": 12436 + }, + { + "epoch": 0.9587573234659266, + "grad_norm": 4.134285926818848, + "learning_rate": 4.4545315684785815e-08, + "loss": 0.9191, + "step": 12437 + }, + { + "epoch": 0.9588344125809436, + "grad_norm": 3.8390088081359863, + "learning_rate": 4.437919332001328e-08, + "loss": 0.9041, + "step": 12438 + }, + { + "epoch": 0.9589115016959605, + "grad_norm": 3.7085273265838623, + "learning_rate": 4.421337991253161e-08, + "loss": 0.9955, + "step": 12439 + }, + { + "epoch": 0.9589885908109775, + "grad_norm": 4.025529384613037, + "learning_rate": 4.4047875472679194e-08, + "loss": 1.0332, + "step": 12440 + }, + { + "epoch": 0.9590656799259945, + "grad_norm": 3.5782909393310547, + "learning_rate": 4.3882680010774445e-08, + "loss": 0.988, + "step": 12441 + }, + { + "epoch": 0.9591427690410114, + "grad_norm": 3.8593509197235107, + "learning_rate": 4.3717793537115806e-08, + "loss": 0.9466, + "step": 12442 + }, + { + "epoch": 0.9592198581560284, + "grad_norm": 3.907071590423584, + "learning_rate": 4.355321606198393e-08, + "loss": 0.8672, + "step": 12443 + }, + { + "epoch": 0.9592969472710453, + "grad_norm": 4.352113246917725, + "learning_rate": 4.338894759563894e-08, + "loss": 0.9618, + "step": 12444 + }, + { + "epoch": 0.9593740363860623, + "grad_norm": 3.6431379318237305, + "learning_rate": 4.3224988148321545e-08, + "loss": 0.8185, + "step": 12445 + }, + { + "epoch": 0.9594511255010792, + "grad_norm": 3.652925729751587, + "learning_rate": 4.306133773025467e-08, + "loss": 0.8715, + "step": 12446 + }, + { + "epoch": 0.9595282146160962, + "grad_norm": 3.6305699348449707, + "learning_rate": 4.2897996351640715e-08, + "loss": 0.7884, + "step": 12447 + }, + { + "epoch": 0.9596053037311132, + "grad_norm": 3.712374448776245, + "learning_rate": 4.2734964022663194e-08, + "loss": 0.8094, + "step": 12448 + }, + { + "epoch": 0.9596823928461301, + "grad_norm": 3.571690797805786, + "learning_rate": 4.257224075348676e-08, + "loss": 0.7927, + "step": 12449 + }, + { + "epoch": 0.9597594819611471, + "grad_norm": 3.573230743408203, + "learning_rate": 4.240982655425552e-08, + "loss": 0.8144, + "step": 12450 + }, + { + "epoch": 0.959836571076164, + "grad_norm": 4.072946548461914, + "learning_rate": 4.224772143509526e-08, + "loss": 0.9237, + "step": 12451 + }, + { + "epoch": 0.959913660191181, + "grad_norm": 3.8101742267608643, + "learning_rate": 4.2085925406112894e-08, + "loss": 0.9525, + "step": 12452 + }, + { + "epoch": 0.959990749306198, + "grad_norm": 3.8451669216156006, + "learning_rate": 4.192443847739536e-08, + "loss": 0.8578, + "step": 12453 + }, + { + "epoch": 0.9600678384212149, + "grad_norm": 3.733152389526367, + "learning_rate": 4.1763260659011265e-08, + "loss": 0.9414, + "step": 12454 + }, + { + "epoch": 0.9601449275362319, + "grad_norm": 3.55338716506958, + "learning_rate": 4.160239196100757e-08, + "loss": 0.8958, + "step": 12455 + }, + { + "epoch": 0.9602220166512488, + "grad_norm": 3.659872531890869, + "learning_rate": 4.144183239341515e-08, + "loss": 0.9185, + "step": 12456 + }, + { + "epoch": 0.9602991057662658, + "grad_norm": 3.818180561065674, + "learning_rate": 4.1281581966243214e-08, + "loss": 0.9281, + "step": 12457 + }, + { + "epoch": 0.9603761948812828, + "grad_norm": 4.13081169128418, + "learning_rate": 4.1121640689482655e-08, + "loss": 0.992, + "step": 12458 + }, + { + "epoch": 0.9604532839962997, + "grad_norm": 3.2538528442382812, + "learning_rate": 4.0962008573105505e-08, + "loss": 0.7597, + "step": 12459 + }, + { + "epoch": 0.9605303731113167, + "grad_norm": 3.9792425632476807, + "learning_rate": 4.080268562706324e-08, + "loss": 0.8961, + "step": 12460 + }, + { + "epoch": 0.9606074622263336, + "grad_norm": 4.055764198303223, + "learning_rate": 4.0643671861289035e-08, + "loss": 0.8922, + "step": 12461 + }, + { + "epoch": 0.9606845513413506, + "grad_norm": 3.6515095233917236, + "learning_rate": 4.048496728569717e-08, + "loss": 0.8648, + "step": 12462 + }, + { + "epoch": 0.9607616404563676, + "grad_norm": 3.2547214031219482, + "learning_rate": 4.0326571910180855e-08, + "loss": 0.7415, + "step": 12463 + }, + { + "epoch": 0.9608387295713845, + "grad_norm": 3.92317271232605, + "learning_rate": 4.016848574461718e-08, + "loss": 0.9378, + "step": 12464 + }, + { + "epoch": 0.9609158186864015, + "grad_norm": 3.4518179893493652, + "learning_rate": 4.001070879885993e-08, + "loss": 0.7162, + "step": 12465 + }, + { + "epoch": 0.9609929078014184, + "grad_norm": 3.502856969833374, + "learning_rate": 3.9853241082746795e-08, + "loss": 0.8978, + "step": 12466 + }, + { + "epoch": 0.9610699969164354, + "grad_norm": 3.2858731746673584, + "learning_rate": 3.9696082606094366e-08, + "loss": 0.8471, + "step": 12467 + }, + { + "epoch": 0.9611470860314524, + "grad_norm": 3.7062535285949707, + "learning_rate": 3.953923337870147e-08, + "loss": 0.893, + "step": 12468 + }, + { + "epoch": 0.9612241751464693, + "grad_norm": 4.334238529205322, + "learning_rate": 3.938269341034695e-08, + "loss": 0.9144, + "step": 12469 + }, + { + "epoch": 0.9613012642614863, + "grad_norm": 3.6347129344940186, + "learning_rate": 3.922646271078911e-08, + "loss": 0.8314, + "step": 12470 + }, + { + "epoch": 0.9613783533765032, + "grad_norm": 4.064388751983643, + "learning_rate": 3.907054128976906e-08, + "loss": 0.9495, + "step": 12471 + }, + { + "epoch": 0.9614554424915202, + "grad_norm": 3.732004165649414, + "learning_rate": 3.8914929157007895e-08, + "loss": 0.9922, + "step": 12472 + }, + { + "epoch": 0.9615325316065372, + "grad_norm": 3.932603120803833, + "learning_rate": 3.87596263222062e-08, + "loss": 0.8979, + "step": 12473 + }, + { + "epoch": 0.9616096207215541, + "grad_norm": 3.8675358295440674, + "learning_rate": 3.860463279504678e-08, + "loss": 0.9025, + "step": 12474 + }, + { + "epoch": 0.9616867098365711, + "grad_norm": 3.780653715133667, + "learning_rate": 3.8449948585193577e-08, + "loss": 0.9724, + "step": 12475 + }, + { + "epoch": 0.961763798951588, + "grad_norm": 3.6268699169158936, + "learning_rate": 3.829557370228887e-08, + "loss": 0.8795, + "step": 12476 + }, + { + "epoch": 0.961840888066605, + "grad_norm": 3.926316976547241, + "learning_rate": 3.814150815595774e-08, + "loss": 0.8567, + "step": 12477 + }, + { + "epoch": 0.961917977181622, + "grad_norm": 4.029868125915527, + "learning_rate": 3.798775195580584e-08, + "loss": 0.837, + "step": 12478 + }, + { + "epoch": 0.9619950662966389, + "grad_norm": 3.769047260284424, + "learning_rate": 3.783430511141828e-08, + "loss": 0.7122, + "step": 12479 + }, + { + "epoch": 0.9620721554116559, + "grad_norm": 3.4524729251861572, + "learning_rate": 3.76811676323624e-08, + "loss": 0.7635, + "step": 12480 + }, + { + "epoch": 0.9621492445266728, + "grad_norm": 4.048440933227539, + "learning_rate": 3.752833952818502e-08, + "loss": 0.8472, + "step": 12481 + }, + { + "epoch": 0.9622263336416897, + "grad_norm": 3.5961873531341553, + "learning_rate": 3.737582080841462e-08, + "loss": 0.9834, + "step": 12482 + }, + { + "epoch": 0.9623034227567068, + "grad_norm": 4.128652095794678, + "learning_rate": 3.722361148255971e-08, + "loss": 0.8462, + "step": 12483 + }, + { + "epoch": 0.9623805118717237, + "grad_norm": 3.8181169033050537, + "learning_rate": 3.707171156010936e-08, + "loss": 0.9219, + "step": 12484 + }, + { + "epoch": 0.9624576009867407, + "grad_norm": 3.621793031692505, + "learning_rate": 3.692012105053433e-08, + "loss": 0.8931, + "step": 12485 + }, + { + "epoch": 0.9625346901017576, + "grad_norm": 3.9379565715789795, + "learning_rate": 3.6768839963285395e-08, + "loss": 0.8306, + "step": 12486 + }, + { + "epoch": 0.9626117792167745, + "grad_norm": 3.8023934364318848, + "learning_rate": 3.66178683077939e-08, + "loss": 0.97, + "step": 12487 + }, + { + "epoch": 0.9626888683317916, + "grad_norm": 3.626626968383789, + "learning_rate": 3.646720609347232e-08, + "loss": 0.9197, + "step": 12488 + }, + { + "epoch": 0.9627659574468085, + "grad_norm": 3.600571393966675, + "learning_rate": 3.6316853329713686e-08, + "loss": 0.9092, + "step": 12489 + }, + { + "epoch": 0.9628430465618255, + "grad_norm": 3.4085299968719482, + "learning_rate": 3.616681002589162e-08, + "loss": 0.7884, + "step": 12490 + }, + { + "epoch": 0.9629201356768424, + "grad_norm": 3.688203811645508, + "learning_rate": 3.601707619136086e-08, + "loss": 0.7688, + "step": 12491 + }, + { + "epoch": 0.9629972247918593, + "grad_norm": 3.6305623054504395, + "learning_rate": 3.586765183545615e-08, + "loss": 0.9648, + "step": 12492 + }, + { + "epoch": 0.9630743139068764, + "grad_norm": 3.8015410900115967, + "learning_rate": 3.571853696749339e-08, + "loss": 0.9645, + "step": 12493 + }, + { + "epoch": 0.9631514030218933, + "grad_norm": 3.945363998413086, + "learning_rate": 3.556973159676902e-08, + "loss": 0.92, + "step": 12494 + }, + { + "epoch": 0.9632284921369103, + "grad_norm": 3.6869564056396484, + "learning_rate": 3.542123573256062e-08, + "loss": 0.8454, + "step": 12495 + }, + { + "epoch": 0.9633055812519272, + "grad_norm": 4.102055549621582, + "learning_rate": 3.5273049384126345e-08, + "loss": 0.9861, + "step": 12496 + }, + { + "epoch": 0.9633826703669441, + "grad_norm": 4.952552318572998, + "learning_rate": 3.51251725607038e-08, + "loss": 0.8793, + "step": 12497 + }, + { + "epoch": 0.9634597594819612, + "grad_norm": 3.5862975120544434, + "learning_rate": 3.497760527151284e-08, + "loss": 0.8908, + "step": 12498 + }, + { + "epoch": 0.9635368485969781, + "grad_norm": 3.9423680305480957, + "learning_rate": 3.4830347525754425e-08, + "loss": 0.8734, + "step": 12499 + }, + { + "epoch": 0.9636139377119951, + "grad_norm": 4.143372058868408, + "learning_rate": 3.4683399332607893e-08, + "loss": 0.9683, + "step": 12500 + }, + { + "epoch": 0.963691026827012, + "grad_norm": 3.9046006202697754, + "learning_rate": 3.45367607012359e-08, + "loss": 0.9009, + "step": 12501 + }, + { + "epoch": 0.9637681159420289, + "grad_norm": 3.8234989643096924, + "learning_rate": 3.439043164078004e-08, + "loss": 1.0309, + "step": 12502 + }, + { + "epoch": 0.963845205057046, + "grad_norm": 3.920905828475952, + "learning_rate": 3.424441216036301e-08, + "loss": 0.9936, + "step": 12503 + }, + { + "epoch": 0.9639222941720629, + "grad_norm": 3.868464231491089, + "learning_rate": 3.409870226908863e-08, + "loss": 0.8631, + "step": 12504 + }, + { + "epoch": 0.9639993832870799, + "grad_norm": 3.8888497352600098, + "learning_rate": 3.3953301976040743e-08, + "loss": 0.9221, + "step": 12505 + }, + { + "epoch": 0.9640764724020968, + "grad_norm": 3.7030375003814697, + "learning_rate": 3.3808211290284886e-08, + "loss": 0.8446, + "step": 12506 + }, + { + "epoch": 0.9641535615171137, + "grad_norm": 4.315587520599365, + "learning_rate": 3.36634302208666e-08, + "loss": 0.9185, + "step": 12507 + }, + { + "epoch": 0.9642306506321308, + "grad_norm": 3.6543986797332764, + "learning_rate": 3.351895877681255e-08, + "loss": 0.9264, + "step": 12508 + }, + { + "epoch": 0.9643077397471477, + "grad_norm": 3.710772752761841, + "learning_rate": 3.3374796967128866e-08, + "loss": 0.9397, + "step": 12509 + }, + { + "epoch": 0.9643848288621647, + "grad_norm": 3.6390933990478516, + "learning_rate": 3.3230944800803355e-08, + "loss": 0.8503, + "step": 12510 + }, + { + "epoch": 0.9644619179771816, + "grad_norm": 4.070563793182373, + "learning_rate": 3.3087402286805514e-08, + "loss": 0.8884, + "step": 12511 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 3.739609718322754, + "learning_rate": 3.294416943408374e-08, + "loss": 0.9062, + "step": 12512 + }, + { + "epoch": 0.9646160962072156, + "grad_norm": 3.311293840408325, + "learning_rate": 3.2801246251568106e-08, + "loss": 0.8365, + "step": 12513 + }, + { + "epoch": 0.9646931853222325, + "grad_norm": 3.451474666595459, + "learning_rate": 3.2658632748168714e-08, + "loss": 0.829, + "step": 12514 + }, + { + "epoch": 0.9647702744372495, + "grad_norm": 3.717940092086792, + "learning_rate": 3.2516328932776786e-08, + "loss": 0.8624, + "step": 12515 + }, + { + "epoch": 0.9648473635522664, + "grad_norm": 3.5586538314819336, + "learning_rate": 3.2374334814265216e-08, + "loss": 0.8706, + "step": 12516 + }, + { + "epoch": 0.9649244526672833, + "grad_norm": 3.5937881469726562, + "learning_rate": 3.223265040148527e-08, + "loss": 0.9791, + "step": 12517 + }, + { + "epoch": 0.9650015417823004, + "grad_norm": 3.6897778511047363, + "learning_rate": 3.209127570327153e-08, + "loss": 0.8629, + "step": 12518 + }, + { + "epoch": 0.9650786308973173, + "grad_norm": 3.6691691875457764, + "learning_rate": 3.19502107284364e-08, + "loss": 0.9409, + "step": 12519 + }, + { + "epoch": 0.9651557200123343, + "grad_norm": 3.7458150386810303, + "learning_rate": 3.180945548577619e-08, + "loss": 0.9058, + "step": 12520 + }, + { + "epoch": 0.9652328091273512, + "grad_norm": 3.7683491706848145, + "learning_rate": 3.166900998406497e-08, + "loss": 0.9012, + "step": 12521 + }, + { + "epoch": 0.9653098982423681, + "grad_norm": 3.705265522003174, + "learning_rate": 3.1528874232059635e-08, + "loss": 0.813, + "step": 12522 + }, + { + "epoch": 0.9653869873573852, + "grad_norm": 3.670219898223877, + "learning_rate": 3.138904823849653e-08, + "loss": 0.8558, + "step": 12523 + }, + { + "epoch": 0.9654640764724021, + "grad_norm": 3.741985321044922, + "learning_rate": 3.124953201209313e-08, + "loss": 0.9858, + "step": 12524 + }, + { + "epoch": 0.9655411655874191, + "grad_norm": 3.5063395500183105, + "learning_rate": 3.1110325561547473e-08, + "loss": 0.7742, + "step": 12525 + }, + { + "epoch": 0.965618254702436, + "grad_norm": 3.4746077060699463, + "learning_rate": 3.097142889553872e-08, + "loss": 0.8353, + "step": 12526 + }, + { + "epoch": 0.9656953438174529, + "grad_norm": 3.630375385284424, + "learning_rate": 3.083284202272607e-08, + "loss": 0.84, + "step": 12527 + }, + { + "epoch": 0.96577243293247, + "grad_norm": 4.338189125061035, + "learning_rate": 3.069456495174983e-08, + "loss": 0.8878, + "step": 12528 + }, + { + "epoch": 0.9658495220474869, + "grad_norm": 3.834465503692627, + "learning_rate": 3.0556597691230317e-08, + "loss": 0.8431, + "step": 12529 + }, + { + "epoch": 0.9659266111625039, + "grad_norm": 3.5252437591552734, + "learning_rate": 3.04189402497701e-08, + "loss": 0.7253, + "step": 12530 + }, + { + "epoch": 0.9660037002775208, + "grad_norm": 3.7228147983551025, + "learning_rate": 3.028159263595065e-08, + "loss": 0.9094, + "step": 12531 + }, + { + "epoch": 0.9660807893925377, + "grad_norm": 4.035754680633545, + "learning_rate": 3.0144554858334563e-08, + "loss": 0.994, + "step": 12532 + }, + { + "epoch": 0.9661578785075547, + "grad_norm": 4.05665922164917, + "learning_rate": 3.000782692546667e-08, + "loss": 0.869, + "step": 12533 + }, + { + "epoch": 0.9662349676225717, + "grad_norm": 3.910731554031372, + "learning_rate": 2.9871408845870144e-08, + "loss": 0.9663, + "step": 12534 + }, + { + "epoch": 0.9663120567375887, + "grad_norm": 4.1619696617126465, + "learning_rate": 2.973530062805041e-08, + "loss": 0.8635, + "step": 12535 + }, + { + "epoch": 0.9663891458526056, + "grad_norm": 3.829831838607788, + "learning_rate": 2.959950228049291e-08, + "loss": 0.9295, + "step": 12536 + }, + { + "epoch": 0.9664662349676225, + "grad_norm": 3.9368739128112793, + "learning_rate": 2.9464013811664192e-08, + "loss": 0.8815, + "step": 12537 + }, + { + "epoch": 0.9665433240826395, + "grad_norm": 3.4988670349121094, + "learning_rate": 2.9328835230011398e-08, + "loss": 0.8505, + "step": 12538 + }, + { + "epoch": 0.9666204131976565, + "grad_norm": 3.812965154647827, + "learning_rate": 2.9193966543961673e-08, + "loss": 0.8958, + "step": 12539 + }, + { + "epoch": 0.9666975023126735, + "grad_norm": 3.911647081375122, + "learning_rate": 2.905940776192384e-08, + "loss": 0.9759, + "step": 12540 + }, + { + "epoch": 0.9667745914276904, + "grad_norm": 3.842432975769043, + "learning_rate": 2.89251588922862e-08, + "loss": 0.9148, + "step": 12541 + }, + { + "epoch": 0.9668516805427073, + "grad_norm": 3.5029232501983643, + "learning_rate": 2.8791219943419824e-08, + "loss": 0.7411, + "step": 12542 + }, + { + "epoch": 0.9669287696577243, + "grad_norm": 3.9447033405303955, + "learning_rate": 2.8657590923673596e-08, + "loss": 0.9414, + "step": 12543 + }, + { + "epoch": 0.9670058587727413, + "grad_norm": 3.7002828121185303, + "learning_rate": 2.852427184138029e-08, + "loss": 0.9014, + "step": 12544 + }, + { + "epoch": 0.9670829478877583, + "grad_norm": 3.4802746772766113, + "learning_rate": 2.839126270484993e-08, + "loss": 0.8871, + "step": 12545 + }, + { + "epoch": 0.9671600370027752, + "grad_norm": 3.994755744934082, + "learning_rate": 2.825856352237588e-08, + "loss": 0.8622, + "step": 12546 + }, + { + "epoch": 0.9672371261177922, + "grad_norm": 3.6245877742767334, + "learning_rate": 2.812617430223097e-08, + "loss": 0.802, + "step": 12547 + }, + { + "epoch": 0.9673142152328091, + "grad_norm": 4.391573429107666, + "learning_rate": 2.7994095052669147e-08, + "loss": 0.9172, + "step": 12548 + }, + { + "epoch": 0.967391304347826, + "grad_norm": 3.789491653442383, + "learning_rate": 2.786232578192494e-08, + "loss": 0.9283, + "step": 12549 + }, + { + "epoch": 0.9674683934628431, + "grad_norm": 3.5741639137268066, + "learning_rate": 2.773086649821344e-08, + "loss": 0.9144, + "step": 12550 + }, + { + "epoch": 0.96754548257786, + "grad_norm": 3.636545181274414, + "learning_rate": 2.7599717209730316e-08, + "loss": 0.823, + "step": 12551 + }, + { + "epoch": 0.967622571692877, + "grad_norm": 3.7562508583068848, + "learning_rate": 2.7468877924651804e-08, + "loss": 0.9054, + "step": 12552 + }, + { + "epoch": 0.9676996608078939, + "grad_norm": 3.873007297515869, + "learning_rate": 2.733834865113527e-08, + "loss": 0.9583, + "step": 12553 + }, + { + "epoch": 0.9677767499229109, + "grad_norm": 3.7551496028900146, + "learning_rate": 2.7208129397319206e-08, + "loss": 0.8355, + "step": 12554 + }, + { + "epoch": 0.9678538390379279, + "grad_norm": 3.8303260803222656, + "learning_rate": 2.7078220171321556e-08, + "loss": 0.8735, + "step": 12555 + }, + { + "epoch": 0.9679309281529448, + "grad_norm": 3.5815868377685547, + "learning_rate": 2.6948620981240848e-08, + "loss": 0.7389, + "step": 12556 + }, + { + "epoch": 0.9680080172679618, + "grad_norm": 3.6111199855804443, + "learning_rate": 2.6819331835157836e-08, + "loss": 0.7886, + "step": 12557 + }, + { + "epoch": 0.9680851063829787, + "grad_norm": 3.4187819957733154, + "learning_rate": 2.669035274113274e-08, + "loss": 0.8699, + "step": 12558 + }, + { + "epoch": 0.9681621954979956, + "grad_norm": 3.8764898777008057, + "learning_rate": 2.6561683707206354e-08, + "loss": 0.921, + "step": 12559 + }, + { + "epoch": 0.9682392846130127, + "grad_norm": 3.894237756729126, + "learning_rate": 2.6433324741401702e-08, + "loss": 0.9434, + "step": 12560 + }, + { + "epoch": 0.9683163737280296, + "grad_norm": 3.7127158641815186, + "learning_rate": 2.6305275851720158e-08, + "loss": 0.8949, + "step": 12561 + }, + { + "epoch": 0.9683934628430466, + "grad_norm": 3.729252576828003, + "learning_rate": 2.6177537046144786e-08, + "loss": 0.8965, + "step": 12562 + }, + { + "epoch": 0.9684705519580635, + "grad_norm": 4.151895523071289, + "learning_rate": 2.6050108332640323e-08, + "loss": 1.0355, + "step": 12563 + }, + { + "epoch": 0.9685476410730804, + "grad_norm": 3.9273674488067627, + "learning_rate": 2.5922989719150414e-08, + "loss": 0.8994, + "step": 12564 + }, + { + "epoch": 0.9686247301880975, + "grad_norm": 3.550187349319458, + "learning_rate": 2.5796181213601503e-08, + "loss": 0.9353, + "step": 12565 + }, + { + "epoch": 0.9687018193031144, + "grad_norm": 3.781679153442383, + "learning_rate": 2.5669682823897813e-08, + "loss": 0.9055, + "step": 12566 + }, + { + "epoch": 0.9687789084181314, + "grad_norm": 3.9291346073150635, + "learning_rate": 2.554349455792693e-08, + "loss": 0.8633, + "step": 12567 + }, + { + "epoch": 0.9688559975331483, + "grad_norm": 4.16334867477417, + "learning_rate": 2.541761642355589e-08, + "loss": 0.9427, + "step": 12568 + }, + { + "epoch": 0.9689330866481652, + "grad_norm": 3.531303882598877, + "learning_rate": 2.529204842863231e-08, + "loss": 0.773, + "step": 12569 + }, + { + "epoch": 0.9690101757631823, + "grad_norm": 3.517418146133423, + "learning_rate": 2.5166790580984923e-08, + "loss": 0.8888, + "step": 12570 + }, + { + "epoch": 0.9690872648781992, + "grad_norm": 3.5748257637023926, + "learning_rate": 2.504184288842304e-08, + "loss": 0.9294, + "step": 12571 + }, + { + "epoch": 0.9691643539932162, + "grad_norm": 3.5963714122772217, + "learning_rate": 2.4917205358735986e-08, + "loss": 0.9087, + "step": 12572 + }, + { + "epoch": 0.9692414431082331, + "grad_norm": 3.668978691101074, + "learning_rate": 2.4792877999694764e-08, + "loss": 0.8818, + "step": 12573 + }, + { + "epoch": 0.96931853222325, + "grad_norm": 3.8249216079711914, + "learning_rate": 2.46688608190504e-08, + "loss": 0.9889, + "step": 12574 + }, + { + "epoch": 0.9693956213382671, + "grad_norm": 3.7434964179992676, + "learning_rate": 2.4545153824534483e-08, + "loss": 0.906, + "step": 12575 + }, + { + "epoch": 0.969472710453284, + "grad_norm": 3.613534450531006, + "learning_rate": 2.4421757023859737e-08, + "loss": 0.8281, + "step": 12576 + }, + { + "epoch": 0.969549799568301, + "grad_norm": 3.6317696571350098, + "learning_rate": 2.4298670424718895e-08, + "loss": 0.8908, + "step": 12577 + }, + { + "epoch": 0.9696268886833179, + "grad_norm": 3.982708215713501, + "learning_rate": 2.417589403478693e-08, + "loss": 0.857, + "step": 12578 + }, + { + "epoch": 0.9697039777983348, + "grad_norm": 3.8874056339263916, + "learning_rate": 2.4053427861716605e-08, + "loss": 0.9243, + "step": 12579 + }, + { + "epoch": 0.9697810669133519, + "grad_norm": 3.79288649559021, + "learning_rate": 2.3931271913144595e-08, + "loss": 0.9047, + "step": 12580 + }, + { + "epoch": 0.9698581560283688, + "grad_norm": 3.9113500118255615, + "learning_rate": 2.3809426196685913e-08, + "loss": 1.0422, + "step": 12581 + }, + { + "epoch": 0.9699352451433858, + "grad_norm": 3.4592325687408447, + "learning_rate": 2.3687890719937266e-08, + "loss": 0.7883, + "step": 12582 + }, + { + "epoch": 0.9700123342584027, + "grad_norm": 4.037449836730957, + "learning_rate": 2.3566665490475372e-08, + "loss": 0.9285, + "step": 12583 + }, + { + "epoch": 0.9700894233734196, + "grad_norm": 3.595924139022827, + "learning_rate": 2.3445750515858067e-08, + "loss": 0.8422, + "step": 12584 + }, + { + "epoch": 0.9701665124884367, + "grad_norm": 3.691532611846924, + "learning_rate": 2.3325145803624327e-08, + "loss": 0.9454, + "step": 12585 + }, + { + "epoch": 0.9702436016034536, + "grad_norm": 3.8150765895843506, + "learning_rate": 2.3204851361293136e-08, + "loss": 0.944, + "step": 12586 + }, + { + "epoch": 0.9703206907184706, + "grad_norm": 4.036136627197266, + "learning_rate": 2.308486719636349e-08, + "loss": 0.9124, + "step": 12587 + }, + { + "epoch": 0.9703977798334875, + "grad_norm": 3.491523504257202, + "learning_rate": 2.2965193316316635e-08, + "loss": 0.9327, + "step": 12588 + }, + { + "epoch": 0.9704748689485044, + "grad_norm": 4.169717788696289, + "learning_rate": 2.2845829728613268e-08, + "loss": 0.8412, + "step": 12589 + }, + { + "epoch": 0.9705519580635215, + "grad_norm": 3.619222640991211, + "learning_rate": 2.2726776440694653e-08, + "loss": 0.8712, + "step": 12590 + }, + { + "epoch": 0.9706290471785384, + "grad_norm": 3.93977427482605, + "learning_rate": 2.2608033459983747e-08, + "loss": 0.9405, + "step": 12591 + }, + { + "epoch": 0.9707061362935554, + "grad_norm": 3.866957187652588, + "learning_rate": 2.2489600793883516e-08, + "loss": 0.9402, + "step": 12592 + }, + { + "epoch": 0.9707832254085723, + "grad_norm": 3.519181966781616, + "learning_rate": 2.2371478449777495e-08, + "loss": 0.8973, + "step": 12593 + }, + { + "epoch": 0.9708603145235892, + "grad_norm": 3.5281713008880615, + "learning_rate": 2.2253666435029797e-08, + "loss": 0.839, + "step": 12594 + }, + { + "epoch": 0.9709374036386063, + "grad_norm": 3.604396343231201, + "learning_rate": 2.2136164756985655e-08, + "loss": 0.8367, + "step": 12595 + }, + { + "epoch": 0.9710144927536232, + "grad_norm": 3.734344720840454, + "learning_rate": 2.201897342297088e-08, + "loss": 0.9964, + "step": 12596 + }, + { + "epoch": 0.9710915818686402, + "grad_norm": 4.021172523498535, + "learning_rate": 2.190209244029129e-08, + "loss": 0.7933, + "step": 12597 + }, + { + "epoch": 0.9711686709836571, + "grad_norm": 3.9066355228424072, + "learning_rate": 2.1785521816233834e-08, + "loss": 1.0668, + "step": 12598 + }, + { + "epoch": 0.971245760098674, + "grad_norm": 4.022904396057129, + "learning_rate": 2.166926155806659e-08, + "loss": 0.9402, + "step": 12599 + }, + { + "epoch": 0.971322849213691, + "grad_norm": 3.6551458835601807, + "learning_rate": 2.155331167303709e-08, + "loss": 0.8959, + "step": 12600 + }, + { + "epoch": 0.971399938328708, + "grad_norm": 3.6164748668670654, + "learning_rate": 2.1437672168374557e-08, + "loss": 0.8252, + "step": 12601 + }, + { + "epoch": 0.971477027443725, + "grad_norm": 3.7705652713775635, + "learning_rate": 2.1322343051289328e-08, + "loss": 0.976, + "step": 12602 + }, + { + "epoch": 0.9715541165587419, + "grad_norm": 3.6110386848449707, + "learning_rate": 2.1207324328970103e-08, + "loss": 0.925, + "step": 12603 + }, + { + "epoch": 0.9716312056737588, + "grad_norm": 3.571214199066162, + "learning_rate": 2.1092616008588364e-08, + "loss": 0.9206, + "step": 12604 + }, + { + "epoch": 0.9717082947887759, + "grad_norm": 3.6576857566833496, + "learning_rate": 2.0978218097295612e-08, + "loss": 0.8903, + "step": 12605 + }, + { + "epoch": 0.9717853839037928, + "grad_norm": 3.6607632637023926, + "learning_rate": 2.086413060222392e-08, + "loss": 0.8791, + "step": 12606 + }, + { + "epoch": 0.9718624730188098, + "grad_norm": 3.758730888366699, + "learning_rate": 2.0750353530485932e-08, + "loss": 0.8838, + "step": 12607 + }, + { + "epoch": 0.9719395621338267, + "grad_norm": 4.265690326690674, + "learning_rate": 2.0636886889175978e-08, + "loss": 0.9786, + "step": 12608 + }, + { + "epoch": 0.9720166512488436, + "grad_norm": 3.9939539432525635, + "learning_rate": 2.0523730685366726e-08, + "loss": 0.9212, + "step": 12609 + }, + { + "epoch": 0.9720937403638606, + "grad_norm": 3.4965243339538574, + "learning_rate": 2.0410884926113094e-08, + "loss": 0.9082, + "step": 12610 + }, + { + "epoch": 0.9721708294788776, + "grad_norm": 4.189631938934326, + "learning_rate": 2.0298349618451673e-08, + "loss": 0.9899, + "step": 12611 + }, + { + "epoch": 0.9722479185938946, + "grad_norm": 3.7480947971343994, + "learning_rate": 2.0186124769396855e-08, + "loss": 0.9383, + "step": 12612 + }, + { + "epoch": 0.9723250077089115, + "grad_norm": 3.6697635650634766, + "learning_rate": 2.0074210385946925e-08, + "loss": 0.9815, + "step": 12613 + }, + { + "epoch": 0.9724020968239284, + "grad_norm": 3.7893688678741455, + "learning_rate": 1.996260647507797e-08, + "loss": 1.0289, + "step": 12614 + }, + { + "epoch": 0.9724791859389454, + "grad_norm": 3.7555932998657227, + "learning_rate": 1.9851313043747767e-08, + "loss": 0.8935, + "step": 12615 + }, + { + "epoch": 0.9725562750539624, + "grad_norm": 3.678717851638794, + "learning_rate": 1.9740330098895755e-08, + "loss": 0.8674, + "step": 12616 + }, + { + "epoch": 0.9726333641689794, + "grad_norm": 3.5781774520874023, + "learning_rate": 1.962965764744029e-08, + "loss": 0.9078, + "step": 12617 + }, + { + "epoch": 0.9727104532839963, + "grad_norm": 3.7126963138580322, + "learning_rate": 1.951929569628197e-08, + "loss": 0.8724, + "step": 12618 + }, + { + "epoch": 0.9727875423990132, + "grad_norm": 3.61206316947937, + "learning_rate": 1.9409244252301396e-08, + "loss": 0.788, + "step": 12619 + }, + { + "epoch": 0.9728646315140302, + "grad_norm": 3.783534049987793, + "learning_rate": 1.929950332235864e-08, + "loss": 0.9197, + "step": 12620 + }, + { + "epoch": 0.9729417206290472, + "grad_norm": 3.796304225921631, + "learning_rate": 1.9190072913296555e-08, + "loss": 0.8634, + "step": 12621 + }, + { + "epoch": 0.9730188097440642, + "grad_norm": 4.072221755981445, + "learning_rate": 1.908095303193691e-08, + "loss": 0.9239, + "step": 12622 + }, + { + "epoch": 0.9730958988590811, + "grad_norm": 4.112748146057129, + "learning_rate": 1.8972143685083155e-08, + "loss": 0.9495, + "step": 12623 + }, + { + "epoch": 0.973172987974098, + "grad_norm": 3.8566768169403076, + "learning_rate": 1.886364487951875e-08, + "loss": 0.9331, + "step": 12624 + }, + { + "epoch": 0.973250077089115, + "grad_norm": 3.7345011234283447, + "learning_rate": 1.8755456622008283e-08, + "loss": 0.8506, + "step": 12625 + }, + { + "epoch": 0.973327166204132, + "grad_norm": 4.711991310119629, + "learning_rate": 1.864757891929636e-08, + "loss": 0.934, + "step": 12626 + }, + { + "epoch": 0.973404255319149, + "grad_norm": 4.006261825561523, + "learning_rate": 1.8540011778108714e-08, + "loss": 1.0348, + "step": 12627 + }, + { + "epoch": 0.9734813444341659, + "grad_norm": 3.980159282684326, + "learning_rate": 1.84327552051522e-08, + "loss": 0.8444, + "step": 12628 + }, + { + "epoch": 0.9735584335491828, + "grad_norm": 3.6981513500213623, + "learning_rate": 1.8325809207112576e-08, + "loss": 0.9964, + "step": 12629 + }, + { + "epoch": 0.9736355226641998, + "grad_norm": 3.5920050144195557, + "learning_rate": 1.8219173790658406e-08, + "loss": 0.8259, + "step": 12630 + }, + { + "epoch": 0.9737126117792168, + "grad_norm": 3.436185836791992, + "learning_rate": 1.8112848962437146e-08, + "loss": 0.8754, + "step": 12631 + }, + { + "epoch": 0.9737897008942338, + "grad_norm": 3.517681360244751, + "learning_rate": 1.800683472907794e-08, + "loss": 0.8284, + "step": 12632 + }, + { + "epoch": 0.9738667900092507, + "grad_norm": 3.979491710662842, + "learning_rate": 1.7901131097190494e-08, + "loss": 0.9857, + "step": 12633 + }, + { + "epoch": 0.9739438791242676, + "grad_norm": 3.7884466648101807, + "learning_rate": 1.7795738073364543e-08, + "loss": 0.8687, + "step": 12634 + }, + { + "epoch": 0.9740209682392846, + "grad_norm": 3.576528787612915, + "learning_rate": 1.7690655664170388e-08, + "loss": 0.7963, + "step": 12635 + }, + { + "epoch": 0.9740980573543015, + "grad_norm": 4.090680122375488, + "learning_rate": 1.7585883876160003e-08, + "loss": 0.8982, + "step": 12636 + }, + { + "epoch": 0.9741751464693186, + "grad_norm": 3.728907346725464, + "learning_rate": 1.7481422715865394e-08, + "loss": 0.8825, + "step": 12637 + }, + { + "epoch": 0.9742522355843355, + "grad_norm": 3.741525411605835, + "learning_rate": 1.737727218979912e-08, + "loss": 0.9252, + "step": 12638 + }, + { + "epoch": 0.9743293246993524, + "grad_norm": 3.8002474308013916, + "learning_rate": 1.7273432304453774e-08, + "loss": 0.7882, + "step": 12639 + }, + { + "epoch": 0.9744064138143694, + "grad_norm": 3.745436191558838, + "learning_rate": 1.7169903066303618e-08, + "loss": 0.9055, + "step": 12640 + }, + { + "epoch": 0.9744835029293863, + "grad_norm": 3.9796302318573, + "learning_rate": 1.7066684481803486e-08, + "loss": 0.9246, + "step": 12641 + }, + { + "epoch": 0.9745605920444034, + "grad_norm": 3.8389484882354736, + "learning_rate": 1.6963776557388235e-08, + "loss": 0.82, + "step": 12642 + }, + { + "epoch": 0.9746376811594203, + "grad_norm": 3.872706651687622, + "learning_rate": 1.686117929947384e-08, + "loss": 0.9159, + "step": 12643 + }, + { + "epoch": 0.9747147702744372, + "grad_norm": 4.000880241394043, + "learning_rate": 1.6758892714456853e-08, + "loss": 0.8, + "step": 12644 + }, + { + "epoch": 0.9747918593894542, + "grad_norm": 3.820415735244751, + "learning_rate": 1.665691680871384e-08, + "loss": 0.9869, + "step": 12645 + }, + { + "epoch": 0.9748689485044711, + "grad_norm": 3.617678165435791, + "learning_rate": 1.6555251588602496e-08, + "loss": 0.9262, + "step": 12646 + }, + { + "epoch": 0.9749460376194882, + "grad_norm": 3.6912779808044434, + "learning_rate": 1.6453897060461076e-08, + "loss": 0.9492, + "step": 12647 + }, + { + "epoch": 0.9750231267345051, + "grad_norm": 3.9584383964538574, + "learning_rate": 1.6352853230609534e-08, + "loss": 0.972, + "step": 12648 + }, + { + "epoch": 0.975100215849522, + "grad_norm": 3.8923022747039795, + "learning_rate": 1.6252120105345604e-08, + "loss": 0.993, + "step": 12649 + }, + { + "epoch": 0.975177304964539, + "grad_norm": 3.5396828651428223, + "learning_rate": 1.6151697690951484e-08, + "loss": 0.83, + "step": 12650 + }, + { + "epoch": 0.9752543940795559, + "grad_norm": 3.774773359298706, + "learning_rate": 1.6051585993686614e-08, + "loss": 0.8335, + "step": 12651 + }, + { + "epoch": 0.975331483194573, + "grad_norm": 3.4751946926116943, + "learning_rate": 1.5951785019792666e-08, + "loss": 0.8478, + "step": 12652 + }, + { + "epoch": 0.9754085723095899, + "grad_norm": 3.7243905067443848, + "learning_rate": 1.5852294775491882e-08, + "loss": 0.8084, + "step": 12653 + }, + { + "epoch": 0.9754856614246068, + "grad_norm": 3.5833497047424316, + "learning_rate": 1.575311526698653e-08, + "loss": 0.7971, + "step": 12654 + }, + { + "epoch": 0.9755627505396238, + "grad_norm": 3.848736524581909, + "learning_rate": 1.56542465004611e-08, + "loss": 0.9441, + "step": 12655 + }, + { + "epoch": 0.9756398396546407, + "grad_norm": 3.8537116050720215, + "learning_rate": 1.5555688482078446e-08, + "loss": 0.8782, + "step": 12656 + }, + { + "epoch": 0.9757169287696578, + "grad_norm": 3.7755062580108643, + "learning_rate": 1.545744121798365e-08, + "loss": 0.823, + "step": 12657 + }, + { + "epoch": 0.9757940178846747, + "grad_norm": 3.62941837310791, + "learning_rate": 1.535950471430181e-08, + "loss": 0.8467, + "step": 12658 + }, + { + "epoch": 0.9758711069996916, + "grad_norm": 3.677624464035034, + "learning_rate": 1.52618789771386e-08, + "loss": 0.9067, + "step": 12659 + }, + { + "epoch": 0.9759481961147086, + "grad_norm": 3.693296432495117, + "learning_rate": 1.516456401258082e-08, + "loss": 0.8758, + "step": 12660 + }, + { + "epoch": 0.9760252852297255, + "grad_norm": 4.068735122680664, + "learning_rate": 1.5067559826695277e-08, + "loss": 1.0129, + "step": 12661 + }, + { + "epoch": 0.9761023743447426, + "grad_norm": 3.9047787189483643, + "learning_rate": 1.4970866425529916e-08, + "loss": 0.8848, + "step": 12662 + }, + { + "epoch": 0.9761794634597595, + "grad_norm": 3.9053268432617188, + "learning_rate": 1.4874483815112694e-08, + "loss": 0.9414, + "step": 12663 + }, + { + "epoch": 0.9762565525747764, + "grad_norm": 3.717155933380127, + "learning_rate": 1.477841200145269e-08, + "loss": 0.8827, + "step": 12664 + }, + { + "epoch": 0.9763336416897934, + "grad_norm": 3.714543581008911, + "learning_rate": 1.468265099053956e-08, + "loss": 0.9911, + "step": 12665 + }, + { + "epoch": 0.9764107308048103, + "grad_norm": 3.9254050254821777, + "learning_rate": 1.4587200788343524e-08, + "loss": 0.9414, + "step": 12666 + }, + { + "epoch": 0.9764878199198274, + "grad_norm": 3.731003999710083, + "learning_rate": 1.4492061400815383e-08, + "loss": 0.8444, + "step": 12667 + }, + { + "epoch": 0.9765649090348443, + "grad_norm": 3.458054542541504, + "learning_rate": 1.4397232833887053e-08, + "loss": 0.717, + "step": 12668 + }, + { + "epoch": 0.9766419981498612, + "grad_norm": 4.42410945892334, + "learning_rate": 1.4302715093469365e-08, + "loss": 0.8526, + "step": 12669 + }, + { + "epoch": 0.9767190872648782, + "grad_norm": 3.618612289428711, + "learning_rate": 1.4208508185456493e-08, + "loss": 0.9025, + "step": 12670 + }, + { + "epoch": 0.9767961763798951, + "grad_norm": 3.267320156097412, + "learning_rate": 1.4114612115720961e-08, + "loss": 0.8451, + "step": 12671 + }, + { + "epoch": 0.9768732654949122, + "grad_norm": 3.8861756324768066, + "learning_rate": 1.4021026890116418e-08, + "loss": 0.9779, + "step": 12672 + }, + { + "epoch": 0.9769503546099291, + "grad_norm": 3.7133381366729736, + "learning_rate": 1.3927752514478198e-08, + "loss": 0.9188, + "step": 12673 + }, + { + "epoch": 0.977027443724946, + "grad_norm": 4.010810375213623, + "learning_rate": 1.383478899462054e-08, + "loss": 0.9097, + "step": 12674 + }, + { + "epoch": 0.977104532839963, + "grad_norm": 4.118958473205566, + "learning_rate": 1.3742136336340473e-08, + "loss": 0.9476, + "step": 12675 + }, + { + "epoch": 0.9771816219549799, + "grad_norm": 4.091235160827637, + "learning_rate": 1.3649794545413376e-08, + "loss": 0.9998, + "step": 12676 + }, + { + "epoch": 0.977258711069997, + "grad_norm": 3.626465082168579, + "learning_rate": 1.3557763627596309e-08, + "loss": 0.9915, + "step": 12677 + }, + { + "epoch": 0.9773358001850139, + "grad_norm": 3.511397361755371, + "learning_rate": 1.3466043588628019e-08, + "loss": 0.9487, + "step": 12678 + }, + { + "epoch": 0.9774128893000308, + "grad_norm": 3.758230447769165, + "learning_rate": 1.3374634434225043e-08, + "loss": 0.9622, + "step": 12679 + }, + { + "epoch": 0.9774899784150478, + "grad_norm": 4.067485332489014, + "learning_rate": 1.3283536170087818e-08, + "loss": 1.0281, + "step": 12680 + }, + { + "epoch": 0.9775670675300647, + "grad_norm": 3.6686248779296875, + "learning_rate": 1.3192748801895139e-08, + "loss": 0.86, + "step": 12681 + }, + { + "epoch": 0.9776441566450818, + "grad_norm": 3.7029106616973877, + "learning_rate": 1.3102272335307476e-08, + "loss": 0.7519, + "step": 12682 + }, + { + "epoch": 0.9777212457600987, + "grad_norm": 3.997795343399048, + "learning_rate": 1.301210677596476e-08, + "loss": 0.8996, + "step": 12683 + }, + { + "epoch": 0.9777983348751156, + "grad_norm": 3.61710262298584, + "learning_rate": 1.2922252129489165e-08, + "loss": 0.9548, + "step": 12684 + }, + { + "epoch": 0.9778754239901326, + "grad_norm": 3.6802361011505127, + "learning_rate": 1.2832708401482319e-08, + "loss": 0.8405, + "step": 12685 + }, + { + "epoch": 0.9779525131051495, + "grad_norm": 3.580125331878662, + "learning_rate": 1.2743475597526978e-08, + "loss": 0.8522, + "step": 12686 + }, + { + "epoch": 0.9780296022201665, + "grad_norm": 3.6615421772003174, + "learning_rate": 1.265455372318647e-08, + "loss": 0.9192, + "step": 12687 + }, + { + "epoch": 0.9781066913351835, + "grad_norm": 3.9896399974823, + "learning_rate": 1.2565942784004692e-08, + "loss": 0.9484, + "step": 12688 + }, + { + "epoch": 0.9781837804502004, + "grad_norm": 4.462338447570801, + "learning_rate": 1.2477642785505006e-08, + "loss": 0.9435, + "step": 12689 + }, + { + "epoch": 0.9782608695652174, + "grad_norm": 3.643880605697632, + "learning_rate": 1.2389653733193563e-08, + "loss": 0.8626, + "step": 12690 + }, + { + "epoch": 0.9783379586802343, + "grad_norm": 3.689103841781616, + "learning_rate": 1.2301975632555973e-08, + "loss": 0.826, + "step": 12691 + }, + { + "epoch": 0.9784150477952513, + "grad_norm": 3.740290880203247, + "learning_rate": 1.2214608489057866e-08, + "loss": 0.8738, + "step": 12692 + }, + { + "epoch": 0.9784921369102683, + "grad_norm": 3.896019697189331, + "learning_rate": 1.2127552308147106e-08, + "loss": 0.9251, + "step": 12693 + }, + { + "epoch": 0.9785692260252852, + "grad_norm": 3.6664927005767822, + "learning_rate": 1.2040807095249907e-08, + "loss": 0.9496, + "step": 12694 + }, + { + "epoch": 0.9786463151403022, + "grad_norm": 3.8327767848968506, + "learning_rate": 1.1954372855775275e-08, + "loss": 0.991, + "step": 12695 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 3.853940486907959, + "learning_rate": 1.1868249595111681e-08, + "loss": 0.8489, + "step": 12696 + }, + { + "epoch": 0.9788004933703361, + "grad_norm": 3.322674512863159, + "learning_rate": 1.1782437318628714e-08, + "loss": 0.8567, + "step": 12697 + }, + { + "epoch": 0.9788775824853531, + "grad_norm": 3.6797235012054443, + "learning_rate": 1.1696936031676542e-08, + "loss": 0.8655, + "step": 12698 + }, + { + "epoch": 0.97895467160037, + "grad_norm": 4.247450351715088, + "learning_rate": 1.161174573958479e-08, + "loss": 1.0226, + "step": 12699 + }, + { + "epoch": 0.979031760715387, + "grad_norm": 3.664731979370117, + "learning_rate": 1.152686644766532e-08, + "loss": 0.8835, + "step": 12700 + }, + { + "epoch": 0.9791088498304039, + "grad_norm": 3.965914249420166, + "learning_rate": 1.1442298161209453e-08, + "loss": 0.8815, + "step": 12701 + }, + { + "epoch": 0.9791859389454209, + "grad_norm": 4.060098648071289, + "learning_rate": 1.1358040885490196e-08, + "loss": 0.8988, + "step": 12702 + }, + { + "epoch": 0.9792630280604379, + "grad_norm": 3.575415849685669, + "learning_rate": 1.1274094625760013e-08, + "loss": 0.8422, + "step": 12703 + }, + { + "epoch": 0.9793401171754548, + "grad_norm": 3.5373342037200928, + "learning_rate": 1.119045938725305e-08, + "loss": 1.0148, + "step": 12704 + }, + { + "epoch": 0.9794172062904718, + "grad_norm": 3.689316511154175, + "learning_rate": 1.110713517518347e-08, + "loss": 0.9632, + "step": 12705 + }, + { + "epoch": 0.9794942954054887, + "grad_norm": 3.855285882949829, + "learning_rate": 1.1024121994745452e-08, + "loss": 0.929, + "step": 12706 + }, + { + "epoch": 0.9795713845205057, + "grad_norm": 3.655165910720825, + "learning_rate": 1.0941419851114853e-08, + "loss": 0.9428, + "step": 12707 + }, + { + "epoch": 0.9796484736355227, + "grad_norm": 3.5580670833587646, + "learning_rate": 1.0859028749447552e-08, + "loss": 0.8268, + "step": 12708 + }, + { + "epoch": 0.9797255627505396, + "grad_norm": 3.758411407470703, + "learning_rate": 1.0776948694881107e-08, + "loss": 0.9141, + "step": 12709 + }, + { + "epoch": 0.9798026518655566, + "grad_norm": 3.422999858856201, + "learning_rate": 1.0695179692531421e-08, + "loss": 0.8323, + "step": 12710 + }, + { + "epoch": 0.9798797409805735, + "grad_norm": 3.754002809524536, + "learning_rate": 1.0613721747497196e-08, + "loss": 0.942, + "step": 12711 + }, + { + "epoch": 0.9799568300955905, + "grad_norm": 3.7836527824401855, + "learning_rate": 1.0532574864856592e-08, + "loss": 0.9146, + "step": 12712 + }, + { + "epoch": 0.9800339192106075, + "grad_norm": 3.721501350402832, + "learning_rate": 1.0451739049668896e-08, + "loss": 0.9326, + "step": 12713 + }, + { + "epoch": 0.9801110083256244, + "grad_norm": 3.7368814945220947, + "learning_rate": 1.0371214306973964e-08, + "loss": 0.9373, + "step": 12714 + }, + { + "epoch": 0.9801880974406414, + "grad_norm": 3.6165754795074463, + "learning_rate": 1.029100064179167e-08, + "loss": 0.9012, + "step": 12715 + }, + { + "epoch": 0.9802651865556583, + "grad_norm": 3.814692497253418, + "learning_rate": 1.0211098059123015e-08, + "loss": 0.8587, + "step": 12716 + }, + { + "epoch": 0.9803422756706753, + "grad_norm": 3.516226053237915, + "learning_rate": 1.0131506563950121e-08, + "loss": 0.9172, + "step": 12717 + }, + { + "epoch": 0.9804193647856922, + "grad_norm": 3.7366223335266113, + "learning_rate": 1.0052226161234024e-08, + "loss": 0.8711, + "step": 12718 + }, + { + "epoch": 0.9804964539007093, + "grad_norm": 3.5088720321655273, + "learning_rate": 9.97325685591799e-09, + "loss": 0.8817, + "step": 12719 + }, + { + "epoch": 0.9805735430157262, + "grad_norm": 3.6304702758789062, + "learning_rate": 9.894598652925857e-09, + "loss": 0.9045, + "step": 12720 + }, + { + "epoch": 0.9806506321307431, + "grad_norm": 3.697366952896118, + "learning_rate": 9.816251557160927e-09, + "loss": 0.9645, + "step": 12721 + }, + { + "epoch": 0.9807277212457601, + "grad_norm": 3.8746864795684814, + "learning_rate": 9.738215573507625e-09, + "loss": 0.9756, + "step": 12722 + }, + { + "epoch": 0.980804810360777, + "grad_norm": 3.758707046508789, + "learning_rate": 9.660490706831505e-09, + "loss": 0.8504, + "step": 12723 + }, + { + "epoch": 0.9808818994757941, + "grad_norm": 3.641058921813965, + "learning_rate": 9.583076961978132e-09, + "loss": 0.7918, + "step": 12724 + }, + { + "epoch": 0.980958988590811, + "grad_norm": 3.7192978858947754, + "learning_rate": 9.505974343774205e-09, + "loss": 0.9248, + "step": 12725 + }, + { + "epoch": 0.9810360777058279, + "grad_norm": 3.7372353076934814, + "learning_rate": 9.429182857025876e-09, + "loss": 0.8648, + "step": 12726 + }, + { + "epoch": 0.9811131668208449, + "grad_norm": 3.754185199737549, + "learning_rate": 9.352702506521539e-09, + "loss": 0.8707, + "step": 12727 + }, + { + "epoch": 0.9811902559358618, + "grad_norm": 3.9802560806274414, + "learning_rate": 9.276533297028489e-09, + "loss": 0.9176, + "step": 12728 + }, + { + "epoch": 0.9812673450508789, + "grad_norm": 3.81831431388855, + "learning_rate": 9.200675233296263e-09, + "loss": 0.8515, + "step": 12729 + }, + { + "epoch": 0.9813444341658958, + "grad_norm": 3.8928399085998535, + "learning_rate": 9.125128320053856e-09, + "loss": 0.9243, + "step": 12730 + }, + { + "epoch": 0.9814215232809127, + "grad_norm": 3.8177058696746826, + "learning_rate": 9.049892562011387e-09, + "loss": 0.908, + "step": 12731 + }, + { + "epoch": 0.9814986123959297, + "grad_norm": 3.94755220413208, + "learning_rate": 8.974967963858994e-09, + "loss": 0.906, + "step": 12732 + }, + { + "epoch": 0.9815757015109466, + "grad_norm": 3.647578477859497, + "learning_rate": 8.900354530268497e-09, + "loss": 0.8664, + "step": 12733 + }, + { + "epoch": 0.9816527906259637, + "grad_norm": 3.8050804138183594, + "learning_rate": 8.826052265891172e-09, + "loss": 0.9537, + "step": 12734 + }, + { + "epoch": 0.9817298797409806, + "grad_norm": 3.629077672958374, + "learning_rate": 8.752061175359983e-09, + "loss": 0.8645, + "step": 12735 + }, + { + "epoch": 0.9818069688559975, + "grad_norm": 3.9921088218688965, + "learning_rate": 8.67838126328735e-09, + "loss": 0.98, + "step": 12736 + }, + { + "epoch": 0.9818840579710145, + "grad_norm": 3.7594244480133057, + "learning_rate": 8.605012534266265e-09, + "loss": 0.8998, + "step": 12737 + }, + { + "epoch": 0.9819611470860314, + "grad_norm": 3.63808536529541, + "learning_rate": 8.53195499287196e-09, + "loss": 0.9154, + "step": 12738 + }, + { + "epoch": 0.9820382362010485, + "grad_norm": 3.701803207397461, + "learning_rate": 8.459208643659122e-09, + "loss": 0.8308, + "step": 12739 + }, + { + "epoch": 0.9821153253160654, + "grad_norm": 3.5713162422180176, + "learning_rate": 8.386773491162458e-09, + "loss": 0.9392, + "step": 12740 + }, + { + "epoch": 0.9821924144310823, + "grad_norm": 3.8430564403533936, + "learning_rate": 8.314649539898357e-09, + "loss": 0.8847, + "step": 12741 + }, + { + "epoch": 0.9822695035460993, + "grad_norm": 3.8548264503479004, + "learning_rate": 8.242836794362662e-09, + "loss": 0.9908, + "step": 12742 + }, + { + "epoch": 0.9823465926611162, + "grad_norm": 4.117403984069824, + "learning_rate": 8.17133525903402e-09, + "loss": 0.8986, + "step": 12743 + }, + { + "epoch": 0.9824236817761333, + "grad_norm": 3.5124740600585938, + "learning_rate": 8.100144938368304e-09, + "loss": 0.9653, + "step": 12744 + }, + { + "epoch": 0.9825007708911502, + "grad_norm": 3.7108638286590576, + "learning_rate": 8.029265836805855e-09, + "loss": 0.8482, + "step": 12745 + }, + { + "epoch": 0.9825778600061671, + "grad_norm": 3.5516583919525146, + "learning_rate": 7.958697958763695e-09, + "loss": 0.9324, + "step": 12746 + }, + { + "epoch": 0.9826549491211841, + "grad_norm": 4.330380439758301, + "learning_rate": 7.888441308642746e-09, + "loss": 0.9597, + "step": 12747 + }, + { + "epoch": 0.982732038236201, + "grad_norm": 3.6358656883239746, + "learning_rate": 7.818495890822842e-09, + "loss": 0.8769, + "step": 12748 + }, + { + "epoch": 0.9828091273512181, + "grad_norm": 3.5097153186798096, + "learning_rate": 7.748861709664379e-09, + "loss": 0.8299, + "step": 12749 + }, + { + "epoch": 0.982886216466235, + "grad_norm": 4.351414203643799, + "learning_rate": 7.679538769508887e-09, + "loss": 0.9594, + "step": 12750 + }, + { + "epoch": 0.9829633055812519, + "grad_norm": 4.271960735321045, + "learning_rate": 7.61052707467791e-09, + "loss": 0.8921, + "step": 12751 + }, + { + "epoch": 0.9830403946962689, + "grad_norm": 3.7134621143341064, + "learning_rate": 7.541826629474669e-09, + "loss": 0.8811, + "step": 12752 + }, + { + "epoch": 0.9831174838112858, + "grad_norm": 3.69516921043396, + "learning_rate": 7.473437438181852e-09, + "loss": 0.9587, + "step": 12753 + }, + { + "epoch": 0.9831945729263029, + "grad_norm": 3.7787835597991943, + "learning_rate": 7.40535950506327e-09, + "loss": 0.9491, + "step": 12754 + }, + { + "epoch": 0.9832716620413198, + "grad_norm": 4.08289909362793, + "learning_rate": 7.3375928343633054e-09, + "loss": 0.9131, + "step": 12755 + }, + { + "epoch": 0.9833487511563367, + "grad_norm": 3.624502658843994, + "learning_rate": 7.2701374303063565e-09, + "loss": 0.9144, + "step": 12756 + }, + { + "epoch": 0.9834258402713537, + "grad_norm": 3.4605467319488525, + "learning_rate": 7.202993297099059e-09, + "loss": 0.8354, + "step": 12757 + }, + { + "epoch": 0.9835029293863706, + "grad_norm": 3.8539695739746094, + "learning_rate": 7.136160438925843e-09, + "loss": 0.8254, + "step": 12758 + }, + { + "epoch": 0.9835800185013877, + "grad_norm": 3.464372396469116, + "learning_rate": 7.069638859955041e-09, + "loss": 0.8553, + "step": 12759 + }, + { + "epoch": 0.9836571076164046, + "grad_norm": 3.6082799434661865, + "learning_rate": 7.003428564332782e-09, + "loss": 0.9381, + "step": 12760 + }, + { + "epoch": 0.9837341967314215, + "grad_norm": 3.6667985916137695, + "learning_rate": 6.937529556187428e-09, + "loss": 0.962, + "step": 12761 + }, + { + "epoch": 0.9838112858464385, + "grad_norm": 3.6042168140411377, + "learning_rate": 6.871941839627916e-09, + "loss": 0.8476, + "step": 12762 + }, + { + "epoch": 0.9838883749614554, + "grad_norm": 3.740255832672119, + "learning_rate": 6.8066654187420865e-09, + "loss": 0.9196, + "step": 12763 + }, + { + "epoch": 0.9839654640764725, + "grad_norm": 3.6301140785217285, + "learning_rate": 6.741700297600573e-09, + "loss": 0.7799, + "step": 12764 + }, + { + "epoch": 0.9840425531914894, + "grad_norm": 3.6463210582733154, + "learning_rate": 6.677046480252913e-09, + "loss": 0.8718, + "step": 12765 + }, + { + "epoch": 0.9841196423065063, + "grad_norm": 4.018143653869629, + "learning_rate": 6.6127039707308826e-09, + "loss": 1.0302, + "step": 12766 + }, + { + "epoch": 0.9841967314215233, + "grad_norm": 3.6346871852874756, + "learning_rate": 6.548672773045161e-09, + "loss": 0.9883, + "step": 12767 + }, + { + "epoch": 0.9842738205365402, + "grad_norm": 3.8458900451660156, + "learning_rate": 6.4849528911881125e-09, + "loss": 0.8449, + "step": 12768 + }, + { + "epoch": 0.9843509096515572, + "grad_norm": 3.548745632171631, + "learning_rate": 6.421544329131557e-09, + "loss": 0.738, + "step": 12769 + }, + { + "epoch": 0.9844279987665742, + "grad_norm": 3.7064075469970703, + "learning_rate": 6.358447090829556e-09, + "loss": 0.8474, + "step": 12770 + }, + { + "epoch": 0.9845050878815911, + "grad_norm": 3.6911072731018066, + "learning_rate": 6.295661180216184e-09, + "loss": 0.911, + "step": 12771 + }, + { + "epoch": 0.9845821769966081, + "grad_norm": 3.657993793487549, + "learning_rate": 6.2331866012044215e-09, + "loss": 0.9808, + "step": 12772 + }, + { + "epoch": 0.984659266111625, + "grad_norm": 3.437884569168091, + "learning_rate": 6.171023357690598e-09, + "loss": 0.8563, + "step": 12773 + }, + { + "epoch": 0.984736355226642, + "grad_norm": 3.694701910018921, + "learning_rate": 6.109171453549944e-09, + "loss": 0.798, + "step": 12774 + }, + { + "epoch": 0.984813444341659, + "grad_norm": 3.68932843208313, + "learning_rate": 6.0476308926377125e-09, + "loss": 0.8745, + "step": 12775 + }, + { + "epoch": 0.9848905334566759, + "grad_norm": 3.8350744247436523, + "learning_rate": 5.986401678791942e-09, + "loss": 0.9399, + "step": 12776 + }, + { + "epoch": 0.9849676225716929, + "grad_norm": 4.023548126220703, + "learning_rate": 5.9254838158295805e-09, + "loss": 0.835, + "step": 12777 + }, + { + "epoch": 0.9850447116867098, + "grad_norm": 3.844338893890381, + "learning_rate": 5.864877307547589e-09, + "loss": 1.0251, + "step": 12778 + }, + { + "epoch": 0.9851218008017268, + "grad_norm": 3.5062670707702637, + "learning_rate": 5.804582157725724e-09, + "loss": 0.8741, + "step": 12779 + }, + { + "epoch": 0.9851988899167438, + "grad_norm": 3.6520488262176514, + "learning_rate": 5.744598370122645e-09, + "loss": 0.9983, + "step": 12780 + }, + { + "epoch": 0.9852759790317607, + "grad_norm": 3.605093479156494, + "learning_rate": 5.684925948477582e-09, + "loss": 0.8435, + "step": 12781 + }, + { + "epoch": 0.9853530681467777, + "grad_norm": 3.9544193744659424, + "learning_rate": 5.625564896511448e-09, + "loss": 0.899, + "step": 12782 + }, + { + "epoch": 0.9854301572617946, + "grad_norm": 3.8571465015411377, + "learning_rate": 5.566515217924617e-09, + "loss": 0.8574, + "step": 12783 + }, + { + "epoch": 0.9855072463768116, + "grad_norm": 3.823538303375244, + "learning_rate": 5.507776916398588e-09, + "loss": 0.9406, + "step": 12784 + }, + { + "epoch": 0.9855843354918286, + "grad_norm": 4.045183181762695, + "learning_rate": 5.4493499955959874e-09, + "loss": 0.804, + "step": 12785 + }, + { + "epoch": 0.9856614246068455, + "grad_norm": 3.724013090133667, + "learning_rate": 5.391234459158901e-09, + "loss": 0.9158, + "step": 12786 + }, + { + "epoch": 0.9857385137218625, + "grad_norm": 3.712146759033203, + "learning_rate": 5.333430310709986e-09, + "loss": 1.0271, + "step": 12787 + }, + { + "epoch": 0.9858156028368794, + "grad_norm": 4.106431484222412, + "learning_rate": 5.2759375538541376e-09, + "loss": 0.9228, + "step": 12788 + }, + { + "epoch": 0.9858926919518964, + "grad_norm": 3.821424961090088, + "learning_rate": 5.218756192175156e-09, + "loss": 0.974, + "step": 12789 + }, + { + "epoch": 0.9859697810669134, + "grad_norm": 3.7011265754699707, + "learning_rate": 5.161886229238522e-09, + "loss": 0.81, + "step": 12790 + }, + { + "epoch": 0.9860468701819303, + "grad_norm": 3.595111608505249, + "learning_rate": 5.105327668588622e-09, + "loss": 0.9212, + "step": 12791 + }, + { + "epoch": 0.9861239592969473, + "grad_norm": 3.862673759460449, + "learning_rate": 5.049080513752636e-09, + "loss": 0.9619, + "step": 12792 + }, + { + "epoch": 0.9862010484119642, + "grad_norm": 3.584799289703369, + "learning_rate": 4.993144768237201e-09, + "loss": 0.9546, + "step": 12793 + }, + { + "epoch": 0.9862781375269812, + "grad_norm": 3.9195289611816406, + "learning_rate": 4.937520435528975e-09, + "loss": 0.9763, + "step": 12794 + }, + { + "epoch": 0.9863552266419982, + "grad_norm": 3.7915120124816895, + "learning_rate": 4.8822075190962936e-09, + "loss": 0.7143, + "step": 12795 + }, + { + "epoch": 0.9864323157570151, + "grad_norm": 3.788917303085327, + "learning_rate": 4.827206022388065e-09, + "loss": 0.9824, + "step": 12796 + }, + { + "epoch": 0.9865094048720321, + "grad_norm": 3.605607509613037, + "learning_rate": 4.772515948832657e-09, + "loss": 0.8145, + "step": 12797 + }, + { + "epoch": 0.986586493987049, + "grad_norm": 3.643545150756836, + "learning_rate": 4.718137301839565e-09, + "loss": 0.8363, + "step": 12798 + }, + { + "epoch": 0.986663583102066, + "grad_norm": 4.066047668457031, + "learning_rate": 4.66407008479941e-09, + "loss": 0.9509, + "step": 12799 + }, + { + "epoch": 0.986740672217083, + "grad_norm": 3.583989381790161, + "learning_rate": 4.6103143010833854e-09, + "loss": 0.8493, + "step": 12800 + }, + { + "epoch": 0.9868177613320999, + "grad_norm": 3.6887717247009277, + "learning_rate": 4.556869954042143e-09, + "loss": 0.8542, + "step": 12801 + }, + { + "epoch": 0.9868948504471169, + "grad_norm": 3.8332138061523438, + "learning_rate": 4.5037370470085715e-09, + "loss": 0.9282, + "step": 12802 + }, + { + "epoch": 0.9869719395621338, + "grad_norm": 3.6815474033355713, + "learning_rate": 4.450915583293913e-09, + "loss": 0.8847, + "step": 12803 + }, + { + "epoch": 0.9870490286771508, + "grad_norm": 3.505082607269287, + "learning_rate": 4.398405566192754e-09, + "loss": 0.7977, + "step": 12804 + }, + { + "epoch": 0.9871261177921677, + "grad_norm": 3.825004816055298, + "learning_rate": 4.346206998977476e-09, + "loss": 1.0071, + "step": 12805 + }, + { + "epoch": 0.9872032069071847, + "grad_norm": 4.440659999847412, + "learning_rate": 4.294319884903253e-09, + "loss": 1.0099, + "step": 12806 + }, + { + "epoch": 0.9872802960222017, + "grad_norm": 3.277602195739746, + "learning_rate": 4.24274422720472e-09, + "loss": 0.8623, + "step": 12807 + }, + { + "epoch": 0.9873573851372186, + "grad_norm": 3.6194324493408203, + "learning_rate": 4.191480029097639e-09, + "loss": 0.9843, + "step": 12808 + }, + { + "epoch": 0.9874344742522356, + "grad_norm": 3.5237951278686523, + "learning_rate": 4.140527293777785e-09, + "loss": 0.7906, + "step": 12809 + }, + { + "epoch": 0.9875115633672525, + "grad_norm": 4.282720565795898, + "learning_rate": 4.089886024421508e-09, + "loss": 0.9852, + "step": 12810 + }, + { + "epoch": 0.9875886524822695, + "grad_norm": 3.8325626850128174, + "learning_rate": 4.039556224186836e-09, + "loss": 0.9432, + "step": 12811 + }, + { + "epoch": 0.9876657415972865, + "grad_norm": 3.7518670558929443, + "learning_rate": 3.989537896210704e-09, + "loss": 0.8936, + "step": 12812 + }, + { + "epoch": 0.9877428307123034, + "grad_norm": 3.3281097412109375, + "learning_rate": 3.93983104361173e-09, + "loss": 0.8703, + "step": 12813 + }, + { + "epoch": 0.9878199198273204, + "grad_norm": 3.881019353866577, + "learning_rate": 3.890435669489656e-09, + "loss": 0.8354, + "step": 12814 + }, + { + "epoch": 0.9878970089423373, + "grad_norm": 3.5037472248077393, + "learning_rate": 3.84135177692313e-09, + "loss": 0.9291, + "step": 12815 + }, + { + "epoch": 0.9879740980573543, + "grad_norm": 3.938447952270508, + "learning_rate": 3.792579368972482e-09, + "loss": 0.9214, + "step": 12816 + }, + { + "epoch": 0.9880511871723713, + "grad_norm": 4.022164344787598, + "learning_rate": 3.744118448678058e-09, + "loss": 0.8379, + "step": 12817 + }, + { + "epoch": 0.9881282762873882, + "grad_norm": 3.696821451187134, + "learning_rate": 3.6959690190618847e-09, + "loss": 0.8365, + "step": 12818 + }, + { + "epoch": 0.9882053654024052, + "grad_norm": 3.4798009395599365, + "learning_rate": 3.6481310831260054e-09, + "loss": 0.8654, + "step": 12819 + }, + { + "epoch": 0.9882824545174221, + "grad_norm": 3.6241512298583984, + "learning_rate": 3.600604643851924e-09, + "loss": 0.8805, + "step": 12820 + }, + { + "epoch": 0.988359543632439, + "grad_norm": 3.9195075035095215, + "learning_rate": 3.5533897042033805e-09, + "loss": 0.9811, + "step": 12821 + }, + { + "epoch": 0.9884366327474561, + "grad_norm": 4.072207927703857, + "learning_rate": 3.5064862671230217e-09, + "loss": 0.9203, + "step": 12822 + }, + { + "epoch": 0.988513721862473, + "grad_norm": 4.044565200805664, + "learning_rate": 3.4598943355362845e-09, + "loss": 0.9196, + "step": 12823 + }, + { + "epoch": 0.98859081097749, + "grad_norm": 3.762471914291382, + "learning_rate": 3.4136139123475134e-09, + "loss": 0.8305, + "step": 12824 + }, + { + "epoch": 0.9886679000925069, + "grad_norm": 3.648581027984619, + "learning_rate": 3.3676450004416215e-09, + "loss": 0.8875, + "step": 12825 + }, + { + "epoch": 0.9887449892075239, + "grad_norm": 3.582662343978882, + "learning_rate": 3.321987602685206e-09, + "loss": 0.8568, + "step": 12826 + }, + { + "epoch": 0.9888220783225409, + "grad_norm": 3.838958978652954, + "learning_rate": 3.276641721924323e-09, + "loss": 0.9401, + "step": 12827 + }, + { + "epoch": 0.9888991674375578, + "grad_norm": 3.704432964324951, + "learning_rate": 3.2316073609856e-09, + "loss": 0.8952, + "step": 12828 + }, + { + "epoch": 0.9889762565525748, + "grad_norm": 3.5584044456481934, + "learning_rate": 3.186884522677902e-09, + "loss": 1.0251, + "step": 12829 + }, + { + "epoch": 0.9890533456675917, + "grad_norm": 3.896476984024048, + "learning_rate": 3.1424732097884434e-09, + "loss": 0.888, + "step": 12830 + }, + { + "epoch": 0.9891304347826086, + "grad_norm": 3.5131354331970215, + "learning_rate": 3.098373425086676e-09, + "loss": 0.8817, + "step": 12831 + }, + { + "epoch": 0.9892075238976257, + "grad_norm": 3.6488070487976074, + "learning_rate": 3.0545851713215113e-09, + "loss": 0.8898, + "step": 12832 + }, + { + "epoch": 0.9892846130126426, + "grad_norm": 3.468665361404419, + "learning_rate": 3.011108451222988e-09, + "loss": 0.8596, + "step": 12833 + }, + { + "epoch": 0.9893617021276596, + "grad_norm": 3.568416118621826, + "learning_rate": 2.9679432675017163e-09, + "loss": 0.9725, + "step": 12834 + }, + { + "epoch": 0.9894387912426765, + "grad_norm": 4.020820617675781, + "learning_rate": 2.9250896228494307e-09, + "loss": 0.8734, + "step": 12835 + }, + { + "epoch": 0.9895158803576934, + "grad_norm": 3.5080959796905518, + "learning_rate": 2.8825475199367735e-09, + "loss": 0.8361, + "step": 12836 + }, + { + "epoch": 0.9895929694727105, + "grad_norm": 3.717933416366577, + "learning_rate": 2.8403169614166226e-09, + "loss": 0.7573, + "step": 12837 + }, + { + "epoch": 0.9896700585877274, + "grad_norm": 3.621506690979004, + "learning_rate": 2.7983979499218715e-09, + "loss": 0.903, + "step": 12838 + }, + { + "epoch": 0.9897471477027444, + "grad_norm": 3.9174814224243164, + "learning_rate": 2.756790488065986e-09, + "loss": 0.8788, + "step": 12839 + }, + { + "epoch": 0.9898242368177613, + "grad_norm": 4.379630088806152, + "learning_rate": 2.7154945784424456e-09, + "loss": 0.912, + "step": 12840 + }, + { + "epoch": 0.9899013259327782, + "grad_norm": 3.8400206565856934, + "learning_rate": 2.6745102236264142e-09, + "loss": 0.9096, + "step": 12841 + }, + { + "epoch": 0.9899784150477953, + "grad_norm": 3.97227144241333, + "learning_rate": 2.633837426173069e-09, + "loss": 0.8001, + "step": 12842 + }, + { + "epoch": 0.9900555041628122, + "grad_norm": 3.7930757999420166, + "learning_rate": 2.5934761886176052e-09, + "loss": 0.886, + "step": 12843 + }, + { + "epoch": 0.9901325932778292, + "grad_norm": 3.9105029106140137, + "learning_rate": 2.5534265134768977e-09, + "loss": 0.9109, + "step": 12844 + }, + { + "epoch": 0.9902096823928461, + "grad_norm": 3.479025363922119, + "learning_rate": 2.513688403247283e-09, + "loss": 0.8513, + "step": 12845 + }, + { + "epoch": 0.990286771507863, + "grad_norm": 4.080109119415283, + "learning_rate": 2.474261860406779e-09, + "loss": 0.8726, + "step": 12846 + }, + { + "epoch": 0.9903638606228801, + "grad_norm": 3.6820366382598877, + "learning_rate": 2.4351468874134197e-09, + "loss": 0.8961, + "step": 12847 + }, + { + "epoch": 0.990440949737897, + "grad_norm": 3.6713099479675293, + "learning_rate": 2.396343486705255e-09, + "loss": 0.9043, + "step": 12848 + }, + { + "epoch": 0.990518038852914, + "grad_norm": 3.635523796081543, + "learning_rate": 2.3578516607020154e-09, + "loss": 0.8773, + "step": 12849 + }, + { + "epoch": 0.9905951279679309, + "grad_norm": 3.570716142654419, + "learning_rate": 2.319671411802893e-09, + "loss": 0.9127, + "step": 12850 + }, + { + "epoch": 0.9906722170829478, + "grad_norm": 3.6702277660369873, + "learning_rate": 2.281802742388761e-09, + "loss": 0.852, + "step": 12851 + }, + { + "epoch": 0.9907493061979649, + "grad_norm": 3.9197945594787598, + "learning_rate": 2.2442456548205094e-09, + "loss": 0.9476, + "step": 12852 + }, + { + "epoch": 0.9908263953129818, + "grad_norm": 3.8219149112701416, + "learning_rate": 2.207000151439598e-09, + "loss": 0.9928, + "step": 12853 + }, + { + "epoch": 0.9909034844279988, + "grad_norm": 3.7937533855438232, + "learning_rate": 2.170066234568058e-09, + "loss": 1.061, + "step": 12854 + }, + { + "epoch": 0.9909805735430157, + "grad_norm": 3.5674586296081543, + "learning_rate": 2.133443906508492e-09, + "loss": 0.8648, + "step": 12855 + }, + { + "epoch": 0.9910576626580326, + "grad_norm": 3.8357954025268555, + "learning_rate": 2.097133169543519e-09, + "loss": 0.9061, + "step": 12856 + }, + { + "epoch": 0.9911347517730497, + "grad_norm": 3.6642403602600098, + "learning_rate": 2.0611340259379942e-09, + "loss": 0.9031, + "step": 12857 + }, + { + "epoch": 0.9912118408880666, + "grad_norm": 3.718217134475708, + "learning_rate": 2.0254464779356776e-09, + "loss": 0.8826, + "step": 12858 + }, + { + "epoch": 0.9912889300030836, + "grad_norm": 3.5286970138549805, + "learning_rate": 1.990070527761456e-09, + "loss": 0.8263, + "step": 12859 + }, + { + "epoch": 0.9913660191181005, + "grad_norm": 3.934288263320923, + "learning_rate": 1.9550061776213435e-09, + "loss": 0.8804, + "step": 12860 + }, + { + "epoch": 0.9914431082331174, + "grad_norm": 3.582671880722046, + "learning_rate": 1.9202534297008134e-09, + "loss": 0.8939, + "step": 12861 + }, + { + "epoch": 0.9915201973481345, + "grad_norm": 3.763944387435913, + "learning_rate": 1.8858122861664664e-09, + "loss": 0.9044, + "step": 12862 + }, + { + "epoch": 0.9915972864631514, + "grad_norm": 4.1658196449279785, + "learning_rate": 1.8516827491660282e-09, + "loss": 0.8877, + "step": 12863 + }, + { + "epoch": 0.9916743755781684, + "grad_norm": 3.695138692855835, + "learning_rate": 1.817864820827242e-09, + "loss": 0.8791, + "step": 12864 + }, + { + "epoch": 0.9917514646931853, + "grad_norm": 4.022881984710693, + "learning_rate": 1.784358503258421e-09, + "loss": 1.0657, + "step": 12865 + }, + { + "epoch": 0.9918285538082022, + "grad_norm": 3.7440459728240967, + "learning_rate": 1.7511637985478947e-09, + "loss": 0.8766, + "step": 12866 + }, + { + "epoch": 0.9919056429232193, + "grad_norm": 3.9772725105285645, + "learning_rate": 1.718280708766229e-09, + "loss": 0.9796, + "step": 12867 + }, + { + "epoch": 0.9919827320382362, + "grad_norm": 3.865461826324463, + "learning_rate": 1.6857092359628957e-09, + "loss": 0.8336, + "step": 12868 + }, + { + "epoch": 0.9920598211532532, + "grad_norm": 4.357512474060059, + "learning_rate": 1.6534493821684927e-09, + "loss": 1.0112, + "step": 12869 + }, + { + "epoch": 0.9921369102682701, + "grad_norm": 3.5063059329986572, + "learning_rate": 1.621501149394744e-09, + "loss": 0.8071, + "step": 12870 + }, + { + "epoch": 0.992213999383287, + "grad_norm": 3.5108940601348877, + "learning_rate": 1.5898645396328349e-09, + "loss": 0.9605, + "step": 12871 + }, + { + "epoch": 0.992291088498304, + "grad_norm": 3.727933883666992, + "learning_rate": 1.5585395548556314e-09, + "loss": 0.9568, + "step": 12872 + }, + { + "epoch": 0.992368177613321, + "grad_norm": 3.5517189502716064, + "learning_rate": 1.5275261970154608e-09, + "loss": 0.9227, + "step": 12873 + }, + { + "epoch": 0.992445266728338, + "grad_norm": 3.8274848461151123, + "learning_rate": 1.4968244680468868e-09, + "loss": 1.0268, + "step": 12874 + }, + { + "epoch": 0.9925223558433549, + "grad_norm": 3.797487258911133, + "learning_rate": 1.4664343698628236e-09, + "loss": 0.7404, + "step": 12875 + }, + { + "epoch": 0.9925994449583718, + "grad_norm": 3.6216273307800293, + "learning_rate": 1.436355904358977e-09, + "loss": 0.919, + "step": 12876 + }, + { + "epoch": 0.9926765340733888, + "grad_norm": 3.420397996902466, + "learning_rate": 1.4065890734099586e-09, + "loss": 0.8691, + "step": 12877 + }, + { + "epoch": 0.9927536231884058, + "grad_norm": 3.8331243991851807, + "learning_rate": 1.3771338788715061e-09, + "loss": 0.945, + "step": 12878 + }, + { + "epoch": 0.9928307123034228, + "grad_norm": 3.9298641681671143, + "learning_rate": 1.3479903225804836e-09, + "loss": 0.937, + "step": 12879 + }, + { + "epoch": 0.9929078014184397, + "grad_norm": 3.827240467071533, + "learning_rate": 1.319158406353771e-09, + "loss": 0.9731, + "step": 12880 + }, + { + "epoch": 0.9929848905334566, + "grad_norm": 3.64302921295166, + "learning_rate": 1.2906381319882644e-09, + "loss": 0.8572, + "step": 12881 + }, + { + "epoch": 0.9930619796484736, + "grad_norm": 3.7599618434906006, + "learning_rate": 1.2624295012625409e-09, + "loss": 0.954, + "step": 12882 + }, + { + "epoch": 0.9931390687634906, + "grad_norm": 3.944673776626587, + "learning_rate": 1.2345325159357491e-09, + "loss": 0.9175, + "step": 12883 + }, + { + "epoch": 0.9932161578785076, + "grad_norm": 3.6056995391845703, + "learning_rate": 1.2069471777459429e-09, + "loss": 0.8907, + "step": 12884 + }, + { + "epoch": 0.9932932469935245, + "grad_norm": 3.5463924407958984, + "learning_rate": 1.1796734884139682e-09, + "loss": 0.8499, + "step": 12885 + }, + { + "epoch": 0.9933703361085414, + "grad_norm": 3.643226385116577, + "learning_rate": 1.1527114496395764e-09, + "loss": 0.9093, + "step": 12886 + }, + { + "epoch": 0.9934474252235584, + "grad_norm": 3.62485671043396, + "learning_rate": 1.126061063103645e-09, + "loss": 0.9344, + "step": 12887 + }, + { + "epoch": 0.9935245143385754, + "grad_norm": 3.879370927810669, + "learning_rate": 1.0997223304687333e-09, + "loss": 0.9439, + "step": 12888 + }, + { + "epoch": 0.9936016034535924, + "grad_norm": 3.560511350631714, + "learning_rate": 1.0736952533757506e-09, + "loss": 0.8394, + "step": 12889 + }, + { + "epoch": 0.9936786925686093, + "grad_norm": 3.8150246143341064, + "learning_rate": 1.047979833447843e-09, + "loss": 0.7913, + "step": 12890 + }, + { + "epoch": 0.9937557816836263, + "grad_norm": 4.272899150848389, + "learning_rate": 1.0225760722876177e-09, + "loss": 0.9468, + "step": 12891 + }, + { + "epoch": 0.9938328707986432, + "grad_norm": 3.9249536991119385, + "learning_rate": 9.974839714799178e-10, + "loss": 0.8953, + "step": 12892 + }, + { + "epoch": 0.9939099599136602, + "grad_norm": 3.497441291809082, + "learning_rate": 9.727035325884925e-10, + "loss": 0.822, + "step": 12893 + }, + { + "epoch": 0.9939870490286772, + "grad_norm": 4.1843647956848145, + "learning_rate": 9.482347571587724e-10, + "loss": 0.9528, + "step": 12894 + }, + { + "epoch": 0.9940641381436941, + "grad_norm": 3.7656443119049072, + "learning_rate": 9.240776467150936e-10, + "loss": 0.9781, + "step": 12895 + }, + { + "epoch": 0.9941412272587111, + "grad_norm": 3.6662378311157227, + "learning_rate": 9.002322027651389e-10, + "loss": 0.889, + "step": 12896 + }, + { + "epoch": 0.994218316373728, + "grad_norm": 4.327576637268066, + "learning_rate": 8.766984267938316e-10, + "loss": 0.8852, + "step": 12897 + }, + { + "epoch": 0.994295405488745, + "grad_norm": 3.467092275619507, + "learning_rate": 8.534763202699969e-10, + "loss": 0.8667, + "step": 12898 + }, + { + "epoch": 0.994372494603762, + "grad_norm": 3.563298463821411, + "learning_rate": 8.305658846402554e-10, + "loss": 0.9111, + "step": 12899 + }, + { + "epoch": 0.9944495837187789, + "grad_norm": 3.706197738647461, + "learning_rate": 8.079671213334639e-10, + "loss": 0.854, + "step": 12900 + }, + { + "epoch": 0.9945266728337959, + "grad_norm": 4.062145233154297, + "learning_rate": 7.856800317584956e-10, + "loss": 0.8797, + "step": 12901 + }, + { + "epoch": 0.9946037619488128, + "grad_norm": 3.602550506591797, + "learning_rate": 7.637046173047946e-10, + "loss": 0.8933, + "step": 12902 + }, + { + "epoch": 0.9946808510638298, + "grad_norm": 4.166884422302246, + "learning_rate": 7.420408793423762e-10, + "loss": 0.9677, + "step": 12903 + }, + { + "epoch": 0.9947579401788468, + "grad_norm": 4.013248443603516, + "learning_rate": 7.206888192218265e-10, + "loss": 0.9302, + "step": 12904 + }, + { + "epoch": 0.9948350292938637, + "grad_norm": 3.9088120460510254, + "learning_rate": 6.99648438274303e-10, + "loss": 0.8768, + "step": 12905 + }, + { + "epoch": 0.9949121184088807, + "grad_norm": 3.6118009090423584, + "learning_rate": 6.789197378115342e-10, + "loss": 0.9307, + "step": 12906 + }, + { + "epoch": 0.9949892075238976, + "grad_norm": 3.630589485168457, + "learning_rate": 6.585027191263748e-10, + "loss": 0.9369, + "step": 12907 + }, + { + "epoch": 0.9950662966389145, + "grad_norm": 3.9359915256500244, + "learning_rate": 6.383973834911406e-10, + "loss": 0.8476, + "step": 12908 + }, + { + "epoch": 0.9951433857539316, + "grad_norm": 3.3206164836883545, + "learning_rate": 6.186037321592731e-10, + "loss": 0.7947, + "step": 12909 + }, + { + "epoch": 0.9952204748689485, + "grad_norm": 3.8254833221435547, + "learning_rate": 5.991217663653404e-10, + "loss": 0.859, + "step": 12910 + }, + { + "epoch": 0.9952975639839655, + "grad_norm": 3.701080322265625, + "learning_rate": 5.799514873233714e-10, + "loss": 0.8847, + "step": 12911 + }, + { + "epoch": 0.9953746530989824, + "grad_norm": 3.581287145614624, + "learning_rate": 5.610928962290763e-10, + "loss": 0.9241, + "step": 12912 + }, + { + "epoch": 0.9954517422139993, + "grad_norm": 3.830244302749634, + "learning_rate": 5.425459942576261e-10, + "loss": 0.9187, + "step": 12913 + }, + { + "epoch": 0.9955288313290164, + "grad_norm": 3.582486867904663, + "learning_rate": 5.243107825653182e-10, + "loss": 0.8323, + "step": 12914 + }, + { + "epoch": 0.9956059204440333, + "grad_norm": 3.7536768913269043, + "learning_rate": 5.06387262289576e-10, + "loss": 0.9389, + "step": 12915 + }, + { + "epoch": 0.9956830095590503, + "grad_norm": 3.8179402351379395, + "learning_rate": 4.887754345478391e-10, + "loss": 0.8959, + "step": 12916 + }, + { + "epoch": 0.9957600986740672, + "grad_norm": 3.915501356124878, + "learning_rate": 4.714753004375627e-10, + "loss": 0.9829, + "step": 12917 + }, + { + "epoch": 0.9958371877890841, + "grad_norm": 3.4252476692199707, + "learning_rate": 4.5448686103732876e-10, + "loss": 0.816, + "step": 12918 + }, + { + "epoch": 0.9959142769041012, + "grad_norm": 4.076374053955078, + "learning_rate": 4.37810117406845e-10, + "loss": 0.9458, + "step": 12919 + }, + { + "epoch": 0.9959913660191181, + "grad_norm": 3.622620105743408, + "learning_rate": 4.2144507058528016e-10, + "loss": 0.8332, + "step": 12920 + }, + { + "epoch": 0.9960684551341351, + "grad_norm": 3.3440213203430176, + "learning_rate": 4.053917215934844e-10, + "loss": 0.8837, + "step": 12921 + }, + { + "epoch": 0.996145544249152, + "grad_norm": 3.92189884185791, + "learning_rate": 3.8965007143176857e-10, + "loss": 0.9619, + "step": 12922 + }, + { + "epoch": 0.9962226333641689, + "grad_norm": 4.633378505706787, + "learning_rate": 3.742201210815699e-10, + "loss": 0.9774, + "step": 12923 + }, + { + "epoch": 0.996299722479186, + "grad_norm": 3.8096113204956055, + "learning_rate": 3.5910187150545174e-10, + "loss": 0.9167, + "step": 12924 + }, + { + "epoch": 0.9963768115942029, + "grad_norm": 3.8679616451263428, + "learning_rate": 3.4429532364488315e-10, + "loss": 0.844, + "step": 12925 + }, + { + "epoch": 0.9964539007092199, + "grad_norm": 4.342710494995117, + "learning_rate": 3.298004784241249e-10, + "loss": 0.9341, + "step": 12926 + }, + { + "epoch": 0.9965309898242368, + "grad_norm": 4.704819679260254, + "learning_rate": 3.156173367457882e-10, + "loss": 0.9207, + "step": 12927 + }, + { + "epoch": 0.9966080789392537, + "grad_norm": 3.6841025352478027, + "learning_rate": 3.01745899495276e-10, + "loss": 0.8167, + "step": 12928 + }, + { + "epoch": 0.9966851680542708, + "grad_norm": 3.9212276935577393, + "learning_rate": 2.8818616753634177e-10, + "loss": 0.9434, + "step": 12929 + }, + { + "epoch": 0.9967622571692877, + "grad_norm": 3.5783891677856445, + "learning_rate": 2.749381417155306e-10, + "loss": 0.8192, + "step": 12930 + }, + { + "epoch": 0.9968393462843047, + "grad_norm": 3.933195114135742, + "learning_rate": 2.6200182285718303e-10, + "loss": 0.8573, + "step": 12931 + }, + { + "epoch": 0.9969164353993216, + "grad_norm": 4.147609710693359, + "learning_rate": 2.493772117695414e-10, + "loss": 0.9402, + "step": 12932 + }, + { + "epoch": 0.9969935245143385, + "grad_norm": 3.782233476638794, + "learning_rate": 2.3706430923808867e-10, + "loss": 0.969, + "step": 12933 + }, + { + "epoch": 0.9970706136293556, + "grad_norm": 3.95326566696167, + "learning_rate": 2.250631160316541e-10, + "loss": 1.0036, + "step": 12934 + }, + { + "epoch": 0.9971477027443725, + "grad_norm": 3.8084826469421387, + "learning_rate": 2.1337363289797296e-10, + "loss": 0.9255, + "step": 12935 + }, + { + "epoch": 0.9972247918593895, + "grad_norm": 4.2080535888671875, + "learning_rate": 2.0199586056590669e-10, + "loss": 0.9461, + "step": 12936 + }, + { + "epoch": 0.9973018809744064, + "grad_norm": 4.278135299682617, + "learning_rate": 1.909297997448878e-10, + "loss": 0.9677, + "step": 12937 + }, + { + "epoch": 0.9973789700894233, + "grad_norm": 3.921271800994873, + "learning_rate": 1.8017545112491984e-10, + "loss": 0.9193, + "step": 12938 + }, + { + "epoch": 0.9974560592044404, + "grad_norm": 3.6912286281585693, + "learning_rate": 1.697328153760225e-10, + "loss": 0.9062, + "step": 12939 + }, + { + "epoch": 0.9975331483194573, + "grad_norm": 3.613358736038208, + "learning_rate": 1.5960189314934149e-10, + "loss": 0.8264, + "step": 12940 + }, + { + "epoch": 0.9976102374344743, + "grad_norm": 4.033813953399658, + "learning_rate": 1.4978268507659376e-10, + "loss": 0.9703, + "step": 12941 + }, + { + "epoch": 0.9976873265494912, + "grad_norm": 3.622581958770752, + "learning_rate": 1.402751917700673e-10, + "loss": 0.8894, + "step": 12942 + }, + { + "epoch": 0.9977644156645081, + "grad_norm": 3.6066484451293945, + "learning_rate": 1.3107941382262124e-10, + "loss": 0.8503, + "step": 12943 + }, + { + "epoch": 0.9978415047795252, + "grad_norm": 3.685361385345459, + "learning_rate": 1.221953518071306e-10, + "loss": 0.8426, + "step": 12944 + }, + { + "epoch": 0.9979185938945421, + "grad_norm": 4.148501873016357, + "learning_rate": 1.1362300627815182e-10, + "loss": 0.9235, + "step": 12945 + }, + { + "epoch": 0.9979956830095591, + "grad_norm": 3.7229340076446533, + "learning_rate": 1.0536237776970215e-10, + "loss": 0.8819, + "step": 12946 + }, + { + "epoch": 0.998072772124576, + "grad_norm": 3.745091199874878, + "learning_rate": 9.741346679636998e-11, + "loss": 0.8432, + "step": 12947 + }, + { + "epoch": 0.9981498612395929, + "grad_norm": 3.642552137374878, + "learning_rate": 8.9776273854425e-11, + "loss": 0.7828, + "step": 12948 + }, + { + "epoch": 0.99822695035461, + "grad_norm": 4.068527698516846, + "learning_rate": 8.245079941959777e-11, + "loss": 0.8816, + "step": 12949 + }, + { + "epoch": 0.9983040394696269, + "grad_norm": 3.732131242752075, + "learning_rate": 7.543704394874507e-11, + "loss": 0.8963, + "step": 12950 + }, + { + "epoch": 0.9983811285846439, + "grad_norm": 4.001811504364014, + "learning_rate": 6.873500787873965e-11, + "loss": 0.9997, + "step": 12951 + }, + { + "epoch": 0.9984582176996608, + "grad_norm": 3.770373821258545, + "learning_rate": 6.234469162813561e-11, + "loss": 0.8653, + "step": 12952 + }, + { + "epoch": 0.9985353068146777, + "grad_norm": 3.548311710357666, + "learning_rate": 5.6266095595503e-11, + "loss": 0.8459, + "step": 12953 + }, + { + "epoch": 0.9986123959296948, + "grad_norm": 3.578949451446533, + "learning_rate": 5.049922015887276e-11, + "loss": 0.8083, + "step": 12954 + }, + { + "epoch": 0.9986894850447117, + "grad_norm": 3.936638593673706, + "learning_rate": 4.504406567795716e-11, + "loss": 0.9448, + "step": 12955 + }, + { + "epoch": 0.9987665741597287, + "grad_norm": 3.4746735095977783, + "learning_rate": 3.990063249359466e-11, + "loss": 0.8742, + "step": 12956 + }, + { + "epoch": 0.9988436632747456, + "grad_norm": 3.4810571670532227, + "learning_rate": 3.506892092552949e-11, + "loss": 0.8603, + "step": 12957 + }, + { + "epoch": 0.9989207523897625, + "grad_norm": 3.837407112121582, + "learning_rate": 3.054893127574232e-11, + "loss": 1.0381, + "step": 12958 + }, + { + "epoch": 0.9989978415047795, + "grad_norm": 3.7372279167175293, + "learning_rate": 2.6340663825674683e-11, + "loss": 0.8691, + "step": 12959 + }, + { + "epoch": 0.9990749306197965, + "grad_norm": 3.8073391914367676, + "learning_rate": 2.2444118837339212e-11, + "loss": 0.945, + "step": 12960 + }, + { + "epoch": 0.9991520197348135, + "grad_norm": 3.483799457550049, + "learning_rate": 1.8859296554429862e-11, + "loss": 0.8425, + "step": 12961 + }, + { + "epoch": 0.9992291088498304, + "grad_norm": 4.023273468017578, + "learning_rate": 1.5586197200101462e-11, + "loss": 0.905, + "step": 12962 + }, + { + "epoch": 0.9993061979648473, + "grad_norm": 3.542368173599243, + "learning_rate": 1.2624820978079932e-11, + "loss": 0.922, + "step": 12963 + }, + { + "epoch": 0.9993832870798643, + "grad_norm": 3.746952772140503, + "learning_rate": 9.975168073772523e-12, + "loss": 0.7879, + "step": 12964 + }, + { + "epoch": 0.9994603761948813, + "grad_norm": 3.6323859691619873, + "learning_rate": 7.63723865149224e-12, + "loss": 0.9078, + "step": 12965 + }, + { + "epoch": 0.9995374653098983, + "grad_norm": 3.747823476791382, + "learning_rate": 5.611032857788523e-12, + "loss": 0.8839, + "step": 12966 + }, + { + "epoch": 0.9996145544249152, + "grad_norm": 3.6762750148773193, + "learning_rate": 3.896550818116574e-12, + "loss": 0.86, + "step": 12967 + }, + { + "epoch": 0.9996916435399321, + "grad_norm": 3.4373939037323, + "learning_rate": 2.4937926401680245e-12, + "loss": 0.904, + "step": 12968 + }, + { + "epoch": 0.9997687326549491, + "grad_norm": 3.7813000679016113, + "learning_rate": 1.402758411095384e-12, + "loss": 0.9159, + "step": 12969 + }, + { + "epoch": 0.9998458217699661, + "grad_norm": 3.9511356353759766, + "learning_rate": 6.234481991773678e-13, + "loss": 0.864, + "step": 12970 + }, + { + "epoch": 0.9999229108849831, + "grad_norm": 4.2751851081848145, + "learning_rate": 1.5586205215356586e-13, + "loss": 0.8958, + "step": 12971 + }, + { + "epoch": 1.0, + "grad_norm": 4.090433120727539, + "learning_rate": 0.0, + "loss": 0.8966, + "step": 12972 + }, + { + "epoch": 1.0, + "step": 12972, + "total_flos": 2.0878280805119427e+18, + "train_loss": 0.3483946987583135, + "train_runtime": 106063.3145, + "train_samples_per_second": 3.914, + "train_steps_per_second": 0.122 + } + ], + "logging_steps": 1.0, + "max_steps": 12972, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.0878280805119427e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}