diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24969 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9991577765300392, + "eval_steps": 500, + "global_step": 3561, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008422234699606962, + "grad_norm": 2.7139885425567627, + "learning_rate": 2.8011204481792718e-08, + "loss": 0.7099, + "step": 1 + }, + { + "epoch": 0.0016844469399213925, + "grad_norm": 2.5785422325134277, + "learning_rate": 5.6022408963585437e-08, + "loss": 0.7135, + "step": 2 + }, + { + "epoch": 0.0025266704098820887, + "grad_norm": 2.7374205589294434, + "learning_rate": 8.403361344537815e-08, + "loss": 0.7394, + "step": 3 + }, + { + "epoch": 0.003368893879842785, + "grad_norm": 2.7776577472686768, + "learning_rate": 1.1204481792717087e-07, + "loss": 0.7385, + "step": 4 + }, + { + "epoch": 0.004211117349803481, + "grad_norm": 2.700944662094116, + "learning_rate": 1.400560224089636e-07, + "loss": 0.753, + "step": 5 + }, + { + "epoch": 0.0050533408197641775, + "grad_norm": 2.9223380088806152, + "learning_rate": 1.680672268907563e-07, + "loss": 0.7637, + "step": 6 + }, + { + "epoch": 0.005895564289724873, + "grad_norm": 2.558424234390259, + "learning_rate": 1.9607843137254904e-07, + "loss": 0.7837, + "step": 7 + }, + { + "epoch": 0.00673778775968557, + "grad_norm": 2.620910882949829, + "learning_rate": 2.2408963585434175e-07, + "loss": 0.7524, + "step": 8 + }, + { + "epoch": 0.007580011229646266, + "grad_norm": 2.769366979598999, + "learning_rate": 2.5210084033613445e-07, + "loss": 0.7467, + "step": 9 + }, + { + "epoch": 0.008422234699606962, + "grad_norm": 2.6973936557769775, + "learning_rate": 2.801120448179272e-07, + "loss": 0.7416, + "step": 10 + }, + { + "epoch": 0.009264458169567658, + "grad_norm": 2.6612558364868164, + "learning_rate": 3.081232492997199e-07, + "loss": 0.7446, + "step": 11 + }, + { + "epoch": 0.010106681639528355, + "grad_norm": 2.729600667953491, + "learning_rate": 3.361344537815126e-07, + "loss": 0.7453, + "step": 12 + }, + { + "epoch": 0.010948905109489052, + "grad_norm": 2.687103271484375, + "learning_rate": 3.641456582633054e-07, + "loss": 0.7578, + "step": 13 + }, + { + "epoch": 0.011791128579449747, + "grad_norm": 2.611377239227295, + "learning_rate": 3.921568627450981e-07, + "loss": 0.7663, + "step": 14 + }, + { + "epoch": 0.012633352049410443, + "grad_norm": 2.788295030593872, + "learning_rate": 4.201680672268908e-07, + "loss": 0.7648, + "step": 15 + }, + { + "epoch": 0.01347557551937114, + "grad_norm": 2.528625726699829, + "learning_rate": 4.481792717086835e-07, + "loss": 0.6989, + "step": 16 + }, + { + "epoch": 0.014317798989331837, + "grad_norm": 2.6373348236083984, + "learning_rate": 4.7619047619047623e-07, + "loss": 0.7643, + "step": 17 + }, + { + "epoch": 0.015160022459292532, + "grad_norm": 2.454690456390381, + "learning_rate": 5.042016806722689e-07, + "loss": 0.7315, + "step": 18 + }, + { + "epoch": 0.016002245929253228, + "grad_norm": 2.1451756954193115, + "learning_rate": 5.322128851540616e-07, + "loss": 0.7307, + "step": 19 + }, + { + "epoch": 0.016844469399213923, + "grad_norm": 2.230375289916992, + "learning_rate": 5.602240896358544e-07, + "loss": 0.7181, + "step": 20 + }, + { + "epoch": 0.01768669286917462, + "grad_norm": 2.100219488143921, + "learning_rate": 5.882352941176471e-07, + "loss": 0.7203, + "step": 21 + }, + { + "epoch": 0.018528916339135316, + "grad_norm": 1.9923067092895508, + "learning_rate": 6.162464985994398e-07, + "loss": 0.7079, + "step": 22 + }, + { + "epoch": 0.019371139809096015, + "grad_norm": 1.9194575548171997, + "learning_rate": 6.442577030812325e-07, + "loss": 0.6887, + "step": 23 + }, + { + "epoch": 0.02021336327905671, + "grad_norm": 1.9598820209503174, + "learning_rate": 6.722689075630252e-07, + "loss": 0.7282, + "step": 24 + }, + { + "epoch": 0.021055586749017405, + "grad_norm": 1.6673228740692139, + "learning_rate": 7.002801120448179e-07, + "loss": 0.7221, + "step": 25 + }, + { + "epoch": 0.021897810218978103, + "grad_norm": 1.5177627801895142, + "learning_rate": 7.282913165266108e-07, + "loss": 0.7328, + "step": 26 + }, + { + "epoch": 0.022740033688938798, + "grad_norm": 1.3995906114578247, + "learning_rate": 7.563025210084034e-07, + "loss": 0.6929, + "step": 27 + }, + { + "epoch": 0.023582257158899493, + "grad_norm": 1.4189306497573853, + "learning_rate": 7.843137254901962e-07, + "loss": 0.6901, + "step": 28 + }, + { + "epoch": 0.02442448062886019, + "grad_norm": 1.362115502357483, + "learning_rate": 8.123249299719889e-07, + "loss": 0.6601, + "step": 29 + }, + { + "epoch": 0.025266704098820886, + "grad_norm": 1.3577219247817993, + "learning_rate": 8.403361344537816e-07, + "loss": 0.7119, + "step": 30 + }, + { + "epoch": 0.026108927568781585, + "grad_norm": 1.3313069343566895, + "learning_rate": 8.683473389355742e-07, + "loss": 0.6776, + "step": 31 + }, + { + "epoch": 0.02695115103874228, + "grad_norm": 1.2647556066513062, + "learning_rate": 8.96358543417367e-07, + "loss": 0.6799, + "step": 32 + }, + { + "epoch": 0.027793374508702975, + "grad_norm": 1.2313024997711182, + "learning_rate": 9.243697478991598e-07, + "loss": 0.6693, + "step": 33 + }, + { + "epoch": 0.028635597978663673, + "grad_norm": 1.1124472618103027, + "learning_rate": 9.523809523809525e-07, + "loss": 0.6515, + "step": 34 + }, + { + "epoch": 0.029477821448624368, + "grad_norm": 1.0712497234344482, + "learning_rate": 9.80392156862745e-07, + "loss": 0.6746, + "step": 35 + }, + { + "epoch": 0.030320044918585063, + "grad_norm": 1.0045796632766724, + "learning_rate": 1.0084033613445378e-06, + "loss": 0.6326, + "step": 36 + }, + { + "epoch": 0.03116226838854576, + "grad_norm": 0.9366388320922852, + "learning_rate": 1.0364145658263308e-06, + "loss": 0.6184, + "step": 37 + }, + { + "epoch": 0.032004491858506456, + "grad_norm": 0.9612988233566284, + "learning_rate": 1.0644257703081233e-06, + "loss": 0.6551, + "step": 38 + }, + { + "epoch": 0.032846715328467155, + "grad_norm": 0.9342404007911682, + "learning_rate": 1.092436974789916e-06, + "loss": 0.6553, + "step": 39 + }, + { + "epoch": 0.033688938798427846, + "grad_norm": 0.8526981472969055, + "learning_rate": 1.1204481792717088e-06, + "loss": 0.6212, + "step": 40 + }, + { + "epoch": 0.034531162268388545, + "grad_norm": 0.8903021216392517, + "learning_rate": 1.1484593837535015e-06, + "loss": 0.6381, + "step": 41 + }, + { + "epoch": 0.03537338573834924, + "grad_norm": 0.8123112320899963, + "learning_rate": 1.1764705882352942e-06, + "loss": 0.6237, + "step": 42 + }, + { + "epoch": 0.03621560920830994, + "grad_norm": 0.8697153925895691, + "learning_rate": 1.204481792717087e-06, + "loss": 0.637, + "step": 43 + }, + { + "epoch": 0.03705783267827063, + "grad_norm": 0.7646113038063049, + "learning_rate": 1.2324929971988797e-06, + "loss": 0.6099, + "step": 44 + }, + { + "epoch": 0.03790005614823133, + "grad_norm": 0.7301273941993713, + "learning_rate": 1.2605042016806724e-06, + "loss": 0.6307, + "step": 45 + }, + { + "epoch": 0.03874227961819203, + "grad_norm": 0.6388236880302429, + "learning_rate": 1.288515406162465e-06, + "loss": 0.6004, + "step": 46 + }, + { + "epoch": 0.03958450308815272, + "grad_norm": 0.6535987854003906, + "learning_rate": 1.316526610644258e-06, + "loss": 0.6248, + "step": 47 + }, + { + "epoch": 0.04042672655811342, + "grad_norm": 0.5928495526313782, + "learning_rate": 1.3445378151260504e-06, + "loss": 0.5803, + "step": 48 + }, + { + "epoch": 0.04126895002807412, + "grad_norm": 0.6195793151855469, + "learning_rate": 1.3725490196078434e-06, + "loss": 0.5997, + "step": 49 + }, + { + "epoch": 0.04211117349803481, + "grad_norm": 0.5354277491569519, + "learning_rate": 1.4005602240896359e-06, + "loss": 0.5537, + "step": 50 + }, + { + "epoch": 0.04295339696799551, + "grad_norm": 0.5489490032196045, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.5818, + "step": 51 + }, + { + "epoch": 0.043795620437956206, + "grad_norm": 0.5248365998268127, + "learning_rate": 1.4565826330532216e-06, + "loss": 0.5803, + "step": 52 + }, + { + "epoch": 0.0446378439079169, + "grad_norm": 0.5383778214454651, + "learning_rate": 1.484593837535014e-06, + "loss": 0.6108, + "step": 53 + }, + { + "epoch": 0.045480067377877596, + "grad_norm": 0.5217686295509338, + "learning_rate": 1.5126050420168068e-06, + "loss": 0.5962, + "step": 54 + }, + { + "epoch": 0.046322290847838295, + "grad_norm": 0.5311843752861023, + "learning_rate": 1.5406162464985996e-06, + "loss": 0.5983, + "step": 55 + }, + { + "epoch": 0.047164514317798986, + "grad_norm": 0.5228126645088196, + "learning_rate": 1.5686274509803923e-06, + "loss": 0.5789, + "step": 56 + }, + { + "epoch": 0.048006737787759685, + "grad_norm": 0.4893929958343506, + "learning_rate": 1.5966386554621848e-06, + "loss": 0.5695, + "step": 57 + }, + { + "epoch": 0.04884896125772038, + "grad_norm": 0.46164101362228394, + "learning_rate": 1.6246498599439778e-06, + "loss": 0.5611, + "step": 58 + }, + { + "epoch": 0.04969118472768108, + "grad_norm": 0.42507484555244446, + "learning_rate": 1.6526610644257705e-06, + "loss": 0.5262, + "step": 59 + }, + { + "epoch": 0.05053340819764177, + "grad_norm": 0.41358301043510437, + "learning_rate": 1.6806722689075632e-06, + "loss": 0.5948, + "step": 60 + }, + { + "epoch": 0.05137563166760247, + "grad_norm": 0.3956189751625061, + "learning_rate": 1.708683473389356e-06, + "loss": 0.5569, + "step": 61 + }, + { + "epoch": 0.05221785513756317, + "grad_norm": 0.41548147797584534, + "learning_rate": 1.7366946778711485e-06, + "loss": 0.5444, + "step": 62 + }, + { + "epoch": 0.05306007860752386, + "grad_norm": 0.4024750888347626, + "learning_rate": 1.7647058823529414e-06, + "loss": 0.5549, + "step": 63 + }, + { + "epoch": 0.05390230207748456, + "grad_norm": 0.36441898345947266, + "learning_rate": 1.792717086834734e-06, + "loss": 0.5219, + "step": 64 + }, + { + "epoch": 0.05474452554744526, + "grad_norm": 0.37034735083580017, + "learning_rate": 1.8207282913165267e-06, + "loss": 0.5264, + "step": 65 + }, + { + "epoch": 0.05558674901740595, + "grad_norm": 0.34774619340896606, + "learning_rate": 1.8487394957983196e-06, + "loss": 0.5511, + "step": 66 + }, + { + "epoch": 0.05642897248736665, + "grad_norm": 0.3381839096546173, + "learning_rate": 1.8767507002801122e-06, + "loss": 0.5242, + "step": 67 + }, + { + "epoch": 0.057271195957327346, + "grad_norm": 0.34560632705688477, + "learning_rate": 1.904761904761905e-06, + "loss": 0.5143, + "step": 68 + }, + { + "epoch": 0.05811341942728804, + "grad_norm": 0.35959887504577637, + "learning_rate": 1.932773109243698e-06, + "loss": 0.5333, + "step": 69 + }, + { + "epoch": 0.058955642897248736, + "grad_norm": 0.3450910449028015, + "learning_rate": 1.96078431372549e-06, + "loss": 0.5181, + "step": 70 + }, + { + "epoch": 0.059797866367209435, + "grad_norm": 0.3333073854446411, + "learning_rate": 1.988795518207283e-06, + "loss": 0.5358, + "step": 71 + }, + { + "epoch": 0.060640089837170126, + "grad_norm": 0.34593814611434937, + "learning_rate": 2.0168067226890756e-06, + "loss": 0.5351, + "step": 72 + }, + { + "epoch": 0.061482313307130824, + "grad_norm": 0.30022236704826355, + "learning_rate": 2.0448179271708684e-06, + "loss": 0.5296, + "step": 73 + }, + { + "epoch": 0.06232453677709152, + "grad_norm": 0.31735390424728394, + "learning_rate": 2.0728291316526615e-06, + "loss": 0.5198, + "step": 74 + }, + { + "epoch": 0.06316676024705221, + "grad_norm": 0.40318942070007324, + "learning_rate": 2.100840336134454e-06, + "loss": 0.5037, + "step": 75 + }, + { + "epoch": 0.06400898371701291, + "grad_norm": 0.2909386456012726, + "learning_rate": 2.1288515406162466e-06, + "loss": 0.4936, + "step": 76 + }, + { + "epoch": 0.06485120718697361, + "grad_norm": 0.30838459730148315, + "learning_rate": 2.1568627450980393e-06, + "loss": 0.5073, + "step": 77 + }, + { + "epoch": 0.06569343065693431, + "grad_norm": 0.2972123324871063, + "learning_rate": 2.184873949579832e-06, + "loss": 0.5445, + "step": 78 + }, + { + "epoch": 0.06653565412689501, + "grad_norm": 0.29268571734428406, + "learning_rate": 2.2128851540616248e-06, + "loss": 0.5006, + "step": 79 + }, + { + "epoch": 0.06737787759685569, + "grad_norm": 0.2976105213165283, + "learning_rate": 2.2408963585434175e-06, + "loss": 0.5266, + "step": 80 + }, + { + "epoch": 0.06822010106681639, + "grad_norm": 0.2998995780944824, + "learning_rate": 2.2689075630252102e-06, + "loss": 0.5093, + "step": 81 + }, + { + "epoch": 0.06906232453677709, + "grad_norm": 0.3065139055252075, + "learning_rate": 2.296918767507003e-06, + "loss": 0.5078, + "step": 82 + }, + { + "epoch": 0.06990454800673779, + "grad_norm": 0.2579464614391327, + "learning_rate": 2.3249299719887957e-06, + "loss": 0.484, + "step": 83 + }, + { + "epoch": 0.07074677147669849, + "grad_norm": 0.28767305612564087, + "learning_rate": 2.3529411764705885e-06, + "loss": 0.5066, + "step": 84 + }, + { + "epoch": 0.07158899494665918, + "grad_norm": 0.27491238713264465, + "learning_rate": 2.380952380952381e-06, + "loss": 0.4865, + "step": 85 + }, + { + "epoch": 0.07243121841661988, + "grad_norm": 0.2776690721511841, + "learning_rate": 2.408963585434174e-06, + "loss": 0.5279, + "step": 86 + }, + { + "epoch": 0.07327344188658057, + "grad_norm": 0.2866682708263397, + "learning_rate": 2.4369747899159667e-06, + "loss": 0.506, + "step": 87 + }, + { + "epoch": 0.07411566535654127, + "grad_norm": 0.27375006675720215, + "learning_rate": 2.4649859943977594e-06, + "loss": 0.5203, + "step": 88 + }, + { + "epoch": 0.07495788882650196, + "grad_norm": 0.2791830897331238, + "learning_rate": 2.492997198879552e-06, + "loss": 0.5098, + "step": 89 + }, + { + "epoch": 0.07580011229646266, + "grad_norm": 0.2743302881717682, + "learning_rate": 2.521008403361345e-06, + "loss": 0.5236, + "step": 90 + }, + { + "epoch": 0.07664233576642336, + "grad_norm": 0.2687312066555023, + "learning_rate": 2.549019607843137e-06, + "loss": 0.4928, + "step": 91 + }, + { + "epoch": 0.07748455923638406, + "grad_norm": 0.25979599356651306, + "learning_rate": 2.57703081232493e-06, + "loss": 0.4872, + "step": 92 + }, + { + "epoch": 0.07832678270634474, + "grad_norm": 0.2747167646884918, + "learning_rate": 2.605042016806723e-06, + "loss": 0.4934, + "step": 93 + }, + { + "epoch": 0.07916900617630544, + "grad_norm": 0.25741496682167053, + "learning_rate": 2.633053221288516e-06, + "loss": 0.5163, + "step": 94 + }, + { + "epoch": 0.08001122964626614, + "grad_norm": 0.2904193103313446, + "learning_rate": 2.6610644257703085e-06, + "loss": 0.494, + "step": 95 + }, + { + "epoch": 0.08085345311622684, + "grad_norm": 0.25851333141326904, + "learning_rate": 2.689075630252101e-06, + "loss": 0.4831, + "step": 96 + }, + { + "epoch": 0.08169567658618754, + "grad_norm": 0.27450624108314514, + "learning_rate": 2.7170868347338936e-06, + "loss": 0.5113, + "step": 97 + }, + { + "epoch": 0.08253790005614824, + "grad_norm": 0.2518622875213623, + "learning_rate": 2.7450980392156867e-06, + "loss": 0.4762, + "step": 98 + }, + { + "epoch": 0.08338012352610892, + "grad_norm": 0.25959309935569763, + "learning_rate": 2.7731092436974795e-06, + "loss": 0.4694, + "step": 99 + }, + { + "epoch": 0.08422234699606962, + "grad_norm": 0.28160223364830017, + "learning_rate": 2.8011204481792718e-06, + "loss": 0.5058, + "step": 100 + }, + { + "epoch": 0.08506457046603032, + "grad_norm": 0.27110520005226135, + "learning_rate": 2.8291316526610645e-06, + "loss": 0.4883, + "step": 101 + }, + { + "epoch": 0.08590679393599102, + "grad_norm": 0.2723713219165802, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.5112, + "step": 102 + }, + { + "epoch": 0.08674901740595171, + "grad_norm": 0.2606522738933563, + "learning_rate": 2.88515406162465e-06, + "loss": 0.4779, + "step": 103 + }, + { + "epoch": 0.08759124087591241, + "grad_norm": 0.2593904435634613, + "learning_rate": 2.913165266106443e-06, + "loss": 0.5065, + "step": 104 + }, + { + "epoch": 0.08843346434587311, + "grad_norm": 0.2900582551956177, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.49, + "step": 105 + }, + { + "epoch": 0.0892756878158338, + "grad_norm": 0.2618088126182556, + "learning_rate": 2.969187675070028e-06, + "loss": 0.4668, + "step": 106 + }, + { + "epoch": 0.0901179112857945, + "grad_norm": 0.28109776973724365, + "learning_rate": 2.997198879551821e-06, + "loss": 0.4702, + "step": 107 + }, + { + "epoch": 0.09096013475575519, + "grad_norm": 0.23819291591644287, + "learning_rate": 3.0252100840336137e-06, + "loss": 0.4843, + "step": 108 + }, + { + "epoch": 0.09180235822571589, + "grad_norm": 0.2603215277194977, + "learning_rate": 3.053221288515407e-06, + "loss": 0.4834, + "step": 109 + }, + { + "epoch": 0.09264458169567659, + "grad_norm": 0.2843914031982422, + "learning_rate": 3.081232492997199e-06, + "loss": 0.4787, + "step": 110 + }, + { + "epoch": 0.09348680516563729, + "grad_norm": 0.27807438373565674, + "learning_rate": 3.109243697478992e-06, + "loss": 0.4966, + "step": 111 + }, + { + "epoch": 0.09432902863559797, + "grad_norm": 0.25857606530189514, + "learning_rate": 3.1372549019607846e-06, + "loss": 0.4654, + "step": 112 + }, + { + "epoch": 0.09517125210555867, + "grad_norm": 0.3008019030094147, + "learning_rate": 3.1652661064425773e-06, + "loss": 0.4811, + "step": 113 + }, + { + "epoch": 0.09601347557551937, + "grad_norm": 0.2548486590385437, + "learning_rate": 3.1932773109243696e-06, + "loss": 0.486, + "step": 114 + }, + { + "epoch": 0.09685569904548007, + "grad_norm": 0.2549389600753784, + "learning_rate": 3.221288515406163e-06, + "loss": 0.4674, + "step": 115 + }, + { + "epoch": 0.09769792251544077, + "grad_norm": 0.2595871090888977, + "learning_rate": 3.2492997198879555e-06, + "loss": 0.4681, + "step": 116 + }, + { + "epoch": 0.09854014598540146, + "grad_norm": 0.24076548218727112, + "learning_rate": 3.2773109243697483e-06, + "loss": 0.4794, + "step": 117 + }, + { + "epoch": 0.09938236945536216, + "grad_norm": 0.2617177367210388, + "learning_rate": 3.305322128851541e-06, + "loss": 0.4741, + "step": 118 + }, + { + "epoch": 0.10022459292532285, + "grad_norm": 0.2643377184867859, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.4778, + "step": 119 + }, + { + "epoch": 0.10106681639528355, + "grad_norm": 0.25778520107269287, + "learning_rate": 3.3613445378151265e-06, + "loss": 0.4667, + "step": 120 + }, + { + "epoch": 0.10190903986524424, + "grad_norm": 0.2485887110233307, + "learning_rate": 3.3893557422969192e-06, + "loss": 0.4942, + "step": 121 + }, + { + "epoch": 0.10275126333520494, + "grad_norm": 0.2509766221046448, + "learning_rate": 3.417366946778712e-06, + "loss": 0.4623, + "step": 122 + }, + { + "epoch": 0.10359348680516564, + "grad_norm": 0.2518707513809204, + "learning_rate": 3.4453781512605043e-06, + "loss": 0.4549, + "step": 123 + }, + { + "epoch": 0.10443571027512634, + "grad_norm": 0.2753753960132599, + "learning_rate": 3.473389355742297e-06, + "loss": 0.4455, + "step": 124 + }, + { + "epoch": 0.10527793374508702, + "grad_norm": 0.25094544887542725, + "learning_rate": 3.5014005602240897e-06, + "loss": 0.4582, + "step": 125 + }, + { + "epoch": 0.10612015721504772, + "grad_norm": 0.2611958980560303, + "learning_rate": 3.529411764705883e-06, + "loss": 0.4981, + "step": 126 + }, + { + "epoch": 0.10696238068500842, + "grad_norm": 0.2824934422969818, + "learning_rate": 3.5574229691876756e-06, + "loss": 0.4671, + "step": 127 + }, + { + "epoch": 0.10780460415496912, + "grad_norm": 0.235825777053833, + "learning_rate": 3.585434173669468e-06, + "loss": 0.4477, + "step": 128 + }, + { + "epoch": 0.10864682762492982, + "grad_norm": 0.25024521350860596, + "learning_rate": 3.6134453781512607e-06, + "loss": 0.4579, + "step": 129 + }, + { + "epoch": 0.10948905109489052, + "grad_norm": 0.26830458641052246, + "learning_rate": 3.6414565826330534e-06, + "loss": 0.4848, + "step": 130 + }, + { + "epoch": 0.1103312745648512, + "grad_norm": 0.27017107605934143, + "learning_rate": 3.669467787114846e-06, + "loss": 0.4639, + "step": 131 + }, + { + "epoch": 0.1111734980348119, + "grad_norm": 0.2695944607257843, + "learning_rate": 3.6974789915966393e-06, + "loss": 0.4513, + "step": 132 + }, + { + "epoch": 0.1120157215047726, + "grad_norm": 0.29847121238708496, + "learning_rate": 3.7254901960784316e-06, + "loss": 0.4732, + "step": 133 + }, + { + "epoch": 0.1128579449747333, + "grad_norm": 0.26092973351478577, + "learning_rate": 3.7535014005602243e-06, + "loss": 0.4714, + "step": 134 + }, + { + "epoch": 0.113700168444694, + "grad_norm": 0.3229716718196869, + "learning_rate": 3.781512605042017e-06, + "loss": 0.4414, + "step": 135 + }, + { + "epoch": 0.11454239191465469, + "grad_norm": 0.2888365685939789, + "learning_rate": 3.80952380952381e-06, + "loss": 0.4798, + "step": 136 + }, + { + "epoch": 0.11538461538461539, + "grad_norm": 0.2670290470123291, + "learning_rate": 3.8375350140056026e-06, + "loss": 0.4413, + "step": 137 + }, + { + "epoch": 0.11622683885457608, + "grad_norm": 0.2552930414676666, + "learning_rate": 3.865546218487396e-06, + "loss": 0.4621, + "step": 138 + }, + { + "epoch": 0.11706906232453677, + "grad_norm": 0.255060613155365, + "learning_rate": 3.893557422969188e-06, + "loss": 0.4274, + "step": 139 + }, + { + "epoch": 0.11791128579449747, + "grad_norm": 0.2504236698150635, + "learning_rate": 3.92156862745098e-06, + "loss": 0.4631, + "step": 140 + }, + { + "epoch": 0.11875350926445817, + "grad_norm": 0.25774672627449036, + "learning_rate": 3.9495798319327735e-06, + "loss": 0.463, + "step": 141 + }, + { + "epoch": 0.11959573273441887, + "grad_norm": 0.2837006151676178, + "learning_rate": 3.977591036414566e-06, + "loss": 0.45, + "step": 142 + }, + { + "epoch": 0.12043795620437957, + "grad_norm": 0.24713779985904694, + "learning_rate": 4.005602240896359e-06, + "loss": 0.4438, + "step": 143 + }, + { + "epoch": 0.12128017967434025, + "grad_norm": 0.2868185043334961, + "learning_rate": 4.033613445378151e-06, + "loss": 0.4702, + "step": 144 + }, + { + "epoch": 0.12212240314430095, + "grad_norm": 0.29100534319877625, + "learning_rate": 4.0616246498599444e-06, + "loss": 0.4584, + "step": 145 + }, + { + "epoch": 0.12296462661426165, + "grad_norm": 0.24189263582229614, + "learning_rate": 4.089635854341737e-06, + "loss": 0.4599, + "step": 146 + }, + { + "epoch": 0.12380685008422235, + "grad_norm": 0.26137450337409973, + "learning_rate": 4.11764705882353e-06, + "loss": 0.4549, + "step": 147 + }, + { + "epoch": 0.12464907355418305, + "grad_norm": 0.25882503390312195, + "learning_rate": 4.145658263305323e-06, + "loss": 0.4521, + "step": 148 + }, + { + "epoch": 0.12549129702414374, + "grad_norm": 0.26184239983558655, + "learning_rate": 4.173669467787115e-06, + "loss": 0.4576, + "step": 149 + }, + { + "epoch": 0.12633352049410443, + "grad_norm": 0.2604025602340698, + "learning_rate": 4.201680672268908e-06, + "loss": 0.4806, + "step": 150 + }, + { + "epoch": 0.12717574396406514, + "grad_norm": 0.2570963203907013, + "learning_rate": 4.229691876750701e-06, + "loss": 0.4283, + "step": 151 + }, + { + "epoch": 0.12801796743402583, + "grad_norm": 0.2603912353515625, + "learning_rate": 4.257703081232493e-06, + "loss": 0.4516, + "step": 152 + }, + { + "epoch": 0.12886019090398654, + "grad_norm": 0.2573627829551697, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.4642, + "step": 153 + }, + { + "epoch": 0.12970241437394722, + "grad_norm": 0.2628520429134369, + "learning_rate": 4.313725490196079e-06, + "loss": 0.4614, + "step": 154 + }, + { + "epoch": 0.1305446378439079, + "grad_norm": 0.26437580585479736, + "learning_rate": 4.341736694677872e-06, + "loss": 0.4758, + "step": 155 + }, + { + "epoch": 0.13138686131386862, + "grad_norm": 0.24924257397651672, + "learning_rate": 4.369747899159664e-06, + "loss": 0.4466, + "step": 156 + }, + { + "epoch": 0.1322290847838293, + "grad_norm": 0.265608549118042, + "learning_rate": 4.397759103641457e-06, + "loss": 0.4732, + "step": 157 + }, + { + "epoch": 0.13307130825379002, + "grad_norm": 0.24660134315490723, + "learning_rate": 4.4257703081232496e-06, + "loss": 0.4453, + "step": 158 + }, + { + "epoch": 0.1339135317237507, + "grad_norm": 0.2537635266780853, + "learning_rate": 4.453781512605043e-06, + "loss": 0.4711, + "step": 159 + }, + { + "epoch": 0.13475575519371139, + "grad_norm": 0.2631363570690155, + "learning_rate": 4.481792717086835e-06, + "loss": 0.4556, + "step": 160 + }, + { + "epoch": 0.1355979786636721, + "grad_norm": 0.2548125088214874, + "learning_rate": 4.509803921568628e-06, + "loss": 0.4691, + "step": 161 + }, + { + "epoch": 0.13644020213363278, + "grad_norm": 0.2381230890750885, + "learning_rate": 4.5378151260504205e-06, + "loss": 0.4396, + "step": 162 + }, + { + "epoch": 0.1372824256035935, + "grad_norm": 0.2523537576198578, + "learning_rate": 4.565826330532213e-06, + "loss": 0.4418, + "step": 163 + }, + { + "epoch": 0.13812464907355418, + "grad_norm": 0.26021164655685425, + "learning_rate": 4.593837535014006e-06, + "loss": 0.4818, + "step": 164 + }, + { + "epoch": 0.1389668725435149, + "grad_norm": 0.2858008146286011, + "learning_rate": 4.621848739495799e-06, + "loss": 0.4398, + "step": 165 + }, + { + "epoch": 0.13980909601347558, + "grad_norm": 0.2752104699611664, + "learning_rate": 4.6498599439775914e-06, + "loss": 0.4766, + "step": 166 + }, + { + "epoch": 0.14065131948343626, + "grad_norm": 0.2509008049964905, + "learning_rate": 4.677871148459384e-06, + "loss": 0.4499, + "step": 167 + }, + { + "epoch": 0.14149354295339697, + "grad_norm": 0.2699715197086334, + "learning_rate": 4.705882352941177e-06, + "loss": 0.4696, + "step": 168 + }, + { + "epoch": 0.14233576642335766, + "grad_norm": 0.29950863122940063, + "learning_rate": 4.733893557422969e-06, + "loss": 0.4532, + "step": 169 + }, + { + "epoch": 0.14317798989331837, + "grad_norm": 0.2432369738817215, + "learning_rate": 4.761904761904762e-06, + "loss": 0.448, + "step": 170 + }, + { + "epoch": 0.14402021336327905, + "grad_norm": 0.25826480984687805, + "learning_rate": 4.7899159663865555e-06, + "loss": 0.4349, + "step": 171 + }, + { + "epoch": 0.14486243683323977, + "grad_norm": 0.2995217740535736, + "learning_rate": 4.817927170868348e-06, + "loss": 0.4619, + "step": 172 + }, + { + "epoch": 0.14570466030320045, + "grad_norm": 0.2739027440547943, + "learning_rate": 4.84593837535014e-06, + "loss": 0.4486, + "step": 173 + }, + { + "epoch": 0.14654688377316114, + "grad_norm": 0.22998765110969543, + "learning_rate": 4.873949579831933e-06, + "loss": 0.4239, + "step": 174 + }, + { + "epoch": 0.14738910724312185, + "grad_norm": 0.3160231113433838, + "learning_rate": 4.901960784313726e-06, + "loss": 0.4522, + "step": 175 + }, + { + "epoch": 0.14823133071308253, + "grad_norm": 0.2635315954685211, + "learning_rate": 4.929971988795519e-06, + "loss": 0.4372, + "step": 176 + }, + { + "epoch": 0.14907355418304324, + "grad_norm": 0.24901020526885986, + "learning_rate": 4.957983193277311e-06, + "loss": 0.4459, + "step": 177 + }, + { + "epoch": 0.14991577765300393, + "grad_norm": 0.27069857716560364, + "learning_rate": 4.985994397759104e-06, + "loss": 0.4833, + "step": 178 + }, + { + "epoch": 0.1507580011229646, + "grad_norm": 0.2763567268848419, + "learning_rate": 5.0140056022408966e-06, + "loss": 0.4369, + "step": 179 + }, + { + "epoch": 0.15160022459292533, + "grad_norm": 0.26479318737983704, + "learning_rate": 5.04201680672269e-06, + "loss": 0.4509, + "step": 180 + }, + { + "epoch": 0.152442448062886, + "grad_norm": 0.22581776976585388, + "learning_rate": 5.070028011204482e-06, + "loss": 0.4173, + "step": 181 + }, + { + "epoch": 0.15328467153284672, + "grad_norm": 0.2581729292869568, + "learning_rate": 5.098039215686274e-06, + "loss": 0.4512, + "step": 182 + }, + { + "epoch": 0.1541268950028074, + "grad_norm": 0.2569541931152344, + "learning_rate": 5.1260504201680675e-06, + "loss": 0.4441, + "step": 183 + }, + { + "epoch": 0.15496911847276812, + "grad_norm": 0.27027472853660583, + "learning_rate": 5.15406162464986e-06, + "loss": 0.4612, + "step": 184 + }, + { + "epoch": 0.1558113419427288, + "grad_norm": 0.2799546420574188, + "learning_rate": 5.182072829131654e-06, + "loss": 0.4713, + "step": 185 + }, + { + "epoch": 0.1566535654126895, + "grad_norm": 0.27860912680625916, + "learning_rate": 5.210084033613446e-06, + "loss": 0.4368, + "step": 186 + }, + { + "epoch": 0.1574957888826502, + "grad_norm": 0.2584918439388275, + "learning_rate": 5.2380952380952384e-06, + "loss": 0.4169, + "step": 187 + }, + { + "epoch": 0.15833801235261089, + "grad_norm": 0.2801576554775238, + "learning_rate": 5.266106442577032e-06, + "loss": 0.438, + "step": 188 + }, + { + "epoch": 0.1591802358225716, + "grad_norm": 0.3123912811279297, + "learning_rate": 5.294117647058824e-06, + "loss": 0.4763, + "step": 189 + }, + { + "epoch": 0.16002245929253228, + "grad_norm": 0.2466737926006317, + "learning_rate": 5.322128851540617e-06, + "loss": 0.4384, + "step": 190 + }, + { + "epoch": 0.160864682762493, + "grad_norm": 0.24402092397212982, + "learning_rate": 5.350140056022409e-06, + "loss": 0.4226, + "step": 191 + }, + { + "epoch": 0.16170690623245368, + "grad_norm": 0.3021142780780792, + "learning_rate": 5.378151260504202e-06, + "loss": 0.4303, + "step": 192 + }, + { + "epoch": 0.16254912970241436, + "grad_norm": 0.2734925150871277, + "learning_rate": 5.406162464985995e-06, + "loss": 0.4308, + "step": 193 + }, + { + "epoch": 0.16339135317237508, + "grad_norm": 0.24589981138706207, + "learning_rate": 5.434173669467787e-06, + "loss": 0.4317, + "step": 194 + }, + { + "epoch": 0.16423357664233576, + "grad_norm": 0.2625247538089752, + "learning_rate": 5.4621848739495795e-06, + "loss": 0.4357, + "step": 195 + }, + { + "epoch": 0.16507580011229647, + "grad_norm": 0.25053343176841736, + "learning_rate": 5.4901960784313735e-06, + "loss": 0.4175, + "step": 196 + }, + { + "epoch": 0.16591802358225716, + "grad_norm": 0.23492728173732758, + "learning_rate": 5.518207282913166e-06, + "loss": 0.4506, + "step": 197 + }, + { + "epoch": 0.16676024705221784, + "grad_norm": 0.2485855370759964, + "learning_rate": 5.546218487394959e-06, + "loss": 0.4517, + "step": 198 + }, + { + "epoch": 0.16760247052217855, + "grad_norm": 0.22515664994716644, + "learning_rate": 5.574229691876751e-06, + "loss": 0.437, + "step": 199 + }, + { + "epoch": 0.16844469399213924, + "grad_norm": 0.2672145962715149, + "learning_rate": 5.6022408963585436e-06, + "loss": 0.4512, + "step": 200 + }, + { + "epoch": 0.16928691746209995, + "grad_norm": 0.28508293628692627, + "learning_rate": 5.630252100840337e-06, + "loss": 0.4401, + "step": 201 + }, + { + "epoch": 0.17012914093206064, + "grad_norm": 0.2558955252170563, + "learning_rate": 5.658263305322129e-06, + "loss": 0.4215, + "step": 202 + }, + { + "epoch": 0.17097136440202135, + "grad_norm": 0.24676844477653503, + "learning_rate": 5.686274509803922e-06, + "loss": 0.4283, + "step": 203 + }, + { + "epoch": 0.17181358787198203, + "grad_norm": 0.2635219991207123, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.4641, + "step": 204 + }, + { + "epoch": 0.17265581134194272, + "grad_norm": 0.23264090716838837, + "learning_rate": 5.742296918767507e-06, + "loss": 0.4257, + "step": 205 + }, + { + "epoch": 0.17349803481190343, + "grad_norm": 0.2503959536552429, + "learning_rate": 5.7703081232493e-06, + "loss": 0.4256, + "step": 206 + }, + { + "epoch": 0.1743402582818641, + "grad_norm": 0.25033625960350037, + "learning_rate": 5.798319327731093e-06, + "loss": 0.4291, + "step": 207 + }, + { + "epoch": 0.17518248175182483, + "grad_norm": 0.27226942777633667, + "learning_rate": 5.826330532212886e-06, + "loss": 0.4503, + "step": 208 + }, + { + "epoch": 0.1760247052217855, + "grad_norm": 0.25907737016677856, + "learning_rate": 5.854341736694679e-06, + "loss": 0.4339, + "step": 209 + }, + { + "epoch": 0.17686692869174622, + "grad_norm": 0.25802192091941833, + "learning_rate": 5.882352941176471e-06, + "loss": 0.4413, + "step": 210 + }, + { + "epoch": 0.1777091521617069, + "grad_norm": 0.26198717951774597, + "learning_rate": 5.910364145658264e-06, + "loss": 0.4412, + "step": 211 + }, + { + "epoch": 0.1785513756316676, + "grad_norm": 0.2522122859954834, + "learning_rate": 5.938375350140056e-06, + "loss": 0.4265, + "step": 212 + }, + { + "epoch": 0.1793935991016283, + "grad_norm": 0.2897355258464813, + "learning_rate": 5.9663865546218495e-06, + "loss": 0.4426, + "step": 213 + }, + { + "epoch": 0.180235822571589, + "grad_norm": 0.2690928876399994, + "learning_rate": 5.994397759103642e-06, + "loss": 0.4573, + "step": 214 + }, + { + "epoch": 0.1810780460415497, + "grad_norm": 0.2453453242778778, + "learning_rate": 6.022408963585434e-06, + "loss": 0.4462, + "step": 215 + }, + { + "epoch": 0.18192026951151039, + "grad_norm": 0.23455454409122467, + "learning_rate": 6.050420168067227e-06, + "loss": 0.4422, + "step": 216 + }, + { + "epoch": 0.1827624929814711, + "grad_norm": 0.25252288579940796, + "learning_rate": 6.07843137254902e-06, + "loss": 0.4347, + "step": 217 + }, + { + "epoch": 0.18360471645143178, + "grad_norm": 0.2373313456773758, + "learning_rate": 6.106442577030814e-06, + "loss": 0.4172, + "step": 218 + }, + { + "epoch": 0.18444693992139247, + "grad_norm": 0.22695258259773254, + "learning_rate": 6.134453781512606e-06, + "loss": 0.4394, + "step": 219 + }, + { + "epoch": 0.18528916339135318, + "grad_norm": 0.24199916422367096, + "learning_rate": 6.162464985994398e-06, + "loss": 0.4357, + "step": 220 + }, + { + "epoch": 0.18613138686131386, + "grad_norm": 0.27336186170578003, + "learning_rate": 6.1904761904761914e-06, + "loss": 0.4304, + "step": 221 + }, + { + "epoch": 0.18697361033127458, + "grad_norm": 0.23748163878917694, + "learning_rate": 6.218487394957984e-06, + "loss": 0.4407, + "step": 222 + }, + { + "epoch": 0.18781583380123526, + "grad_norm": 0.2980271279811859, + "learning_rate": 6.246498599439776e-06, + "loss": 0.4307, + "step": 223 + }, + { + "epoch": 0.18865805727119594, + "grad_norm": 0.25661277770996094, + "learning_rate": 6.274509803921569e-06, + "loss": 0.4273, + "step": 224 + }, + { + "epoch": 0.18950028074115666, + "grad_norm": 0.27232545614242554, + "learning_rate": 6.3025210084033615e-06, + "loss": 0.4142, + "step": 225 + }, + { + "epoch": 0.19034250421111734, + "grad_norm": 0.2807347774505615, + "learning_rate": 6.330532212885155e-06, + "loss": 0.4444, + "step": 226 + }, + { + "epoch": 0.19118472768107805, + "grad_norm": 0.25483405590057373, + "learning_rate": 6.358543417366947e-06, + "loss": 0.4433, + "step": 227 + }, + { + "epoch": 0.19202695115103874, + "grad_norm": 0.263897180557251, + "learning_rate": 6.386554621848739e-06, + "loss": 0.4229, + "step": 228 + }, + { + "epoch": 0.19286917462099945, + "grad_norm": 0.27256882190704346, + "learning_rate": 6.414565826330533e-06, + "loss": 0.4273, + "step": 229 + }, + { + "epoch": 0.19371139809096014, + "grad_norm": 0.30847159028053284, + "learning_rate": 6.442577030812326e-06, + "loss": 0.4364, + "step": 230 + }, + { + "epoch": 0.19455362156092082, + "grad_norm": 0.24873819947242737, + "learning_rate": 6.470588235294119e-06, + "loss": 0.4267, + "step": 231 + }, + { + "epoch": 0.19539584503088153, + "grad_norm": 0.30360686779022217, + "learning_rate": 6.498599439775911e-06, + "loss": 0.4096, + "step": 232 + }, + { + "epoch": 0.19623806850084222, + "grad_norm": 0.29067298769950867, + "learning_rate": 6.526610644257703e-06, + "loss": 0.4656, + "step": 233 + }, + { + "epoch": 0.19708029197080293, + "grad_norm": 0.280111700296402, + "learning_rate": 6.5546218487394966e-06, + "loss": 0.4368, + "step": 234 + }, + { + "epoch": 0.1979225154407636, + "grad_norm": 0.26713109016418457, + "learning_rate": 6.582633053221289e-06, + "loss": 0.4215, + "step": 235 + }, + { + "epoch": 0.19876473891072433, + "grad_norm": 0.2671687602996826, + "learning_rate": 6.610644257703082e-06, + "loss": 0.4087, + "step": 236 + }, + { + "epoch": 0.199606962380685, + "grad_norm": 0.2896401584148407, + "learning_rate": 6.638655462184874e-06, + "loss": 0.4387, + "step": 237 + }, + { + "epoch": 0.2004491858506457, + "grad_norm": 0.24639546871185303, + "learning_rate": 6.666666666666667e-06, + "loss": 0.4213, + "step": 238 + }, + { + "epoch": 0.2012914093206064, + "grad_norm": 0.29163262248039246, + "learning_rate": 6.69467787114846e-06, + "loss": 0.4198, + "step": 239 + }, + { + "epoch": 0.2021336327905671, + "grad_norm": 0.26881664991378784, + "learning_rate": 6.722689075630253e-06, + "loss": 0.4308, + "step": 240 + }, + { + "epoch": 0.2029758562605278, + "grad_norm": 0.26591241359710693, + "learning_rate": 6.750700280112046e-06, + "loss": 0.4266, + "step": 241 + }, + { + "epoch": 0.2038180797304885, + "grad_norm": 0.29882800579071045, + "learning_rate": 6.7787114845938384e-06, + "loss": 0.4246, + "step": 242 + }, + { + "epoch": 0.20466030320044917, + "grad_norm": 0.28500762581825256, + "learning_rate": 6.806722689075631e-06, + "loss": 0.4616, + "step": 243 + }, + { + "epoch": 0.20550252667040989, + "grad_norm": 0.3008909523487091, + "learning_rate": 6.834733893557424e-06, + "loss": 0.4245, + "step": 244 + }, + { + "epoch": 0.20634475014037057, + "grad_norm": 0.2836599349975586, + "learning_rate": 6.862745098039216e-06, + "loss": 0.418, + "step": 245 + }, + { + "epoch": 0.20718697361033128, + "grad_norm": 0.27612629532814026, + "learning_rate": 6.8907563025210085e-06, + "loss": 0.4347, + "step": 246 + }, + { + "epoch": 0.20802919708029197, + "grad_norm": 0.283955842256546, + "learning_rate": 6.918767507002802e-06, + "loss": 0.4455, + "step": 247 + }, + { + "epoch": 0.20887142055025268, + "grad_norm": 0.2872370183467865, + "learning_rate": 6.946778711484594e-06, + "loss": 0.4213, + "step": 248 + }, + { + "epoch": 0.20971364402021336, + "grad_norm": 0.2736302316188812, + "learning_rate": 6.974789915966387e-06, + "loss": 0.4213, + "step": 249 + }, + { + "epoch": 0.21055586749017405, + "grad_norm": 0.2974297106266022, + "learning_rate": 7.0028011204481795e-06, + "loss": 0.4053, + "step": 250 + }, + { + "epoch": 0.21139809096013476, + "grad_norm": 0.33730605244636536, + "learning_rate": 7.030812324929972e-06, + "loss": 0.4246, + "step": 251 + }, + { + "epoch": 0.21224031443009544, + "grad_norm": 0.2622481882572174, + "learning_rate": 7.058823529411766e-06, + "loss": 0.4202, + "step": 252 + }, + { + "epoch": 0.21308253790005616, + "grad_norm": 0.28689515590667725, + "learning_rate": 7.086834733893558e-06, + "loss": 0.4252, + "step": 253 + }, + { + "epoch": 0.21392476137001684, + "grad_norm": 0.2937809228897095, + "learning_rate": 7.114845938375351e-06, + "loss": 0.4495, + "step": 254 + }, + { + "epoch": 0.21476698483997755, + "grad_norm": 0.2611549198627472, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.4294, + "step": 255 + }, + { + "epoch": 0.21560920830993824, + "grad_norm": 0.24436011910438538, + "learning_rate": 7.170868347338936e-06, + "loss": 0.4318, + "step": 256 + }, + { + "epoch": 0.21645143177989892, + "grad_norm": 0.2528916895389557, + "learning_rate": 7.198879551820729e-06, + "loss": 0.4309, + "step": 257 + }, + { + "epoch": 0.21729365524985964, + "grad_norm": 0.25828465819358826, + "learning_rate": 7.226890756302521e-06, + "loss": 0.4122, + "step": 258 + }, + { + "epoch": 0.21813587871982032, + "grad_norm": 0.24670042097568512, + "learning_rate": 7.2549019607843145e-06, + "loss": 0.4197, + "step": 259 + }, + { + "epoch": 0.21897810218978103, + "grad_norm": 0.29221484065055847, + "learning_rate": 7.282913165266107e-06, + "loss": 0.4158, + "step": 260 + }, + { + "epoch": 0.21982032565974172, + "grad_norm": 0.2650546133518219, + "learning_rate": 7.310924369747899e-06, + "loss": 0.4098, + "step": 261 + }, + { + "epoch": 0.2206625491297024, + "grad_norm": 0.27381178736686707, + "learning_rate": 7.338935574229692e-06, + "loss": 0.4165, + "step": 262 + }, + { + "epoch": 0.2215047725996631, + "grad_norm": 0.2621234357357025, + "learning_rate": 7.3669467787114854e-06, + "loss": 0.4566, + "step": 263 + }, + { + "epoch": 0.2223469960696238, + "grad_norm": 0.27825459837913513, + "learning_rate": 7.394957983193279e-06, + "loss": 0.4297, + "step": 264 + }, + { + "epoch": 0.2231892195395845, + "grad_norm": 0.27097970247268677, + "learning_rate": 7.422969187675071e-06, + "loss": 0.4318, + "step": 265 + }, + { + "epoch": 0.2240314430095452, + "grad_norm": 0.29523909091949463, + "learning_rate": 7.450980392156863e-06, + "loss": 0.4327, + "step": 266 + }, + { + "epoch": 0.2248736664795059, + "grad_norm": 0.3030112683773041, + "learning_rate": 7.478991596638656e-06, + "loss": 0.4317, + "step": 267 + }, + { + "epoch": 0.2257158899494666, + "grad_norm": 0.25813236832618713, + "learning_rate": 7.507002801120449e-06, + "loss": 0.4137, + "step": 268 + }, + { + "epoch": 0.22655811341942728, + "grad_norm": 0.2950754761695862, + "learning_rate": 7.535014005602241e-06, + "loss": 0.4026, + "step": 269 + }, + { + "epoch": 0.227400336889388, + "grad_norm": 0.2944609522819519, + "learning_rate": 7.563025210084034e-06, + "loss": 0.4171, + "step": 270 + }, + { + "epoch": 0.22824256035934867, + "grad_norm": 0.25540435314178467, + "learning_rate": 7.5910364145658265e-06, + "loss": 0.4385, + "step": 271 + }, + { + "epoch": 0.22908478382930939, + "grad_norm": 0.28693222999572754, + "learning_rate": 7.61904761904762e-06, + "loss": 0.4317, + "step": 272 + }, + { + "epoch": 0.22992700729927007, + "grad_norm": 0.32201504707336426, + "learning_rate": 7.647058823529411e-06, + "loss": 0.4383, + "step": 273 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 0.2618683874607086, + "learning_rate": 7.675070028011205e-06, + "loss": 0.3924, + "step": 274 + }, + { + "epoch": 0.23161145423919147, + "grad_norm": 0.2778761684894562, + "learning_rate": 7.703081232492997e-06, + "loss": 0.4213, + "step": 275 + }, + { + "epoch": 0.23245367770915215, + "grad_norm": 0.28987208008766174, + "learning_rate": 7.731092436974791e-06, + "loss": 0.4325, + "step": 276 + }, + { + "epoch": 0.23329590117911286, + "grad_norm": 0.28098398447036743, + "learning_rate": 7.759103641456584e-06, + "loss": 0.4246, + "step": 277 + }, + { + "epoch": 0.23413812464907355, + "grad_norm": 0.29261812567710876, + "learning_rate": 7.787114845938376e-06, + "loss": 0.4511, + "step": 278 + }, + { + "epoch": 0.23498034811903426, + "grad_norm": 0.2923807203769684, + "learning_rate": 7.815126050420168e-06, + "loss": 0.4305, + "step": 279 + }, + { + "epoch": 0.23582257158899494, + "grad_norm": 0.2776165008544922, + "learning_rate": 7.84313725490196e-06, + "loss": 0.4071, + "step": 280 + }, + { + "epoch": 0.23666479505895563, + "grad_norm": 0.3058721423149109, + "learning_rate": 7.871148459383755e-06, + "loss": 0.4349, + "step": 281 + }, + { + "epoch": 0.23750701852891634, + "grad_norm": 0.24573291838169098, + "learning_rate": 7.899159663865547e-06, + "loss": 0.4177, + "step": 282 + }, + { + "epoch": 0.23834924199887703, + "grad_norm": 0.29806849360466003, + "learning_rate": 7.92717086834734e-06, + "loss": 0.433, + "step": 283 + }, + { + "epoch": 0.23919146546883774, + "grad_norm": 0.2590548098087311, + "learning_rate": 7.955182072829132e-06, + "loss": 0.4082, + "step": 284 + }, + { + "epoch": 0.24003368893879842, + "grad_norm": 0.31977927684783936, + "learning_rate": 7.983193277310926e-06, + "loss": 0.4477, + "step": 285 + }, + { + "epoch": 0.24087591240875914, + "grad_norm": 0.2705984115600586, + "learning_rate": 8.011204481792718e-06, + "loss": 0.4006, + "step": 286 + }, + { + "epoch": 0.24171813587871982, + "grad_norm": 0.2714398205280304, + "learning_rate": 8.03921568627451e-06, + "loss": 0.4317, + "step": 287 + }, + { + "epoch": 0.2425603593486805, + "grad_norm": 0.27493196725845337, + "learning_rate": 8.067226890756303e-06, + "loss": 0.429, + "step": 288 + }, + { + "epoch": 0.24340258281864122, + "grad_norm": 0.28182482719421387, + "learning_rate": 8.095238095238097e-06, + "loss": 0.3974, + "step": 289 + }, + { + "epoch": 0.2442448062886019, + "grad_norm": 0.26035940647125244, + "learning_rate": 8.123249299719889e-06, + "loss": 0.4025, + "step": 290 + }, + { + "epoch": 0.2450870297585626, + "grad_norm": 0.2671283781528473, + "learning_rate": 8.151260504201681e-06, + "loss": 0.4375, + "step": 291 + }, + { + "epoch": 0.2459292532285233, + "grad_norm": 0.24599888920783997, + "learning_rate": 8.179271708683473e-06, + "loss": 0.3932, + "step": 292 + }, + { + "epoch": 0.246771476698484, + "grad_norm": 0.26801490783691406, + "learning_rate": 8.207282913165266e-06, + "loss": 0.4219, + "step": 293 + }, + { + "epoch": 0.2476137001684447, + "grad_norm": 0.2903979420661926, + "learning_rate": 8.23529411764706e-06, + "loss": 0.4306, + "step": 294 + }, + { + "epoch": 0.24845592363840538, + "grad_norm": 0.26690545678138733, + "learning_rate": 8.263305322128852e-06, + "loss": 0.4009, + "step": 295 + }, + { + "epoch": 0.2492981471083661, + "grad_norm": 0.2960893213748932, + "learning_rate": 8.291316526610646e-06, + "loss": 0.4198, + "step": 296 + }, + { + "epoch": 0.2501403705783268, + "grad_norm": 0.24158838391304016, + "learning_rate": 8.319327731092438e-06, + "loss": 0.4145, + "step": 297 + }, + { + "epoch": 0.2509825940482875, + "grad_norm": 0.26518386602401733, + "learning_rate": 8.34733893557423e-06, + "loss": 0.4194, + "step": 298 + }, + { + "epoch": 0.2518248175182482, + "grad_norm": 0.2885102927684784, + "learning_rate": 8.375350140056023e-06, + "loss": 0.428, + "step": 299 + }, + { + "epoch": 0.25266704098820886, + "grad_norm": 0.24481067061424255, + "learning_rate": 8.403361344537815e-06, + "loss": 0.4162, + "step": 300 + }, + { + "epoch": 0.25350926445816957, + "grad_norm": 0.2774904668331146, + "learning_rate": 8.43137254901961e-06, + "loss": 0.4359, + "step": 301 + }, + { + "epoch": 0.2543514879281303, + "grad_norm": 0.25240957736968994, + "learning_rate": 8.459383753501402e-06, + "loss": 0.3918, + "step": 302 + }, + { + "epoch": 0.25519371139809094, + "grad_norm": 0.259308397769928, + "learning_rate": 8.487394957983194e-06, + "loss": 0.4297, + "step": 303 + }, + { + "epoch": 0.25603593486805165, + "grad_norm": 0.25079861283302307, + "learning_rate": 8.515406162464986e-06, + "loss": 0.4217, + "step": 304 + }, + { + "epoch": 0.25687815833801236, + "grad_norm": 0.2630213499069214, + "learning_rate": 8.543417366946779e-06, + "loss": 0.4308, + "step": 305 + }, + { + "epoch": 0.2577203818079731, + "grad_norm": 0.24344508349895477, + "learning_rate": 8.571428571428571e-06, + "loss": 0.3946, + "step": 306 + }, + { + "epoch": 0.25856260527793373, + "grad_norm": 0.28903988003730774, + "learning_rate": 8.599439775910365e-06, + "loss": 0.4279, + "step": 307 + }, + { + "epoch": 0.25940482874789444, + "grad_norm": 0.2642119526863098, + "learning_rate": 8.627450980392157e-06, + "loss": 0.4097, + "step": 308 + }, + { + "epoch": 0.26024705221785516, + "grad_norm": 0.2755482494831085, + "learning_rate": 8.655462184873951e-06, + "loss": 0.4129, + "step": 309 + }, + { + "epoch": 0.2610892756878158, + "grad_norm": 0.29126107692718506, + "learning_rate": 8.683473389355744e-06, + "loss": 0.4117, + "step": 310 + }, + { + "epoch": 0.2619314991577765, + "grad_norm": 0.25518178939819336, + "learning_rate": 8.711484593837536e-06, + "loss": 0.4167, + "step": 311 + }, + { + "epoch": 0.26277372262773724, + "grad_norm": 0.27598628401756287, + "learning_rate": 8.739495798319328e-06, + "loss": 0.4351, + "step": 312 + }, + { + "epoch": 0.2636159460976979, + "grad_norm": 0.2577887773513794, + "learning_rate": 8.76750700280112e-06, + "loss": 0.4299, + "step": 313 + }, + { + "epoch": 0.2644581695676586, + "grad_norm": 0.2999492883682251, + "learning_rate": 8.795518207282914e-06, + "loss": 0.4389, + "step": 314 + }, + { + "epoch": 0.2653003930376193, + "grad_norm": 0.27518129348754883, + "learning_rate": 8.823529411764707e-06, + "loss": 0.4036, + "step": 315 + }, + { + "epoch": 0.26614261650758003, + "grad_norm": 0.28360292315483093, + "learning_rate": 8.851540616246499e-06, + "loss": 0.4355, + "step": 316 + }, + { + "epoch": 0.2669848399775407, + "grad_norm": 0.27708709239959717, + "learning_rate": 8.879551820728291e-06, + "loss": 0.4011, + "step": 317 + }, + { + "epoch": 0.2678270634475014, + "grad_norm": 0.27756384015083313, + "learning_rate": 8.907563025210085e-06, + "loss": 0.4207, + "step": 318 + }, + { + "epoch": 0.2686692869174621, + "grad_norm": 0.28298452496528625, + "learning_rate": 8.935574229691878e-06, + "loss": 0.4381, + "step": 319 + }, + { + "epoch": 0.26951151038742277, + "grad_norm": 0.2583518326282501, + "learning_rate": 8.96358543417367e-06, + "loss": 0.4043, + "step": 320 + }, + { + "epoch": 0.2703537338573835, + "grad_norm": 0.27131637930870056, + "learning_rate": 8.991596638655462e-06, + "loss": 0.4223, + "step": 321 + }, + { + "epoch": 0.2711959573273442, + "grad_norm": 0.2701607644557953, + "learning_rate": 9.019607843137256e-06, + "loss": 0.4179, + "step": 322 + }, + { + "epoch": 0.2720381807973049, + "grad_norm": 0.2683499753475189, + "learning_rate": 9.047619047619049e-06, + "loss": 0.3957, + "step": 323 + }, + { + "epoch": 0.27288040426726556, + "grad_norm": 0.25922417640686035, + "learning_rate": 9.075630252100841e-06, + "loss": 0.4058, + "step": 324 + }, + { + "epoch": 0.2737226277372263, + "grad_norm": 0.2564760744571686, + "learning_rate": 9.103641456582633e-06, + "loss": 0.4108, + "step": 325 + }, + { + "epoch": 0.274564851207187, + "grad_norm": 0.26044026017189026, + "learning_rate": 9.131652661064426e-06, + "loss": 0.4112, + "step": 326 + }, + { + "epoch": 0.27540707467714765, + "grad_norm": 0.272298127412796, + "learning_rate": 9.15966386554622e-06, + "loss": 0.4191, + "step": 327 + }, + { + "epoch": 0.27624929814710836, + "grad_norm": 0.2655772268772125, + "learning_rate": 9.187675070028012e-06, + "loss": 0.4091, + "step": 328 + }, + { + "epoch": 0.27709152161706907, + "grad_norm": 0.26740503311157227, + "learning_rate": 9.215686274509804e-06, + "loss": 0.3991, + "step": 329 + }, + { + "epoch": 0.2779337450870298, + "grad_norm": 0.28810393810272217, + "learning_rate": 9.243697478991598e-06, + "loss": 0.4245, + "step": 330 + }, + { + "epoch": 0.27877596855699044, + "grad_norm": 0.29594841599464417, + "learning_rate": 9.27170868347339e-06, + "loss": 0.4275, + "step": 331 + }, + { + "epoch": 0.27961819202695115, + "grad_norm": 0.30016207695007324, + "learning_rate": 9.299719887955183e-06, + "loss": 0.3934, + "step": 332 + }, + { + "epoch": 0.28046041549691186, + "grad_norm": 0.25892147421836853, + "learning_rate": 9.327731092436975e-06, + "loss": 0.4425, + "step": 333 + }, + { + "epoch": 0.2813026389668725, + "grad_norm": 0.305174320936203, + "learning_rate": 9.355742296918767e-06, + "loss": 0.429, + "step": 334 + }, + { + "epoch": 0.28214486243683323, + "grad_norm": 0.29193490743637085, + "learning_rate": 9.383753501400561e-06, + "loss": 0.4226, + "step": 335 + }, + { + "epoch": 0.28298708590679394, + "grad_norm": 0.3096065819263458, + "learning_rate": 9.411764705882354e-06, + "loss": 0.4501, + "step": 336 + }, + { + "epoch": 0.28382930937675466, + "grad_norm": 0.28678637742996216, + "learning_rate": 9.439775910364146e-06, + "loss": 0.424, + "step": 337 + }, + { + "epoch": 0.2846715328467153, + "grad_norm": 0.2797674536705017, + "learning_rate": 9.467787114845938e-06, + "loss": 0.3909, + "step": 338 + }, + { + "epoch": 0.285513756316676, + "grad_norm": 0.30397042632102966, + "learning_rate": 9.49579831932773e-06, + "loss": 0.4042, + "step": 339 + }, + { + "epoch": 0.28635597978663674, + "grad_norm": 0.28562748432159424, + "learning_rate": 9.523809523809525e-06, + "loss": 0.4316, + "step": 340 + }, + { + "epoch": 0.2871982032565974, + "grad_norm": 0.27482637763023376, + "learning_rate": 9.551820728291317e-06, + "loss": 0.415, + "step": 341 + }, + { + "epoch": 0.2880404267265581, + "grad_norm": 0.31606414914131165, + "learning_rate": 9.579831932773111e-06, + "loss": 0.4069, + "step": 342 + }, + { + "epoch": 0.2888826501965188, + "grad_norm": 0.348613440990448, + "learning_rate": 9.607843137254903e-06, + "loss": 0.4326, + "step": 343 + }, + { + "epoch": 0.28972487366647953, + "grad_norm": 0.2946453094482422, + "learning_rate": 9.635854341736696e-06, + "loss": 0.4271, + "step": 344 + }, + { + "epoch": 0.2905670971364402, + "grad_norm": 0.2972312867641449, + "learning_rate": 9.663865546218488e-06, + "loss": 0.4226, + "step": 345 + }, + { + "epoch": 0.2914093206064009, + "grad_norm": 0.3148708641529083, + "learning_rate": 9.69187675070028e-06, + "loss": 0.3889, + "step": 346 + }, + { + "epoch": 0.2922515440763616, + "grad_norm": 0.3089003562927246, + "learning_rate": 9.719887955182074e-06, + "loss": 0.3934, + "step": 347 + }, + { + "epoch": 0.29309376754632227, + "grad_norm": 0.29248514771461487, + "learning_rate": 9.747899159663867e-06, + "loss": 0.4003, + "step": 348 + }, + { + "epoch": 0.293935991016283, + "grad_norm": 0.3220504820346832, + "learning_rate": 9.775910364145659e-06, + "loss": 0.4282, + "step": 349 + }, + { + "epoch": 0.2947782144862437, + "grad_norm": 0.3207250237464905, + "learning_rate": 9.803921568627451e-06, + "loss": 0.4099, + "step": 350 + }, + { + "epoch": 0.2956204379562044, + "grad_norm": 0.31499800086021423, + "learning_rate": 9.831932773109244e-06, + "loss": 0.3907, + "step": 351 + }, + { + "epoch": 0.29646266142616506, + "grad_norm": 0.2804590165615082, + "learning_rate": 9.859943977591038e-06, + "loss": 0.3929, + "step": 352 + }, + { + "epoch": 0.2973048848961258, + "grad_norm": 0.3583865165710449, + "learning_rate": 9.88795518207283e-06, + "loss": 0.4331, + "step": 353 + }, + { + "epoch": 0.2981471083660865, + "grad_norm": 0.2689453065395355, + "learning_rate": 9.915966386554622e-06, + "loss": 0.4261, + "step": 354 + }, + { + "epoch": 0.29898933183604715, + "grad_norm": 0.24902856349945068, + "learning_rate": 9.943977591036416e-06, + "loss": 0.3995, + "step": 355 + }, + { + "epoch": 0.29983155530600786, + "grad_norm": 0.28560537099838257, + "learning_rate": 9.971988795518209e-06, + "loss": 0.413, + "step": 356 + }, + { + "epoch": 0.30067377877596857, + "grad_norm": 0.2845311164855957, + "learning_rate": 1e-05, + "loss": 0.4229, + "step": 357 + }, + { + "epoch": 0.3015160022459292, + "grad_norm": 0.2546791732311249, + "learning_rate": 9.99999759644146e-06, + "loss": 0.4159, + "step": 358 + }, + { + "epoch": 0.30235822571588994, + "grad_norm": 0.2738746702671051, + "learning_rate": 9.999990385768144e-06, + "loss": 0.4301, + "step": 359 + }, + { + "epoch": 0.30320044918585065, + "grad_norm": 0.30176639556884766, + "learning_rate": 9.999978367986988e-06, + "loss": 0.3877, + "step": 360 + }, + { + "epoch": 0.30404267265581136, + "grad_norm": 0.26767653226852417, + "learning_rate": 9.999961543109546e-06, + "loss": 0.4062, + "step": 361 + }, + { + "epoch": 0.304884896125772, + "grad_norm": 0.30675870180130005, + "learning_rate": 9.999939911151992e-06, + "loss": 0.4241, + "step": 362 + }, + { + "epoch": 0.30572711959573273, + "grad_norm": 0.2657410204410553, + "learning_rate": 9.999913472135126e-06, + "loss": 0.4107, + "step": 363 + }, + { + "epoch": 0.30656934306569344, + "grad_norm": 0.28486159443855286, + "learning_rate": 9.999882226084366e-06, + "loss": 0.4193, + "step": 364 + }, + { + "epoch": 0.3074115665356541, + "grad_norm": 0.271361768245697, + "learning_rate": 9.999846173029752e-06, + "loss": 0.4184, + "step": 365 + }, + { + "epoch": 0.3082537900056148, + "grad_norm": 0.26552683115005493, + "learning_rate": 9.999805313005946e-06, + "loss": 0.4155, + "step": 366 + }, + { + "epoch": 0.3090960134755755, + "grad_norm": 0.2777312994003296, + "learning_rate": 9.999759646052234e-06, + "loss": 0.4078, + "step": 367 + }, + { + "epoch": 0.30993823694553624, + "grad_norm": 0.26210469007492065, + "learning_rate": 9.99970917221252e-06, + "loss": 0.4068, + "step": 368 + }, + { + "epoch": 0.3107804604154969, + "grad_norm": 0.2592101991176605, + "learning_rate": 9.99965389153533e-06, + "loss": 0.4084, + "step": 369 + }, + { + "epoch": 0.3116226838854576, + "grad_norm": 0.26666998863220215, + "learning_rate": 9.999593804073812e-06, + "loss": 0.4118, + "step": 370 + }, + { + "epoch": 0.3124649073554183, + "grad_norm": 0.28305909037590027, + "learning_rate": 9.999528909885738e-06, + "loss": 0.3965, + "step": 371 + }, + { + "epoch": 0.313307130825379, + "grad_norm": 0.25685542821884155, + "learning_rate": 9.999459209033495e-06, + "loss": 0.4255, + "step": 372 + }, + { + "epoch": 0.3141493542953397, + "grad_norm": 0.2907734811306, + "learning_rate": 9.999384701584098e-06, + "loss": 0.4074, + "step": 373 + }, + { + "epoch": 0.3149915777653004, + "grad_norm": 0.27968016266822815, + "learning_rate": 9.99930538760918e-06, + "loss": 0.4105, + "step": 374 + }, + { + "epoch": 0.3158338012352611, + "grad_norm": 0.30917319655418396, + "learning_rate": 9.999221267184993e-06, + "loss": 0.3744, + "step": 375 + }, + { + "epoch": 0.31667602470522177, + "grad_norm": 0.3107389807701111, + "learning_rate": 9.999132340392416e-06, + "loss": 0.4255, + "step": 376 + }, + { + "epoch": 0.3175182481751825, + "grad_norm": 0.3284875154495239, + "learning_rate": 9.999038607316942e-06, + "loss": 0.4103, + "step": 377 + }, + { + "epoch": 0.3183604716451432, + "grad_norm": 0.3265838921070099, + "learning_rate": 9.998940068048688e-06, + "loss": 0.4163, + "step": 378 + }, + { + "epoch": 0.31920269511510385, + "grad_norm": 0.34593111276626587, + "learning_rate": 9.998836722682397e-06, + "loss": 0.4144, + "step": 379 + }, + { + "epoch": 0.32004491858506456, + "grad_norm": 0.309358686208725, + "learning_rate": 9.998728571317422e-06, + "loss": 0.4178, + "step": 380 + }, + { + "epoch": 0.3208871420550253, + "grad_norm": 0.2778068482875824, + "learning_rate": 9.998615614057743e-06, + "loss": 0.4281, + "step": 381 + }, + { + "epoch": 0.321729365524986, + "grad_norm": 0.40132778882980347, + "learning_rate": 9.998497851011963e-06, + "loss": 0.4237, + "step": 382 + }, + { + "epoch": 0.32257158899494665, + "grad_norm": 0.2529246509075165, + "learning_rate": 9.998375282293298e-06, + "loss": 0.415, + "step": 383 + }, + { + "epoch": 0.32341381246490736, + "grad_norm": 0.3505932092666626, + "learning_rate": 9.998247908019594e-06, + "loss": 0.4394, + "step": 384 + }, + { + "epoch": 0.32425603593486807, + "grad_norm": 0.3261929154396057, + "learning_rate": 9.998115728313305e-06, + "loss": 0.3895, + "step": 385 + }, + { + "epoch": 0.3250982594048287, + "grad_norm": 0.37192705273628235, + "learning_rate": 9.997978743301516e-06, + "loss": 0.4179, + "step": 386 + }, + { + "epoch": 0.32594048287478944, + "grad_norm": 0.25225892663002014, + "learning_rate": 9.997836953115927e-06, + "loss": 0.417, + "step": 387 + }, + { + "epoch": 0.32678270634475015, + "grad_norm": 0.34606847167015076, + "learning_rate": 9.997690357892857e-06, + "loss": 0.4351, + "step": 388 + }, + { + "epoch": 0.32762492981471086, + "grad_norm": 0.308843731880188, + "learning_rate": 9.997538957773248e-06, + "loss": 0.4046, + "step": 389 + }, + { + "epoch": 0.3284671532846715, + "grad_norm": 0.2955940365791321, + "learning_rate": 9.997382752902658e-06, + "loss": 0.3789, + "step": 390 + }, + { + "epoch": 0.32930937675463223, + "grad_norm": 0.303615003824234, + "learning_rate": 9.997221743431267e-06, + "loss": 0.4044, + "step": 391 + }, + { + "epoch": 0.33015160022459294, + "grad_norm": 0.30333343148231506, + "learning_rate": 9.997055929513873e-06, + "loss": 0.4393, + "step": 392 + }, + { + "epoch": 0.3309938236945536, + "grad_norm": 0.29766735434532166, + "learning_rate": 9.996885311309892e-06, + "loss": 0.422, + "step": 393 + }, + { + "epoch": 0.3318360471645143, + "grad_norm": 0.28912481665611267, + "learning_rate": 9.996709888983362e-06, + "loss": 0.411, + "step": 394 + }, + { + "epoch": 0.332678270634475, + "grad_norm": 0.30706316232681274, + "learning_rate": 9.99652966270294e-06, + "loss": 0.4136, + "step": 395 + }, + { + "epoch": 0.3335204941044357, + "grad_norm": 0.31515517830848694, + "learning_rate": 9.996344632641895e-06, + "loss": 0.4105, + "step": 396 + }, + { + "epoch": 0.3343627175743964, + "grad_norm": 0.2824975252151489, + "learning_rate": 9.996154798978122e-06, + "loss": 0.409, + "step": 397 + }, + { + "epoch": 0.3352049410443571, + "grad_norm": 0.2654586434364319, + "learning_rate": 9.995960161894132e-06, + "loss": 0.4352, + "step": 398 + }, + { + "epoch": 0.3360471645143178, + "grad_norm": 0.3069636821746826, + "learning_rate": 9.995760721577053e-06, + "loss": 0.4292, + "step": 399 + }, + { + "epoch": 0.3368893879842785, + "grad_norm": 0.2730976641178131, + "learning_rate": 9.99555647821863e-06, + "loss": 0.4101, + "step": 400 + }, + { + "epoch": 0.3377316114542392, + "grad_norm": 0.26647499203681946, + "learning_rate": 9.99534743201523e-06, + "loss": 0.3737, + "step": 401 + }, + { + "epoch": 0.3385738349241999, + "grad_norm": 0.2886996269226074, + "learning_rate": 9.995133583167833e-06, + "loss": 0.4339, + "step": 402 + }, + { + "epoch": 0.33941605839416056, + "grad_norm": 0.2916286587715149, + "learning_rate": 9.99491493188204e-06, + "loss": 0.4101, + "step": 403 + }, + { + "epoch": 0.34025828186412127, + "grad_norm": 0.32273969054222107, + "learning_rate": 9.994691478368067e-06, + "loss": 0.3825, + "step": 404 + }, + { + "epoch": 0.341100505334082, + "grad_norm": 0.26445654034614563, + "learning_rate": 9.994463222840748e-06, + "loss": 0.4172, + "step": 405 + }, + { + "epoch": 0.3419427288040427, + "grad_norm": 0.31402212381362915, + "learning_rate": 9.994230165519529e-06, + "loss": 0.4256, + "step": 406 + }, + { + "epoch": 0.34278495227400335, + "grad_norm": 0.2932952344417572, + "learning_rate": 9.993992306628481e-06, + "loss": 0.4206, + "step": 407 + }, + { + "epoch": 0.34362717574396406, + "grad_norm": 0.2703147232532501, + "learning_rate": 9.993749646396286e-06, + "loss": 0.4054, + "step": 408 + }, + { + "epoch": 0.3444693992139248, + "grad_norm": 0.31178292632102966, + "learning_rate": 9.993502185056244e-06, + "loss": 0.4237, + "step": 409 + }, + { + "epoch": 0.34531162268388543, + "grad_norm": 0.2774251103401184, + "learning_rate": 9.993249922846269e-06, + "loss": 0.4118, + "step": 410 + }, + { + "epoch": 0.34615384615384615, + "grad_norm": 0.2867864668369293, + "learning_rate": 9.992992860008893e-06, + "loss": 0.4218, + "step": 411 + }, + { + "epoch": 0.34699606962380686, + "grad_norm": 0.2802028954029083, + "learning_rate": 9.99273099679126e-06, + "loss": 0.4167, + "step": 412 + }, + { + "epoch": 0.34783829309376757, + "grad_norm": 0.24620549380779266, + "learning_rate": 9.992464333445134e-06, + "loss": 0.4018, + "step": 413 + }, + { + "epoch": 0.3486805165637282, + "grad_norm": 0.2835536301136017, + "learning_rate": 9.99219287022689e-06, + "loss": 0.4086, + "step": 414 + }, + { + "epoch": 0.34952274003368894, + "grad_norm": 0.29051610827445984, + "learning_rate": 9.99191660739752e-06, + "loss": 0.3848, + "step": 415 + }, + { + "epoch": 0.35036496350364965, + "grad_norm": 0.2779700756072998, + "learning_rate": 9.991635545222628e-06, + "loss": 0.4122, + "step": 416 + }, + { + "epoch": 0.3512071869736103, + "grad_norm": 0.25075340270996094, + "learning_rate": 9.991349683972435e-06, + "loss": 0.3967, + "step": 417 + }, + { + "epoch": 0.352049410443571, + "grad_norm": 0.26911744475364685, + "learning_rate": 9.991059023921773e-06, + "loss": 0.4128, + "step": 418 + }, + { + "epoch": 0.35289163391353173, + "grad_norm": 0.2757958471775055, + "learning_rate": 9.990763565350092e-06, + "loss": 0.4236, + "step": 419 + }, + { + "epoch": 0.35373385738349244, + "grad_norm": 0.27440062165260315, + "learning_rate": 9.990463308541452e-06, + "loss": 0.4243, + "step": 420 + }, + { + "epoch": 0.3545760808534531, + "grad_norm": 0.28325390815734863, + "learning_rate": 9.990158253784525e-06, + "loss": 0.4127, + "step": 421 + }, + { + "epoch": 0.3554183043234138, + "grad_norm": 0.25825008749961853, + "learning_rate": 9.989848401372602e-06, + "loss": 0.3925, + "step": 422 + }, + { + "epoch": 0.3562605277933745, + "grad_norm": 0.2866584062576294, + "learning_rate": 9.989533751603578e-06, + "loss": 0.4258, + "step": 423 + }, + { + "epoch": 0.3571027512633352, + "grad_norm": 0.31460416316986084, + "learning_rate": 9.989214304779965e-06, + "loss": 0.4357, + "step": 424 + }, + { + "epoch": 0.3579449747332959, + "grad_norm": 0.26211369037628174, + "learning_rate": 9.988890061208889e-06, + "loss": 0.398, + "step": 425 + }, + { + "epoch": 0.3587871982032566, + "grad_norm": 0.2859407365322113, + "learning_rate": 9.988561021202083e-06, + "loss": 0.4132, + "step": 426 + }, + { + "epoch": 0.3596294216732173, + "grad_norm": 0.28992462158203125, + "learning_rate": 9.988227185075897e-06, + "loss": 0.4053, + "step": 427 + }, + { + "epoch": 0.360471645143178, + "grad_norm": 0.3033839166164398, + "learning_rate": 9.987888553151285e-06, + "loss": 0.4287, + "step": 428 + }, + { + "epoch": 0.3613138686131387, + "grad_norm": 0.28602495789527893, + "learning_rate": 9.987545125753818e-06, + "loss": 0.4176, + "step": 429 + }, + { + "epoch": 0.3621560920830994, + "grad_norm": 0.313308984041214, + "learning_rate": 9.987196903213677e-06, + "loss": 0.404, + "step": 430 + }, + { + "epoch": 0.36299831555306006, + "grad_norm": 0.26802730560302734, + "learning_rate": 9.986843885865649e-06, + "loss": 0.4092, + "step": 431 + }, + { + "epoch": 0.36384053902302077, + "grad_norm": 0.2948782742023468, + "learning_rate": 9.986486074049131e-06, + "loss": 0.3985, + "step": 432 + }, + { + "epoch": 0.3646827624929815, + "grad_norm": 0.3537862300872803, + "learning_rate": 9.986123468108134e-06, + "loss": 0.413, + "step": 433 + }, + { + "epoch": 0.3655249859629422, + "grad_norm": 0.2874160706996918, + "learning_rate": 9.985756068391276e-06, + "loss": 0.4193, + "step": 434 + }, + { + "epoch": 0.36636720943290285, + "grad_norm": 0.45029184222221375, + "learning_rate": 9.985383875251783e-06, + "loss": 0.4117, + "step": 435 + }, + { + "epoch": 0.36720943290286356, + "grad_norm": 0.2880908250808716, + "learning_rate": 9.985006889047492e-06, + "loss": 0.4198, + "step": 436 + }, + { + "epoch": 0.3680516563728243, + "grad_norm": 0.3384968638420105, + "learning_rate": 9.984625110140844e-06, + "loss": 0.4295, + "step": 437 + }, + { + "epoch": 0.36889387984278493, + "grad_norm": 0.27980169653892517, + "learning_rate": 9.98423853889889e-06, + "loss": 0.4093, + "step": 438 + }, + { + "epoch": 0.36973610331274565, + "grad_norm": 0.3056717813014984, + "learning_rate": 9.983847175693291e-06, + "loss": 0.3861, + "step": 439 + }, + { + "epoch": 0.37057832678270636, + "grad_norm": 0.29720255732536316, + "learning_rate": 9.983451020900312e-06, + "loss": 0.4159, + "step": 440 + }, + { + "epoch": 0.371420550252667, + "grad_norm": 0.3048990070819855, + "learning_rate": 9.983050074900824e-06, + "loss": 0.4171, + "step": 441 + }, + { + "epoch": 0.3722627737226277, + "grad_norm": 0.338155061006546, + "learning_rate": 9.982644338080308e-06, + "loss": 0.4201, + "step": 442 + }, + { + "epoch": 0.37310499719258844, + "grad_norm": 0.3175585865974426, + "learning_rate": 9.982233810828846e-06, + "loss": 0.4204, + "step": 443 + }, + { + "epoch": 0.37394722066254915, + "grad_norm": 0.31735533475875854, + "learning_rate": 9.98181849354113e-06, + "loss": 0.4142, + "step": 444 + }, + { + "epoch": 0.3747894441325098, + "grad_norm": 0.33125582337379456, + "learning_rate": 9.98139838661646e-06, + "loss": 0.4166, + "step": 445 + }, + { + "epoch": 0.3756316676024705, + "grad_norm": 0.2836975157260895, + "learning_rate": 9.980973490458728e-06, + "loss": 0.4033, + "step": 446 + }, + { + "epoch": 0.37647389107243123, + "grad_norm": 0.303907573223114, + "learning_rate": 9.980543805476447e-06, + "loss": 0.4017, + "step": 447 + }, + { + "epoch": 0.3773161145423919, + "grad_norm": 0.3196185231208801, + "learning_rate": 9.980109332082722e-06, + "loss": 0.41, + "step": 448 + }, + { + "epoch": 0.3781583380123526, + "grad_norm": 0.3260578513145447, + "learning_rate": 9.979670070695265e-06, + "loss": 0.4247, + "step": 449 + }, + { + "epoch": 0.3790005614823133, + "grad_norm": 0.32000672817230225, + "learning_rate": 9.979226021736396e-06, + "loss": 0.4291, + "step": 450 + }, + { + "epoch": 0.379842784952274, + "grad_norm": 0.3371357321739197, + "learning_rate": 9.978777185633032e-06, + "loss": 0.3954, + "step": 451 + }, + { + "epoch": 0.3806850084222347, + "grad_norm": 0.2443530112504959, + "learning_rate": 9.978323562816693e-06, + "loss": 0.3982, + "step": 452 + }, + { + "epoch": 0.3815272318921954, + "grad_norm": 0.31013694405555725, + "learning_rate": 9.977865153723508e-06, + "loss": 0.4137, + "step": 453 + }, + { + "epoch": 0.3823694553621561, + "grad_norm": 0.31999990344047546, + "learning_rate": 9.977401958794194e-06, + "loss": 0.4103, + "step": 454 + }, + { + "epoch": 0.38321167883211676, + "grad_norm": 0.2706303298473358, + "learning_rate": 9.976933978474085e-06, + "loss": 0.4014, + "step": 455 + }, + { + "epoch": 0.3840539023020775, + "grad_norm": 0.27447131276130676, + "learning_rate": 9.976461213213104e-06, + "loss": 0.3868, + "step": 456 + }, + { + "epoch": 0.3848961257720382, + "grad_norm": 0.31807786226272583, + "learning_rate": 9.97598366346578e-06, + "loss": 0.3846, + "step": 457 + }, + { + "epoch": 0.3857383492419989, + "grad_norm": 0.271933376789093, + "learning_rate": 9.975501329691241e-06, + "loss": 0.4182, + "step": 458 + }, + { + "epoch": 0.38658057271195956, + "grad_norm": 0.2885846197605133, + "learning_rate": 9.975014212353212e-06, + "loss": 0.4128, + "step": 459 + }, + { + "epoch": 0.38742279618192027, + "grad_norm": 0.28178519010543823, + "learning_rate": 9.974522311920021e-06, + "loss": 0.4237, + "step": 460 + }, + { + "epoch": 0.388265019651881, + "grad_norm": 0.29402226209640503, + "learning_rate": 9.974025628864592e-06, + "loss": 0.3933, + "step": 461 + }, + { + "epoch": 0.38910724312184164, + "grad_norm": 0.26406511664390564, + "learning_rate": 9.973524163664447e-06, + "loss": 0.3735, + "step": 462 + }, + { + "epoch": 0.38994946659180235, + "grad_norm": 0.3236297369003296, + "learning_rate": 9.973017916801708e-06, + "loss": 0.4083, + "step": 463 + }, + { + "epoch": 0.39079169006176306, + "grad_norm": 0.2857525646686554, + "learning_rate": 9.972506888763092e-06, + "loss": 0.4027, + "step": 464 + }, + { + "epoch": 0.3916339135317238, + "grad_norm": 0.25883105397224426, + "learning_rate": 9.971991080039912e-06, + "loss": 0.4105, + "step": 465 + }, + { + "epoch": 0.39247613700168443, + "grad_norm": 0.2922370731830597, + "learning_rate": 9.971470491128077e-06, + "loss": 0.3932, + "step": 466 + }, + { + "epoch": 0.39331836047164515, + "grad_norm": 0.28805452585220337, + "learning_rate": 9.9709451225281e-06, + "loss": 0.4136, + "step": 467 + }, + { + "epoch": 0.39416058394160586, + "grad_norm": 0.26007330417633057, + "learning_rate": 9.970414974745077e-06, + "loss": 0.4072, + "step": 468 + }, + { + "epoch": 0.3950028074115665, + "grad_norm": 0.24989785254001617, + "learning_rate": 9.969880048288704e-06, + "loss": 0.4081, + "step": 469 + }, + { + "epoch": 0.3958450308815272, + "grad_norm": 0.26404762268066406, + "learning_rate": 9.969340343673277e-06, + "loss": 0.3937, + "step": 470 + }, + { + "epoch": 0.39668725435148794, + "grad_norm": 0.24274301528930664, + "learning_rate": 9.968795861417676e-06, + "loss": 0.417, + "step": 471 + }, + { + "epoch": 0.39752947782144865, + "grad_norm": 0.25516611337661743, + "learning_rate": 9.96824660204538e-06, + "loss": 0.4068, + "step": 472 + }, + { + "epoch": 0.3983717012914093, + "grad_norm": 0.24564239382743835, + "learning_rate": 9.96769256608446e-06, + "loss": 0.4197, + "step": 473 + }, + { + "epoch": 0.39921392476137, + "grad_norm": 0.2595030665397644, + "learning_rate": 9.967133754067581e-06, + "loss": 0.3948, + "step": 474 + }, + { + "epoch": 0.40005614823133073, + "grad_norm": 0.25603294372558594, + "learning_rate": 9.966570166531997e-06, + "loss": 0.4141, + "step": 475 + }, + { + "epoch": 0.4008983717012914, + "grad_norm": 0.2584262192249298, + "learning_rate": 9.966001804019552e-06, + "loss": 0.4257, + "step": 476 + }, + { + "epoch": 0.4017405951712521, + "grad_norm": 0.2369280606508255, + "learning_rate": 9.965428667076687e-06, + "loss": 0.4094, + "step": 477 + }, + { + "epoch": 0.4025828186412128, + "grad_norm": 0.2712903320789337, + "learning_rate": 9.964850756254426e-06, + "loss": 0.3783, + "step": 478 + }, + { + "epoch": 0.40342504211117347, + "grad_norm": 0.24917183816432953, + "learning_rate": 9.964268072108385e-06, + "loss": 0.3973, + "step": 479 + }, + { + "epoch": 0.4042672655811342, + "grad_norm": 0.2436331957578659, + "learning_rate": 9.963680615198774e-06, + "loss": 0.4184, + "step": 480 + }, + { + "epoch": 0.4051094890510949, + "grad_norm": 0.275935560464859, + "learning_rate": 9.963088386090386e-06, + "loss": 0.3807, + "step": 481 + }, + { + "epoch": 0.4059517125210556, + "grad_norm": 0.2412864714860916, + "learning_rate": 9.962491385352601e-06, + "loss": 0.4139, + "step": 482 + }, + { + "epoch": 0.40679393599101626, + "grad_norm": 0.26072415709495544, + "learning_rate": 9.961889613559396e-06, + "loss": 0.4119, + "step": 483 + }, + { + "epoch": 0.407636159460977, + "grad_norm": 0.27970102429389954, + "learning_rate": 9.961283071289323e-06, + "loss": 0.4231, + "step": 484 + }, + { + "epoch": 0.4084783829309377, + "grad_norm": 0.2597492039203644, + "learning_rate": 9.960671759125529e-06, + "loss": 0.418, + "step": 485 + }, + { + "epoch": 0.40932060640089835, + "grad_norm": 0.2641339898109436, + "learning_rate": 9.960055677655743e-06, + "loss": 0.3961, + "step": 486 + }, + { + "epoch": 0.41016282987085906, + "grad_norm": 0.2758226692676544, + "learning_rate": 9.959434827472278e-06, + "loss": 0.4075, + "step": 487 + }, + { + "epoch": 0.41100505334081977, + "grad_norm": 0.256102591753006, + "learning_rate": 9.958809209172038e-06, + "loss": 0.4084, + "step": 488 + }, + { + "epoch": 0.4118472768107805, + "grad_norm": 0.27487775683403015, + "learning_rate": 9.958178823356503e-06, + "loss": 0.3878, + "step": 489 + }, + { + "epoch": 0.41268950028074114, + "grad_norm": 0.24837318062782288, + "learning_rate": 9.957543670631743e-06, + "loss": 0.3978, + "step": 490 + }, + { + "epoch": 0.41353172375070185, + "grad_norm": 0.2772219777107239, + "learning_rate": 9.956903751608409e-06, + "loss": 0.4179, + "step": 491 + }, + { + "epoch": 0.41437394722066256, + "grad_norm": 0.30891284346580505, + "learning_rate": 9.956259066901733e-06, + "loss": 0.3918, + "step": 492 + }, + { + "epoch": 0.4152161706906232, + "grad_norm": 0.2519027888774872, + "learning_rate": 9.95560961713153e-06, + "loss": 0.4007, + "step": 493 + }, + { + "epoch": 0.41605839416058393, + "grad_norm": 0.31477248668670654, + "learning_rate": 9.954955402922195e-06, + "loss": 0.3806, + "step": 494 + }, + { + "epoch": 0.41690061763054465, + "grad_norm": 0.30688488483428955, + "learning_rate": 9.954296424902709e-06, + "loss": 0.402, + "step": 495 + }, + { + "epoch": 0.41774284110050536, + "grad_norm": 0.2900298833847046, + "learning_rate": 9.953632683706624e-06, + "loss": 0.4308, + "step": 496 + }, + { + "epoch": 0.418585064570466, + "grad_norm": 0.33165663480758667, + "learning_rate": 9.95296417997208e-06, + "loss": 0.4131, + "step": 497 + }, + { + "epoch": 0.4194272880404267, + "grad_norm": 0.254812628030777, + "learning_rate": 9.95229091434179e-06, + "loss": 0.3977, + "step": 498 + }, + { + "epoch": 0.42026951151038744, + "grad_norm": 0.3099961578845978, + "learning_rate": 9.95161288746305e-06, + "loss": 0.4235, + "step": 499 + }, + { + "epoch": 0.4211117349803481, + "grad_norm": 0.25541451573371887, + "learning_rate": 9.950930099987728e-06, + "loss": 0.4061, + "step": 500 + }, + { + "epoch": 0.4219539584503088, + "grad_norm": 0.2893903851509094, + "learning_rate": 9.950242552572272e-06, + "loss": 0.4148, + "step": 501 + }, + { + "epoch": 0.4227961819202695, + "grad_norm": 0.26284852623939514, + "learning_rate": 9.949550245877708e-06, + "loss": 0.3981, + "step": 502 + }, + { + "epoch": 0.42363840539023023, + "grad_norm": 0.27225029468536377, + "learning_rate": 9.948853180569635e-06, + "loss": 0.4128, + "step": 503 + }, + { + "epoch": 0.4244806288601909, + "grad_norm": 0.28634268045425415, + "learning_rate": 9.948151357318228e-06, + "loss": 0.4004, + "step": 504 + }, + { + "epoch": 0.4253228523301516, + "grad_norm": 0.28133031725883484, + "learning_rate": 9.947444776798235e-06, + "loss": 0.4123, + "step": 505 + }, + { + "epoch": 0.4261650758001123, + "grad_norm": 0.27984684705734253, + "learning_rate": 9.946733439688982e-06, + "loss": 0.3934, + "step": 506 + }, + { + "epoch": 0.42700729927007297, + "grad_norm": 0.2432078868150711, + "learning_rate": 9.946017346674362e-06, + "loss": 0.423, + "step": 507 + }, + { + "epoch": 0.4278495227400337, + "grad_norm": 0.28992959856987, + "learning_rate": 9.945296498442845e-06, + "loss": 0.3977, + "step": 508 + }, + { + "epoch": 0.4286917462099944, + "grad_norm": 0.2634131610393524, + "learning_rate": 9.944570895687471e-06, + "loss": 0.395, + "step": 509 + }, + { + "epoch": 0.4295339696799551, + "grad_norm": 0.255519300699234, + "learning_rate": 9.943840539105853e-06, + "loss": 0.3913, + "step": 510 + }, + { + "epoch": 0.43037619314991576, + "grad_norm": 0.2576368451118469, + "learning_rate": 9.943105429400171e-06, + "loss": 0.3904, + "step": 511 + }, + { + "epoch": 0.4312184166198765, + "grad_norm": 0.2746698558330536, + "learning_rate": 9.942365567277178e-06, + "loss": 0.3899, + "step": 512 + }, + { + "epoch": 0.4320606400898372, + "grad_norm": 0.2807859480381012, + "learning_rate": 9.941620953448195e-06, + "loss": 0.4123, + "step": 513 + }, + { + "epoch": 0.43290286355979785, + "grad_norm": 0.2983086407184601, + "learning_rate": 9.940871588629108e-06, + "loss": 0.4049, + "step": 514 + }, + { + "epoch": 0.43374508702975856, + "grad_norm": 0.2526918053627014, + "learning_rate": 9.940117473540377e-06, + "loss": 0.4206, + "step": 515 + }, + { + "epoch": 0.43458731049971927, + "grad_norm": 0.25227421522140503, + "learning_rate": 9.939358608907026e-06, + "loss": 0.4008, + "step": 516 + }, + { + "epoch": 0.43542953396968, + "grad_norm": 0.255201131105423, + "learning_rate": 9.938594995458644e-06, + "loss": 0.407, + "step": 517 + }, + { + "epoch": 0.43627175743964064, + "grad_norm": 0.2799823582172394, + "learning_rate": 9.937826633929388e-06, + "loss": 0.4022, + "step": 518 + }, + { + "epoch": 0.43711398090960135, + "grad_norm": 0.24459460377693176, + "learning_rate": 9.937053525057977e-06, + "loss": 0.4058, + "step": 519 + }, + { + "epoch": 0.43795620437956206, + "grad_norm": 0.26577839255332947, + "learning_rate": 9.936275669587697e-06, + "loss": 0.3902, + "step": 520 + }, + { + "epoch": 0.4387984278495227, + "grad_norm": 0.28700780868530273, + "learning_rate": 9.935493068266396e-06, + "loss": 0.4186, + "step": 521 + }, + { + "epoch": 0.43964065131948343, + "grad_norm": 0.2666192948818207, + "learning_rate": 9.934705721846487e-06, + "loss": 0.416, + "step": 522 + }, + { + "epoch": 0.44048287478944415, + "grad_norm": 0.2625783681869507, + "learning_rate": 9.933913631084942e-06, + "loss": 0.394, + "step": 523 + }, + { + "epoch": 0.4413250982594048, + "grad_norm": 0.24555350840091705, + "learning_rate": 9.933116796743294e-06, + "loss": 0.393, + "step": 524 + }, + { + "epoch": 0.4421673217293655, + "grad_norm": 0.2609098553657532, + "learning_rate": 9.932315219587641e-06, + "loss": 0.3874, + "step": 525 + }, + { + "epoch": 0.4430095451993262, + "grad_norm": 0.2597399055957794, + "learning_rate": 9.931508900388635e-06, + "loss": 0.3821, + "step": 526 + }, + { + "epoch": 0.44385176866928694, + "grad_norm": 0.27358579635620117, + "learning_rate": 9.930697839921496e-06, + "loss": 0.3947, + "step": 527 + }, + { + "epoch": 0.4446939921392476, + "grad_norm": 0.25108766555786133, + "learning_rate": 9.92988203896599e-06, + "loss": 0.3992, + "step": 528 + }, + { + "epoch": 0.4455362156092083, + "grad_norm": 0.2552182078361511, + "learning_rate": 9.929061498306448e-06, + "loss": 0.4085, + "step": 529 + }, + { + "epoch": 0.446378439079169, + "grad_norm": 0.24164170026779175, + "learning_rate": 9.92823621873176e-06, + "loss": 0.3816, + "step": 530 + }, + { + "epoch": 0.4472206625491297, + "grad_norm": 0.2571217119693756, + "learning_rate": 9.927406201035368e-06, + "loss": 0.3978, + "step": 531 + }, + { + "epoch": 0.4480628860190904, + "grad_norm": 0.2748754322528839, + "learning_rate": 9.926571446015271e-06, + "loss": 0.3985, + "step": 532 + }, + { + "epoch": 0.4489051094890511, + "grad_norm": 0.232895165681839, + "learning_rate": 9.92573195447402e-06, + "loss": 0.3957, + "step": 533 + }, + { + "epoch": 0.4497473329590118, + "grad_norm": 0.2499205768108368, + "learning_rate": 9.924887727218724e-06, + "loss": 0.4001, + "step": 534 + }, + { + "epoch": 0.45058955642897247, + "grad_norm": 0.25635528564453125, + "learning_rate": 9.924038765061042e-06, + "loss": 0.3903, + "step": 535 + }, + { + "epoch": 0.4514317798989332, + "grad_norm": 0.2474844753742218, + "learning_rate": 9.923185068817184e-06, + "loss": 0.4026, + "step": 536 + }, + { + "epoch": 0.4522740033688939, + "grad_norm": 0.24200493097305298, + "learning_rate": 9.922326639307918e-06, + "loss": 0.4116, + "step": 537 + }, + { + "epoch": 0.45311622683885455, + "grad_norm": 0.2580752968788147, + "learning_rate": 9.921463477358555e-06, + "loss": 0.4286, + "step": 538 + }, + { + "epoch": 0.45395845030881526, + "grad_norm": 0.24984806776046753, + "learning_rate": 9.920595583798959e-06, + "loss": 0.4263, + "step": 539 + }, + { + "epoch": 0.454800673778776, + "grad_norm": 0.2657654881477356, + "learning_rate": 9.919722959463545e-06, + "loss": 0.4097, + "step": 540 + }, + { + "epoch": 0.4556428972487367, + "grad_norm": 0.29026904702186584, + "learning_rate": 9.918845605191274e-06, + "loss": 0.4042, + "step": 541 + }, + { + "epoch": 0.45648512071869735, + "grad_norm": 0.2547171711921692, + "learning_rate": 9.917963521825653e-06, + "loss": 0.3989, + "step": 542 + }, + { + "epoch": 0.45732734418865806, + "grad_norm": 0.2613818347454071, + "learning_rate": 9.917076710214739e-06, + "loss": 0.3997, + "step": 543 + }, + { + "epoch": 0.45816956765861877, + "grad_norm": 0.2573147118091583, + "learning_rate": 9.916185171211135e-06, + "loss": 0.4151, + "step": 544 + }, + { + "epoch": 0.4590117911285794, + "grad_norm": 0.24024684727191925, + "learning_rate": 9.915288905671986e-06, + "loss": 0.3656, + "step": 545 + }, + { + "epoch": 0.45985401459854014, + "grad_norm": 0.2507457435131073, + "learning_rate": 9.914387914458983e-06, + "loss": 0.402, + "step": 546 + }, + { + "epoch": 0.46069623806850085, + "grad_norm": 0.25308138132095337, + "learning_rate": 9.913482198438357e-06, + "loss": 0.4187, + "step": 547 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 0.2678643465042114, + "learning_rate": 9.912571758480892e-06, + "loss": 0.4229, + "step": 548 + }, + { + "epoch": 0.4623806850084222, + "grad_norm": 0.252312570810318, + "learning_rate": 9.911656595461899e-06, + "loss": 0.4014, + "step": 549 + }, + { + "epoch": 0.46322290847838293, + "grad_norm": 0.24333544075489044, + "learning_rate": 9.910736710261238e-06, + "loss": 0.3984, + "step": 550 + }, + { + "epoch": 0.46406513194834365, + "grad_norm": 0.26091766357421875, + "learning_rate": 9.909812103763312e-06, + "loss": 0.3842, + "step": 551 + }, + { + "epoch": 0.4649073554183043, + "grad_norm": 0.25327688455581665, + "learning_rate": 9.908882776857057e-06, + "loss": 0.4054, + "step": 552 + }, + { + "epoch": 0.465749578888265, + "grad_norm": 0.30325254797935486, + "learning_rate": 9.90794873043595e-06, + "loss": 0.3966, + "step": 553 + }, + { + "epoch": 0.4665918023582257, + "grad_norm": 0.29707014560699463, + "learning_rate": 9.907009965398005e-06, + "loss": 0.3941, + "step": 554 + }, + { + "epoch": 0.46743402582818644, + "grad_norm": 0.27899348735809326, + "learning_rate": 9.906066482645774e-06, + "loss": 0.4064, + "step": 555 + }, + { + "epoch": 0.4682762492981471, + "grad_norm": 0.2798828184604645, + "learning_rate": 9.905118283086341e-06, + "loss": 0.397, + "step": 556 + }, + { + "epoch": 0.4691184727681078, + "grad_norm": 0.2678147554397583, + "learning_rate": 9.904165367631329e-06, + "loss": 0.3808, + "step": 557 + }, + { + "epoch": 0.4699606962380685, + "grad_norm": 0.25181877613067627, + "learning_rate": 9.903207737196892e-06, + "loss": 0.3951, + "step": 558 + }, + { + "epoch": 0.4708029197080292, + "grad_norm": 0.3140571117401123, + "learning_rate": 9.902245392703719e-06, + "loss": 0.3898, + "step": 559 + }, + { + "epoch": 0.4716451431779899, + "grad_norm": 0.24063082039356232, + "learning_rate": 9.901278335077031e-06, + "loss": 0.3981, + "step": 560 + }, + { + "epoch": 0.4724873666479506, + "grad_norm": 0.23210649192333221, + "learning_rate": 9.900306565246579e-06, + "loss": 0.3929, + "step": 561 + }, + { + "epoch": 0.47332959011791126, + "grad_norm": 0.25003889203071594, + "learning_rate": 9.899330084146646e-06, + "loss": 0.4097, + "step": 562 + }, + { + "epoch": 0.47417181358787197, + "grad_norm": 0.2555879056453705, + "learning_rate": 9.898348892716042e-06, + "loss": 0.3941, + "step": 563 + }, + { + "epoch": 0.4750140370578327, + "grad_norm": 0.2842501103878021, + "learning_rate": 9.89736299189811e-06, + "loss": 0.4004, + "step": 564 + }, + { + "epoch": 0.4758562605277934, + "grad_norm": 0.24519623816013336, + "learning_rate": 9.896372382640718e-06, + "loss": 0.3935, + "step": 565 + }, + { + "epoch": 0.47669848399775405, + "grad_norm": 0.2366226464509964, + "learning_rate": 9.895377065896259e-06, + "loss": 0.4067, + "step": 566 + }, + { + "epoch": 0.47754070746771476, + "grad_norm": 0.2646145820617676, + "learning_rate": 9.894377042621654e-06, + "loss": 0.4011, + "step": 567 + }, + { + "epoch": 0.4783829309376755, + "grad_norm": 0.2705758512020111, + "learning_rate": 9.89337231377835e-06, + "loss": 0.4131, + "step": 568 + }, + { + "epoch": 0.47922515440763613, + "grad_norm": 0.243195042014122, + "learning_rate": 9.892362880332316e-06, + "loss": 0.382, + "step": 569 + }, + { + "epoch": 0.48006737787759685, + "grad_norm": 0.25831979513168335, + "learning_rate": 9.891348743254046e-06, + "loss": 0.3843, + "step": 570 + }, + { + "epoch": 0.48090960134755756, + "grad_norm": 0.2449856847524643, + "learning_rate": 9.890329903518554e-06, + "loss": 0.4043, + "step": 571 + }, + { + "epoch": 0.48175182481751827, + "grad_norm": 0.25311365723609924, + "learning_rate": 9.889306362105377e-06, + "loss": 0.4008, + "step": 572 + }, + { + "epoch": 0.4825940482874789, + "grad_norm": 0.22990000247955322, + "learning_rate": 9.888278119998573e-06, + "loss": 0.4042, + "step": 573 + }, + { + "epoch": 0.48343627175743964, + "grad_norm": 0.23013539612293243, + "learning_rate": 9.887245178186715e-06, + "loss": 0.4142, + "step": 574 + }, + { + "epoch": 0.48427849522740035, + "grad_norm": 0.2604409158229828, + "learning_rate": 9.886207537662899e-06, + "loss": 0.4039, + "step": 575 + }, + { + "epoch": 0.485120718697361, + "grad_norm": 0.2513580918312073, + "learning_rate": 9.885165199424738e-06, + "loss": 0.385, + "step": 576 + }, + { + "epoch": 0.4859629421673217, + "grad_norm": 0.2692544460296631, + "learning_rate": 9.884118164474359e-06, + "loss": 0.4241, + "step": 577 + }, + { + "epoch": 0.48680516563728243, + "grad_norm": 0.2501692473888397, + "learning_rate": 9.883066433818404e-06, + "loss": 0.4002, + "step": 578 + }, + { + "epoch": 0.48764738910724315, + "grad_norm": 0.27484849095344543, + "learning_rate": 9.882010008468038e-06, + "loss": 0.378, + "step": 579 + }, + { + "epoch": 0.4884896125772038, + "grad_norm": 0.25672003626823425, + "learning_rate": 9.880948889438923e-06, + "loss": 0.4199, + "step": 580 + }, + { + "epoch": 0.4893318360471645, + "grad_norm": 0.26332417130470276, + "learning_rate": 9.879883077751255e-06, + "loss": 0.4101, + "step": 581 + }, + { + "epoch": 0.4901740595171252, + "grad_norm": 0.23965123295783997, + "learning_rate": 9.878812574429722e-06, + "loss": 0.4109, + "step": 582 + }, + { + "epoch": 0.4910162829870859, + "grad_norm": 0.272664338350296, + "learning_rate": 9.877737380503534e-06, + "loss": 0.3989, + "step": 583 + }, + { + "epoch": 0.4918585064570466, + "grad_norm": 0.23640109598636627, + "learning_rate": 9.876657497006408e-06, + "loss": 0.3999, + "step": 584 + }, + { + "epoch": 0.4927007299270073, + "grad_norm": 0.26969826221466064, + "learning_rate": 9.875572924976568e-06, + "loss": 0.3823, + "step": 585 + }, + { + "epoch": 0.493542953396968, + "grad_norm": 0.2869883179664612, + "learning_rate": 9.874483665456746e-06, + "loss": 0.4051, + "step": 586 + }, + { + "epoch": 0.4943851768669287, + "grad_norm": 0.2534290850162506, + "learning_rate": 9.873389719494186e-06, + "loss": 0.4012, + "step": 587 + }, + { + "epoch": 0.4952274003368894, + "grad_norm": 0.28421682119369507, + "learning_rate": 9.87229108814063e-06, + "loss": 0.3961, + "step": 588 + }, + { + "epoch": 0.4960696238068501, + "grad_norm": 0.25548046827316284, + "learning_rate": 9.871187772452327e-06, + "loss": 0.4219, + "step": 589 + }, + { + "epoch": 0.49691184727681076, + "grad_norm": 0.23864027857780457, + "learning_rate": 9.870079773490033e-06, + "loss": 0.4051, + "step": 590 + }, + { + "epoch": 0.49775407074677147, + "grad_norm": 0.27701038122177124, + "learning_rate": 9.868967092319003e-06, + "loss": 0.3948, + "step": 591 + }, + { + "epoch": 0.4985962942167322, + "grad_norm": 0.25791436433792114, + "learning_rate": 9.867849730008994e-06, + "loss": 0.4001, + "step": 592 + }, + { + "epoch": 0.4994385176866929, + "grad_norm": 0.22449485957622528, + "learning_rate": 9.866727687634266e-06, + "loss": 0.4034, + "step": 593 + }, + { + "epoch": 0.5002807411566536, + "grad_norm": 0.2631644308567047, + "learning_rate": 9.865600966273576e-06, + "loss": 0.3675, + "step": 594 + }, + { + "epoch": 0.5011229646266142, + "grad_norm": 0.28211626410484314, + "learning_rate": 9.86446956701018e-06, + "loss": 0.4139, + "step": 595 + }, + { + "epoch": 0.501965188096575, + "grad_norm": 0.22687473893165588, + "learning_rate": 9.86333349093183e-06, + "loss": 0.4066, + "step": 596 + }, + { + "epoch": 0.5028074115665356, + "grad_norm": 0.27827274799346924, + "learning_rate": 9.86219273913078e-06, + "loss": 0.3923, + "step": 597 + }, + { + "epoch": 0.5036496350364964, + "grad_norm": 0.24931105971336365, + "learning_rate": 9.861047312703772e-06, + "loss": 0.3816, + "step": 598 + }, + { + "epoch": 0.5044918585064571, + "grad_norm": 0.29707902669906616, + "learning_rate": 9.859897212752049e-06, + "loss": 0.4201, + "step": 599 + }, + { + "epoch": 0.5053340819764177, + "grad_norm": 0.26794612407684326, + "learning_rate": 9.858742440381343e-06, + "loss": 0.4042, + "step": 600 + }, + { + "epoch": 0.5061763054463785, + "grad_norm": 0.24728211760520935, + "learning_rate": 9.857582996701878e-06, + "loss": 0.4209, + "step": 601 + }, + { + "epoch": 0.5070185289163391, + "grad_norm": 0.2674620747566223, + "learning_rate": 9.856418882828368e-06, + "loss": 0.4173, + "step": 602 + }, + { + "epoch": 0.5078607523862998, + "grad_norm": 0.2867263853549957, + "learning_rate": 9.855250099880026e-06, + "loss": 0.3963, + "step": 603 + }, + { + "epoch": 0.5087029758562606, + "grad_norm": 0.262423574924469, + "learning_rate": 9.854076648980543e-06, + "loss": 0.4141, + "step": 604 + }, + { + "epoch": 0.5095451993262212, + "grad_norm": 0.2728235125541687, + "learning_rate": 9.852898531258102e-06, + "loss": 0.4097, + "step": 605 + }, + { + "epoch": 0.5103874227961819, + "grad_norm": 0.2570253908634186, + "learning_rate": 9.851715747845372e-06, + "loss": 0.4089, + "step": 606 + }, + { + "epoch": 0.5112296462661426, + "grad_norm": 0.2520364224910736, + "learning_rate": 9.850528299879513e-06, + "loss": 0.3943, + "step": 607 + }, + { + "epoch": 0.5120718697361033, + "grad_norm": 0.24955777823925018, + "learning_rate": 9.84933618850216e-06, + "loss": 0.393, + "step": 608 + }, + { + "epoch": 0.512914093206064, + "grad_norm": 0.25666603446006775, + "learning_rate": 9.848139414859441e-06, + "loss": 0.4066, + "step": 609 + }, + { + "epoch": 0.5137563166760247, + "grad_norm": 0.2686915099620819, + "learning_rate": 9.84693798010196e-06, + "loss": 0.4023, + "step": 610 + }, + { + "epoch": 0.5145985401459854, + "grad_norm": 0.24033565819263458, + "learning_rate": 9.845731885384806e-06, + "loss": 0.3966, + "step": 611 + }, + { + "epoch": 0.5154407636159462, + "grad_norm": 0.24050825834274292, + "learning_rate": 9.844521131867546e-06, + "loss": 0.386, + "step": 612 + }, + { + "epoch": 0.5162829870859068, + "grad_norm": 0.2602809965610504, + "learning_rate": 9.843305720714227e-06, + "loss": 0.3914, + "step": 613 + }, + { + "epoch": 0.5171252105558675, + "grad_norm": 0.2609127461910248, + "learning_rate": 9.842085653093372e-06, + "loss": 0.4117, + "step": 614 + }, + { + "epoch": 0.5179674340258282, + "grad_norm": 0.2795320749282837, + "learning_rate": 9.840860930177984e-06, + "loss": 0.4067, + "step": 615 + }, + { + "epoch": 0.5188096574957889, + "grad_norm": 0.25793221592903137, + "learning_rate": 9.83963155314554e-06, + "loss": 0.3827, + "step": 616 + }, + { + "epoch": 0.5196518809657495, + "grad_norm": 0.2973584830760956, + "learning_rate": 9.838397523177993e-06, + "loss": 0.4203, + "step": 617 + }, + { + "epoch": 0.5204941044357103, + "grad_norm": 0.2485421746969223, + "learning_rate": 9.837158841461767e-06, + "loss": 0.4058, + "step": 618 + }, + { + "epoch": 0.521336327905671, + "grad_norm": 0.28625357151031494, + "learning_rate": 9.835915509187759e-06, + "loss": 0.4052, + "step": 619 + }, + { + "epoch": 0.5221785513756316, + "grad_norm": 0.28024089336395264, + "learning_rate": 9.834667527551341e-06, + "loss": 0.3896, + "step": 620 + }, + { + "epoch": 0.5230207748455924, + "grad_norm": 0.2571643590927124, + "learning_rate": 9.833414897752346e-06, + "loss": 0.3892, + "step": 621 + }, + { + "epoch": 0.523862998315553, + "grad_norm": 0.2675620913505554, + "learning_rate": 9.832157620995088e-06, + "loss": 0.4076, + "step": 622 + }, + { + "epoch": 0.5247052217855137, + "grad_norm": 0.26316148042678833, + "learning_rate": 9.830895698488341e-06, + "loss": 0.3937, + "step": 623 + }, + { + "epoch": 0.5255474452554745, + "grad_norm": 0.3073652684688568, + "learning_rate": 9.829629131445342e-06, + "loss": 0.4122, + "step": 624 + }, + { + "epoch": 0.5263896687254351, + "grad_norm": 0.24735543131828308, + "learning_rate": 9.828357921083803e-06, + "loss": 0.3816, + "step": 625 + }, + { + "epoch": 0.5272318921953958, + "grad_norm": 0.3236400783061981, + "learning_rate": 9.827082068625893e-06, + "loss": 0.3938, + "step": 626 + }, + { + "epoch": 0.5280741156653566, + "grad_norm": 0.339495986700058, + "learning_rate": 9.825801575298248e-06, + "loss": 0.4254, + "step": 627 + }, + { + "epoch": 0.5289163391353172, + "grad_norm": 0.2840985655784607, + "learning_rate": 9.824516442331963e-06, + "loss": 0.423, + "step": 628 + }, + { + "epoch": 0.529758562605278, + "grad_norm": 0.3441496789455414, + "learning_rate": 9.823226670962598e-06, + "loss": 0.3984, + "step": 629 + }, + { + "epoch": 0.5306007860752386, + "grad_norm": 0.26104456186294556, + "learning_rate": 9.821932262430164e-06, + "loss": 0.4058, + "step": 630 + }, + { + "epoch": 0.5314430095451993, + "grad_norm": 0.2665572762489319, + "learning_rate": 9.82063321797914e-06, + "loss": 0.3975, + "step": 631 + }, + { + "epoch": 0.5322852330151601, + "grad_norm": 0.25185590982437134, + "learning_rate": 9.819329538858458e-06, + "loss": 0.4018, + "step": 632 + }, + { + "epoch": 0.5331274564851207, + "grad_norm": 0.25625938177108765, + "learning_rate": 9.818021226321502e-06, + "loss": 0.3823, + "step": 633 + }, + { + "epoch": 0.5339696799550814, + "grad_norm": 0.32790085673332214, + "learning_rate": 9.816708281626116e-06, + "loss": 0.3869, + "step": 634 + }, + { + "epoch": 0.5348119034250421, + "grad_norm": 0.25382882356643677, + "learning_rate": 9.815390706034598e-06, + "loss": 0.415, + "step": 635 + }, + { + "epoch": 0.5356541268950028, + "grad_norm": 0.2376747578382492, + "learning_rate": 9.814068500813692e-06, + "loss": 0.4161, + "step": 636 + }, + { + "epoch": 0.5364963503649635, + "grad_norm": 0.25517722964286804, + "learning_rate": 9.812741667234599e-06, + "loss": 0.3881, + "step": 637 + }, + { + "epoch": 0.5373385738349242, + "grad_norm": 0.24864782392978668, + "learning_rate": 9.811410206572972e-06, + "loss": 0.3703, + "step": 638 + }, + { + "epoch": 0.5381807973048849, + "grad_norm": 0.2584892809391022, + "learning_rate": 9.8100741201089e-06, + "loss": 0.4057, + "step": 639 + }, + { + "epoch": 0.5390230207748455, + "grad_norm": 0.2750840187072754, + "learning_rate": 9.808733409126934e-06, + "loss": 0.4113, + "step": 640 + }, + { + "epoch": 0.5398652442448063, + "grad_norm": 0.24435681104660034, + "learning_rate": 9.807388074916064e-06, + "loss": 0.3987, + "step": 641 + }, + { + "epoch": 0.540707467714767, + "grad_norm": 0.2840283513069153, + "learning_rate": 9.806038118769724e-06, + "loss": 0.3964, + "step": 642 + }, + { + "epoch": 0.5415496911847277, + "grad_norm": 0.2739749848842621, + "learning_rate": 9.804683541985796e-06, + "loss": 0.4262, + "step": 643 + }, + { + "epoch": 0.5423919146546884, + "grad_norm": 0.25117605924606323, + "learning_rate": 9.803324345866599e-06, + "loss": 0.3956, + "step": 644 + }, + { + "epoch": 0.543234138124649, + "grad_norm": 0.26271775364875793, + "learning_rate": 9.801960531718898e-06, + "loss": 0.4025, + "step": 645 + }, + { + "epoch": 0.5440763615946098, + "grad_norm": 0.25807446241378784, + "learning_rate": 9.800592100853894e-06, + "loss": 0.4004, + "step": 646 + }, + { + "epoch": 0.5449185850645705, + "grad_norm": 0.25606074929237366, + "learning_rate": 9.79921905458723e-06, + "loss": 0.3868, + "step": 647 + }, + { + "epoch": 0.5457608085345311, + "grad_norm": 0.27761006355285645, + "learning_rate": 9.797841394238987e-06, + "loss": 0.4067, + "step": 648 + }, + { + "epoch": 0.5466030320044919, + "grad_norm": 0.2372005730867386, + "learning_rate": 9.796459121133675e-06, + "loss": 0.4075, + "step": 649 + }, + { + "epoch": 0.5474452554744526, + "grad_norm": 0.2324095219373703, + "learning_rate": 9.795072236600247e-06, + "loss": 0.3867, + "step": 650 + }, + { + "epoch": 0.5482874789444132, + "grad_norm": 0.2631465792655945, + "learning_rate": 9.793680741972084e-06, + "loss": 0.402, + "step": 651 + }, + { + "epoch": 0.549129702414374, + "grad_norm": 0.2458457201719284, + "learning_rate": 9.792284638587005e-06, + "loss": 0.3924, + "step": 652 + }, + { + "epoch": 0.5499719258843346, + "grad_norm": 0.2652607858181, + "learning_rate": 9.790883927787254e-06, + "loss": 0.3838, + "step": 653 + }, + { + "epoch": 0.5508141493542953, + "grad_norm": 0.2583690881729126, + "learning_rate": 9.789478610919508e-06, + "loss": 0.3694, + "step": 654 + }, + { + "epoch": 0.5516563728242561, + "grad_norm": 0.3279140591621399, + "learning_rate": 9.78806868933487e-06, + "loss": 0.398, + "step": 655 + }, + { + "epoch": 0.5524985962942167, + "grad_norm": 0.22404353320598602, + "learning_rate": 9.786654164388873e-06, + "loss": 0.3939, + "step": 656 + }, + { + "epoch": 0.5533408197641775, + "grad_norm": 0.2810705602169037, + "learning_rate": 9.785235037441473e-06, + "loss": 0.3991, + "step": 657 + }, + { + "epoch": 0.5541830432341381, + "grad_norm": 0.26564010977745056, + "learning_rate": 9.783811309857057e-06, + "loss": 0.4042, + "step": 658 + }, + { + "epoch": 0.5550252667040988, + "grad_norm": 0.2543601393699646, + "learning_rate": 9.782382983004424e-06, + "loss": 0.3967, + "step": 659 + }, + { + "epoch": 0.5558674901740596, + "grad_norm": 0.27026471495628357, + "learning_rate": 9.780950058256802e-06, + "loss": 0.4063, + "step": 660 + }, + { + "epoch": 0.5567097136440202, + "grad_norm": 0.2524589002132416, + "learning_rate": 9.779512536991839e-06, + "loss": 0.4245, + "step": 661 + }, + { + "epoch": 0.5575519371139809, + "grad_norm": 0.2756974697113037, + "learning_rate": 9.778070420591603e-06, + "loss": 0.4059, + "step": 662 + }, + { + "epoch": 0.5583941605839416, + "grad_norm": 0.23817111551761627, + "learning_rate": 9.77662371044258e-06, + "loss": 0.3775, + "step": 663 + }, + { + "epoch": 0.5592363840539023, + "grad_norm": 0.27896976470947266, + "learning_rate": 9.775172407935664e-06, + "loss": 0.4178, + "step": 664 + }, + { + "epoch": 0.560078607523863, + "grad_norm": 0.282566636800766, + "learning_rate": 9.773716514466179e-06, + "loss": 0.4463, + "step": 665 + }, + { + "epoch": 0.5609208309938237, + "grad_norm": 0.2578970491886139, + "learning_rate": 9.77225603143385e-06, + "loss": 0.3849, + "step": 666 + }, + { + "epoch": 0.5617630544637844, + "grad_norm": 0.2897636592388153, + "learning_rate": 9.770790960242821e-06, + "loss": 0.4002, + "step": 667 + }, + { + "epoch": 0.562605277933745, + "grad_norm": 0.23036324977874756, + "learning_rate": 9.769321302301648e-06, + "loss": 0.3838, + "step": 668 + }, + { + "epoch": 0.5634475014037058, + "grad_norm": 0.2721245586872101, + "learning_rate": 9.767847059023292e-06, + "loss": 0.3993, + "step": 669 + }, + { + "epoch": 0.5642897248736665, + "grad_norm": 0.266300767660141, + "learning_rate": 9.766368231825126e-06, + "loss": 0.4184, + "step": 670 + }, + { + "epoch": 0.5651319483436271, + "grad_norm": 0.3153010606765747, + "learning_rate": 9.764884822128928e-06, + "loss": 0.3748, + "step": 671 + }, + { + "epoch": 0.5659741718135879, + "grad_norm": 0.2557030916213989, + "learning_rate": 9.763396831360884e-06, + "loss": 0.3771, + "step": 672 + }, + { + "epoch": 0.5668163952835485, + "grad_norm": 0.2733110189437866, + "learning_rate": 9.761904260951583e-06, + "loss": 0.3881, + "step": 673 + }, + { + "epoch": 0.5676586187535093, + "grad_norm": 0.2519077658653259, + "learning_rate": 9.760407112336016e-06, + "loss": 0.3807, + "step": 674 + }, + { + "epoch": 0.56850084222347, + "grad_norm": 0.2971459627151489, + "learning_rate": 9.75890538695358e-06, + "loss": 0.4062, + "step": 675 + }, + { + "epoch": 0.5693430656934306, + "grad_norm": 0.27987900376319885, + "learning_rate": 9.757399086248062e-06, + "loss": 0.3901, + "step": 676 + }, + { + "epoch": 0.5701852891633914, + "grad_norm": 0.24134734272956848, + "learning_rate": 9.755888211667663e-06, + "loss": 0.3777, + "step": 677 + }, + { + "epoch": 0.571027512633352, + "grad_norm": 0.2651788592338562, + "learning_rate": 9.75437276466497e-06, + "loss": 0.4159, + "step": 678 + }, + { + "epoch": 0.5718697361033127, + "grad_norm": 0.22584442794322968, + "learning_rate": 9.752852746696968e-06, + "loss": 0.3922, + "step": 679 + }, + { + "epoch": 0.5727119595732735, + "grad_norm": 0.2702990770339966, + "learning_rate": 9.751328159225037e-06, + "loss": 0.3937, + "step": 680 + }, + { + "epoch": 0.5735541830432341, + "grad_norm": 0.27012670040130615, + "learning_rate": 9.749799003714954e-06, + "loss": 0.4118, + "step": 681 + }, + { + "epoch": 0.5743964065131948, + "grad_norm": 0.24326875805854797, + "learning_rate": 9.748265281636885e-06, + "loss": 0.4085, + "step": 682 + }, + { + "epoch": 0.5752386299831556, + "grad_norm": 0.257748544216156, + "learning_rate": 9.746726994465383e-06, + "loss": 0.3849, + "step": 683 + }, + { + "epoch": 0.5760808534531162, + "grad_norm": 0.2539883255958557, + "learning_rate": 9.745184143679398e-06, + "loss": 0.3977, + "step": 684 + }, + { + "epoch": 0.5769230769230769, + "grad_norm": 0.21800824999809265, + "learning_rate": 9.743636730762259e-06, + "loss": 0.4062, + "step": 685 + }, + { + "epoch": 0.5777653003930376, + "grad_norm": 0.2953537106513977, + "learning_rate": 9.742084757201684e-06, + "loss": 0.4001, + "step": 686 + }, + { + "epoch": 0.5786075238629983, + "grad_norm": 0.2531120777130127, + "learning_rate": 9.74052822448978e-06, + "loss": 0.3975, + "step": 687 + }, + { + "epoch": 0.5794497473329591, + "grad_norm": 0.25123146176338196, + "learning_rate": 9.738967134123035e-06, + "loss": 0.3841, + "step": 688 + }, + { + "epoch": 0.5802919708029197, + "grad_norm": 0.3006608188152313, + "learning_rate": 9.737401487602314e-06, + "loss": 0.3995, + "step": 689 + }, + { + "epoch": 0.5811341942728804, + "grad_norm": 0.2671349346637726, + "learning_rate": 9.735831286432869e-06, + "loss": 0.4051, + "step": 690 + }, + { + "epoch": 0.5819764177428411, + "grad_norm": 0.2805735170841217, + "learning_rate": 9.734256532124326e-06, + "loss": 0.3813, + "step": 691 + }, + { + "epoch": 0.5828186412128018, + "grad_norm": 0.25094088912010193, + "learning_rate": 9.732677226190692e-06, + "loss": 0.3693, + "step": 692 + }, + { + "epoch": 0.5836608646827625, + "grad_norm": 0.2721113860607147, + "learning_rate": 9.731093370150349e-06, + "loss": 0.4002, + "step": 693 + }, + { + "epoch": 0.5845030881527232, + "grad_norm": 0.24327607452869415, + "learning_rate": 9.729504965526053e-06, + "loss": 0.3991, + "step": 694 + }, + { + "epoch": 0.5853453116226839, + "grad_norm": 0.25750425457954407, + "learning_rate": 9.727912013844933e-06, + "loss": 0.4134, + "step": 695 + }, + { + "epoch": 0.5861875350926445, + "grad_norm": 0.25728413462638855, + "learning_rate": 9.72631451663849e-06, + "loss": 0.3895, + "step": 696 + }, + { + "epoch": 0.5870297585626053, + "grad_norm": 0.3020685613155365, + "learning_rate": 9.724712475442597e-06, + "loss": 0.3877, + "step": 697 + }, + { + "epoch": 0.587871982032566, + "grad_norm": 0.23866966366767883, + "learning_rate": 9.72310589179749e-06, + "loss": 0.4152, + "step": 698 + }, + { + "epoch": 0.5887142055025266, + "grad_norm": 0.28489524126052856, + "learning_rate": 9.721494767247779e-06, + "loss": 0.39, + "step": 699 + }, + { + "epoch": 0.5895564289724874, + "grad_norm": 0.27395954728126526, + "learning_rate": 9.719879103342438e-06, + "loss": 0.3756, + "step": 700 + }, + { + "epoch": 0.590398652442448, + "grad_norm": 0.28621479868888855, + "learning_rate": 9.718258901634802e-06, + "loss": 0.3948, + "step": 701 + }, + { + "epoch": 0.5912408759124088, + "grad_norm": 0.23523105680942535, + "learning_rate": 9.71663416368257e-06, + "loss": 0.385, + "step": 702 + }, + { + "epoch": 0.5920830993823695, + "grad_norm": 0.2857535481452942, + "learning_rate": 9.715004891047805e-06, + "loss": 0.3767, + "step": 703 + }, + { + "epoch": 0.5929253228523301, + "grad_norm": 0.26084885001182556, + "learning_rate": 9.71337108529693e-06, + "loss": 0.3769, + "step": 704 + }, + { + "epoch": 0.5937675463222909, + "grad_norm": 0.276625394821167, + "learning_rate": 9.71173274800072e-06, + "loss": 0.3924, + "step": 705 + }, + { + "epoch": 0.5946097697922516, + "grad_norm": 0.239909365773201, + "learning_rate": 9.71008988073431e-06, + "loss": 0.3989, + "step": 706 + }, + { + "epoch": 0.5954519932622122, + "grad_norm": 0.2926711142063141, + "learning_rate": 9.708442485077197e-06, + "loss": 0.3969, + "step": 707 + }, + { + "epoch": 0.596294216732173, + "grad_norm": 0.2879006266593933, + "learning_rate": 9.70679056261322e-06, + "loss": 0.3816, + "step": 708 + }, + { + "epoch": 0.5971364402021336, + "grad_norm": 0.24086812138557434, + "learning_rate": 9.70513411493058e-06, + "loss": 0.3813, + "step": 709 + }, + { + "epoch": 0.5979786636720943, + "grad_norm": 0.28989389538764954, + "learning_rate": 9.70347314362182e-06, + "loss": 0.3992, + "step": 710 + }, + { + "epoch": 0.5988208871420551, + "grad_norm": 0.25863921642303467, + "learning_rate": 9.70180765028384e-06, + "loss": 0.4085, + "step": 711 + }, + { + "epoch": 0.5996631106120157, + "grad_norm": 0.2894207239151001, + "learning_rate": 9.700137636517884e-06, + "loss": 0.4016, + "step": 712 + }, + { + "epoch": 0.6005053340819764, + "grad_norm": 0.25992056727409363, + "learning_rate": 9.698463103929542e-06, + "loss": 0.3675, + "step": 713 + }, + { + "epoch": 0.6013475575519371, + "grad_norm": 0.28006356954574585, + "learning_rate": 9.696784054128749e-06, + "loss": 0.393, + "step": 714 + }, + { + "epoch": 0.6021897810218978, + "grad_norm": 0.2698265612125397, + "learning_rate": 9.695100488729784e-06, + "loss": 0.4124, + "step": 715 + }, + { + "epoch": 0.6030320044918585, + "grad_norm": 0.24214014410972595, + "learning_rate": 9.693412409351264e-06, + "loss": 0.4018, + "step": 716 + }, + { + "epoch": 0.6038742279618192, + "grad_norm": 0.2883848547935486, + "learning_rate": 9.691719817616148e-06, + "loss": 0.4001, + "step": 717 + }, + { + "epoch": 0.6047164514317799, + "grad_norm": 0.27353766560554504, + "learning_rate": 9.690022715151734e-06, + "loss": 0.408, + "step": 718 + }, + { + "epoch": 0.6055586749017406, + "grad_norm": 0.253030389547348, + "learning_rate": 9.688321103589659e-06, + "loss": 0.3887, + "step": 719 + }, + { + "epoch": 0.6064008983717013, + "grad_norm": 0.255454957485199, + "learning_rate": 9.686614984565888e-06, + "loss": 0.3715, + "step": 720 + }, + { + "epoch": 0.607243121841662, + "grad_norm": 0.25838613510131836, + "learning_rate": 9.684904359720724e-06, + "loss": 0.4027, + "step": 721 + }, + { + "epoch": 0.6080853453116227, + "grad_norm": 0.23131829500198364, + "learning_rate": 9.683189230698804e-06, + "loss": 0.4158, + "step": 722 + }, + { + "epoch": 0.6089275687815834, + "grad_norm": 0.26560577750205994, + "learning_rate": 9.681469599149093e-06, + "loss": 0.3927, + "step": 723 + }, + { + "epoch": 0.609769792251544, + "grad_norm": 0.2801228165626526, + "learning_rate": 9.679745466724884e-06, + "loss": 0.3835, + "step": 724 + }, + { + "epoch": 0.6106120157215048, + "grad_norm": 0.2449226677417755, + "learning_rate": 9.678016835083798e-06, + "loss": 0.4011, + "step": 725 + }, + { + "epoch": 0.6114542391914655, + "grad_norm": 0.2523360252380371, + "learning_rate": 9.676283705887783e-06, + "loss": 0.3691, + "step": 726 + }, + { + "epoch": 0.6122964626614261, + "grad_norm": 0.2678155303001404, + "learning_rate": 9.674546080803109e-06, + "loss": 0.3874, + "step": 727 + }, + { + "epoch": 0.6131386861313869, + "grad_norm": 0.2483142614364624, + "learning_rate": 9.67280396150037e-06, + "loss": 0.4032, + "step": 728 + }, + { + "epoch": 0.6139809096013475, + "grad_norm": 0.29895925521850586, + "learning_rate": 9.671057349654481e-06, + "loss": 0.3938, + "step": 729 + }, + { + "epoch": 0.6148231330713082, + "grad_norm": 0.222129225730896, + "learning_rate": 9.669306246944674e-06, + "loss": 0.3928, + "step": 730 + }, + { + "epoch": 0.615665356541269, + "grad_norm": 0.29450759291648865, + "learning_rate": 9.6675506550545e-06, + "loss": 0.4152, + "step": 731 + }, + { + "epoch": 0.6165075800112296, + "grad_norm": 0.2805439829826355, + "learning_rate": 9.66579057567183e-06, + "loss": 0.4006, + "step": 732 + }, + { + "epoch": 0.6173498034811904, + "grad_norm": 0.234604150056839, + "learning_rate": 9.66402601048884e-06, + "loss": 0.3831, + "step": 733 + }, + { + "epoch": 0.618192026951151, + "grad_norm": 0.31237849593162537, + "learning_rate": 9.662256961202028e-06, + "loss": 0.3922, + "step": 734 + }, + { + "epoch": 0.6190342504211117, + "grad_norm": 0.2697100043296814, + "learning_rate": 9.660483429512198e-06, + "loss": 0.392, + "step": 735 + }, + { + "epoch": 0.6198764738910725, + "grad_norm": 0.31797346472740173, + "learning_rate": 9.658705417124466e-06, + "loss": 0.3838, + "step": 736 + }, + { + "epoch": 0.6207186973610331, + "grad_norm": 0.275126188993454, + "learning_rate": 9.656922925748254e-06, + "loss": 0.3969, + "step": 737 + }, + { + "epoch": 0.6215609208309938, + "grad_norm": 0.2596983015537262, + "learning_rate": 9.65513595709729e-06, + "loss": 0.3882, + "step": 738 + }, + { + "epoch": 0.6224031443009546, + "grad_norm": 0.2853798568248749, + "learning_rate": 9.653344512889608e-06, + "loss": 0.4083, + "step": 739 + }, + { + "epoch": 0.6232453677709152, + "grad_norm": 0.23947489261627197, + "learning_rate": 9.651548594847546e-06, + "loss": 0.3982, + "step": 740 + }, + { + "epoch": 0.6240875912408759, + "grad_norm": 0.26423704624176025, + "learning_rate": 9.649748204697741e-06, + "loss": 0.3913, + "step": 741 + }, + { + "epoch": 0.6249298147108366, + "grad_norm": 0.2359628826379776, + "learning_rate": 9.647943344171129e-06, + "loss": 0.3866, + "step": 742 + }, + { + "epoch": 0.6257720381807973, + "grad_norm": 0.2493540644645691, + "learning_rate": 9.646134015002946e-06, + "loss": 0.3946, + "step": 743 + }, + { + "epoch": 0.626614261650758, + "grad_norm": 0.2681938111782074, + "learning_rate": 9.644320218932723e-06, + "loss": 0.4084, + "step": 744 + }, + { + "epoch": 0.6274564851207187, + "grad_norm": 0.2515985667705536, + "learning_rate": 9.642501957704287e-06, + "loss": 0.3729, + "step": 745 + }, + { + "epoch": 0.6282987085906794, + "grad_norm": 0.25220364332199097, + "learning_rate": 9.640679233065755e-06, + "loss": 0.3944, + "step": 746 + }, + { + "epoch": 0.62914093206064, + "grad_norm": 0.2675838768482208, + "learning_rate": 9.63885204676954e-06, + "loss": 0.3733, + "step": 747 + }, + { + "epoch": 0.6299831555306008, + "grad_norm": 0.2608799636363983, + "learning_rate": 9.637020400572339e-06, + "loss": 0.4032, + "step": 748 + }, + { + "epoch": 0.6308253790005615, + "grad_norm": 0.23328998684883118, + "learning_rate": 9.63518429623514e-06, + "loss": 0.4149, + "step": 749 + }, + { + "epoch": 0.6316676024705222, + "grad_norm": 0.23251040279865265, + "learning_rate": 9.63334373552322e-06, + "loss": 0.3944, + "step": 750 + }, + { + "epoch": 0.6325098259404829, + "grad_norm": 0.2581608295440674, + "learning_rate": 9.631498720206132e-06, + "loss": 0.4023, + "step": 751 + }, + { + "epoch": 0.6333520494104435, + "grad_norm": 0.2426811009645462, + "learning_rate": 9.62964925205772e-06, + "loss": 0.3795, + "step": 752 + }, + { + "epoch": 0.6341942728804043, + "grad_norm": 0.25329363346099854, + "learning_rate": 9.627795332856107e-06, + "loss": 0.3948, + "step": 753 + }, + { + "epoch": 0.635036496350365, + "grad_norm": 0.25250154733657837, + "learning_rate": 9.625936964383691e-06, + "loss": 0.3767, + "step": 754 + }, + { + "epoch": 0.6358787198203256, + "grad_norm": 0.2705284357070923, + "learning_rate": 9.624074148427154e-06, + "loss": 0.3975, + "step": 755 + }, + { + "epoch": 0.6367209432902864, + "grad_norm": 0.24624527990818024, + "learning_rate": 9.622206886777448e-06, + "loss": 0.3792, + "step": 756 + }, + { + "epoch": 0.637563166760247, + "grad_norm": 0.2937239706516266, + "learning_rate": 9.620335181229805e-06, + "loss": 0.397, + "step": 757 + }, + { + "epoch": 0.6384053902302077, + "grad_norm": 0.24023059010505676, + "learning_rate": 9.618459033583725e-06, + "loss": 0.4001, + "step": 758 + }, + { + "epoch": 0.6392476137001685, + "grad_norm": 0.278189092874527, + "learning_rate": 9.616578445642982e-06, + "loss": 0.3803, + "step": 759 + }, + { + "epoch": 0.6400898371701291, + "grad_norm": 0.31741783022880554, + "learning_rate": 9.614693419215613e-06, + "loss": 0.3926, + "step": 760 + }, + { + "epoch": 0.6409320606400898, + "grad_norm": 0.26135605573654175, + "learning_rate": 9.612803956113932e-06, + "loss": 0.3572, + "step": 761 + }, + { + "epoch": 0.6417742841100506, + "grad_norm": 0.2977483868598938, + "learning_rate": 9.61091005815451e-06, + "loss": 0.3933, + "step": 762 + }, + { + "epoch": 0.6426165075800112, + "grad_norm": 0.29880109429359436, + "learning_rate": 9.609011727158184e-06, + "loss": 0.4027, + "step": 763 + }, + { + "epoch": 0.643458731049972, + "grad_norm": 0.23161263763904572, + "learning_rate": 9.607108964950056e-06, + "loss": 0.3636, + "step": 764 + }, + { + "epoch": 0.6443009545199326, + "grad_norm": 0.24532750248908997, + "learning_rate": 9.605201773359485e-06, + "loss": 0.3887, + "step": 765 + }, + { + "epoch": 0.6451431779898933, + "grad_norm": 0.2845829725265503, + "learning_rate": 9.603290154220091e-06, + "loss": 0.3963, + "step": 766 + }, + { + "epoch": 0.6459854014598541, + "grad_norm": 0.24565917253494263, + "learning_rate": 9.601374109369746e-06, + "loss": 0.3946, + "step": 767 + }, + { + "epoch": 0.6468276249298147, + "grad_norm": 0.24358156323432922, + "learning_rate": 9.599453640650585e-06, + "loss": 0.3983, + "step": 768 + }, + { + "epoch": 0.6476698483997754, + "grad_norm": 0.2880096137523651, + "learning_rate": 9.59752874990899e-06, + "loss": 0.3927, + "step": 769 + }, + { + "epoch": 0.6485120718697361, + "grad_norm": 0.25155845284461975, + "learning_rate": 9.595599438995593e-06, + "loss": 0.4066, + "step": 770 + }, + { + "epoch": 0.6493542953396968, + "grad_norm": 0.27330100536346436, + "learning_rate": 9.59366570976528e-06, + "loss": 0.4022, + "step": 771 + }, + { + "epoch": 0.6501965188096575, + "grad_norm": 0.2611733078956604, + "learning_rate": 9.591727564077189e-06, + "loss": 0.3915, + "step": 772 + }, + { + "epoch": 0.6510387422796182, + "grad_norm": 0.2380668818950653, + "learning_rate": 9.589785003794692e-06, + "loss": 0.4014, + "step": 773 + }, + { + "epoch": 0.6518809657495789, + "grad_norm": 0.2736964821815491, + "learning_rate": 9.587838030785413e-06, + "loss": 0.394, + "step": 774 + }, + { + "epoch": 0.6527231892195395, + "grad_norm": 0.23338758945465088, + "learning_rate": 9.585886646921221e-06, + "loss": 0.3826, + "step": 775 + }, + { + "epoch": 0.6535654126895003, + "grad_norm": 0.2619832754135132, + "learning_rate": 9.583930854078219e-06, + "loss": 0.4194, + "step": 776 + }, + { + "epoch": 0.654407636159461, + "grad_norm": 0.2791779339313507, + "learning_rate": 9.581970654136752e-06, + "loss": 0.3891, + "step": 777 + }, + { + "epoch": 0.6552498596294217, + "grad_norm": 0.2457047700881958, + "learning_rate": 9.580006048981403e-06, + "loss": 0.3794, + "step": 778 + }, + { + "epoch": 0.6560920830993824, + "grad_norm": 0.24340137839317322, + "learning_rate": 9.578037040500992e-06, + "loss": 0.396, + "step": 779 + }, + { + "epoch": 0.656934306569343, + "grad_norm": 0.3096691071987152, + "learning_rate": 9.576063630588563e-06, + "loss": 0.3789, + "step": 780 + }, + { + "epoch": 0.6577765300393038, + "grad_norm": 0.23291133344173431, + "learning_rate": 9.574085821141406e-06, + "loss": 0.393, + "step": 781 + }, + { + "epoch": 0.6586187535092645, + "grad_norm": 0.2542714476585388, + "learning_rate": 9.572103614061029e-06, + "loss": 0.385, + "step": 782 + }, + { + "epoch": 0.6594609769792251, + "grad_norm": 0.26308146119117737, + "learning_rate": 9.570117011253173e-06, + "loss": 0.4036, + "step": 783 + }, + { + "epoch": 0.6603032004491859, + "grad_norm": 0.2650177776813507, + "learning_rate": 9.568126014627805e-06, + "loss": 0.4113, + "step": 784 + }, + { + "epoch": 0.6611454239191465, + "grad_norm": 0.25345557928085327, + "learning_rate": 9.566130626099118e-06, + "loss": 0.3891, + "step": 785 + }, + { + "epoch": 0.6619876473891072, + "grad_norm": 0.2507096529006958, + "learning_rate": 9.56413084758552e-06, + "loss": 0.3891, + "step": 786 + }, + { + "epoch": 0.662829870859068, + "grad_norm": 0.24637813866138458, + "learning_rate": 9.562126681009649e-06, + "loss": 0.3737, + "step": 787 + }, + { + "epoch": 0.6636720943290286, + "grad_norm": 0.2659986615180969, + "learning_rate": 9.560118128298355e-06, + "loss": 0.4135, + "step": 788 + }, + { + "epoch": 0.6645143177989893, + "grad_norm": 0.2727932035923004, + "learning_rate": 9.55810519138271e-06, + "loss": 0.3974, + "step": 789 + }, + { + "epoch": 0.66535654126895, + "grad_norm": 0.26761728525161743, + "learning_rate": 9.556087872197997e-06, + "loss": 0.3718, + "step": 790 + }, + { + "epoch": 0.6661987647389107, + "grad_norm": 0.28363317251205444, + "learning_rate": 9.554066172683715e-06, + "loss": 0.4138, + "step": 791 + }, + { + "epoch": 0.6670409882088714, + "grad_norm": 0.2563317120075226, + "learning_rate": 9.552040094783575e-06, + "loss": 0.4085, + "step": 792 + }, + { + "epoch": 0.6678832116788321, + "grad_norm": 0.29577916860580444, + "learning_rate": 9.550009640445492e-06, + "loss": 0.3808, + "step": 793 + }, + { + "epoch": 0.6687254351487928, + "grad_norm": 0.30510273575782776, + "learning_rate": 9.547974811621594e-06, + "loss": 0.3998, + "step": 794 + }, + { + "epoch": 0.6695676586187536, + "grad_norm": 0.2771177887916565, + "learning_rate": 9.545935610268213e-06, + "loss": 0.3904, + "step": 795 + }, + { + "epoch": 0.6704098820887142, + "grad_norm": 0.23797883093357086, + "learning_rate": 9.543892038345885e-06, + "loss": 0.3888, + "step": 796 + }, + { + "epoch": 0.6712521055586749, + "grad_norm": 0.276486873626709, + "learning_rate": 9.541844097819347e-06, + "loss": 0.3909, + "step": 797 + }, + { + "epoch": 0.6720943290286356, + "grad_norm": 0.25211796164512634, + "learning_rate": 9.53979179065754e-06, + "loss": 0.4019, + "step": 798 + }, + { + "epoch": 0.6729365524985963, + "grad_norm": 0.2426571249961853, + "learning_rate": 9.537735118833595e-06, + "loss": 0.3907, + "step": 799 + }, + { + "epoch": 0.673778775968557, + "grad_norm": 0.23780083656311035, + "learning_rate": 9.53567408432485e-06, + "loss": 0.3757, + "step": 800 + }, + { + "epoch": 0.6746209994385177, + "grad_norm": 0.3084465265274048, + "learning_rate": 9.533608689112827e-06, + "loss": 0.3994, + "step": 801 + }, + { + "epoch": 0.6754632229084784, + "grad_norm": 0.2585592567920685, + "learning_rate": 9.531538935183252e-06, + "loss": 0.4141, + "step": 802 + }, + { + "epoch": 0.676305446378439, + "grad_norm": 0.23807695508003235, + "learning_rate": 9.529464824526027e-06, + "loss": 0.4032, + "step": 803 + }, + { + "epoch": 0.6771476698483998, + "grad_norm": 0.2669808864593506, + "learning_rate": 9.527386359135254e-06, + "loss": 0.4069, + "step": 804 + }, + { + "epoch": 0.6779898933183605, + "grad_norm": 0.24769054353237152, + "learning_rate": 9.525303541009218e-06, + "loss": 0.3805, + "step": 805 + }, + { + "epoch": 0.6788321167883211, + "grad_norm": 0.25517210364341736, + "learning_rate": 9.523216372150393e-06, + "loss": 0.4199, + "step": 806 + }, + { + "epoch": 0.6796743402582819, + "grad_norm": 0.22805273532867432, + "learning_rate": 9.521124854565425e-06, + "loss": 0.4066, + "step": 807 + }, + { + "epoch": 0.6805165637282425, + "grad_norm": 0.26646801829338074, + "learning_rate": 9.519028990265153e-06, + "loss": 0.4145, + "step": 808 + }, + { + "epoch": 0.6813587871982033, + "grad_norm": 0.22494232654571533, + "learning_rate": 9.516928781264588e-06, + "loss": 0.3941, + "step": 809 + }, + { + "epoch": 0.682201010668164, + "grad_norm": 0.2086746245622635, + "learning_rate": 9.514824229582922e-06, + "loss": 0.395, + "step": 810 + }, + { + "epoch": 0.6830432341381246, + "grad_norm": 0.22179856896400452, + "learning_rate": 9.512715337243517e-06, + "loss": 0.3974, + "step": 811 + }, + { + "epoch": 0.6838854576080854, + "grad_norm": 0.23663990199565887, + "learning_rate": 9.510602106273914e-06, + "loss": 0.3894, + "step": 812 + }, + { + "epoch": 0.684727681078046, + "grad_norm": 0.23997731506824493, + "learning_rate": 9.508484538705823e-06, + "loss": 0.391, + "step": 813 + }, + { + "epoch": 0.6855699045480067, + "grad_norm": 0.2230270504951477, + "learning_rate": 9.506362636575122e-06, + "loss": 0.4036, + "step": 814 + }, + { + "epoch": 0.6864121280179675, + "grad_norm": 0.23250775039196014, + "learning_rate": 9.504236401921856e-06, + "loss": 0.4077, + "step": 815 + }, + { + "epoch": 0.6872543514879281, + "grad_norm": 0.2370990365743637, + "learning_rate": 9.50210583679024e-06, + "loss": 0.4115, + "step": 816 + }, + { + "epoch": 0.6880965749578888, + "grad_norm": 0.25787898898124695, + "learning_rate": 9.499970943228646e-06, + "loss": 0.3852, + "step": 817 + }, + { + "epoch": 0.6889387984278496, + "grad_norm": 0.24739859998226166, + "learning_rate": 9.497831723289615e-06, + "loss": 0.3975, + "step": 818 + }, + { + "epoch": 0.6897810218978102, + "grad_norm": 0.22881582379341125, + "learning_rate": 9.495688179029838e-06, + "loss": 0.3766, + "step": 819 + }, + { + "epoch": 0.6906232453677709, + "grad_norm": 0.25048691034317017, + "learning_rate": 9.493540312510173e-06, + "loss": 0.4028, + "step": 820 + }, + { + "epoch": 0.6914654688377316, + "grad_norm": 0.22022280097007751, + "learning_rate": 9.491388125795623e-06, + "loss": 0.4066, + "step": 821 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 0.24745136499404907, + "learning_rate": 9.48923162095536e-06, + "loss": 0.4013, + "step": 822 + }, + { + "epoch": 0.6931499157776531, + "grad_norm": 0.24586893618106842, + "learning_rate": 9.487070800062689e-06, + "loss": 0.4051, + "step": 823 + }, + { + "epoch": 0.6939921392476137, + "grad_norm": 0.22258132696151733, + "learning_rate": 9.48490566519508e-06, + "loss": 0.4183, + "step": 824 + }, + { + "epoch": 0.6948343627175744, + "grad_norm": 0.22149792313575745, + "learning_rate": 9.482736218434144e-06, + "loss": 0.3884, + "step": 825 + }, + { + "epoch": 0.6956765861875351, + "grad_norm": 0.22729527950286865, + "learning_rate": 9.480562461865634e-06, + "loss": 0.3696, + "step": 826 + }, + { + "epoch": 0.6965188096574958, + "grad_norm": 0.22042086720466614, + "learning_rate": 9.478384397579452e-06, + "loss": 0.3929, + "step": 827 + }, + { + "epoch": 0.6973610331274565, + "grad_norm": 0.25043973326683044, + "learning_rate": 9.476202027669644e-06, + "loss": 0.3997, + "step": 828 + }, + { + "epoch": 0.6982032565974172, + "grad_norm": 0.2305058091878891, + "learning_rate": 9.474015354234385e-06, + "loss": 0.4011, + "step": 829 + }, + { + "epoch": 0.6990454800673779, + "grad_norm": 0.25610336661338806, + "learning_rate": 9.471824379375998e-06, + "loss": 0.4031, + "step": 830 + }, + { + "epoch": 0.6998877035373385, + "grad_norm": 0.21562042832374573, + "learning_rate": 9.469629105200937e-06, + "loss": 0.3763, + "step": 831 + }, + { + "epoch": 0.7007299270072993, + "grad_norm": 0.2788180410861969, + "learning_rate": 9.46742953381979e-06, + "loss": 0.3857, + "step": 832 + }, + { + "epoch": 0.70157215047726, + "grad_norm": 0.4841333031654358, + "learning_rate": 9.465225667347275e-06, + "loss": 0.3926, + "step": 833 + }, + { + "epoch": 0.7024143739472206, + "grad_norm": 0.26868411898612976, + "learning_rate": 9.463017507902245e-06, + "loss": 0.3917, + "step": 834 + }, + { + "epoch": 0.7032565974171814, + "grad_norm": 0.2521004378795624, + "learning_rate": 9.460805057607671e-06, + "loss": 0.4101, + "step": 835 + }, + { + "epoch": 0.704098820887142, + "grad_norm": 0.24040234088897705, + "learning_rate": 9.458588318590659e-06, + "loss": 0.3656, + "step": 836 + }, + { + "epoch": 0.7049410443571027, + "grad_norm": 0.2537100613117218, + "learning_rate": 9.45636729298243e-06, + "loss": 0.3813, + "step": 837 + }, + { + "epoch": 0.7057832678270635, + "grad_norm": 0.23415488004684448, + "learning_rate": 9.45414198291833e-06, + "loss": 0.3937, + "step": 838 + }, + { + "epoch": 0.7066254912970241, + "grad_norm": 0.25770246982574463, + "learning_rate": 9.451912390537828e-06, + "loss": 0.3935, + "step": 839 + }, + { + "epoch": 0.7074677147669849, + "grad_norm": 0.25172489881515503, + "learning_rate": 9.449678517984503e-06, + "loss": 0.3832, + "step": 840 + }, + { + "epoch": 0.7083099382369455, + "grad_norm": 0.23774784803390503, + "learning_rate": 9.447440367406053e-06, + "loss": 0.3819, + "step": 841 + }, + { + "epoch": 0.7091521617069062, + "grad_norm": 0.25593727827072144, + "learning_rate": 9.445197940954292e-06, + "loss": 0.3968, + "step": 842 + }, + { + "epoch": 0.709994385176867, + "grad_norm": 0.24409858882427216, + "learning_rate": 9.442951240785135e-06, + "loss": 0.4202, + "step": 843 + }, + { + "epoch": 0.7108366086468276, + "grad_norm": 0.2531510293483734, + "learning_rate": 9.440700269058617e-06, + "loss": 0.3811, + "step": 844 + }, + { + "epoch": 0.7116788321167883, + "grad_norm": 0.27462396025657654, + "learning_rate": 9.438445027938873e-06, + "loss": 0.4055, + "step": 845 + }, + { + "epoch": 0.712521055586749, + "grad_norm": 0.25302422046661377, + "learning_rate": 9.436185519594145e-06, + "loss": 0.3937, + "step": 846 + }, + { + "epoch": 0.7133632790567097, + "grad_norm": 0.24274687469005585, + "learning_rate": 9.433921746196777e-06, + "loss": 0.4038, + "step": 847 + }, + { + "epoch": 0.7142055025266704, + "grad_norm": 0.7182270884513855, + "learning_rate": 9.431653709923214e-06, + "loss": 0.4178, + "step": 848 + }, + { + "epoch": 0.7150477259966311, + "grad_norm": 0.26638856530189514, + "learning_rate": 9.429381412954e-06, + "loss": 0.3985, + "step": 849 + }, + { + "epoch": 0.7158899494665918, + "grad_norm": 0.2720615565776825, + "learning_rate": 9.427104857473773e-06, + "loss": 0.3844, + "step": 850 + }, + { + "epoch": 0.7167321729365524, + "grad_norm": 0.2869759798049927, + "learning_rate": 9.424824045671267e-06, + "loss": 0.4208, + "step": 851 + }, + { + "epoch": 0.7175743964065132, + "grad_norm": 0.242397740483284, + "learning_rate": 9.422538979739307e-06, + "loss": 0.4087, + "step": 852 + }, + { + "epoch": 0.7184166198764739, + "grad_norm": 0.25077950954437256, + "learning_rate": 9.420249661874812e-06, + "loss": 0.403, + "step": 853 + }, + { + "epoch": 0.7192588433464346, + "grad_norm": 0.28628140687942505, + "learning_rate": 9.417956094278784e-06, + "loss": 0.3731, + "step": 854 + }, + { + "epoch": 0.7201010668163953, + "grad_norm": 0.2310740351676941, + "learning_rate": 9.415658279156312e-06, + "loss": 0.3969, + "step": 855 + }, + { + "epoch": 0.720943290286356, + "grad_norm": 0.28179624676704407, + "learning_rate": 9.41335621871657e-06, + "loss": 0.4003, + "step": 856 + }, + { + "epoch": 0.7217855137563167, + "grad_norm": 0.26215457916259766, + "learning_rate": 9.41104991517281e-06, + "loss": 0.3803, + "step": 857 + }, + { + "epoch": 0.7226277372262774, + "grad_norm": 0.24661622941493988, + "learning_rate": 9.408739370742372e-06, + "loss": 0.3907, + "step": 858 + }, + { + "epoch": 0.723469960696238, + "grad_norm": 0.2931444048881531, + "learning_rate": 9.406424587646664e-06, + "loss": 0.4037, + "step": 859 + }, + { + "epoch": 0.7243121841661988, + "grad_norm": 0.21865636110305786, + "learning_rate": 9.404105568111173e-06, + "loss": 0.392, + "step": 860 + }, + { + "epoch": 0.7251544076361595, + "grad_norm": 0.2503955364227295, + "learning_rate": 9.401782314365458e-06, + "loss": 0.3787, + "step": 861 + }, + { + "epoch": 0.7259966311061201, + "grad_norm": 0.2230653613805771, + "learning_rate": 9.39945482864315e-06, + "loss": 0.4023, + "step": 862 + }, + { + "epoch": 0.7268388545760809, + "grad_norm": 0.2315695583820343, + "learning_rate": 9.39712311318195e-06, + "loss": 0.3925, + "step": 863 + }, + { + "epoch": 0.7276810780460415, + "grad_norm": 0.2381676435470581, + "learning_rate": 9.39478717022362e-06, + "loss": 0.3936, + "step": 864 + }, + { + "epoch": 0.7285233015160022, + "grad_norm": 0.24494384229183197, + "learning_rate": 9.392447002013996e-06, + "loss": 0.3849, + "step": 865 + }, + { + "epoch": 0.729365524985963, + "grad_norm": 0.2394935041666031, + "learning_rate": 9.390102610802965e-06, + "loss": 0.3882, + "step": 866 + }, + { + "epoch": 0.7302077484559236, + "grad_norm": 0.24465805292129517, + "learning_rate": 9.387753998844482e-06, + "loss": 0.3765, + "step": 867 + }, + { + "epoch": 0.7310499719258844, + "grad_norm": 0.23144975304603577, + "learning_rate": 9.385401168396558e-06, + "loss": 0.3837, + "step": 868 + }, + { + "epoch": 0.731892195395845, + "grad_norm": 0.2408526986837387, + "learning_rate": 9.383044121721257e-06, + "loss": 0.3716, + "step": 869 + }, + { + "epoch": 0.7327344188658057, + "grad_norm": 0.23346202075481415, + "learning_rate": 9.380682861084703e-06, + "loss": 0.3978, + "step": 870 + }, + { + "epoch": 0.7335766423357665, + "grad_norm": 0.2438187599182129, + "learning_rate": 9.378317388757062e-06, + "loss": 0.3938, + "step": 871 + }, + { + "epoch": 0.7344188658057271, + "grad_norm": 0.23521101474761963, + "learning_rate": 9.375947707012558e-06, + "loss": 0.3637, + "step": 872 + }, + { + "epoch": 0.7352610892756878, + "grad_norm": 0.2562772035598755, + "learning_rate": 9.37357381812946e-06, + "loss": 0.3888, + "step": 873 + }, + { + "epoch": 0.7361033127456486, + "grad_norm": 0.24033290147781372, + "learning_rate": 9.371195724390075e-06, + "loss": 0.3761, + "step": 874 + }, + { + "epoch": 0.7369455362156092, + "grad_norm": 0.25617146492004395, + "learning_rate": 9.368813428080763e-06, + "loss": 0.3926, + "step": 875 + }, + { + "epoch": 0.7377877596855699, + "grad_norm": 0.24614277482032776, + "learning_rate": 9.366426931491917e-06, + "loss": 0.3763, + "step": 876 + }, + { + "epoch": 0.7386299831555306, + "grad_norm": 0.24150444567203522, + "learning_rate": 9.364036236917972e-06, + "loss": 0.4064, + "step": 877 + }, + { + "epoch": 0.7394722066254913, + "grad_norm": 0.26152312755584717, + "learning_rate": 9.361641346657396e-06, + "loss": 0.3918, + "step": 878 + }, + { + "epoch": 0.740314430095452, + "grad_norm": 0.20616672933101654, + "learning_rate": 9.359242263012693e-06, + "loss": 0.4021, + "step": 879 + }, + { + "epoch": 0.7411566535654127, + "grad_norm": 0.23581033945083618, + "learning_rate": 9.356838988290401e-06, + "loss": 0.3902, + "step": 880 + }, + { + "epoch": 0.7419988770353734, + "grad_norm": 0.24657794833183289, + "learning_rate": 9.354431524801082e-06, + "loss": 0.4069, + "step": 881 + }, + { + "epoch": 0.742841100505334, + "grad_norm": 0.21929241716861725, + "learning_rate": 9.352019874859326e-06, + "loss": 0.3884, + "step": 882 + }, + { + "epoch": 0.7436833239752948, + "grad_norm": 0.24844641983509064, + "learning_rate": 9.349604040783754e-06, + "loss": 0.3814, + "step": 883 + }, + { + "epoch": 0.7445255474452555, + "grad_norm": 0.24313275516033173, + "learning_rate": 9.347184024897003e-06, + "loss": 0.4, + "step": 884 + }, + { + "epoch": 0.7453677709152162, + "grad_norm": 0.24789856374263763, + "learning_rate": 9.344759829525734e-06, + "loss": 0.4097, + "step": 885 + }, + { + "epoch": 0.7462099943851769, + "grad_norm": 0.24578550457954407, + "learning_rate": 9.342331457000621e-06, + "loss": 0.4079, + "step": 886 + }, + { + "epoch": 0.7470522178551375, + "grad_norm": 0.23694393038749695, + "learning_rate": 9.339898909656364e-06, + "loss": 0.4013, + "step": 887 + }, + { + "epoch": 0.7478944413250983, + "grad_norm": 0.2592124342918396, + "learning_rate": 9.33746218983167e-06, + "loss": 0.3991, + "step": 888 + }, + { + "epoch": 0.748736664795059, + "grad_norm": 0.23632554709911346, + "learning_rate": 9.335021299869256e-06, + "loss": 0.4267, + "step": 889 + }, + { + "epoch": 0.7495788882650196, + "grad_norm": 0.24402032792568207, + "learning_rate": 9.332576242115852e-06, + "loss": 0.3786, + "step": 890 + }, + { + "epoch": 0.7504211117349804, + "grad_norm": 0.22527766227722168, + "learning_rate": 9.330127018922195e-06, + "loss": 0.3789, + "step": 891 + }, + { + "epoch": 0.751263335204941, + "grad_norm": 0.2471362203359604, + "learning_rate": 9.327673632643021e-06, + "loss": 0.4083, + "step": 892 + }, + { + "epoch": 0.7521055586749017, + "grad_norm": 0.23946036398410797, + "learning_rate": 9.32521608563708e-06, + "loss": 0.3869, + "step": 893 + }, + { + "epoch": 0.7529477821448625, + "grad_norm": 0.275928795337677, + "learning_rate": 9.32275438026711e-06, + "loss": 0.3786, + "step": 894 + }, + { + "epoch": 0.7537900056148231, + "grad_norm": 0.21995076537132263, + "learning_rate": 9.320288518899853e-06, + "loss": 0.3825, + "step": 895 + }, + { + "epoch": 0.7546322290847838, + "grad_norm": 0.24087098240852356, + "learning_rate": 9.317818503906046e-06, + "loss": 0.3836, + "step": 896 + }, + { + "epoch": 0.7554744525547445, + "grad_norm": 0.23902052640914917, + "learning_rate": 9.315344337660422e-06, + "loss": 0.3879, + "step": 897 + }, + { + "epoch": 0.7563166760247052, + "grad_norm": 0.23242321610450745, + "learning_rate": 9.312866022541697e-06, + "loss": 0.3837, + "step": 898 + }, + { + "epoch": 0.757158899494666, + "grad_norm": 0.23153363168239594, + "learning_rate": 9.310383560932587e-06, + "loss": 0.3842, + "step": 899 + }, + { + "epoch": 0.7580011229646266, + "grad_norm": 0.24144352972507477, + "learning_rate": 9.307896955219787e-06, + "loss": 0.3744, + "step": 900 + }, + { + "epoch": 0.7588433464345873, + "grad_norm": 0.3434450626373291, + "learning_rate": 9.305406207793974e-06, + "loss": 0.4151, + "step": 901 + }, + { + "epoch": 0.759685569904548, + "grad_norm": 0.23582684993743896, + "learning_rate": 9.302911321049818e-06, + "loss": 0.3812, + "step": 902 + }, + { + "epoch": 0.7605277933745087, + "grad_norm": 0.24953168630599976, + "learning_rate": 9.300412297385954e-06, + "loss": 0.39, + "step": 903 + }, + { + "epoch": 0.7613700168444694, + "grad_norm": 0.30013614892959595, + "learning_rate": 9.297909139205005e-06, + "loss": 0.4118, + "step": 904 + }, + { + "epoch": 0.7622122403144301, + "grad_norm": 0.24813398718833923, + "learning_rate": 9.295401848913569e-06, + "loss": 0.3836, + "step": 905 + }, + { + "epoch": 0.7630544637843908, + "grad_norm": 0.226630300283432, + "learning_rate": 9.29289042892221e-06, + "loss": 0.3875, + "step": 906 + }, + { + "epoch": 0.7638966872543514, + "grad_norm": 0.23602192103862762, + "learning_rate": 9.290374881645465e-06, + "loss": 0.4, + "step": 907 + }, + { + "epoch": 0.7647389107243122, + "grad_norm": 0.24765224754810333, + "learning_rate": 9.287855209501844e-06, + "loss": 0.4083, + "step": 908 + }, + { + "epoch": 0.7655811341942729, + "grad_norm": 0.27086493372917175, + "learning_rate": 9.285331414913816e-06, + "loss": 0.4144, + "step": 909 + }, + { + "epoch": 0.7664233576642335, + "grad_norm": 0.22973863780498505, + "learning_rate": 9.282803500307818e-06, + "loss": 0.3661, + "step": 910 + }, + { + "epoch": 0.7672655811341943, + "grad_norm": 0.30480989813804626, + "learning_rate": 9.280271468114243e-06, + "loss": 0.3913, + "step": 911 + }, + { + "epoch": 0.768107804604155, + "grad_norm": 0.26645076274871826, + "learning_rate": 9.277735320767449e-06, + "loss": 0.4083, + "step": 912 + }, + { + "epoch": 0.7689500280741156, + "grad_norm": 0.27367421984672546, + "learning_rate": 9.275195060705749e-06, + "loss": 0.3795, + "step": 913 + }, + { + "epoch": 0.7697922515440764, + "grad_norm": 0.3100326359272003, + "learning_rate": 9.272650690371403e-06, + "loss": 0.3931, + "step": 914 + }, + { + "epoch": 0.770634475014037, + "grad_norm": 0.2507917582988739, + "learning_rate": 9.270102212210632e-06, + "loss": 0.3892, + "step": 915 + }, + { + "epoch": 0.7714766984839978, + "grad_norm": 0.245370551943779, + "learning_rate": 9.267549628673603e-06, + "loss": 0.3953, + "step": 916 + }, + { + "epoch": 0.7723189219539585, + "grad_norm": 0.2668488621711731, + "learning_rate": 9.264992942214427e-06, + "loss": 0.4083, + "step": 917 + }, + { + "epoch": 0.7731611454239191, + "grad_norm": 0.257581502199173, + "learning_rate": 9.262432155291167e-06, + "loss": 0.3755, + "step": 918 + }, + { + "epoch": 0.7740033688938799, + "grad_norm": 0.24320945143699646, + "learning_rate": 9.25986727036582e-06, + "loss": 0.3707, + "step": 919 + }, + { + "epoch": 0.7748455923638405, + "grad_norm": 0.2300320714712143, + "learning_rate": 9.257298289904324e-06, + "loss": 0.376, + "step": 920 + }, + { + "epoch": 0.7756878158338012, + "grad_norm": 0.2596640884876251, + "learning_rate": 9.254725216376562e-06, + "loss": 0.3885, + "step": 921 + }, + { + "epoch": 0.776530039303762, + "grad_norm": 0.2934633791446686, + "learning_rate": 9.252148052256343e-06, + "loss": 0.3939, + "step": 922 + }, + { + "epoch": 0.7773722627737226, + "grad_norm": 0.23935237526893616, + "learning_rate": 9.249566800021417e-06, + "loss": 0.4103, + "step": 923 + }, + { + "epoch": 0.7782144862436833, + "grad_norm": 0.25070446729660034, + "learning_rate": 9.246981462153456e-06, + "loss": 0.4276, + "step": 924 + }, + { + "epoch": 0.779056709713644, + "grad_norm": 0.23785822093486786, + "learning_rate": 9.244392041138068e-06, + "loss": 0.3645, + "step": 925 + }, + { + "epoch": 0.7798989331836047, + "grad_norm": 0.32643628120422363, + "learning_rate": 9.24179853946478e-06, + "loss": 0.4096, + "step": 926 + }, + { + "epoch": 0.7807411566535654, + "grad_norm": 0.23632368445396423, + "learning_rate": 9.239200959627048e-06, + "loss": 0.3805, + "step": 927 + }, + { + "epoch": 0.7815833801235261, + "grad_norm": 0.23155541718006134, + "learning_rate": 9.236599304122246e-06, + "loss": 0.3825, + "step": 928 + }, + { + "epoch": 0.7824256035934868, + "grad_norm": 0.25782865285873413, + "learning_rate": 9.233993575451663e-06, + "loss": 0.3925, + "step": 929 + }, + { + "epoch": 0.7832678270634476, + "grad_norm": 0.25167566537857056, + "learning_rate": 9.231383776120512e-06, + "loss": 0.3869, + "step": 930 + }, + { + "epoch": 0.7841100505334082, + "grad_norm": 0.3077695369720459, + "learning_rate": 9.228769908637912e-06, + "loss": 0.4098, + "step": 931 + }, + { + "epoch": 0.7849522740033689, + "grad_norm": 0.231857568025589, + "learning_rate": 9.226151975516897e-06, + "loss": 0.4046, + "step": 932 + }, + { + "epoch": 0.7857944974733296, + "grad_norm": 0.25358080863952637, + "learning_rate": 9.223529979274411e-06, + "loss": 0.3868, + "step": 933 + }, + { + "epoch": 0.7866367209432903, + "grad_norm": 0.23811326920986176, + "learning_rate": 9.220903922431302e-06, + "loss": 0.3926, + "step": 934 + }, + { + "epoch": 0.787478944413251, + "grad_norm": 0.2356197088956833, + "learning_rate": 9.218273807512318e-06, + "loss": 0.4042, + "step": 935 + }, + { + "epoch": 0.7883211678832117, + "grad_norm": 0.25609177350997925, + "learning_rate": 9.215639637046121e-06, + "loss": 0.3894, + "step": 936 + }, + { + "epoch": 0.7891633913531724, + "grad_norm": 0.2308328002691269, + "learning_rate": 9.213001413565259e-06, + "loss": 0.4005, + "step": 937 + }, + { + "epoch": 0.790005614823133, + "grad_norm": 0.2398657649755478, + "learning_rate": 9.210359139606183e-06, + "loss": 0.3909, + "step": 938 + }, + { + "epoch": 0.7908478382930938, + "grad_norm": 0.2556980550289154, + "learning_rate": 9.207712817709237e-06, + "loss": 0.3864, + "step": 939 + }, + { + "epoch": 0.7916900617630545, + "grad_norm": 0.2326350063085556, + "learning_rate": 9.205062450418655e-06, + "loss": 0.3678, + "step": 940 + }, + { + "epoch": 0.7925322852330151, + "grad_norm": 0.2267482876777649, + "learning_rate": 9.202408040282567e-06, + "loss": 0.3648, + "step": 941 + }, + { + "epoch": 0.7933745087029759, + "grad_norm": 0.24878454208374023, + "learning_rate": 9.19974958985298e-06, + "loss": 0.3939, + "step": 942 + }, + { + "epoch": 0.7942167321729365, + "grad_norm": 0.24260170757770538, + "learning_rate": 9.197087101685794e-06, + "loss": 0.3923, + "step": 943 + }, + { + "epoch": 0.7950589556428973, + "grad_norm": 0.24691571295261383, + "learning_rate": 9.194420578340785e-06, + "loss": 0.3934, + "step": 944 + }, + { + "epoch": 0.795901179112858, + "grad_norm": 0.24263276159763336, + "learning_rate": 9.191750022381613e-06, + "loss": 0.3892, + "step": 945 + }, + { + "epoch": 0.7967434025828186, + "grad_norm": 0.26982370018959045, + "learning_rate": 9.189075436375813e-06, + "loss": 0.4148, + "step": 946 + }, + { + "epoch": 0.7975856260527794, + "grad_norm": 0.2315046787261963, + "learning_rate": 9.186396822894792e-06, + "loss": 0.3889, + "step": 947 + }, + { + "epoch": 0.79842784952274, + "grad_norm": 0.25008097290992737, + "learning_rate": 9.183714184513832e-06, + "loss": 0.3772, + "step": 948 + }, + { + "epoch": 0.7992700729927007, + "grad_norm": 0.24061961472034454, + "learning_rate": 9.181027523812088e-06, + "loss": 0.3838, + "step": 949 + }, + { + "epoch": 0.8001122964626615, + "grad_norm": 0.25396090745925903, + "learning_rate": 9.178336843372576e-06, + "loss": 0.3902, + "step": 950 + }, + { + "epoch": 0.8009545199326221, + "grad_norm": 0.24839384853839874, + "learning_rate": 9.175642145782179e-06, + "loss": 0.3847, + "step": 951 + }, + { + "epoch": 0.8017967434025828, + "grad_norm": 0.273550420999527, + "learning_rate": 9.172943433631642e-06, + "loss": 0.3958, + "step": 952 + }, + { + "epoch": 0.8026389668725435, + "grad_norm": 0.2734324634075165, + "learning_rate": 9.170240709515573e-06, + "loss": 0.3939, + "step": 953 + }, + { + "epoch": 0.8034811903425042, + "grad_norm": 0.2695125341415405, + "learning_rate": 9.16753397603243e-06, + "loss": 0.387, + "step": 954 + }, + { + "epoch": 0.8043234138124649, + "grad_norm": 0.2518448233604431, + "learning_rate": 9.164823235784535e-06, + "loss": 0.391, + "step": 955 + }, + { + "epoch": 0.8051656372824256, + "grad_norm": 0.2905693054199219, + "learning_rate": 9.162108491378051e-06, + "loss": 0.4143, + "step": 956 + }, + { + "epoch": 0.8060078607523863, + "grad_norm": 0.2615605890750885, + "learning_rate": 9.159389745423003e-06, + "loss": 0.3993, + "step": 957 + }, + { + "epoch": 0.8068500842223469, + "grad_norm": 0.2930067777633667, + "learning_rate": 9.156667000533251e-06, + "loss": 0.3892, + "step": 958 + }, + { + "epoch": 0.8076923076923077, + "grad_norm": 0.2685520648956299, + "learning_rate": 9.153940259326511e-06, + "loss": 0.4065, + "step": 959 + }, + { + "epoch": 0.8085345311622684, + "grad_norm": 0.2457975298166275, + "learning_rate": 9.151209524424333e-06, + "loss": 0.3706, + "step": 960 + }, + { + "epoch": 0.8093767546322291, + "grad_norm": 0.27907201647758484, + "learning_rate": 9.14847479845211e-06, + "loss": 0.4101, + "step": 961 + }, + { + "epoch": 0.8102189781021898, + "grad_norm": 0.2515687942504883, + "learning_rate": 9.145736084039073e-06, + "loss": 0.3965, + "step": 962 + }, + { + "epoch": 0.8110612015721504, + "grad_norm": 0.24229024350643158, + "learning_rate": 9.142993383818284e-06, + "loss": 0.3875, + "step": 963 + }, + { + "epoch": 0.8119034250421112, + "grad_norm": 0.27047234773635864, + "learning_rate": 9.14024670042664e-06, + "loss": 0.4292, + "step": 964 + }, + { + "epoch": 0.8127456485120719, + "grad_norm": 0.26469686627388, + "learning_rate": 9.137496036504868e-06, + "loss": 0.372, + "step": 965 + }, + { + "epoch": 0.8135878719820325, + "grad_norm": 0.24354217946529388, + "learning_rate": 9.134741394697517e-06, + "loss": 0.3789, + "step": 966 + }, + { + "epoch": 0.8144300954519933, + "grad_norm": 0.23837910592556, + "learning_rate": 9.131982777652967e-06, + "loss": 0.3942, + "step": 967 + }, + { + "epoch": 0.815272318921954, + "grad_norm": 0.22872686386108398, + "learning_rate": 9.129220188023419e-06, + "loss": 0.3917, + "step": 968 + }, + { + "epoch": 0.8161145423919146, + "grad_norm": 0.22326874732971191, + "learning_rate": 9.126453628464889e-06, + "loss": 0.3837, + "step": 969 + }, + { + "epoch": 0.8169567658618754, + "grad_norm": 0.24706344306468964, + "learning_rate": 9.12368310163721e-06, + "loss": 0.4004, + "step": 970 + }, + { + "epoch": 0.817798989331836, + "grad_norm": 0.24931980669498444, + "learning_rate": 9.120908610204036e-06, + "loss": 0.4093, + "step": 971 + }, + { + "epoch": 0.8186412128017967, + "grad_norm": 0.23322880268096924, + "learning_rate": 9.118130156832823e-06, + "loss": 0.3866, + "step": 972 + }, + { + "epoch": 0.8194834362717575, + "grad_norm": 0.22499699890613556, + "learning_rate": 9.115347744194844e-06, + "loss": 0.3999, + "step": 973 + }, + { + "epoch": 0.8203256597417181, + "grad_norm": 0.22839175164699554, + "learning_rate": 9.112561374965177e-06, + "loss": 0.4033, + "step": 974 + }, + { + "epoch": 0.8211678832116789, + "grad_norm": 0.22353333234786987, + "learning_rate": 9.109771051822702e-06, + "loss": 0.3648, + "step": 975 + }, + { + "epoch": 0.8220101066816395, + "grad_norm": 0.3328852653503418, + "learning_rate": 9.106976777450099e-06, + "loss": 0.4012, + "step": 976 + }, + { + "epoch": 0.8228523301516002, + "grad_norm": 0.26589012145996094, + "learning_rate": 9.10417855453385e-06, + "loss": 0.3886, + "step": 977 + }, + { + "epoch": 0.823694553621561, + "grad_norm": 0.2546745538711548, + "learning_rate": 9.10137638576423e-06, + "loss": 0.3997, + "step": 978 + }, + { + "epoch": 0.8245367770915216, + "grad_norm": 0.23469386994838715, + "learning_rate": 9.098570273835314e-06, + "loss": 0.3859, + "step": 979 + }, + { + "epoch": 0.8253790005614823, + "grad_norm": 0.23561683297157288, + "learning_rate": 9.09576022144496e-06, + "loss": 0.3831, + "step": 980 + }, + { + "epoch": 0.826221224031443, + "grad_norm": 0.28168466687202454, + "learning_rate": 9.09294623129482e-06, + "loss": 0.3641, + "step": 981 + }, + { + "epoch": 0.8270634475014037, + "grad_norm": 0.24064616858959198, + "learning_rate": 9.090128306090329e-06, + "loss": 0.4049, + "step": 982 + }, + { + "epoch": 0.8279056709713644, + "grad_norm": 0.2215161919593811, + "learning_rate": 9.087306448540707e-06, + "loss": 0.3992, + "step": 983 + }, + { + "epoch": 0.8287478944413251, + "grad_norm": 0.2595074772834778, + "learning_rate": 9.084480661358954e-06, + "loss": 0.3823, + "step": 984 + }, + { + "epoch": 0.8295901179112858, + "grad_norm": 0.23889648914337158, + "learning_rate": 9.081650947261847e-06, + "loss": 0.3656, + "step": 985 + }, + { + "epoch": 0.8304323413812464, + "grad_norm": 0.24093107879161835, + "learning_rate": 9.07881730896994e-06, + "loss": 0.3919, + "step": 986 + }, + { + "epoch": 0.8312745648512072, + "grad_norm": 0.2698620557785034, + "learning_rate": 9.07597974920756e-06, + "loss": 0.3986, + "step": 987 + }, + { + "epoch": 0.8321167883211679, + "grad_norm": 0.2287716567516327, + "learning_rate": 9.073138270702804e-06, + "loss": 0.3815, + "step": 988 + }, + { + "epoch": 0.8329590117911286, + "grad_norm": 0.237266406416893, + "learning_rate": 9.070292876187532e-06, + "loss": 0.3726, + "step": 989 + }, + { + "epoch": 0.8338012352610893, + "grad_norm": 0.2581973075866699, + "learning_rate": 9.067443568397378e-06, + "loss": 0.4076, + "step": 990 + }, + { + "epoch": 0.83464345873105, + "grad_norm": 0.24701803922653198, + "learning_rate": 9.06459035007173e-06, + "loss": 0.4063, + "step": 991 + }, + { + "epoch": 0.8354856822010107, + "grad_norm": 0.24334250390529633, + "learning_rate": 9.061733223953738e-06, + "loss": 0.3882, + "step": 992 + }, + { + "epoch": 0.8363279056709714, + "grad_norm": 0.2147740125656128, + "learning_rate": 9.058872192790314e-06, + "loss": 0.3798, + "step": 993 + }, + { + "epoch": 0.837170129140932, + "grad_norm": 0.24799306690692902, + "learning_rate": 9.056007259332115e-06, + "loss": 0.415, + "step": 994 + }, + { + "epoch": 0.8380123526108928, + "grad_norm": 0.23995551466941833, + "learning_rate": 9.053138426333562e-06, + "loss": 0.4021, + "step": 995 + }, + { + "epoch": 0.8388545760808535, + "grad_norm": 0.24283404648303986, + "learning_rate": 9.05026569655281e-06, + "loss": 0.3929, + "step": 996 + }, + { + "epoch": 0.8396967995508141, + "grad_norm": 0.2712905704975128, + "learning_rate": 9.047389072751777e-06, + "loss": 0.3717, + "step": 997 + }, + { + "epoch": 0.8405390230207749, + "grad_norm": 0.27794942259788513, + "learning_rate": 9.044508557696111e-06, + "loss": 0.3774, + "step": 998 + }, + { + "epoch": 0.8413812464907355, + "grad_norm": 0.26650184392929077, + "learning_rate": 9.041624154155208e-06, + "loss": 0.3967, + "step": 999 + }, + { + "epoch": 0.8422234699606962, + "grad_norm": 0.30274587869644165, + "learning_rate": 9.038735864902201e-06, + "loss": 0.3873, + "step": 1000 + }, + { + "epoch": 0.843065693430657, + "grad_norm": 0.2814517617225647, + "learning_rate": 9.035843692713961e-06, + "loss": 0.391, + "step": 1001 + }, + { + "epoch": 0.8439079169006176, + "grad_norm": 0.23480753600597382, + "learning_rate": 9.032947640371086e-06, + "loss": 0.3931, + "step": 1002 + }, + { + "epoch": 0.8447501403705783, + "grad_norm": 0.2912794351577759, + "learning_rate": 9.030047710657912e-06, + "loss": 0.3905, + "step": 1003 + }, + { + "epoch": 0.845592363840539, + "grad_norm": 0.26708126068115234, + "learning_rate": 9.027143906362499e-06, + "loss": 0.3829, + "step": 1004 + }, + { + "epoch": 0.8464345873104997, + "grad_norm": 0.2397577464580536, + "learning_rate": 9.02423623027663e-06, + "loss": 0.3829, + "step": 1005 + }, + { + "epoch": 0.8472768107804605, + "grad_norm": 0.2438420057296753, + "learning_rate": 9.021324685195814e-06, + "loss": 0.3888, + "step": 1006 + }, + { + "epoch": 0.8481190342504211, + "grad_norm": 0.24794094264507294, + "learning_rate": 9.018409273919279e-06, + "loss": 0.378, + "step": 1007 + }, + { + "epoch": 0.8489612577203818, + "grad_norm": 0.2339514195919037, + "learning_rate": 9.01548999924997e-06, + "loss": 0.4014, + "step": 1008 + }, + { + "epoch": 0.8498034811903425, + "grad_norm": 0.2431575357913971, + "learning_rate": 9.012566863994548e-06, + "loss": 0.3975, + "step": 1009 + }, + { + "epoch": 0.8506457046603032, + "grad_norm": 0.23573164641857147, + "learning_rate": 9.00963987096338e-06, + "loss": 0.3963, + "step": 1010 + }, + { + "epoch": 0.8514879281302639, + "grad_norm": 0.245757058262825, + "learning_rate": 9.006709022970547e-06, + "loss": 0.4054, + "step": 1011 + }, + { + "epoch": 0.8523301516002246, + "grad_norm": 0.24755309522151947, + "learning_rate": 9.003774322833835e-06, + "loss": 0.3954, + "step": 1012 + }, + { + "epoch": 0.8531723750701853, + "grad_norm": 0.25644731521606445, + "learning_rate": 9.000835773374733e-06, + "loss": 0.3977, + "step": 1013 + }, + { + "epoch": 0.8540145985401459, + "grad_norm": 0.253381609916687, + "learning_rate": 8.997893377418432e-06, + "loss": 0.4008, + "step": 1014 + }, + { + "epoch": 0.8548568220101067, + "grad_norm": 0.2430805116891861, + "learning_rate": 8.99494713779382e-06, + "loss": 0.3724, + "step": 1015 + }, + { + "epoch": 0.8556990454800674, + "grad_norm": 0.24902412295341492, + "learning_rate": 8.991997057333481e-06, + "loss": 0.3809, + "step": 1016 + }, + { + "epoch": 0.856541268950028, + "grad_norm": 0.2504844665527344, + "learning_rate": 8.98904313887369e-06, + "loss": 0.397, + "step": 1017 + }, + { + "epoch": 0.8573834924199888, + "grad_norm": 0.28356367349624634, + "learning_rate": 8.986085385254417e-06, + "loss": 0.4272, + "step": 1018 + }, + { + "epoch": 0.8582257158899494, + "grad_norm": 0.2447170913219452, + "learning_rate": 8.983123799319312e-06, + "loss": 0.3974, + "step": 1019 + }, + { + "epoch": 0.8590679393599102, + "grad_norm": 0.21878619492053986, + "learning_rate": 8.980158383915714e-06, + "loss": 0.3867, + "step": 1020 + }, + { + "epoch": 0.8599101628298709, + "grad_norm": 0.254640132188797, + "learning_rate": 8.977189141894645e-06, + "loss": 0.4221, + "step": 1021 + }, + { + "epoch": 0.8607523862998315, + "grad_norm": 0.2748672664165497, + "learning_rate": 8.9742160761108e-06, + "loss": 0.3782, + "step": 1022 + }, + { + "epoch": 0.8615946097697923, + "grad_norm": 0.24861536920070648, + "learning_rate": 8.971239189422555e-06, + "loss": 0.374, + "step": 1023 + }, + { + "epoch": 0.862436833239753, + "grad_norm": 0.28852665424346924, + "learning_rate": 8.968258484691961e-06, + "loss": 0.4046, + "step": 1024 + }, + { + "epoch": 0.8632790567097136, + "grad_norm": 0.24399378895759583, + "learning_rate": 8.965273964784735e-06, + "loss": 0.3799, + "step": 1025 + }, + { + "epoch": 0.8641212801796744, + "grad_norm": 0.24471896886825562, + "learning_rate": 8.962285632570266e-06, + "loss": 0.3918, + "step": 1026 + }, + { + "epoch": 0.864963503649635, + "grad_norm": 0.2678011953830719, + "learning_rate": 8.959293490921606e-06, + "loss": 0.3867, + "step": 1027 + }, + { + "epoch": 0.8658057271195957, + "grad_norm": 0.2465362846851349, + "learning_rate": 8.956297542715469e-06, + "loss": 0.3904, + "step": 1028 + }, + { + "epoch": 0.8666479505895565, + "grad_norm": 0.268699586391449, + "learning_rate": 8.953297790832231e-06, + "loss": 0.3987, + "step": 1029 + }, + { + "epoch": 0.8674901740595171, + "grad_norm": 0.23215501010417938, + "learning_rate": 8.950294238155924e-06, + "loss": 0.3802, + "step": 1030 + }, + { + "epoch": 0.8683323975294778, + "grad_norm": 0.24207451939582825, + "learning_rate": 8.947286887574234e-06, + "loss": 0.402, + "step": 1031 + }, + { + "epoch": 0.8691746209994385, + "grad_norm": 0.2715492248535156, + "learning_rate": 8.944275741978495e-06, + "loss": 0.3945, + "step": 1032 + }, + { + "epoch": 0.8700168444693992, + "grad_norm": 0.2414940595626831, + "learning_rate": 8.941260804263697e-06, + "loss": 0.4273, + "step": 1033 + }, + { + "epoch": 0.87085906793936, + "grad_norm": 0.2749345600605011, + "learning_rate": 8.938242077328469e-06, + "loss": 0.4063, + "step": 1034 + }, + { + "epoch": 0.8717012914093206, + "grad_norm": 0.24765220284461975, + "learning_rate": 8.935219564075087e-06, + "loss": 0.3833, + "step": 1035 + }, + { + "epoch": 0.8725435148792813, + "grad_norm": 0.24345092475414276, + "learning_rate": 8.932193267409465e-06, + "loss": 0.3933, + "step": 1036 + }, + { + "epoch": 0.873385738349242, + "grad_norm": 0.2580495774745941, + "learning_rate": 8.929163190241157e-06, + "loss": 0.3869, + "step": 1037 + }, + { + "epoch": 0.8742279618192027, + "grad_norm": 0.2621128559112549, + "learning_rate": 8.92612933548335e-06, + "loss": 0.3825, + "step": 1038 + }, + { + "epoch": 0.8750701852891634, + "grad_norm": 0.24727575480937958, + "learning_rate": 8.923091706052863e-06, + "loss": 0.3659, + "step": 1039 + }, + { + "epoch": 0.8759124087591241, + "grad_norm": 0.25439199805259705, + "learning_rate": 8.920050304870142e-06, + "loss": 0.3817, + "step": 1040 + }, + { + "epoch": 0.8767546322290848, + "grad_norm": 0.2469315528869629, + "learning_rate": 8.917005134859263e-06, + "loss": 0.3881, + "step": 1041 + }, + { + "epoch": 0.8775968556990454, + "grad_norm": 0.21084031462669373, + "learning_rate": 8.913956198947923e-06, + "loss": 0.3784, + "step": 1042 + }, + { + "epoch": 0.8784390791690062, + "grad_norm": 0.24740955233573914, + "learning_rate": 8.910903500067443e-06, + "loss": 0.3902, + "step": 1043 + }, + { + "epoch": 0.8792813026389669, + "grad_norm": 0.25529807806015015, + "learning_rate": 8.907847041152757e-06, + "loss": 0.391, + "step": 1044 + }, + { + "epoch": 0.8801235261089275, + "grad_norm": 0.24274209141731262, + "learning_rate": 8.904786825142416e-06, + "loss": 0.3742, + "step": 1045 + }, + { + "epoch": 0.8809657495788883, + "grad_norm": 0.2258424311876297, + "learning_rate": 8.901722854978582e-06, + "loss": 0.3998, + "step": 1046 + }, + { + "epoch": 0.881807973048849, + "grad_norm": 0.24744433164596558, + "learning_rate": 8.89865513360703e-06, + "loss": 0.3859, + "step": 1047 + }, + { + "epoch": 0.8826501965188096, + "grad_norm": 0.2778688669204712, + "learning_rate": 8.89558366397714e-06, + "loss": 0.3787, + "step": 1048 + }, + { + "epoch": 0.8834924199887704, + "grad_norm": 0.22339139878749847, + "learning_rate": 8.892508449041893e-06, + "loss": 0.3919, + "step": 1049 + }, + { + "epoch": 0.884334643458731, + "grad_norm": 0.2479926347732544, + "learning_rate": 8.889429491757872e-06, + "loss": 0.3939, + "step": 1050 + }, + { + "epoch": 0.8851768669286918, + "grad_norm": 0.22446775436401367, + "learning_rate": 8.88634679508526e-06, + "loss": 0.3878, + "step": 1051 + }, + { + "epoch": 0.8860190903986525, + "grad_norm": 0.2394167184829712, + "learning_rate": 8.883260361987833e-06, + "loss": 0.3815, + "step": 1052 + }, + { + "epoch": 0.8868613138686131, + "grad_norm": 0.22700078785419464, + "learning_rate": 8.88017019543296e-06, + "loss": 0.3829, + "step": 1053 + }, + { + "epoch": 0.8877035373385739, + "grad_norm": 0.2256840020418167, + "learning_rate": 8.8770762983916e-06, + "loss": 0.4209, + "step": 1054 + }, + { + "epoch": 0.8885457608085345, + "grad_norm": 0.25572511553764343, + "learning_rate": 8.8739786738383e-06, + "loss": 0.3911, + "step": 1055 + }, + { + "epoch": 0.8893879842784952, + "grad_norm": 0.2469644397497177, + "learning_rate": 8.870877324751186e-06, + "loss": 0.4057, + "step": 1056 + }, + { + "epoch": 0.890230207748456, + "grad_norm": 0.23722214996814728, + "learning_rate": 8.867772254111966e-06, + "loss": 0.3695, + "step": 1057 + }, + { + "epoch": 0.8910724312184166, + "grad_norm": 0.22236628830432892, + "learning_rate": 8.864663464905933e-06, + "loss": 0.3813, + "step": 1058 + }, + { + "epoch": 0.8919146546883773, + "grad_norm": 0.21829207241535187, + "learning_rate": 8.861550960121946e-06, + "loss": 0.3859, + "step": 1059 + }, + { + "epoch": 0.892756878158338, + "grad_norm": 0.23551766574382782, + "learning_rate": 8.85843474275244e-06, + "loss": 0.4064, + "step": 1060 + }, + { + "epoch": 0.8935991016282987, + "grad_norm": 0.23779690265655518, + "learning_rate": 8.85531481579342e-06, + "loss": 0.3987, + "step": 1061 + }, + { + "epoch": 0.8944413250982594, + "grad_norm": 0.21609297394752502, + "learning_rate": 8.852191182244456e-06, + "loss": 0.3892, + "step": 1062 + }, + { + "epoch": 0.8952835485682201, + "grad_norm": 0.21627114713191986, + "learning_rate": 8.849063845108685e-06, + "loss": 0.3788, + "step": 1063 + }, + { + "epoch": 0.8961257720381808, + "grad_norm": 0.2338523417711258, + "learning_rate": 8.8459328073928e-06, + "loss": 0.3692, + "step": 1064 + }, + { + "epoch": 0.8969679955081415, + "grad_norm": 0.2511400282382965, + "learning_rate": 8.842798072107055e-06, + "loss": 0.3882, + "step": 1065 + }, + { + "epoch": 0.8978102189781022, + "grad_norm": 0.2453722059726715, + "learning_rate": 8.839659642265259e-06, + "loss": 0.3827, + "step": 1066 + }, + { + "epoch": 0.8986524424480629, + "grad_norm": 0.23561310768127441, + "learning_rate": 8.836517520884768e-06, + "loss": 0.3657, + "step": 1067 + }, + { + "epoch": 0.8994946659180236, + "grad_norm": 0.24510298669338226, + "learning_rate": 8.833371710986493e-06, + "loss": 0.3761, + "step": 1068 + }, + { + "epoch": 0.9003368893879843, + "grad_norm": 0.24968662858009338, + "learning_rate": 8.83022221559489e-06, + "loss": 0.3942, + "step": 1069 + }, + { + "epoch": 0.9011791128579449, + "grad_norm": 0.23057861626148224, + "learning_rate": 8.827069037737958e-06, + "loss": 0.3946, + "step": 1070 + }, + { + "epoch": 0.9020213363279057, + "grad_norm": 0.23505951464176178, + "learning_rate": 8.823912180447237e-06, + "loss": 0.3683, + "step": 1071 + }, + { + "epoch": 0.9028635597978664, + "grad_norm": 0.2223382592201233, + "learning_rate": 8.820751646757798e-06, + "loss": 0.3973, + "step": 1072 + }, + { + "epoch": 0.903705783267827, + "grad_norm": 0.21368998289108276, + "learning_rate": 8.81758743970826e-06, + "loss": 0.3947, + "step": 1073 + }, + { + "epoch": 0.9045480067377878, + "grad_norm": 0.2535130977630615, + "learning_rate": 8.81441956234076e-06, + "loss": 0.4189, + "step": 1074 + }, + { + "epoch": 0.9053902302077484, + "grad_norm": 0.24544082581996918, + "learning_rate": 8.81124801770097e-06, + "loss": 0.4065, + "step": 1075 + }, + { + "epoch": 0.9062324536777091, + "grad_norm": 0.22568069398403168, + "learning_rate": 8.80807280883809e-06, + "loss": 0.3902, + "step": 1076 + }, + { + "epoch": 0.9070746771476699, + "grad_norm": 0.24304959177970886, + "learning_rate": 8.804893938804839e-06, + "loss": 0.3855, + "step": 1077 + }, + { + "epoch": 0.9079169006176305, + "grad_norm": 0.24036061763763428, + "learning_rate": 8.801711410657456e-06, + "loss": 0.4042, + "step": 1078 + }, + { + "epoch": 0.9087591240875912, + "grad_norm": 0.252481609582901, + "learning_rate": 8.7985252274557e-06, + "loss": 0.3835, + "step": 1079 + }, + { + "epoch": 0.909601347557552, + "grad_norm": 0.2750056982040405, + "learning_rate": 8.795335392262841e-06, + "loss": 0.3889, + "step": 1080 + }, + { + "epoch": 0.9104435710275126, + "grad_norm": 0.3957831561565399, + "learning_rate": 8.79214190814566e-06, + "loss": 0.3763, + "step": 1081 + }, + { + "epoch": 0.9112857944974734, + "grad_norm": 0.24878466129302979, + "learning_rate": 8.78894477817445e-06, + "loss": 0.3649, + "step": 1082 + }, + { + "epoch": 0.912128017967434, + "grad_norm": 0.2372635006904602, + "learning_rate": 8.785744005423003e-06, + "loss": 0.3851, + "step": 1083 + }, + { + "epoch": 0.9129702414373947, + "grad_norm": 0.24955178797245026, + "learning_rate": 8.78253959296862e-06, + "loss": 0.3817, + "step": 1084 + }, + { + "epoch": 0.9138124649073555, + "grad_norm": 0.23643504083156586, + "learning_rate": 8.779331543892097e-06, + "loss": 0.3902, + "step": 1085 + }, + { + "epoch": 0.9146546883773161, + "grad_norm": 0.24997515976428986, + "learning_rate": 8.77611986127773e-06, + "loss": 0.3826, + "step": 1086 + }, + { + "epoch": 0.9154969118472768, + "grad_norm": 0.25293558835983276, + "learning_rate": 8.772904548213301e-06, + "loss": 0.3749, + "step": 1087 + }, + { + "epoch": 0.9163391353172375, + "grad_norm": 0.2371770143508911, + "learning_rate": 8.769685607790091e-06, + "loss": 0.3956, + "step": 1088 + }, + { + "epoch": 0.9171813587871982, + "grad_norm": 0.26978716254234314, + "learning_rate": 8.766463043102864e-06, + "loss": 0.3848, + "step": 1089 + }, + { + "epoch": 0.9180235822571589, + "grad_norm": 0.23945683240890503, + "learning_rate": 8.76323685724987e-06, + "loss": 0.3754, + "step": 1090 + }, + { + "epoch": 0.9188658057271196, + "grad_norm": 0.24600526690483093, + "learning_rate": 8.760007053332837e-06, + "loss": 0.3933, + "step": 1091 + }, + { + "epoch": 0.9197080291970803, + "grad_norm": 0.2496621310710907, + "learning_rate": 8.756773634456975e-06, + "loss": 0.4009, + "step": 1092 + }, + { + "epoch": 0.9205502526670409, + "grad_norm": 0.24451924860477448, + "learning_rate": 8.75353660373097e-06, + "loss": 0.4051, + "step": 1093 + }, + { + "epoch": 0.9213924761370017, + "grad_norm": 0.21420931816101074, + "learning_rate": 8.750295964266979e-06, + "loss": 0.3956, + "step": 1094 + }, + { + "epoch": 0.9222346996069624, + "grad_norm": 0.2390458881855011, + "learning_rate": 8.747051719180626e-06, + "loss": 0.4074, + "step": 1095 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.23600642383098602, + "learning_rate": 8.743803871591008e-06, + "loss": 0.3944, + "step": 1096 + }, + { + "epoch": 0.9239191465468838, + "grad_norm": 0.22579824924468994, + "learning_rate": 8.740552424620679e-06, + "loss": 0.3882, + "step": 1097 + }, + { + "epoch": 0.9247613700168444, + "grad_norm": 0.23936140537261963, + "learning_rate": 8.737297381395657e-06, + "loss": 0.3863, + "step": 1098 + }, + { + "epoch": 0.9256035934868052, + "grad_norm": 0.23146341741085052, + "learning_rate": 8.734038745045419e-06, + "loss": 0.3862, + "step": 1099 + }, + { + "epoch": 0.9264458169567659, + "grad_norm": 0.23204730451107025, + "learning_rate": 8.730776518702891e-06, + "loss": 0.3848, + "step": 1100 + }, + { + "epoch": 0.9272880404267265, + "grad_norm": 0.22360917925834656, + "learning_rate": 8.727510705504453e-06, + "loss": 0.3761, + "step": 1101 + }, + { + "epoch": 0.9281302638966873, + "grad_norm": 0.24749885499477386, + "learning_rate": 8.72424130858994e-06, + "loss": 0.4139, + "step": 1102 + }, + { + "epoch": 0.928972487366648, + "grad_norm": 0.21169915795326233, + "learning_rate": 8.72096833110262e-06, + "loss": 0.3653, + "step": 1103 + }, + { + "epoch": 0.9298147108366086, + "grad_norm": 0.24596849083900452, + "learning_rate": 8.717691776189214e-06, + "loss": 0.3915, + "step": 1104 + }, + { + "epoch": 0.9306569343065694, + "grad_norm": 0.2101602554321289, + "learning_rate": 8.714411646999878e-06, + "loss": 0.3699, + "step": 1105 + }, + { + "epoch": 0.93149915777653, + "grad_norm": 0.22556227445602417, + "learning_rate": 8.711127946688207e-06, + "loss": 0.4065, + "step": 1106 + }, + { + "epoch": 0.9323413812464907, + "grad_norm": 0.22517412900924683, + "learning_rate": 8.707840678411223e-06, + "loss": 0.3841, + "step": 1107 + }, + { + "epoch": 0.9331836047164515, + "grad_norm": 0.23653604090213776, + "learning_rate": 8.704549845329386e-06, + "loss": 0.3882, + "step": 1108 + }, + { + "epoch": 0.9340258281864121, + "grad_norm": 0.21519114077091217, + "learning_rate": 8.701255450606579e-06, + "loss": 0.3755, + "step": 1109 + }, + { + "epoch": 0.9348680516563729, + "grad_norm": 0.25170084834098816, + "learning_rate": 8.69795749741011e-06, + "loss": 0.408, + "step": 1110 + }, + { + "epoch": 0.9357102751263335, + "grad_norm": 0.23984849452972412, + "learning_rate": 8.694655988910707e-06, + "loss": 0.4137, + "step": 1111 + }, + { + "epoch": 0.9365524985962942, + "grad_norm": 0.22293125092983246, + "learning_rate": 8.69135092828252e-06, + "loss": 0.3885, + "step": 1112 + }, + { + "epoch": 0.937394722066255, + "grad_norm": 0.23699672520160675, + "learning_rate": 8.688042318703111e-06, + "loss": 0.3867, + "step": 1113 + }, + { + "epoch": 0.9382369455362156, + "grad_norm": 0.252868115901947, + "learning_rate": 8.684730163353457e-06, + "loss": 0.3769, + "step": 1114 + }, + { + "epoch": 0.9390791690061763, + "grad_norm": 0.24049875140190125, + "learning_rate": 8.681414465417936e-06, + "loss": 0.3782, + "step": 1115 + }, + { + "epoch": 0.939921392476137, + "grad_norm": 0.22941753268241882, + "learning_rate": 8.678095228084343e-06, + "loss": 0.4092, + "step": 1116 + }, + { + "epoch": 0.9407636159460977, + "grad_norm": 0.20538175106048584, + "learning_rate": 8.674772454543869e-06, + "loss": 0.4033, + "step": 1117 + }, + { + "epoch": 0.9416058394160584, + "grad_norm": 0.23638539016246796, + "learning_rate": 8.671446147991103e-06, + "loss": 0.3618, + "step": 1118 + }, + { + "epoch": 0.9424480628860191, + "grad_norm": 0.22866426408290863, + "learning_rate": 8.66811631162404e-06, + "loss": 0.3864, + "step": 1119 + }, + { + "epoch": 0.9432902863559798, + "grad_norm": 0.2309676706790924, + "learning_rate": 8.664782948644058e-06, + "loss": 0.4028, + "step": 1120 + }, + { + "epoch": 0.9441325098259404, + "grad_norm": 0.23818331956863403, + "learning_rate": 8.661446062255931e-06, + "loss": 0.3819, + "step": 1121 + }, + { + "epoch": 0.9449747332959012, + "grad_norm": 0.221882164478302, + "learning_rate": 8.65810565566782e-06, + "loss": 0.3762, + "step": 1122 + }, + { + "epoch": 0.9458169567658619, + "grad_norm": 0.25735414028167725, + "learning_rate": 8.654761732091271e-06, + "loss": 0.4059, + "step": 1123 + }, + { + "epoch": 0.9466591802358225, + "grad_norm": 0.23950034379959106, + "learning_rate": 8.65141429474121e-06, + "loss": 0.3819, + "step": 1124 + }, + { + "epoch": 0.9475014037057833, + "grad_norm": 0.22316765785217285, + "learning_rate": 8.648063346835943e-06, + "loss": 0.3974, + "step": 1125 + }, + { + "epoch": 0.9483436271757439, + "grad_norm": 0.23847147822380066, + "learning_rate": 8.644708891597147e-06, + "loss": 0.3841, + "step": 1126 + }, + { + "epoch": 0.9491858506457047, + "grad_norm": 0.21706604957580566, + "learning_rate": 8.641350932249876e-06, + "loss": 0.3754, + "step": 1127 + }, + { + "epoch": 0.9500280741156654, + "grad_norm": 0.22496441006660461, + "learning_rate": 8.637989472022548e-06, + "loss": 0.3879, + "step": 1128 + }, + { + "epoch": 0.950870297585626, + "grad_norm": 0.22241145372390747, + "learning_rate": 8.634624514146954e-06, + "loss": 0.3895, + "step": 1129 + }, + { + "epoch": 0.9517125210555868, + "grad_norm": 0.23969368636608124, + "learning_rate": 8.631256061858238e-06, + "loss": 0.3706, + "step": 1130 + }, + { + "epoch": 0.9525547445255474, + "grad_norm": 0.20851540565490723, + "learning_rate": 8.627884118394913e-06, + "loss": 0.4077, + "step": 1131 + }, + { + "epoch": 0.9533969679955081, + "grad_norm": 0.21636967360973358, + "learning_rate": 8.624508686998846e-06, + "loss": 0.3916, + "step": 1132 + }, + { + "epoch": 0.9542391914654689, + "grad_norm": 0.251487672328949, + "learning_rate": 8.621129770915248e-06, + "loss": 0.3586, + "step": 1133 + }, + { + "epoch": 0.9550814149354295, + "grad_norm": 0.33848926424980164, + "learning_rate": 8.617747373392697e-06, + "loss": 0.376, + "step": 1134 + }, + { + "epoch": 0.9559236384053902, + "grad_norm": 0.23164796829223633, + "learning_rate": 8.614361497683102e-06, + "loss": 0.3761, + "step": 1135 + }, + { + "epoch": 0.956765861875351, + "grad_norm": 0.2592570185661316, + "learning_rate": 8.61097214704173e-06, + "loss": 0.3959, + "step": 1136 + }, + { + "epoch": 0.9576080853453116, + "grad_norm": 0.22420254349708557, + "learning_rate": 8.607579324727175e-06, + "loss": 0.398, + "step": 1137 + }, + { + "epoch": 0.9584503088152723, + "grad_norm": 0.24111324548721313, + "learning_rate": 8.60418303400138e-06, + "loss": 0.3896, + "step": 1138 + }, + { + "epoch": 0.959292532285233, + "grad_norm": 0.2716215252876282, + "learning_rate": 8.600783278129617e-06, + "loss": 0.4066, + "step": 1139 + }, + { + "epoch": 0.9601347557551937, + "grad_norm": 0.25397905707359314, + "learning_rate": 8.597380060380493e-06, + "loss": 0.4086, + "step": 1140 + }, + { + "epoch": 0.9609769792251545, + "grad_norm": 0.23300889134407043, + "learning_rate": 8.59397338402594e-06, + "loss": 0.3715, + "step": 1141 + }, + { + "epoch": 0.9618192026951151, + "grad_norm": 0.2561149299144745, + "learning_rate": 8.590563252341216e-06, + "loss": 0.3704, + "step": 1142 + }, + { + "epoch": 0.9626614261650758, + "grad_norm": 0.24080070853233337, + "learning_rate": 8.5871496686049e-06, + "loss": 0.3583, + "step": 1143 + }, + { + "epoch": 0.9635036496350365, + "grad_norm": 0.28162842988967896, + "learning_rate": 8.583732636098895e-06, + "loss": 0.3998, + "step": 1144 + }, + { + "epoch": 0.9643458731049972, + "grad_norm": 0.24916066229343414, + "learning_rate": 8.580312158108413e-06, + "loss": 0.3781, + "step": 1145 + }, + { + "epoch": 0.9651880965749579, + "grad_norm": 0.24553634226322174, + "learning_rate": 8.576888237921983e-06, + "loss": 0.3674, + "step": 1146 + }, + { + "epoch": 0.9660303200449186, + "grad_norm": 0.24306859076023102, + "learning_rate": 8.57346087883144e-06, + "loss": 0.3797, + "step": 1147 + }, + { + "epoch": 0.9668725435148793, + "grad_norm": 0.24062088131904602, + "learning_rate": 8.570030084131933e-06, + "loss": 0.3868, + "step": 1148 + }, + { + "epoch": 0.9677147669848399, + "grad_norm": 0.2654595375061035, + "learning_rate": 8.566595857121902e-06, + "loss": 0.3875, + "step": 1149 + }, + { + "epoch": 0.9685569904548007, + "grad_norm": 0.24021129310131073, + "learning_rate": 8.563158201103096e-06, + "loss": 0.385, + "step": 1150 + }, + { + "epoch": 0.9693992139247614, + "grad_norm": 0.2687273919582367, + "learning_rate": 8.559717119380558e-06, + "loss": 0.3873, + "step": 1151 + }, + { + "epoch": 0.970241437394722, + "grad_norm": 0.28822076320648193, + "learning_rate": 8.556272615262623e-06, + "loss": 0.372, + "step": 1152 + }, + { + "epoch": 0.9710836608646828, + "grad_norm": 0.23251932859420776, + "learning_rate": 8.55282469206092e-06, + "loss": 0.371, + "step": 1153 + }, + { + "epoch": 0.9719258843346434, + "grad_norm": 0.231855109333992, + "learning_rate": 8.549373353090362e-06, + "loss": 0.374, + "step": 1154 + }, + { + "epoch": 0.9727681078046042, + "grad_norm": 0.2325243204832077, + "learning_rate": 8.545918601669147e-06, + "loss": 0.3623, + "step": 1155 + }, + { + "epoch": 0.9736103312745649, + "grad_norm": 0.24317185580730438, + "learning_rate": 8.542460441118756e-06, + "loss": 0.3985, + "step": 1156 + }, + { + "epoch": 0.9744525547445255, + "grad_norm": 0.2409314215183258, + "learning_rate": 8.538998874763942e-06, + "loss": 0.4072, + "step": 1157 + }, + { + "epoch": 0.9752947782144863, + "grad_norm": 0.22322307527065277, + "learning_rate": 8.535533905932739e-06, + "loss": 0.3968, + "step": 1158 + }, + { + "epoch": 0.976137001684447, + "grad_norm": 0.22121012210845947, + "learning_rate": 8.532065537956446e-06, + "loss": 0.3817, + "step": 1159 + }, + { + "epoch": 0.9769792251544076, + "grad_norm": 0.23832759261131287, + "learning_rate": 8.528593774169637e-06, + "loss": 0.4015, + "step": 1160 + }, + { + "epoch": 0.9778214486243684, + "grad_norm": 0.2314673662185669, + "learning_rate": 8.525118617910144e-06, + "loss": 0.3869, + "step": 1161 + }, + { + "epoch": 0.978663672094329, + "grad_norm": 0.23528188467025757, + "learning_rate": 8.521640072519066e-06, + "loss": 0.3786, + "step": 1162 + }, + { + "epoch": 0.9795058955642897, + "grad_norm": 0.2287149727344513, + "learning_rate": 8.518158141340755e-06, + "loss": 0.4026, + "step": 1163 + }, + { + "epoch": 0.9803481190342505, + "grad_norm": 0.2339918166399002, + "learning_rate": 8.514672827722824e-06, + "loss": 0.3845, + "step": 1164 + }, + { + "epoch": 0.9811903425042111, + "grad_norm": 0.24464187026023865, + "learning_rate": 8.511184135016134e-06, + "loss": 0.3893, + "step": 1165 + }, + { + "epoch": 0.9820325659741718, + "grad_norm": 0.2102963775396347, + "learning_rate": 8.507692066574795e-06, + "loss": 0.3766, + "step": 1166 + }, + { + "epoch": 0.9828747894441325, + "grad_norm": 0.20385922491550446, + "learning_rate": 8.504196625756166e-06, + "loss": 0.3832, + "step": 1167 + }, + { + "epoch": 0.9837170129140932, + "grad_norm": 0.2298925369977951, + "learning_rate": 8.500697815920843e-06, + "loss": 0.3788, + "step": 1168 + }, + { + "epoch": 0.9845592363840538, + "grad_norm": 0.22536714375019073, + "learning_rate": 8.497195640432664e-06, + "loss": 0.3747, + "step": 1169 + }, + { + "epoch": 0.9854014598540146, + "grad_norm": 0.240451842546463, + "learning_rate": 8.493690102658703e-06, + "loss": 0.4021, + "step": 1170 + }, + { + "epoch": 0.9862436833239753, + "grad_norm": 0.24420315027236938, + "learning_rate": 8.490181205969268e-06, + "loss": 0.4098, + "step": 1171 + }, + { + "epoch": 0.987085906793936, + "grad_norm": 0.23833104968070984, + "learning_rate": 8.486668953737891e-06, + "loss": 0.392, + "step": 1172 + }, + { + "epoch": 0.9879281302638967, + "grad_norm": 0.24562418460845947, + "learning_rate": 8.483153349341336e-06, + "loss": 0.4039, + "step": 1173 + }, + { + "epoch": 0.9887703537338574, + "grad_norm": 0.20323196053504944, + "learning_rate": 8.479634396159587e-06, + "loss": 0.3997, + "step": 1174 + }, + { + "epoch": 0.9896125772038181, + "grad_norm": 0.24542491137981415, + "learning_rate": 8.476112097575845e-06, + "loss": 0.3791, + "step": 1175 + }, + { + "epoch": 0.9904548006737788, + "grad_norm": 0.22892843186855316, + "learning_rate": 8.472586456976534e-06, + "loss": 0.3702, + "step": 1176 + }, + { + "epoch": 0.9912970241437394, + "grad_norm": 0.21064841747283936, + "learning_rate": 8.46905747775129e-06, + "loss": 0.3862, + "step": 1177 + }, + { + "epoch": 0.9921392476137002, + "grad_norm": 0.24221305549144745, + "learning_rate": 8.465525163292948e-06, + "loss": 0.3888, + "step": 1178 + }, + { + "epoch": 0.9929814710836609, + "grad_norm": 0.2127092331647873, + "learning_rate": 8.461989516997565e-06, + "loss": 0.4031, + "step": 1179 + }, + { + "epoch": 0.9938236945536215, + "grad_norm": 0.24036602675914764, + "learning_rate": 8.458450542264391e-06, + "loss": 0.3967, + "step": 1180 + }, + { + "epoch": 0.9946659180235823, + "grad_norm": 0.2219998985528946, + "learning_rate": 8.45490824249588e-06, + "loss": 0.3774, + "step": 1181 + }, + { + "epoch": 0.9955081414935429, + "grad_norm": 0.2132783681154251, + "learning_rate": 8.45136262109768e-06, + "loss": 0.3636, + "step": 1182 + }, + { + "epoch": 0.9963503649635036, + "grad_norm": 0.23521870374679565, + "learning_rate": 8.447813681478638e-06, + "loss": 0.3985, + "step": 1183 + }, + { + "epoch": 0.9971925884334644, + "grad_norm": 0.2551150918006897, + "learning_rate": 8.444261427050786e-06, + "loss": 0.4037, + "step": 1184 + }, + { + "epoch": 0.998034811903425, + "grad_norm": 0.21768134832382202, + "learning_rate": 8.440705861229344e-06, + "loss": 0.3734, + "step": 1185 + }, + { + "epoch": 0.9988770353733858, + "grad_norm": 0.22760789096355438, + "learning_rate": 8.437146987432717e-06, + "loss": 0.3958, + "step": 1186 + }, + { + "epoch": 0.9997192588433464, + "grad_norm": 0.22547492384910583, + "learning_rate": 8.43358480908249e-06, + "loss": 0.3673, + "step": 1187 + }, + { + "epoch": 1.000561482313307, + "grad_norm": 0.41303884983062744, + "learning_rate": 8.430019329603423e-06, + "loss": 0.5973, + "step": 1188 + }, + { + "epoch": 1.0014037057832679, + "grad_norm": 0.26283347606658936, + "learning_rate": 8.426450552423451e-06, + "loss": 0.3644, + "step": 1189 + }, + { + "epoch": 1.0022459292532284, + "grad_norm": 0.25336721539497375, + "learning_rate": 8.422878480973681e-06, + "loss": 0.3421, + "step": 1190 + }, + { + "epoch": 1.0030881527231892, + "grad_norm": 0.263960599899292, + "learning_rate": 8.41930311868839e-06, + "loss": 0.3677, + "step": 1191 + }, + { + "epoch": 1.00393037619315, + "grad_norm": 0.24778808653354645, + "learning_rate": 8.41572446900501e-06, + "loss": 0.3819, + "step": 1192 + }, + { + "epoch": 1.0047725996631107, + "grad_norm": 0.2429429590702057, + "learning_rate": 8.412142535364139e-06, + "loss": 0.3124, + "step": 1193 + }, + { + "epoch": 1.0056148231330713, + "grad_norm": 0.22689130902290344, + "learning_rate": 8.408557321209534e-06, + "loss": 0.365, + "step": 1194 + }, + { + "epoch": 1.006457046603032, + "grad_norm": 0.26087427139282227, + "learning_rate": 8.404968829988102e-06, + "loss": 0.3763, + "step": 1195 + }, + { + "epoch": 1.0072992700729928, + "grad_norm": 0.2285303771495819, + "learning_rate": 8.401377065149904e-06, + "loss": 0.3504, + "step": 1196 + }, + { + "epoch": 1.0081414935429533, + "grad_norm": 0.2542676031589508, + "learning_rate": 8.397782030148147e-06, + "loss": 0.3699, + "step": 1197 + }, + { + "epoch": 1.0089837170129141, + "grad_norm": 0.22694192826747894, + "learning_rate": 8.39418372843918e-06, + "loss": 0.3319, + "step": 1198 + }, + { + "epoch": 1.0098259404828749, + "grad_norm": 0.23784025013446808, + "learning_rate": 8.390582163482497e-06, + "loss": 0.3467, + "step": 1199 + }, + { + "epoch": 1.0106681639528354, + "grad_norm": 0.24005892872810364, + "learning_rate": 8.386977338740724e-06, + "loss": 0.351, + "step": 1200 + }, + { + "epoch": 1.0115103874227962, + "grad_norm": 0.2603375315666199, + "learning_rate": 8.383369257679625e-06, + "loss": 0.3975, + "step": 1201 + }, + { + "epoch": 1.012352610892757, + "grad_norm": 0.22591696679592133, + "learning_rate": 8.379757923768094e-06, + "loss": 0.3371, + "step": 1202 + }, + { + "epoch": 1.0131948343627175, + "grad_norm": 0.23937468230724335, + "learning_rate": 8.376143340478153e-06, + "loss": 0.384, + "step": 1203 + }, + { + "epoch": 1.0140370578326783, + "grad_norm": 0.25090497732162476, + "learning_rate": 8.372525511284945e-06, + "loss": 0.3625, + "step": 1204 + }, + { + "epoch": 1.014879281302639, + "grad_norm": 0.2606372535228729, + "learning_rate": 8.368904439666739e-06, + "loss": 0.3607, + "step": 1205 + }, + { + "epoch": 1.0157215047725996, + "grad_norm": 0.20606310665607452, + "learning_rate": 8.365280129104912e-06, + "loss": 0.3482, + "step": 1206 + }, + { + "epoch": 1.0165637282425604, + "grad_norm": 0.2573789060115814, + "learning_rate": 8.361652583083968e-06, + "loss": 0.3303, + "step": 1207 + }, + { + "epoch": 1.0174059517125211, + "grad_norm": 0.23522531986236572, + "learning_rate": 8.358021805091509e-06, + "loss": 0.3569, + "step": 1208 + }, + { + "epoch": 1.0182481751824817, + "grad_norm": 0.22316160798072815, + "learning_rate": 8.354387798618254e-06, + "loss": 0.3443, + "step": 1209 + }, + { + "epoch": 1.0190903986524424, + "grad_norm": 0.2838685214519501, + "learning_rate": 8.35075056715802e-06, + "loss": 0.3429, + "step": 1210 + }, + { + "epoch": 1.0199326221224032, + "grad_norm": 0.22736883163452148, + "learning_rate": 8.347110114207727e-06, + "loss": 0.3963, + "step": 1211 + }, + { + "epoch": 1.0207748455923638, + "grad_norm": 0.2243269979953766, + "learning_rate": 8.34346644326739e-06, + "loss": 0.3273, + "step": 1212 + }, + { + "epoch": 1.0216170690623245, + "grad_norm": 0.23876528441905975, + "learning_rate": 8.339819557840124e-06, + "loss": 0.392, + "step": 1213 + }, + { + "epoch": 1.0224592925322853, + "grad_norm": 0.2140982449054718, + "learning_rate": 8.336169461432125e-06, + "loss": 0.3563, + "step": 1214 + }, + { + "epoch": 1.0233015160022458, + "grad_norm": 0.24384154379367828, + "learning_rate": 8.332516157552684e-06, + "loss": 0.3981, + "step": 1215 + }, + { + "epoch": 1.0241437394722066, + "grad_norm": 0.1963784545660019, + "learning_rate": 8.328859649714171e-06, + "loss": 0.3368, + "step": 1216 + }, + { + "epoch": 1.0249859629421674, + "grad_norm": 0.23279255628585815, + "learning_rate": 8.32519994143204e-06, + "loss": 0.3863, + "step": 1217 + }, + { + "epoch": 1.025828186412128, + "grad_norm": 0.19129885733127594, + "learning_rate": 8.321537036224822e-06, + "loss": 0.3284, + "step": 1218 + }, + { + "epoch": 1.0266704098820887, + "grad_norm": 0.22419750690460205, + "learning_rate": 8.317870937614115e-06, + "loss": 0.3691, + "step": 1219 + }, + { + "epoch": 1.0275126333520495, + "grad_norm": 0.22040456533432007, + "learning_rate": 8.314201649124595e-06, + "loss": 0.3606, + "step": 1220 + }, + { + "epoch": 1.02835485682201, + "grad_norm": 0.2307283580303192, + "learning_rate": 8.310529174284004e-06, + "loss": 0.3419, + "step": 1221 + }, + { + "epoch": 1.0291970802919708, + "grad_norm": 0.23188059031963348, + "learning_rate": 8.30685351662314e-06, + "loss": 0.3544, + "step": 1222 + }, + { + "epoch": 1.0300393037619315, + "grad_norm": 0.2442709058523178, + "learning_rate": 8.30317467967587e-06, + "loss": 0.3719, + "step": 1223 + }, + { + "epoch": 1.0308815272318923, + "grad_norm": 0.26356396079063416, + "learning_rate": 8.299492666979114e-06, + "loss": 0.4023, + "step": 1224 + }, + { + "epoch": 1.0317237507018528, + "grad_norm": 0.22132344543933868, + "learning_rate": 8.295807482072842e-06, + "loss": 0.3593, + "step": 1225 + }, + { + "epoch": 1.0325659741718136, + "grad_norm": 0.23799309134483337, + "learning_rate": 8.292119128500082e-06, + "loss": 0.371, + "step": 1226 + }, + { + "epoch": 1.0334081976417744, + "grad_norm": 0.23775412142276764, + "learning_rate": 8.288427609806899e-06, + "loss": 0.3607, + "step": 1227 + }, + { + "epoch": 1.034250421111735, + "grad_norm": 0.2484200894832611, + "learning_rate": 8.28473292954241e-06, + "loss": 0.3769, + "step": 1228 + }, + { + "epoch": 1.0350926445816957, + "grad_norm": 0.2280905544757843, + "learning_rate": 8.281035091258762e-06, + "loss": 0.3716, + "step": 1229 + }, + { + "epoch": 1.0359348680516565, + "grad_norm": 0.24847450852394104, + "learning_rate": 8.277334098511147e-06, + "loss": 0.3766, + "step": 1230 + }, + { + "epoch": 1.036777091521617, + "grad_norm": 0.2286071479320526, + "learning_rate": 8.273629954857784e-06, + "loss": 0.3481, + "step": 1231 + }, + { + "epoch": 1.0376193149915778, + "grad_norm": 0.22194388508796692, + "learning_rate": 8.269922663859926e-06, + "loss": 0.3461, + "step": 1232 + }, + { + "epoch": 1.0384615384615385, + "grad_norm": 0.2565535604953766, + "learning_rate": 8.266212229081846e-06, + "loss": 0.3983, + "step": 1233 + }, + { + "epoch": 1.039303761931499, + "grad_norm": 0.23098818957805634, + "learning_rate": 8.262498654090846e-06, + "loss": 0.3492, + "step": 1234 + }, + { + "epoch": 1.0401459854014599, + "grad_norm": 0.25537002086639404, + "learning_rate": 8.258781942457244e-06, + "loss": 0.3692, + "step": 1235 + }, + { + "epoch": 1.0409882088714206, + "grad_norm": 0.2224876582622528, + "learning_rate": 8.255062097754371e-06, + "loss": 0.339, + "step": 1236 + }, + { + "epoch": 1.0418304323413812, + "grad_norm": 0.2605500817298889, + "learning_rate": 8.251339123558573e-06, + "loss": 0.3695, + "step": 1237 + }, + { + "epoch": 1.042672655811342, + "grad_norm": 0.22719444334506989, + "learning_rate": 8.247613023449209e-06, + "loss": 0.344, + "step": 1238 + }, + { + "epoch": 1.0435148792813027, + "grad_norm": 0.2446812242269516, + "learning_rate": 8.243883801008632e-06, + "loss": 0.407, + "step": 1239 + }, + { + "epoch": 1.0443571027512633, + "grad_norm": 0.21748603880405426, + "learning_rate": 8.240151459822207e-06, + "loss": 0.3408, + "step": 1240 + }, + { + "epoch": 1.045199326221224, + "grad_norm": 0.2330400049686432, + "learning_rate": 8.236416003478295e-06, + "loss": 0.3796, + "step": 1241 + }, + { + "epoch": 1.0460415496911848, + "grad_norm": 0.2480519860982895, + "learning_rate": 8.232677435568252e-06, + "loss": 0.3703, + "step": 1242 + }, + { + "epoch": 1.0468837731611453, + "grad_norm": 0.22414913773536682, + "learning_rate": 8.228935759686424e-06, + "loss": 0.3212, + "step": 1243 + }, + { + "epoch": 1.047725996631106, + "grad_norm": 0.24647359549999237, + "learning_rate": 8.225190979430145e-06, + "loss": 0.3837, + "step": 1244 + }, + { + "epoch": 1.0485682201010669, + "grad_norm": 0.24247120320796967, + "learning_rate": 8.221443098399733e-06, + "loss": 0.3617, + "step": 1245 + }, + { + "epoch": 1.0494104435710274, + "grad_norm": 0.24783526360988617, + "learning_rate": 8.217692120198492e-06, + "loss": 0.3767, + "step": 1246 + }, + { + "epoch": 1.0502526670409882, + "grad_norm": 0.23246386647224426, + "learning_rate": 8.213938048432697e-06, + "loss": 0.3665, + "step": 1247 + }, + { + "epoch": 1.051094890510949, + "grad_norm": 0.2402683049440384, + "learning_rate": 8.210180886711603e-06, + "loss": 0.379, + "step": 1248 + }, + { + "epoch": 1.0519371139809095, + "grad_norm": 0.2358441948890686, + "learning_rate": 8.206420638647433e-06, + "loss": 0.363, + "step": 1249 + }, + { + "epoch": 1.0527793374508703, + "grad_norm": 0.21942484378814697, + "learning_rate": 8.202657307855376e-06, + "loss": 0.3326, + "step": 1250 + }, + { + "epoch": 1.053621560920831, + "grad_norm": 0.22994016110897064, + "learning_rate": 8.198890897953586e-06, + "loss": 0.3677, + "step": 1251 + }, + { + "epoch": 1.0544637843907916, + "grad_norm": 0.23945726454257965, + "learning_rate": 8.19512141256318e-06, + "loss": 0.3956, + "step": 1252 + }, + { + "epoch": 1.0553060078607523, + "grad_norm": 0.20319385826587677, + "learning_rate": 8.191348855308229e-06, + "loss": 0.3523, + "step": 1253 + }, + { + "epoch": 1.0561482313307131, + "grad_norm": 0.22652249038219452, + "learning_rate": 8.187573229815757e-06, + "loss": 0.3419, + "step": 1254 + }, + { + "epoch": 1.0569904548006739, + "grad_norm": 0.2570735216140747, + "learning_rate": 8.18379453971574e-06, + "loss": 0.4001, + "step": 1255 + }, + { + "epoch": 1.0578326782706344, + "grad_norm": 0.22578343749046326, + "learning_rate": 8.180012788641097e-06, + "loss": 0.3478, + "step": 1256 + }, + { + "epoch": 1.0586749017405952, + "grad_norm": 0.2511613667011261, + "learning_rate": 8.176227980227693e-06, + "loss": 0.3678, + "step": 1257 + }, + { + "epoch": 1.059517125210556, + "grad_norm": 0.25504982471466064, + "learning_rate": 8.172440118114332e-06, + "loss": 0.3973, + "step": 1258 + }, + { + "epoch": 1.0603593486805165, + "grad_norm": 0.20504139363765717, + "learning_rate": 8.168649205942753e-06, + "loss": 0.3428, + "step": 1259 + }, + { + "epoch": 1.0612015721504773, + "grad_norm": 0.22897997498512268, + "learning_rate": 8.164855247357628e-06, + "loss": 0.3671, + "step": 1260 + }, + { + "epoch": 1.062043795620438, + "grad_norm": 0.2271280735731125, + "learning_rate": 8.161058246006558e-06, + "loss": 0.359, + "step": 1261 + }, + { + "epoch": 1.0628860190903986, + "grad_norm": 0.23936693370342255, + "learning_rate": 8.157258205540069e-06, + "loss": 0.3493, + "step": 1262 + }, + { + "epoch": 1.0637282425603594, + "grad_norm": 0.2141476571559906, + "learning_rate": 8.153455129611605e-06, + "loss": 0.3652, + "step": 1263 + }, + { + "epoch": 1.0645704660303201, + "grad_norm": 0.23102352023124695, + "learning_rate": 8.14964902187754e-06, + "loss": 0.3678, + "step": 1264 + }, + { + "epoch": 1.0654126895002807, + "grad_norm": 0.237512469291687, + "learning_rate": 8.145839885997146e-06, + "loss": 0.3335, + "step": 1265 + }, + { + "epoch": 1.0662549129702414, + "grad_norm": 0.2297290712594986, + "learning_rate": 8.142027725632622e-06, + "loss": 0.3592, + "step": 1266 + }, + { + "epoch": 1.0670971364402022, + "grad_norm": 0.25016435980796814, + "learning_rate": 8.138212544449067e-06, + "loss": 0.3785, + "step": 1267 + }, + { + "epoch": 1.0679393599101628, + "grad_norm": 0.22096030414104462, + "learning_rate": 8.134394346114486e-06, + "loss": 0.3609, + "step": 1268 + }, + { + "epoch": 1.0687815833801235, + "grad_norm": 0.26206687092781067, + "learning_rate": 8.130573134299782e-06, + "loss": 0.3661, + "step": 1269 + }, + { + "epoch": 1.0696238068500843, + "grad_norm": 0.24707172811031342, + "learning_rate": 8.126748912678757e-06, + "loss": 0.3965, + "step": 1270 + }, + { + "epoch": 1.0704660303200448, + "grad_norm": 0.21675467491149902, + "learning_rate": 8.122921684928111e-06, + "loss": 0.3157, + "step": 1271 + }, + { + "epoch": 1.0713082537900056, + "grad_norm": 0.22279198467731476, + "learning_rate": 8.119091454727427e-06, + "loss": 0.3631, + "step": 1272 + }, + { + "epoch": 1.0721504772599664, + "grad_norm": 0.2054135948419571, + "learning_rate": 8.11525822575918e-06, + "loss": 0.3735, + "step": 1273 + }, + { + "epoch": 1.072992700729927, + "grad_norm": 0.226912721991539, + "learning_rate": 8.111422001708725e-06, + "loss": 0.3479, + "step": 1274 + }, + { + "epoch": 1.0738349241998877, + "grad_norm": 0.23292496800422668, + "learning_rate": 8.107582786264299e-06, + "loss": 0.3569, + "step": 1275 + }, + { + "epoch": 1.0746771476698485, + "grad_norm": 0.2658878564834595, + "learning_rate": 8.10374058311701e-06, + "loss": 0.3859, + "step": 1276 + }, + { + "epoch": 1.075519371139809, + "grad_norm": 0.25955137610435486, + "learning_rate": 8.099895395960847e-06, + "loss": 0.3635, + "step": 1277 + }, + { + "epoch": 1.0763615946097698, + "grad_norm": 0.23224274814128876, + "learning_rate": 8.09604722849266e-06, + "loss": 0.36, + "step": 1278 + }, + { + "epoch": 1.0772038180797305, + "grad_norm": 0.26262158155441284, + "learning_rate": 8.092196084412167e-06, + "loss": 0.3731, + "step": 1279 + }, + { + "epoch": 1.078046041549691, + "grad_norm": 0.23350927233695984, + "learning_rate": 8.08834196742195e-06, + "loss": 0.3493, + "step": 1280 + }, + { + "epoch": 1.0788882650196518, + "grad_norm": 0.22659815847873688, + "learning_rate": 8.084484881227449e-06, + "loss": 0.3472, + "step": 1281 + }, + { + "epoch": 1.0797304884896126, + "grad_norm": 0.27644115686416626, + "learning_rate": 8.080624829536949e-06, + "loss": 0.3706, + "step": 1282 + }, + { + "epoch": 1.0805727119595732, + "grad_norm": 0.20871034264564514, + "learning_rate": 8.076761816061603e-06, + "loss": 0.3301, + "step": 1283 + }, + { + "epoch": 1.081414935429534, + "grad_norm": 0.2264663428068161, + "learning_rate": 8.072895844515398e-06, + "loss": 0.3711, + "step": 1284 + }, + { + "epoch": 1.0822571588994947, + "grad_norm": 0.22878475487232208, + "learning_rate": 8.069026918615173e-06, + "loss": 0.3576, + "step": 1285 + }, + { + "epoch": 1.0830993823694555, + "grad_norm": 0.2420300841331482, + "learning_rate": 8.065155042080599e-06, + "loss": 0.3634, + "step": 1286 + }, + { + "epoch": 1.083941605839416, + "grad_norm": 0.2258838415145874, + "learning_rate": 8.061280218634192e-06, + "loss": 0.3494, + "step": 1287 + }, + { + "epoch": 1.0847838293093768, + "grad_norm": 0.2405247539281845, + "learning_rate": 8.057402452001298e-06, + "loss": 0.3793, + "step": 1288 + }, + { + "epoch": 1.0856260527793375, + "grad_norm": 0.2510263919830322, + "learning_rate": 8.05352174591009e-06, + "loss": 0.3694, + "step": 1289 + }, + { + "epoch": 1.086468276249298, + "grad_norm": 0.23418664932250977, + "learning_rate": 8.049638104091575e-06, + "loss": 0.3633, + "step": 1290 + }, + { + "epoch": 1.0873104997192589, + "grad_norm": 0.27216869592666626, + "learning_rate": 8.04575153027957e-06, + "loss": 0.3878, + "step": 1291 + }, + { + "epoch": 1.0881527231892196, + "grad_norm": 0.22842253744602203, + "learning_rate": 8.041862028210725e-06, + "loss": 0.3384, + "step": 1292 + }, + { + "epoch": 1.0889949466591802, + "grad_norm": 0.24223503470420837, + "learning_rate": 8.037969601624495e-06, + "loss": 0.3674, + "step": 1293 + }, + { + "epoch": 1.089837170129141, + "grad_norm": 0.24100640416145325, + "learning_rate": 8.034074254263152e-06, + "loss": 0.3353, + "step": 1294 + }, + { + "epoch": 1.0906793935991017, + "grad_norm": 0.2577701210975647, + "learning_rate": 8.030175989871769e-06, + "loss": 0.3725, + "step": 1295 + }, + { + "epoch": 1.0915216170690623, + "grad_norm": 0.21651145815849304, + "learning_rate": 8.026274812198235e-06, + "loss": 0.3519, + "step": 1296 + }, + { + "epoch": 1.092363840539023, + "grad_norm": 0.22987744212150574, + "learning_rate": 8.022370724993229e-06, + "loss": 0.3541, + "step": 1297 + }, + { + "epoch": 1.0932060640089838, + "grad_norm": 0.221238911151886, + "learning_rate": 8.018463732010235e-06, + "loss": 0.4107, + "step": 1298 + }, + { + "epoch": 1.0940482874789443, + "grad_norm": 0.20016083121299744, + "learning_rate": 8.014553837005527e-06, + "loss": 0.305, + "step": 1299 + }, + { + "epoch": 1.094890510948905, + "grad_norm": 0.21442735195159912, + "learning_rate": 8.010641043738167e-06, + "loss": 0.371, + "step": 1300 + }, + { + "epoch": 1.0957327344188659, + "grad_norm": 0.24419724941253662, + "learning_rate": 8.006725355970008e-06, + "loss": 0.3874, + "step": 1301 + }, + { + "epoch": 1.0965749578888264, + "grad_norm": 0.2159401923418045, + "learning_rate": 8.002806777465685e-06, + "loss": 0.3563, + "step": 1302 + }, + { + "epoch": 1.0974171813587872, + "grad_norm": 0.20667701959609985, + "learning_rate": 7.99888531199261e-06, + "loss": 0.3299, + "step": 1303 + }, + { + "epoch": 1.098259404828748, + "grad_norm": 0.23032718896865845, + "learning_rate": 7.99496096332097e-06, + "loss": 0.3662, + "step": 1304 + }, + { + "epoch": 1.0991016282987085, + "grad_norm": 0.21561412513256073, + "learning_rate": 7.99103373522373e-06, + "loss": 0.3741, + "step": 1305 + }, + { + "epoch": 1.0999438517686693, + "grad_norm": 0.20311473309993744, + "learning_rate": 7.987103631476615e-06, + "loss": 0.3382, + "step": 1306 + }, + { + "epoch": 1.10078607523863, + "grad_norm": 0.2105664610862732, + "learning_rate": 7.98317065585812e-06, + "loss": 0.3733, + "step": 1307 + }, + { + "epoch": 1.1016282987085906, + "grad_norm": 0.20192131400108337, + "learning_rate": 7.9792348121495e-06, + "loss": 0.345, + "step": 1308 + }, + { + "epoch": 1.1024705221785513, + "grad_norm": 0.25903552770614624, + "learning_rate": 7.975296104134768e-06, + "loss": 0.4128, + "step": 1309 + }, + { + "epoch": 1.1033127456485121, + "grad_norm": 0.20803354680538177, + "learning_rate": 7.97135453560069e-06, + "loss": 0.3646, + "step": 1310 + }, + { + "epoch": 1.1041549691184729, + "grad_norm": 0.2241167426109314, + "learning_rate": 7.967410110336782e-06, + "loss": 0.3339, + "step": 1311 + }, + { + "epoch": 1.1049971925884334, + "grad_norm": 0.2666442096233368, + "learning_rate": 7.963462832135307e-06, + "loss": 0.4023, + "step": 1312 + }, + { + "epoch": 1.1058394160583942, + "grad_norm": 0.20622016489505768, + "learning_rate": 7.959512704791269e-06, + "loss": 0.3379, + "step": 1313 + }, + { + "epoch": 1.1066816395283547, + "grad_norm": 0.26777777075767517, + "learning_rate": 7.955559732102414e-06, + "loss": 0.405, + "step": 1314 + }, + { + "epoch": 1.1075238629983155, + "grad_norm": 0.2196749597787857, + "learning_rate": 7.951603917869223e-06, + "loss": 0.3563, + "step": 1315 + }, + { + "epoch": 1.1083660864682763, + "grad_norm": 0.22223137319087982, + "learning_rate": 7.94764526589491e-06, + "loss": 0.3455, + "step": 1316 + }, + { + "epoch": 1.109208309938237, + "grad_norm": 0.2447323054075241, + "learning_rate": 7.943683779985412e-06, + "loss": 0.3364, + "step": 1317 + }, + { + "epoch": 1.1100505334081976, + "grad_norm": 0.22005987167358398, + "learning_rate": 7.939719463949398e-06, + "loss": 0.341, + "step": 1318 + }, + { + "epoch": 1.1108927568781584, + "grad_norm": 0.2396015226840973, + "learning_rate": 7.93575232159825e-06, + "loss": 0.3537, + "step": 1319 + }, + { + "epoch": 1.1117349803481191, + "grad_norm": 0.2083560973405838, + "learning_rate": 7.931782356746076e-06, + "loss": 0.3618, + "step": 1320 + }, + { + "epoch": 1.1125772038180797, + "grad_norm": 0.24214932322502136, + "learning_rate": 7.927809573209691e-06, + "loss": 0.3566, + "step": 1321 + }, + { + "epoch": 1.1134194272880404, + "grad_norm": 0.22593237459659576, + "learning_rate": 7.923833974808622e-06, + "loss": 0.3349, + "step": 1322 + }, + { + "epoch": 1.1142616507580012, + "grad_norm": 0.22273516654968262, + "learning_rate": 7.919855565365102e-06, + "loss": 0.39, + "step": 1323 + }, + { + "epoch": 1.1151038742279618, + "grad_norm": 0.2301865667104721, + "learning_rate": 7.91587434870407e-06, + "loss": 0.3674, + "step": 1324 + }, + { + "epoch": 1.1159460976979225, + "grad_norm": 0.251726359128952, + "learning_rate": 7.911890328653156e-06, + "loss": 0.3483, + "step": 1325 + }, + { + "epoch": 1.1167883211678833, + "grad_norm": 0.21106411516666412, + "learning_rate": 7.907903509042696e-06, + "loss": 0.3813, + "step": 1326 + }, + { + "epoch": 1.1176305446378438, + "grad_norm": 0.20069539546966553, + "learning_rate": 7.903913893705706e-06, + "loss": 0.3464, + "step": 1327 + }, + { + "epoch": 1.1184727681078046, + "grad_norm": 0.2140786051750183, + "learning_rate": 7.899921486477899e-06, + "loss": 0.3796, + "step": 1328 + }, + { + "epoch": 1.1193149915777654, + "grad_norm": 0.21972276270389557, + "learning_rate": 7.895926291197667e-06, + "loss": 0.3763, + "step": 1329 + }, + { + "epoch": 1.120157215047726, + "grad_norm": 0.19542910158634186, + "learning_rate": 7.891928311706088e-06, + "loss": 0.3485, + "step": 1330 + }, + { + "epoch": 1.1209994385176867, + "grad_norm": 0.2048928141593933, + "learning_rate": 7.887927551846908e-06, + "loss": 0.3583, + "step": 1331 + }, + { + "epoch": 1.1218416619876475, + "grad_norm": 0.203351229429245, + "learning_rate": 7.883924015466554e-06, + "loss": 0.3453, + "step": 1332 + }, + { + "epoch": 1.122683885457608, + "grad_norm": 0.23957213759422302, + "learning_rate": 7.87991770641412e-06, + "loss": 0.3803, + "step": 1333 + }, + { + "epoch": 1.1235261089275688, + "grad_norm": 0.2266377955675125, + "learning_rate": 7.875908628541363e-06, + "loss": 0.3803, + "step": 1334 + }, + { + "epoch": 1.1243683323975295, + "grad_norm": 0.21484608948230743, + "learning_rate": 7.871896785702707e-06, + "loss": 0.3659, + "step": 1335 + }, + { + "epoch": 1.12521055586749, + "grad_norm": 0.21197688579559326, + "learning_rate": 7.86788218175523e-06, + "loss": 0.3423, + "step": 1336 + }, + { + "epoch": 1.1260527793374508, + "grad_norm": 0.2307911217212677, + "learning_rate": 7.863864820558669e-06, + "loss": 0.3362, + "step": 1337 + }, + { + "epoch": 1.1268950028074116, + "grad_norm": 0.22809703648090363, + "learning_rate": 7.859844705975405e-06, + "loss": 0.4024, + "step": 1338 + }, + { + "epoch": 1.1277372262773722, + "grad_norm": 0.2245447188615799, + "learning_rate": 7.855821841870472e-06, + "loss": 0.355, + "step": 1339 + }, + { + "epoch": 1.128579449747333, + "grad_norm": 0.21683157980442047, + "learning_rate": 7.851796232111546e-06, + "loss": 0.3697, + "step": 1340 + }, + { + "epoch": 1.1294216732172937, + "grad_norm": 0.22042052447795868, + "learning_rate": 7.847767880568944e-06, + "loss": 0.3597, + "step": 1341 + }, + { + "epoch": 1.1302638966872545, + "grad_norm": 0.2326141595840454, + "learning_rate": 7.843736791115614e-06, + "loss": 0.3603, + "step": 1342 + }, + { + "epoch": 1.131106120157215, + "grad_norm": 0.2275954782962799, + "learning_rate": 7.839702967627145e-06, + "loss": 0.3732, + "step": 1343 + }, + { + "epoch": 1.1319483436271758, + "grad_norm": 0.21167173981666565, + "learning_rate": 7.835666413981744e-06, + "loss": 0.3263, + "step": 1344 + }, + { + "epoch": 1.1327905670971363, + "grad_norm": 0.22949613630771637, + "learning_rate": 7.831627134060249e-06, + "loss": 0.3584, + "step": 1345 + }, + { + "epoch": 1.133632790567097, + "grad_norm": 0.24151138961315155, + "learning_rate": 7.827585131746122e-06, + "loss": 0.3904, + "step": 1346 + }, + { + "epoch": 1.1344750140370579, + "grad_norm": 0.23588694632053375, + "learning_rate": 7.823540410925434e-06, + "loss": 0.3839, + "step": 1347 + }, + { + "epoch": 1.1353172375070186, + "grad_norm": 0.2144988477230072, + "learning_rate": 7.81949297548688e-06, + "loss": 0.3653, + "step": 1348 + }, + { + "epoch": 1.1361594609769792, + "grad_norm": 0.23595893383026123, + "learning_rate": 7.815442829321754e-06, + "loss": 0.3538, + "step": 1349 + }, + { + "epoch": 1.13700168444694, + "grad_norm": 0.26966625452041626, + "learning_rate": 7.811389976323963e-06, + "loss": 0.4052, + "step": 1350 + }, + { + "epoch": 1.1378439079169007, + "grad_norm": 0.21856364607810974, + "learning_rate": 7.807334420390014e-06, + "loss": 0.3603, + "step": 1351 + }, + { + "epoch": 1.1386861313868613, + "grad_norm": 0.22447790205478668, + "learning_rate": 7.803276165419015e-06, + "loss": 0.3687, + "step": 1352 + }, + { + "epoch": 1.139528354856822, + "grad_norm": 0.2570057809352875, + "learning_rate": 7.799215215312667e-06, + "loss": 0.387, + "step": 1353 + }, + { + "epoch": 1.1403705783267828, + "grad_norm": 0.22275447845458984, + "learning_rate": 7.795151573975262e-06, + "loss": 0.345, + "step": 1354 + }, + { + "epoch": 1.1412128017967433, + "grad_norm": 0.23103633522987366, + "learning_rate": 7.79108524531368e-06, + "loss": 0.3427, + "step": 1355 + }, + { + "epoch": 1.142055025266704, + "grad_norm": 0.24295048415660858, + "learning_rate": 7.787016233237387e-06, + "loss": 0.3451, + "step": 1356 + }, + { + "epoch": 1.1428972487366649, + "grad_norm": 0.2352520227432251, + "learning_rate": 7.782944541658423e-06, + "loss": 0.3529, + "step": 1357 + }, + { + "epoch": 1.1437394722066254, + "grad_norm": 0.24101249873638153, + "learning_rate": 7.778870174491408e-06, + "loss": 0.362, + "step": 1358 + }, + { + "epoch": 1.1445816956765862, + "grad_norm": 0.22046388685703278, + "learning_rate": 7.774793135653537e-06, + "loss": 0.353, + "step": 1359 + }, + { + "epoch": 1.145423919146547, + "grad_norm": 0.220865398645401, + "learning_rate": 7.770713429064567e-06, + "loss": 0.3512, + "step": 1360 + }, + { + "epoch": 1.1462661426165075, + "grad_norm": 0.24877040088176727, + "learning_rate": 7.766631058646826e-06, + "loss": 0.3671, + "step": 1361 + }, + { + "epoch": 1.1471083660864683, + "grad_norm": 0.2663626968860626, + "learning_rate": 7.7625460283252e-06, + "loss": 0.3741, + "step": 1362 + }, + { + "epoch": 1.147950589556429, + "grad_norm": 0.21680448949337006, + "learning_rate": 7.75845834202713e-06, + "loss": 0.3703, + "step": 1363 + }, + { + "epoch": 1.1487928130263896, + "grad_norm": 0.1968405395746231, + "learning_rate": 7.754368003682617e-06, + "loss": 0.316, + "step": 1364 + }, + { + "epoch": 1.1496350364963503, + "grad_norm": 0.23480704426765442, + "learning_rate": 7.750275017224208e-06, + "loss": 0.3912, + "step": 1365 + }, + { + "epoch": 1.1504772599663111, + "grad_norm": 0.23217934370040894, + "learning_rate": 7.746179386586994e-06, + "loss": 0.3454, + "step": 1366 + }, + { + "epoch": 1.1513194834362717, + "grad_norm": 0.2343100905418396, + "learning_rate": 7.74208111570861e-06, + "loss": 0.3808, + "step": 1367 + }, + { + "epoch": 1.1521617069062324, + "grad_norm": 0.19447638094425201, + "learning_rate": 7.737980208529232e-06, + "loss": 0.3704, + "step": 1368 + }, + { + "epoch": 1.1530039303761932, + "grad_norm": 0.22011230885982513, + "learning_rate": 7.733876668991565e-06, + "loss": 0.3608, + "step": 1369 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 0.2208113670349121, + "learning_rate": 7.72977050104085e-06, + "loss": 0.3579, + "step": 1370 + }, + { + "epoch": 1.1546883773161145, + "grad_norm": 0.2177201509475708, + "learning_rate": 7.725661708624855e-06, + "loss": 0.3611, + "step": 1371 + }, + { + "epoch": 1.1555306007860753, + "grad_norm": 0.21167387068271637, + "learning_rate": 7.721550295693865e-06, + "loss": 0.3815, + "step": 1372 + }, + { + "epoch": 1.156372824256036, + "grad_norm": 0.22142820060253143, + "learning_rate": 7.71743626620069e-06, + "loss": 0.3751, + "step": 1373 + }, + { + "epoch": 1.1572150477259966, + "grad_norm": 0.25120094418525696, + "learning_rate": 7.713319624100657e-06, + "loss": 0.3277, + "step": 1374 + }, + { + "epoch": 1.1580572711959574, + "grad_norm": 0.23623235523700714, + "learning_rate": 7.7092003733516e-06, + "loss": 0.3389, + "step": 1375 + }, + { + "epoch": 1.158899494665918, + "grad_norm": 0.2505205571651459, + "learning_rate": 7.705078517913862e-06, + "loss": 0.3903, + "step": 1376 + }, + { + "epoch": 1.1597417181358787, + "grad_norm": 0.2453787922859192, + "learning_rate": 7.700954061750295e-06, + "loss": 0.3693, + "step": 1377 + }, + { + "epoch": 1.1605839416058394, + "grad_norm": 0.23586124181747437, + "learning_rate": 7.696827008826242e-06, + "loss": 0.3852, + "step": 1378 + }, + { + "epoch": 1.1614261650758002, + "grad_norm": 0.25180327892303467, + "learning_rate": 7.692697363109553e-06, + "loss": 0.3902, + "step": 1379 + }, + { + "epoch": 1.1622683885457608, + "grad_norm": 0.22355681657791138, + "learning_rate": 7.688565128570564e-06, + "loss": 0.3558, + "step": 1380 + }, + { + "epoch": 1.1631106120157215, + "grad_norm": 0.2151123583316803, + "learning_rate": 7.684430309182106e-06, + "loss": 0.3371, + "step": 1381 + }, + { + "epoch": 1.1639528354856823, + "grad_norm": 0.22661295533180237, + "learning_rate": 7.680292908919485e-06, + "loss": 0.3904, + "step": 1382 + }, + { + "epoch": 1.1647950589556428, + "grad_norm": 0.2297997623682022, + "learning_rate": 7.676152931760496e-06, + "loss": 0.3482, + "step": 1383 + }, + { + "epoch": 1.1656372824256036, + "grad_norm": 0.238944873213768, + "learning_rate": 7.672010381685416e-06, + "loss": 0.3758, + "step": 1384 + }, + { + "epoch": 1.1664795058955644, + "grad_norm": 0.2109581083059311, + "learning_rate": 7.667865262676981e-06, + "loss": 0.3397, + "step": 1385 + }, + { + "epoch": 1.167321729365525, + "grad_norm": 0.23245356976985931, + "learning_rate": 7.663717578720412e-06, + "loss": 0.377, + "step": 1386 + }, + { + "epoch": 1.1681639528354857, + "grad_norm": 0.2091844230890274, + "learning_rate": 7.659567333803386e-06, + "loss": 0.3256, + "step": 1387 + }, + { + "epoch": 1.1690061763054465, + "grad_norm": 0.2057802677154541, + "learning_rate": 7.655414531916048e-06, + "loss": 0.3674, + "step": 1388 + }, + { + "epoch": 1.169848399775407, + "grad_norm": 0.2157420814037323, + "learning_rate": 7.651259177050996e-06, + "loss": 0.3274, + "step": 1389 + }, + { + "epoch": 1.1706906232453678, + "grad_norm": 0.21391423046588898, + "learning_rate": 7.647101273203289e-06, + "loss": 0.3504, + "step": 1390 + }, + { + "epoch": 1.1715328467153285, + "grad_norm": 0.2303875833749771, + "learning_rate": 7.642940824370429e-06, + "loss": 0.3423, + "step": 1391 + }, + { + "epoch": 1.172375070185289, + "grad_norm": 0.26220864057540894, + "learning_rate": 7.638777834552372e-06, + "loss": 0.3806, + "step": 1392 + }, + { + "epoch": 1.1732172936552498, + "grad_norm": 0.20384414494037628, + "learning_rate": 7.634612307751513e-06, + "loss": 0.3344, + "step": 1393 + }, + { + "epoch": 1.1740595171252106, + "grad_norm": 0.22354431450366974, + "learning_rate": 7.630444247972688e-06, + "loss": 0.3843, + "step": 1394 + }, + { + "epoch": 1.1749017405951712, + "grad_norm": 0.21157680451869965, + "learning_rate": 7.626273659223166e-06, + "loss": 0.3636, + "step": 1395 + }, + { + "epoch": 1.175743964065132, + "grad_norm": 0.2282872051000595, + "learning_rate": 7.622100545512648e-06, + "loss": 0.3841, + "step": 1396 + }, + { + "epoch": 1.1765861875350927, + "grad_norm": 0.22645720839500427, + "learning_rate": 7.617924910853266e-06, + "loss": 0.3808, + "step": 1397 + }, + { + "epoch": 1.1774284110050532, + "grad_norm": 0.21912354230880737, + "learning_rate": 7.61374675925957e-06, + "loss": 0.3486, + "step": 1398 + }, + { + "epoch": 1.178270634475014, + "grad_norm": 0.2137390673160553, + "learning_rate": 7.609566094748535e-06, + "loss": 0.36, + "step": 1399 + }, + { + "epoch": 1.1791128579449748, + "grad_norm": 0.23046202957630157, + "learning_rate": 7.605382921339548e-06, + "loss": 0.3637, + "step": 1400 + }, + { + "epoch": 1.1799550814149353, + "grad_norm": 0.24850760400295258, + "learning_rate": 7.601197243054411e-06, + "loss": 0.3757, + "step": 1401 + }, + { + "epoch": 1.180797304884896, + "grad_norm": 0.20614586770534515, + "learning_rate": 7.597009063917333e-06, + "loss": 0.3346, + "step": 1402 + }, + { + "epoch": 1.1816395283548569, + "grad_norm": 0.2218198925256729, + "learning_rate": 7.5928183879549274e-06, + "loss": 0.3858, + "step": 1403 + }, + { + "epoch": 1.1824817518248176, + "grad_norm": 0.21687179803848267, + "learning_rate": 7.588625219196208e-06, + "loss": 0.3879, + "step": 1404 + }, + { + "epoch": 1.1833239752947782, + "grad_norm": 0.2420112043619156, + "learning_rate": 7.584429561672586e-06, + "loss": 0.3586, + "step": 1405 + }, + { + "epoch": 1.184166198764739, + "grad_norm": 0.25380006432533264, + "learning_rate": 7.580231419417863e-06, + "loss": 0.3845, + "step": 1406 + }, + { + "epoch": 1.1850084222346995, + "grad_norm": 0.21595925092697144, + "learning_rate": 7.576030796468233e-06, + "loss": 0.3644, + "step": 1407 + }, + { + "epoch": 1.1858506457046603, + "grad_norm": 0.2313537299633026, + "learning_rate": 7.571827696862274e-06, + "loss": 0.3331, + "step": 1408 + }, + { + "epoch": 1.186692869174621, + "grad_norm": 0.2036680281162262, + "learning_rate": 7.567622124640942e-06, + "loss": 0.3618, + "step": 1409 + }, + { + "epoch": 1.1875350926445818, + "grad_norm": 0.2853444814682007, + "learning_rate": 7.563414083847573e-06, + "loss": 0.4122, + "step": 1410 + }, + { + "epoch": 1.1883773161145423, + "grad_norm": 0.23605458438396454, + "learning_rate": 7.55920357852788e-06, + "loss": 0.334, + "step": 1411 + }, + { + "epoch": 1.189219539584503, + "grad_norm": 0.20309481024742126, + "learning_rate": 7.554990612729936e-06, + "loss": 0.3682, + "step": 1412 + }, + { + "epoch": 1.1900617630544639, + "grad_norm": 0.21416065096855164, + "learning_rate": 7.5507751905041885e-06, + "loss": 0.3548, + "step": 1413 + }, + { + "epoch": 1.1909039865244244, + "grad_norm": 0.2469504326581955, + "learning_rate": 7.5465573159034396e-06, + "loss": 0.3669, + "step": 1414 + }, + { + "epoch": 1.1917462099943852, + "grad_norm": 0.23954510688781738, + "learning_rate": 7.542336992982857e-06, + "loss": 0.356, + "step": 1415 + }, + { + "epoch": 1.192588433464346, + "grad_norm": 0.2099873125553131, + "learning_rate": 7.538114225799955e-06, + "loss": 0.376, + "step": 1416 + }, + { + "epoch": 1.1934306569343065, + "grad_norm": 0.21912400424480438, + "learning_rate": 7.533889018414602e-06, + "loss": 0.387, + "step": 1417 + }, + { + "epoch": 1.1942728804042673, + "grad_norm": 0.21182800829410553, + "learning_rate": 7.529661374889011e-06, + "loss": 0.3244, + "step": 1418 + }, + { + "epoch": 1.195115103874228, + "grad_norm": 0.2408171445131302, + "learning_rate": 7.525431299287737e-06, + "loss": 0.367, + "step": 1419 + }, + { + "epoch": 1.1959573273441886, + "grad_norm": 0.2102893888950348, + "learning_rate": 7.5211987956776755e-06, + "loss": 0.3645, + "step": 1420 + }, + { + "epoch": 1.1967995508141493, + "grad_norm": 0.25745439529418945, + "learning_rate": 7.516963868128054e-06, + "loss": 0.3827, + "step": 1421 + }, + { + "epoch": 1.1976417742841101, + "grad_norm": 0.22646570205688477, + "learning_rate": 7.512726520710429e-06, + "loss": 0.3589, + "step": 1422 + }, + { + "epoch": 1.1984839977540707, + "grad_norm": 0.2319384515285492, + "learning_rate": 7.508486757498687e-06, + "loss": 0.3483, + "step": 1423 + }, + { + "epoch": 1.1993262212240314, + "grad_norm": 0.2112833559513092, + "learning_rate": 7.5042445825690344e-06, + "loss": 0.3366, + "step": 1424 + }, + { + "epoch": 1.2001684446939922, + "grad_norm": 0.2383369654417038, + "learning_rate": 7.500000000000001e-06, + "loss": 0.3742, + "step": 1425 + }, + { + "epoch": 1.2010106681639527, + "grad_norm": 0.22696097195148468, + "learning_rate": 7.4957530138724245e-06, + "loss": 0.3443, + "step": 1426 + }, + { + "epoch": 1.2018528916339135, + "grad_norm": 0.21109351515769958, + "learning_rate": 7.491503628269458e-06, + "loss": 0.3698, + "step": 1427 + }, + { + "epoch": 1.2026951151038743, + "grad_norm": 0.24133235216140747, + "learning_rate": 7.4872518472765594e-06, + "loss": 0.4057, + "step": 1428 + }, + { + "epoch": 1.203537338573835, + "grad_norm": 0.21957817673683167, + "learning_rate": 7.4829976749814935e-06, + "loss": 0.3022, + "step": 1429 + }, + { + "epoch": 1.2043795620437956, + "grad_norm": 0.23803481459617615, + "learning_rate": 7.4787411154743175e-06, + "loss": 0.3377, + "step": 1430 + }, + { + "epoch": 1.2052217855137564, + "grad_norm": 0.23232616484165192, + "learning_rate": 7.474482172847391e-06, + "loss": 0.3841, + "step": 1431 + }, + { + "epoch": 1.206064008983717, + "grad_norm": 0.23307104408740997, + "learning_rate": 7.470220851195356e-06, + "loss": 0.3665, + "step": 1432 + }, + { + "epoch": 1.2069062324536777, + "grad_norm": 0.2201298177242279, + "learning_rate": 7.46595715461515e-06, + "loss": 0.3665, + "step": 1433 + }, + { + "epoch": 1.2077484559236384, + "grad_norm": 0.19988930225372314, + "learning_rate": 7.461691087205993e-06, + "loss": 0.346, + "step": 1434 + }, + { + "epoch": 1.2085906793935992, + "grad_norm": 0.2632721960544586, + "learning_rate": 7.457422653069379e-06, + "loss": 0.3814, + "step": 1435 + }, + { + "epoch": 1.2094329028635598, + "grad_norm": 0.2116016000509262, + "learning_rate": 7.45315185630908e-06, + "loss": 0.355, + "step": 1436 + }, + { + "epoch": 1.2102751263335205, + "grad_norm": 0.20903754234313965, + "learning_rate": 7.4488787010311425e-06, + "loss": 0.3569, + "step": 1437 + }, + { + "epoch": 1.211117349803481, + "grad_norm": 0.23268146812915802, + "learning_rate": 7.444603191343878e-06, + "loss": 0.3403, + "step": 1438 + }, + { + "epoch": 1.2119595732734418, + "grad_norm": 0.23514798283576965, + "learning_rate": 7.440325331357858e-06, + "loss": 0.3461, + "step": 1439 + }, + { + "epoch": 1.2128017967434026, + "grad_norm": 0.22160202264785767, + "learning_rate": 7.436045125185923e-06, + "loss": 0.375, + "step": 1440 + }, + { + "epoch": 1.2136440202133634, + "grad_norm": 0.20510455965995789, + "learning_rate": 7.431762576943157e-06, + "loss": 0.3538, + "step": 1441 + }, + { + "epoch": 1.214486243683324, + "grad_norm": 0.2111036628484726, + "learning_rate": 7.427477690746906e-06, + "loss": 0.3451, + "step": 1442 + }, + { + "epoch": 1.2153284671532847, + "grad_norm": 0.23642785847187042, + "learning_rate": 7.423190470716761e-06, + "loss": 0.3427, + "step": 1443 + }, + { + "epoch": 1.2161706906232455, + "grad_norm": 0.24378204345703125, + "learning_rate": 7.418900920974552e-06, + "loss": 0.3692, + "step": 1444 + }, + { + "epoch": 1.217012914093206, + "grad_norm": 0.23461374640464783, + "learning_rate": 7.414609045644356e-06, + "loss": 0.3889, + "step": 1445 + }, + { + "epoch": 1.2178551375631668, + "grad_norm": 0.22334659099578857, + "learning_rate": 7.4103148488524824e-06, + "loss": 0.349, + "step": 1446 + }, + { + "epoch": 1.2186973610331275, + "grad_norm": 0.2602890729904175, + "learning_rate": 7.40601833472747e-06, + "loss": 0.3786, + "step": 1447 + }, + { + "epoch": 1.219539584503088, + "grad_norm": 0.20589013397693634, + "learning_rate": 7.401719507400088e-06, + "loss": 0.3318, + "step": 1448 + }, + { + "epoch": 1.2203818079730488, + "grad_norm": 0.24295538663864136, + "learning_rate": 7.3974183710033334e-06, + "loss": 0.3628, + "step": 1449 + }, + { + "epoch": 1.2212240314430096, + "grad_norm": 0.2678720951080322, + "learning_rate": 7.393114929672414e-06, + "loss": 0.3951, + "step": 1450 + }, + { + "epoch": 1.2220662549129702, + "grad_norm": 0.22849512100219727, + "learning_rate": 7.388809187544764e-06, + "loss": 0.3606, + "step": 1451 + }, + { + "epoch": 1.222908478382931, + "grad_norm": 0.22386722266674042, + "learning_rate": 7.384501148760024e-06, + "loss": 0.3588, + "step": 1452 + }, + { + "epoch": 1.2237507018528917, + "grad_norm": 0.1996832638978958, + "learning_rate": 7.38019081746004e-06, + "loss": 0.3541, + "step": 1453 + }, + { + "epoch": 1.2245929253228522, + "grad_norm": 0.2293432205915451, + "learning_rate": 7.3758781977888684e-06, + "loss": 0.3545, + "step": 1454 + }, + { + "epoch": 1.225435148792813, + "grad_norm": 0.21567665040493011, + "learning_rate": 7.371563293892761e-06, + "loss": 0.3423, + "step": 1455 + }, + { + "epoch": 1.2262773722627738, + "grad_norm": 0.22745153307914734, + "learning_rate": 7.367246109920171e-06, + "loss": 0.3332, + "step": 1456 + }, + { + "epoch": 1.2271195957327343, + "grad_norm": 0.20745089650154114, + "learning_rate": 7.362926650021736e-06, + "loss": 0.3856, + "step": 1457 + }, + { + "epoch": 1.227961819202695, + "grad_norm": 0.24365340173244476, + "learning_rate": 7.3586049183502875e-06, + "loss": 0.391, + "step": 1458 + }, + { + "epoch": 1.2288040426726559, + "grad_norm": 0.21581348776817322, + "learning_rate": 7.354280919060839e-06, + "loss": 0.36, + "step": 1459 + }, + { + "epoch": 1.2296462661426166, + "grad_norm": 0.20025712251663208, + "learning_rate": 7.349954656310585e-06, + "loss": 0.3253, + "step": 1460 + }, + { + "epoch": 1.2304884896125772, + "grad_norm": 0.25201642513275146, + "learning_rate": 7.345626134258897e-06, + "loss": 0.3997, + "step": 1461 + }, + { + "epoch": 1.231330713082538, + "grad_norm": 0.23212586343288422, + "learning_rate": 7.341295357067315e-06, + "loss": 0.4041, + "step": 1462 + }, + { + "epoch": 1.2321729365524985, + "grad_norm": 0.20653077960014343, + "learning_rate": 7.336962328899553e-06, + "loss": 0.3153, + "step": 1463 + }, + { + "epoch": 1.2330151600224593, + "grad_norm": 0.24971120059490204, + "learning_rate": 7.3326270539214826e-06, + "loss": 0.3813, + "step": 1464 + }, + { + "epoch": 1.23385738349242, + "grad_norm": 0.2304530292749405, + "learning_rate": 7.3282895363011405e-06, + "loss": 0.3431, + "step": 1465 + }, + { + "epoch": 1.2346996069623808, + "grad_norm": 0.2110995203256607, + "learning_rate": 7.323949780208717e-06, + "loss": 0.3634, + "step": 1466 + }, + { + "epoch": 1.2355418304323413, + "grad_norm": 0.2557746171951294, + "learning_rate": 7.319607789816555e-06, + "loss": 0.3821, + "step": 1467 + }, + { + "epoch": 1.236384053902302, + "grad_norm": 0.23002184927463531, + "learning_rate": 7.315263569299147e-06, + "loss": 0.3528, + "step": 1468 + }, + { + "epoch": 1.2372262773722627, + "grad_norm": 0.21381570398807526, + "learning_rate": 7.310917122833127e-06, + "loss": 0.4101, + "step": 1469 + }, + { + "epoch": 1.2380685008422234, + "grad_norm": 0.21428106725215912, + "learning_rate": 7.306568454597269e-06, + "loss": 0.3479, + "step": 1470 + }, + { + "epoch": 1.2389107243121842, + "grad_norm": 0.25851118564605713, + "learning_rate": 7.302217568772488e-06, + "loss": 0.3517, + "step": 1471 + }, + { + "epoch": 1.239752947782145, + "grad_norm": 0.25823184847831726, + "learning_rate": 7.297864469541826e-06, + "loss": 0.3613, + "step": 1472 + }, + { + "epoch": 1.2405951712521055, + "grad_norm": 0.20786811411380768, + "learning_rate": 7.293509161090453e-06, + "loss": 0.3739, + "step": 1473 + }, + { + "epoch": 1.2414373947220663, + "grad_norm": 0.2107485681772232, + "learning_rate": 7.289151647605668e-06, + "loss": 0.3242, + "step": 1474 + }, + { + "epoch": 1.242279618192027, + "grad_norm": 0.248404860496521, + "learning_rate": 7.284791933276883e-06, + "loss": 0.3697, + "step": 1475 + }, + { + "epoch": 1.2431218416619876, + "grad_norm": 0.21293829381465912, + "learning_rate": 7.28043002229563e-06, + "loss": 0.3594, + "step": 1476 + }, + { + "epoch": 1.2439640651319483, + "grad_norm": 0.22747156023979187, + "learning_rate": 7.276065918855554e-06, + "loss": 0.3614, + "step": 1477 + }, + { + "epoch": 1.2448062886019091, + "grad_norm": 0.20419429242610931, + "learning_rate": 7.271699627152406e-06, + "loss": 0.3457, + "step": 1478 + }, + { + "epoch": 1.2456485120718697, + "grad_norm": 0.23443926870822906, + "learning_rate": 7.2673311513840395e-06, + "loss": 0.3576, + "step": 1479 + }, + { + "epoch": 1.2464907355418304, + "grad_norm": 0.25050753355026245, + "learning_rate": 7.26296049575041e-06, + "loss": 0.3875, + "step": 1480 + }, + { + "epoch": 1.2473329590117912, + "grad_norm": 0.23791559040546417, + "learning_rate": 7.2585876644535705e-06, + "loss": 0.3904, + "step": 1481 + }, + { + "epoch": 1.2481751824817517, + "grad_norm": 0.19877417385578156, + "learning_rate": 7.2542126616976596e-06, + "loss": 0.3321, + "step": 1482 + }, + { + "epoch": 1.2490174059517125, + "grad_norm": 0.21887525916099548, + "learning_rate": 7.24983549168891e-06, + "loss": 0.3517, + "step": 1483 + }, + { + "epoch": 1.2498596294216733, + "grad_norm": 0.2395276427268982, + "learning_rate": 7.2454561586356355e-06, + "loss": 0.3421, + "step": 1484 + }, + { + "epoch": 1.250701852891634, + "grad_norm": 0.21983036398887634, + "learning_rate": 7.241074666748228e-06, + "loss": 0.3496, + "step": 1485 + }, + { + "epoch": 1.2515440763615946, + "grad_norm": 0.20137713849544525, + "learning_rate": 7.236691020239157e-06, + "loss": 0.3631, + "step": 1486 + }, + { + "epoch": 1.2523862998315554, + "grad_norm": 0.22294463217258453, + "learning_rate": 7.232305223322963e-06, + "loss": 0.3653, + "step": 1487 + }, + { + "epoch": 1.253228523301516, + "grad_norm": 0.19250662624835968, + "learning_rate": 7.227917280216254e-06, + "loss": 0.33, + "step": 1488 + }, + { + "epoch": 1.2540707467714767, + "grad_norm": 0.2341872900724411, + "learning_rate": 7.2235271951377005e-06, + "loss": 0.3987, + "step": 1489 + }, + { + "epoch": 1.2549129702414374, + "grad_norm": 0.2020316869020462, + "learning_rate": 7.219134972308035e-06, + "loss": 0.3569, + "step": 1490 + }, + { + "epoch": 1.2557551937113982, + "grad_norm": 0.20235149562358856, + "learning_rate": 7.214740615950041e-06, + "loss": 0.3583, + "step": 1491 + }, + { + "epoch": 1.2565974171813588, + "grad_norm": 0.21482989192008972, + "learning_rate": 7.210344130288558e-06, + "loss": 0.3396, + "step": 1492 + }, + { + "epoch": 1.2574396406513195, + "grad_norm": 0.20860536396503448, + "learning_rate": 7.205945519550467e-06, + "loss": 0.3759, + "step": 1493 + }, + { + "epoch": 1.25828186412128, + "grad_norm": 0.20563741028308868, + "learning_rate": 7.201544787964698e-06, + "loss": 0.3644, + "step": 1494 + }, + { + "epoch": 1.2591240875912408, + "grad_norm": 0.21251443028450012, + "learning_rate": 7.197141939762217e-06, + "loss": 0.3238, + "step": 1495 + }, + { + "epoch": 1.2599663110612016, + "grad_norm": 0.24338217079639435, + "learning_rate": 7.192736979176025e-06, + "loss": 0.3891, + "step": 1496 + }, + { + "epoch": 1.2608085345311624, + "grad_norm": 0.22656427323818207, + "learning_rate": 7.188329910441154e-06, + "loss": 0.364, + "step": 1497 + }, + { + "epoch": 1.261650758001123, + "grad_norm": 0.19089765846729279, + "learning_rate": 7.183920737794663e-06, + "loss": 0.3476, + "step": 1498 + }, + { + "epoch": 1.2624929814710837, + "grad_norm": 0.2502041459083557, + "learning_rate": 7.179509465475636e-06, + "loss": 0.3563, + "step": 1499 + }, + { + "epoch": 1.2633352049410442, + "grad_norm": 0.22060425579547882, + "learning_rate": 7.175096097725169e-06, + "loss": 0.3802, + "step": 1500 + }, + { + "epoch": 1.264177428411005, + "grad_norm": 0.25501200556755066, + "learning_rate": 7.170680638786383e-06, + "loss": 0.3532, + "step": 1501 + }, + { + "epoch": 1.2650196518809658, + "grad_norm": 0.21981732547283173, + "learning_rate": 7.166263092904399e-06, + "loss": 0.3528, + "step": 1502 + }, + { + "epoch": 1.2658618753509265, + "grad_norm": 0.23550871014595032, + "learning_rate": 7.161843464326349e-06, + "loss": 0.3695, + "step": 1503 + }, + { + "epoch": 1.266704098820887, + "grad_norm": 0.24225080013275146, + "learning_rate": 7.157421757301371e-06, + "loss": 0.3578, + "step": 1504 + }, + { + "epoch": 1.2675463222908478, + "grad_norm": 0.20522600412368774, + "learning_rate": 7.1529979760805946e-06, + "loss": 0.3542, + "step": 1505 + }, + { + "epoch": 1.2683885457608086, + "grad_norm": 0.23005728423595428, + "learning_rate": 7.148572124917148e-06, + "loss": 0.3521, + "step": 1506 + }, + { + "epoch": 1.2692307692307692, + "grad_norm": 0.22665822505950928, + "learning_rate": 7.144144208066148e-06, + "loss": 0.3414, + "step": 1507 + }, + { + "epoch": 1.27007299270073, + "grad_norm": 0.21564829349517822, + "learning_rate": 7.1397142297846975e-06, + "loss": 0.3615, + "step": 1508 + }, + { + "epoch": 1.2709152161706907, + "grad_norm": 0.23631811141967773, + "learning_rate": 7.135282194331881e-06, + "loss": 0.3869, + "step": 1509 + }, + { + "epoch": 1.2717574396406512, + "grad_norm": 0.2472311556339264, + "learning_rate": 7.130848105968762e-06, + "loss": 0.3572, + "step": 1510 + }, + { + "epoch": 1.272599663110612, + "grad_norm": 0.20352278649806976, + "learning_rate": 7.126411968958374e-06, + "loss": 0.3525, + "step": 1511 + }, + { + "epoch": 1.2734418865805728, + "grad_norm": 0.24103017151355743, + "learning_rate": 7.121973787565727e-06, + "loss": 0.3856, + "step": 1512 + }, + { + "epoch": 1.2742841100505333, + "grad_norm": 0.2155137062072754, + "learning_rate": 7.1175335660577906e-06, + "loss": 0.3688, + "step": 1513 + }, + { + "epoch": 1.275126333520494, + "grad_norm": 0.22255980968475342, + "learning_rate": 7.113091308703498e-06, + "loss": 0.3707, + "step": 1514 + }, + { + "epoch": 1.2759685569904549, + "grad_norm": 0.22150452435016632, + "learning_rate": 7.1086470197737405e-06, + "loss": 0.3538, + "step": 1515 + }, + { + "epoch": 1.2768107804604156, + "grad_norm": 0.261150985956192, + "learning_rate": 7.104200703541358e-06, + "loss": 0.3684, + "step": 1516 + }, + { + "epoch": 1.2776530039303762, + "grad_norm": 0.2053704559803009, + "learning_rate": 7.099752364281147e-06, + "loss": 0.3627, + "step": 1517 + }, + { + "epoch": 1.278495227400337, + "grad_norm": 0.19474367797374725, + "learning_rate": 7.095302006269842e-06, + "loss": 0.3467, + "step": 1518 + }, + { + "epoch": 1.2793374508702975, + "grad_norm": 0.22994056344032288, + "learning_rate": 7.090849633786125e-06, + "loss": 0.3585, + "step": 1519 + }, + { + "epoch": 1.2801796743402583, + "grad_norm": 0.22903107106685638, + "learning_rate": 7.0863952511106075e-06, + "loss": 0.3829, + "step": 1520 + }, + { + "epoch": 1.281021897810219, + "grad_norm": 0.1935136318206787, + "learning_rate": 7.0819388625258385e-06, + "loss": 0.308, + "step": 1521 + }, + { + "epoch": 1.2818641212801798, + "grad_norm": 0.21748337149620056, + "learning_rate": 7.077480472316296e-06, + "loss": 0.4124, + "step": 1522 + }, + { + "epoch": 1.2827063447501403, + "grad_norm": 0.20140865445137024, + "learning_rate": 7.0730200847683795e-06, + "loss": 0.3328, + "step": 1523 + }, + { + "epoch": 1.283548568220101, + "grad_norm": 0.2253345400094986, + "learning_rate": 7.06855770417041e-06, + "loss": 0.3933, + "step": 1524 + }, + { + "epoch": 1.2843907916900617, + "grad_norm": 0.223196879029274, + "learning_rate": 7.0640933348126235e-06, + "loss": 0.358, + "step": 1525 + }, + { + "epoch": 1.2852330151600224, + "grad_norm": 0.20166324079036713, + "learning_rate": 7.059626980987172e-06, + "loss": 0.3446, + "step": 1526 + }, + { + "epoch": 1.2860752386299832, + "grad_norm": 0.230132058262825, + "learning_rate": 7.05515864698811e-06, + "loss": 0.3775, + "step": 1527 + }, + { + "epoch": 1.286917462099944, + "grad_norm": 0.22569893300533295, + "learning_rate": 7.0506883371114e-06, + "loss": 0.3603, + "step": 1528 + }, + { + "epoch": 1.2877596855699045, + "grad_norm": 0.2010277509689331, + "learning_rate": 7.046216055654902e-06, + "loss": 0.3565, + "step": 1529 + }, + { + "epoch": 1.2886019090398653, + "grad_norm": 0.22424009442329407, + "learning_rate": 7.041741806918372e-06, + "loss": 0.3822, + "step": 1530 + }, + { + "epoch": 1.2894441325098258, + "grad_norm": 0.20551267266273499, + "learning_rate": 7.0372655952034575e-06, + "loss": 0.3243, + "step": 1531 + }, + { + "epoch": 1.2902863559797866, + "grad_norm": 0.27740082144737244, + "learning_rate": 7.032787424813694e-06, + "loss": 0.3836, + "step": 1532 + }, + { + "epoch": 1.2911285794497473, + "grad_norm": 0.21728472411632538, + "learning_rate": 7.028307300054499e-06, + "loss": 0.3624, + "step": 1533 + }, + { + "epoch": 1.2919708029197081, + "grad_norm": 0.21663859486579895, + "learning_rate": 7.023825225233169e-06, + "loss": 0.3399, + "step": 1534 + }, + { + "epoch": 1.2928130263896687, + "grad_norm": 0.2152295559644699, + "learning_rate": 7.019341204658876e-06, + "loss": 0.3849, + "step": 1535 + }, + { + "epoch": 1.2936552498596294, + "grad_norm": 0.23276902735233307, + "learning_rate": 7.014855242642662e-06, + "loss": 0.3789, + "step": 1536 + }, + { + "epoch": 1.2944974733295902, + "grad_norm": 0.21636129915714264, + "learning_rate": 7.0103673434974375e-06, + "loss": 0.3439, + "step": 1537 + }, + { + "epoch": 1.2953396967995507, + "grad_norm": 0.23212984204292297, + "learning_rate": 7.0058775115379705e-06, + "loss": 0.3799, + "step": 1538 + }, + { + "epoch": 1.2961819202695115, + "grad_norm": 0.20423316955566406, + "learning_rate": 7.0013857510808934e-06, + "loss": 0.3313, + "step": 1539 + }, + { + "epoch": 1.2970241437394723, + "grad_norm": 0.20709416270256042, + "learning_rate": 6.99689206644469e-06, + "loss": 0.3255, + "step": 1540 + }, + { + "epoch": 1.2978663672094328, + "grad_norm": 0.21995380520820618, + "learning_rate": 6.992396461949693e-06, + "loss": 0.3736, + "step": 1541 + }, + { + "epoch": 1.2987085906793936, + "grad_norm": 0.21833443641662598, + "learning_rate": 6.987898941918082e-06, + "loss": 0.3641, + "step": 1542 + }, + { + "epoch": 1.2995508141493544, + "grad_norm": 0.2526569068431854, + "learning_rate": 6.9833995106738774e-06, + "loss": 0.3496, + "step": 1543 + }, + { + "epoch": 1.300393037619315, + "grad_norm": 0.23018354177474976, + "learning_rate": 6.978898172542939e-06, + "loss": 0.3682, + "step": 1544 + }, + { + "epoch": 1.3012352610892757, + "grad_norm": 0.2187051624059677, + "learning_rate": 6.974394931852957e-06, + "loss": 0.3324, + "step": 1545 + }, + { + "epoch": 1.3020774845592364, + "grad_norm": 0.23275905847549438, + "learning_rate": 6.969889792933454e-06, + "loss": 0.3668, + "step": 1546 + }, + { + "epoch": 1.3029197080291972, + "grad_norm": 0.2110632210969925, + "learning_rate": 6.965382760115775e-06, + "loss": 0.332, + "step": 1547 + }, + { + "epoch": 1.3037619314991578, + "grad_norm": 0.231571763753891, + "learning_rate": 6.960873837733089e-06, + "loss": 0.3932, + "step": 1548 + }, + { + "epoch": 1.3046041549691185, + "grad_norm": 0.21139703691005707, + "learning_rate": 6.956363030120377e-06, + "loss": 0.3755, + "step": 1549 + }, + { + "epoch": 1.305446378439079, + "grad_norm": 0.20921073853969574, + "learning_rate": 6.951850341614436e-06, + "loss": 0.3632, + "step": 1550 + }, + { + "epoch": 1.3062886019090398, + "grad_norm": 0.23165875673294067, + "learning_rate": 6.94733577655387e-06, + "loss": 0.362, + "step": 1551 + }, + { + "epoch": 1.3071308253790006, + "grad_norm": 0.23026436567306519, + "learning_rate": 6.942819339279089e-06, + "loss": 0.3451, + "step": 1552 + }, + { + "epoch": 1.3079730488489614, + "grad_norm": 0.21386614441871643, + "learning_rate": 6.9383010341323e-06, + "loss": 0.3528, + "step": 1553 + }, + { + "epoch": 1.308815272318922, + "grad_norm": 0.2159416675567627, + "learning_rate": 6.933780865457508e-06, + "loss": 0.3315, + "step": 1554 + }, + { + "epoch": 1.3096574957888827, + "grad_norm": 0.21278788149356842, + "learning_rate": 6.9292588376005095e-06, + "loss": 0.3949, + "step": 1555 + }, + { + "epoch": 1.3104997192588432, + "grad_norm": 0.20024999976158142, + "learning_rate": 6.924734954908887e-06, + "loss": 0.3288, + "step": 1556 + }, + { + "epoch": 1.311341942728804, + "grad_norm": 0.24197152256965637, + "learning_rate": 6.920209221732007e-06, + "loss": 0.4007, + "step": 1557 + }, + { + "epoch": 1.3121841661987648, + "grad_norm": 0.19979821145534515, + "learning_rate": 6.9156816424210175e-06, + "loss": 0.3112, + "step": 1558 + }, + { + "epoch": 1.3130263896687255, + "grad_norm": 0.2120949625968933, + "learning_rate": 6.911152221328837e-06, + "loss": 0.3552, + "step": 1559 + }, + { + "epoch": 1.313868613138686, + "grad_norm": 0.23370873928070068, + "learning_rate": 6.90662096281016e-06, + "loss": 0.375, + "step": 1560 + }, + { + "epoch": 1.3147108366086468, + "grad_norm": 0.21100081503391266, + "learning_rate": 6.902087871221439e-06, + "loss": 0.3498, + "step": 1561 + }, + { + "epoch": 1.3155530600786074, + "grad_norm": 0.21074166893959045, + "learning_rate": 6.897552950920898e-06, + "loss": 0.3611, + "step": 1562 + }, + { + "epoch": 1.3163952835485682, + "grad_norm": 0.22161081433296204, + "learning_rate": 6.893016206268518e-06, + "loss": 0.3671, + "step": 1563 + }, + { + "epoch": 1.317237507018529, + "grad_norm": 0.22640085220336914, + "learning_rate": 6.888477641626027e-06, + "loss": 0.329, + "step": 1564 + }, + { + "epoch": 1.3180797304884897, + "grad_norm": 0.2186802476644516, + "learning_rate": 6.88393726135691e-06, + "loss": 0.3733, + "step": 1565 + }, + { + "epoch": 1.3189219539584502, + "grad_norm": 0.2328466773033142, + "learning_rate": 6.879395069826394e-06, + "loss": 0.3746, + "step": 1566 + }, + { + "epoch": 1.319764177428411, + "grad_norm": 0.2159368246793747, + "learning_rate": 6.874851071401448e-06, + "loss": 0.3492, + "step": 1567 + }, + { + "epoch": 1.3206064008983718, + "grad_norm": 0.21284975111484528, + "learning_rate": 6.870305270450779e-06, + "loss": 0.3656, + "step": 1568 + }, + { + "epoch": 1.3214486243683323, + "grad_norm": 0.21185383200645447, + "learning_rate": 6.865757671344827e-06, + "loss": 0.3669, + "step": 1569 + }, + { + "epoch": 1.322290847838293, + "grad_norm": 0.2309747338294983, + "learning_rate": 6.861208278455759e-06, + "loss": 0.413, + "step": 1570 + }, + { + "epoch": 1.3231330713082539, + "grad_norm": 0.2102832794189453, + "learning_rate": 6.856657096157469e-06, + "loss": 0.3168, + "step": 1571 + }, + { + "epoch": 1.3239752947782144, + "grad_norm": 0.22655142843723297, + "learning_rate": 6.85210412882557e-06, + "loss": 0.3767, + "step": 1572 + }, + { + "epoch": 1.3248175182481752, + "grad_norm": 0.2309560477733612, + "learning_rate": 6.8475493808373895e-06, + "loss": 0.3696, + "step": 1573 + }, + { + "epoch": 1.325659741718136, + "grad_norm": 0.22664643824100494, + "learning_rate": 6.8429928565719724e-06, + "loss": 0.3709, + "step": 1574 + }, + { + "epoch": 1.3265019651880965, + "grad_norm": 0.2255297303199768, + "learning_rate": 6.838434560410064e-06, + "loss": 0.3626, + "step": 1575 + }, + { + "epoch": 1.3273441886580573, + "grad_norm": 0.22523364424705505, + "learning_rate": 6.833874496734122e-06, + "loss": 0.3761, + "step": 1576 + }, + { + "epoch": 1.328186412128018, + "grad_norm": 0.21672458946704865, + "learning_rate": 6.829312669928293e-06, + "loss": 0.3518, + "step": 1577 + }, + { + "epoch": 1.3290286355979788, + "grad_norm": 0.20885531604290009, + "learning_rate": 6.824749084378428e-06, + "loss": 0.3456, + "step": 1578 + }, + { + "epoch": 1.3298708590679393, + "grad_norm": 0.2158428132534027, + "learning_rate": 6.820183744472062e-06, + "loss": 0.3732, + "step": 1579 + }, + { + "epoch": 1.3307130825379, + "grad_norm": 0.22627563774585724, + "learning_rate": 6.81561665459842e-06, + "loss": 0.376, + "step": 1580 + }, + { + "epoch": 1.3315553060078607, + "grad_norm": 0.21666988730430603, + "learning_rate": 6.811047819148413e-06, + "loss": 0.3684, + "step": 1581 + }, + { + "epoch": 1.3323975294778214, + "grad_norm": 0.20151835680007935, + "learning_rate": 6.806477242514623e-06, + "loss": 0.3128, + "step": 1582 + }, + { + "epoch": 1.3332397529477822, + "grad_norm": 0.22538265585899353, + "learning_rate": 6.801904929091311e-06, + "loss": 0.3822, + "step": 1583 + }, + { + "epoch": 1.334081976417743, + "grad_norm": 0.2307758331298828, + "learning_rate": 6.7973308832744035e-06, + "loss": 0.398, + "step": 1584 + }, + { + "epoch": 1.3349241998877035, + "grad_norm": 0.21107590198516846, + "learning_rate": 6.792755109461498e-06, + "loss": 0.3483, + "step": 1585 + }, + { + "epoch": 1.3357664233576643, + "grad_norm": 0.2177739441394806, + "learning_rate": 6.78817761205185e-06, + "loss": 0.3518, + "step": 1586 + }, + { + "epoch": 1.3366086468276248, + "grad_norm": 0.20612265169620514, + "learning_rate": 6.783598395446371e-06, + "loss": 0.3495, + "step": 1587 + }, + { + "epoch": 1.3374508702975856, + "grad_norm": 0.20119178295135498, + "learning_rate": 6.779017464047629e-06, + "loss": 0.3432, + "step": 1588 + }, + { + "epoch": 1.3382930937675463, + "grad_norm": 0.22750814259052277, + "learning_rate": 6.7744348222598386e-06, + "loss": 0.3889, + "step": 1589 + }, + { + "epoch": 1.3391353172375071, + "grad_norm": 0.22598476707935333, + "learning_rate": 6.769850474488859e-06, + "loss": 0.3693, + "step": 1590 + }, + { + "epoch": 1.3399775407074677, + "grad_norm": 0.1982087939977646, + "learning_rate": 6.7652644251421875e-06, + "loss": 0.3141, + "step": 1591 + }, + { + "epoch": 1.3408197641774284, + "grad_norm": 0.22299589216709137, + "learning_rate": 6.7606766786289624e-06, + "loss": 0.3586, + "step": 1592 + }, + { + "epoch": 1.341661987647389, + "grad_norm": 0.20625364780426025, + "learning_rate": 6.756087239359948e-06, + "loss": 0.356, + "step": 1593 + }, + { + "epoch": 1.3425042111173497, + "grad_norm": 0.21917255222797394, + "learning_rate": 6.75149611174754e-06, + "loss": 0.3581, + "step": 1594 + }, + { + "epoch": 1.3433464345873105, + "grad_norm": 0.22012881934642792, + "learning_rate": 6.746903300205756e-06, + "loss": 0.3746, + "step": 1595 + }, + { + "epoch": 1.3441886580572713, + "grad_norm": 0.20556309819221497, + "learning_rate": 6.742308809150232e-06, + "loss": 0.3787, + "step": 1596 + }, + { + "epoch": 1.3450308815272318, + "grad_norm": 0.19921176135540009, + "learning_rate": 6.737712642998219e-06, + "loss": 0.3461, + "step": 1597 + }, + { + "epoch": 1.3458731049971926, + "grad_norm": 0.2157866656780243, + "learning_rate": 6.7331148061685796e-06, + "loss": 0.3814, + "step": 1598 + }, + { + "epoch": 1.3467153284671534, + "grad_norm": 0.213243767619133, + "learning_rate": 6.728515303081782e-06, + "loss": 0.3405, + "step": 1599 + }, + { + "epoch": 1.347557551937114, + "grad_norm": 0.23965325951576233, + "learning_rate": 6.723914138159895e-06, + "loss": 0.3728, + "step": 1600 + }, + { + "epoch": 1.3483997754070747, + "grad_norm": 0.22270913422107697, + "learning_rate": 6.719311315826589e-06, + "loss": 0.3781, + "step": 1601 + }, + { + "epoch": 1.3492419988770354, + "grad_norm": 0.20295816659927368, + "learning_rate": 6.714706840507122e-06, + "loss": 0.3256, + "step": 1602 + }, + { + "epoch": 1.350084222346996, + "grad_norm": 0.24144160747528076, + "learning_rate": 6.710100716628345e-06, + "loss": 0.3677, + "step": 1603 + }, + { + "epoch": 1.3509264458169568, + "grad_norm": 0.24592921137809753, + "learning_rate": 6.705492948618694e-06, + "loss": 0.3813, + "step": 1604 + }, + { + "epoch": 1.3517686692869175, + "grad_norm": 0.20310568809509277, + "learning_rate": 6.700883540908185e-06, + "loss": 0.3532, + "step": 1605 + }, + { + "epoch": 1.352610892756878, + "grad_norm": 0.30135607719421387, + "learning_rate": 6.696272497928411e-06, + "loss": 0.4034, + "step": 1606 + }, + { + "epoch": 1.3534531162268388, + "grad_norm": 0.20306457579135895, + "learning_rate": 6.691659824112535e-06, + "loss": 0.3345, + "step": 1607 + }, + { + "epoch": 1.3542953396967996, + "grad_norm": 0.1948816031217575, + "learning_rate": 6.687045523895292e-06, + "loss": 0.3221, + "step": 1608 + }, + { + "epoch": 1.3551375631667604, + "grad_norm": 0.23035724461078644, + "learning_rate": 6.682429601712976e-06, + "loss": 0.3724, + "step": 1609 + }, + { + "epoch": 1.355979786636721, + "grad_norm": 0.2039736956357956, + "learning_rate": 6.6778120620034455e-06, + "loss": 0.3477, + "step": 1610 + }, + { + "epoch": 1.3568220101066817, + "grad_norm": 0.22455595433712006, + "learning_rate": 6.673192909206109e-06, + "loss": 0.3985, + "step": 1611 + }, + { + "epoch": 1.3576642335766422, + "grad_norm": 0.20514552295207977, + "learning_rate": 6.668572147761929e-06, + "loss": 0.3361, + "step": 1612 + }, + { + "epoch": 1.358506457046603, + "grad_norm": 0.20776930451393127, + "learning_rate": 6.663949782113413e-06, + "loss": 0.3481, + "step": 1613 + }, + { + "epoch": 1.3593486805165638, + "grad_norm": 0.27368056774139404, + "learning_rate": 6.6593258167046115e-06, + "loss": 0.3668, + "step": 1614 + }, + { + "epoch": 1.3601909039865245, + "grad_norm": 0.19447094202041626, + "learning_rate": 6.654700255981115e-06, + "loss": 0.3312, + "step": 1615 + }, + { + "epoch": 1.361033127456485, + "grad_norm": 0.20037733018398285, + "learning_rate": 6.6500731043900425e-06, + "loss": 0.354, + "step": 1616 + }, + { + "epoch": 1.3618753509264458, + "grad_norm": 0.2207939624786377, + "learning_rate": 6.64544436638005e-06, + "loss": 0.3597, + "step": 1617 + }, + { + "epoch": 1.3627175743964064, + "grad_norm": 0.22954882681369781, + "learning_rate": 6.640814046401312e-06, + "loss": 0.4111, + "step": 1618 + }, + { + "epoch": 1.3635597978663672, + "grad_norm": 0.2107580602169037, + "learning_rate": 6.6361821489055275e-06, + "loss": 0.3634, + "step": 1619 + }, + { + "epoch": 1.364402021336328, + "grad_norm": 0.25928613543510437, + "learning_rate": 6.63154867834591e-06, + "loss": 0.3793, + "step": 1620 + }, + { + "epoch": 1.3652442448062887, + "grad_norm": 0.19150257110595703, + "learning_rate": 6.626913639177189e-06, + "loss": 0.3234, + "step": 1621 + }, + { + "epoch": 1.3660864682762492, + "grad_norm": 0.23382145166397095, + "learning_rate": 6.622277035855596e-06, + "loss": 0.3803, + "step": 1622 + }, + { + "epoch": 1.36692869174621, + "grad_norm": 0.21147483587265015, + "learning_rate": 6.617638872838874e-06, + "loss": 0.3307, + "step": 1623 + }, + { + "epoch": 1.3677709152161706, + "grad_norm": 0.19452020525932312, + "learning_rate": 6.61299915458626e-06, + "loss": 0.3397, + "step": 1624 + }, + { + "epoch": 1.3686131386861313, + "grad_norm": 0.21936218440532684, + "learning_rate": 6.608357885558485e-06, + "loss": 0.3613, + "step": 1625 + }, + { + "epoch": 1.369455362156092, + "grad_norm": 0.21089312434196472, + "learning_rate": 6.603715070217779e-06, + "loss": 0.367, + "step": 1626 + }, + { + "epoch": 1.3702975856260529, + "grad_norm": 0.22670963406562805, + "learning_rate": 6.599070713027849e-06, + "loss": 0.3447, + "step": 1627 + }, + { + "epoch": 1.3711398090960134, + "grad_norm": 0.21916204690933228, + "learning_rate": 6.594424818453891e-06, + "loss": 0.3479, + "step": 1628 + }, + { + "epoch": 1.3719820325659742, + "grad_norm": 0.23116189241409302, + "learning_rate": 6.589777390962575e-06, + "loss": 0.366, + "step": 1629 + }, + { + "epoch": 1.372824256035935, + "grad_norm": 0.21582533419132233, + "learning_rate": 6.58512843502205e-06, + "loss": 0.354, + "step": 1630 + }, + { + "epoch": 1.3736664795058955, + "grad_norm": 0.2179097682237625, + "learning_rate": 6.580477955101927e-06, + "loss": 0.3345, + "step": 1631 + }, + { + "epoch": 1.3745087029758563, + "grad_norm": 0.20926107466220856, + "learning_rate": 6.5758259556732896e-06, + "loss": 0.358, + "step": 1632 + }, + { + "epoch": 1.375350926445817, + "grad_norm": 0.22556179761886597, + "learning_rate": 6.571172441208678e-06, + "loss": 0.3683, + "step": 1633 + }, + { + "epoch": 1.3761931499157778, + "grad_norm": 0.21263204514980316, + "learning_rate": 6.566517416182088e-06, + "loss": 0.3358, + "step": 1634 + }, + { + "epoch": 1.3770353733857383, + "grad_norm": 0.34583309292793274, + "learning_rate": 6.561860885068972e-06, + "loss": 0.3307, + "step": 1635 + }, + { + "epoch": 1.377877596855699, + "grad_norm": 0.2124481499195099, + "learning_rate": 6.5572028523462275e-06, + "loss": 0.376, + "step": 1636 + }, + { + "epoch": 1.3787198203256597, + "grad_norm": 0.2069571316242218, + "learning_rate": 6.552543322492195e-06, + "loss": 0.3406, + "step": 1637 + }, + { + "epoch": 1.3795620437956204, + "grad_norm": 0.22916629910469055, + "learning_rate": 6.547882299986658e-06, + "loss": 0.3512, + "step": 1638 + }, + { + "epoch": 1.3804042672655812, + "grad_norm": 0.22375811636447906, + "learning_rate": 6.54321978931083e-06, + "loss": 0.3636, + "step": 1639 + }, + { + "epoch": 1.381246490735542, + "grad_norm": 0.21804262697696686, + "learning_rate": 6.53855579494736e-06, + "loss": 0.3548, + "step": 1640 + }, + { + "epoch": 1.3820887142055025, + "grad_norm": 0.2238789200782776, + "learning_rate": 6.53389032138032e-06, + "loss": 0.3794, + "step": 1641 + }, + { + "epoch": 1.3829309376754633, + "grad_norm": 0.21942567825317383, + "learning_rate": 6.5292233730952074e-06, + "loss": 0.3548, + "step": 1642 + }, + { + "epoch": 1.3837731611454238, + "grad_norm": 0.2897321581840515, + "learning_rate": 6.5245549545789335e-06, + "loss": 0.37, + "step": 1643 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 0.2234940081834793, + "learning_rate": 6.519885070319827e-06, + "loss": 0.3306, + "step": 1644 + }, + { + "epoch": 1.3854576080853453, + "grad_norm": 0.22789642214775085, + "learning_rate": 6.515213724807621e-06, + "loss": 0.3668, + "step": 1645 + }, + { + "epoch": 1.3862998315553061, + "grad_norm": 0.23737990856170654, + "learning_rate": 6.51054092253346e-06, + "loss": 0.3863, + "step": 1646 + }, + { + "epoch": 1.3871420550252667, + "grad_norm": 0.22608612477779388, + "learning_rate": 6.505866667989884e-06, + "loss": 0.3305, + "step": 1647 + }, + { + "epoch": 1.3879842784952274, + "grad_norm": 0.2412174493074417, + "learning_rate": 6.5011909656708305e-06, + "loss": 0.3769, + "step": 1648 + }, + { + "epoch": 1.388826501965188, + "grad_norm": 0.19889724254608154, + "learning_rate": 6.49651382007163e-06, + "loss": 0.3208, + "step": 1649 + }, + { + "epoch": 1.3896687254351487, + "grad_norm": 0.22150558233261108, + "learning_rate": 6.491835235688999e-06, + "loss": 0.3595, + "step": 1650 + }, + { + "epoch": 1.3905109489051095, + "grad_norm": 0.21915128827095032, + "learning_rate": 6.487155217021039e-06, + "loss": 0.3518, + "step": 1651 + }, + { + "epoch": 1.3913531723750703, + "grad_norm": 0.2163122445344925, + "learning_rate": 6.482473768567228e-06, + "loss": 0.3458, + "step": 1652 + }, + { + "epoch": 1.3921953958450308, + "grad_norm": 0.21529380977153778, + "learning_rate": 6.477790894828422e-06, + "loss": 0.3574, + "step": 1653 + }, + { + "epoch": 1.3930376193149916, + "grad_norm": 0.23291248083114624, + "learning_rate": 6.473106600306842e-06, + "loss": 0.3687, + "step": 1654 + }, + { + "epoch": 1.3938798427849521, + "grad_norm": 0.2185925394296646, + "learning_rate": 6.468420889506084e-06, + "loss": 0.3416, + "step": 1655 + }, + { + "epoch": 1.394722066254913, + "grad_norm": 0.22031629085540771, + "learning_rate": 6.463733766931096e-06, + "loss": 0.3562, + "step": 1656 + }, + { + "epoch": 1.3955642897248737, + "grad_norm": 0.24342696368694305, + "learning_rate": 6.459045237088189e-06, + "loss": 0.3705, + "step": 1657 + }, + { + "epoch": 1.3964065131948344, + "grad_norm": 0.23241513967514038, + "learning_rate": 6.454355304485024e-06, + "loss": 0.3376, + "step": 1658 + }, + { + "epoch": 1.397248736664795, + "grad_norm": 0.2287467122077942, + "learning_rate": 6.449663973630613e-06, + "loss": 0.3353, + "step": 1659 + }, + { + "epoch": 1.3980909601347558, + "grad_norm": 0.21655339002609253, + "learning_rate": 6.444971249035312e-06, + "loss": 0.3637, + "step": 1660 + }, + { + "epoch": 1.3989331836047165, + "grad_norm": 0.21061834692955017, + "learning_rate": 6.440277135210815e-06, + "loss": 0.3745, + "step": 1661 + }, + { + "epoch": 1.399775407074677, + "grad_norm": 0.2187851071357727, + "learning_rate": 6.435581636670154e-06, + "loss": 0.3574, + "step": 1662 + }, + { + "epoch": 1.4006176305446378, + "grad_norm": 0.21468669176101685, + "learning_rate": 6.43088475792769e-06, + "loss": 0.3555, + "step": 1663 + }, + { + "epoch": 1.4014598540145986, + "grad_norm": 0.21525272727012634, + "learning_rate": 6.426186503499114e-06, + "loss": 0.3474, + "step": 1664 + }, + { + "epoch": 1.4023020774845594, + "grad_norm": 0.2308477908372879, + "learning_rate": 6.421486877901436e-06, + "loss": 0.3469, + "step": 1665 + }, + { + "epoch": 1.40314430095452, + "grad_norm": 0.23242992162704468, + "learning_rate": 6.4167858856529875e-06, + "loss": 0.3822, + "step": 1666 + }, + { + "epoch": 1.4039865244244807, + "grad_norm": 0.20637556910514832, + "learning_rate": 6.412083531273411e-06, + "loss": 0.3638, + "step": 1667 + }, + { + "epoch": 1.4048287478944412, + "grad_norm": 0.2414320558309555, + "learning_rate": 6.407379819283661e-06, + "loss": 0.3518, + "step": 1668 + }, + { + "epoch": 1.405670971364402, + "grad_norm": 0.23745858669281006, + "learning_rate": 6.402674754205998e-06, + "loss": 0.3575, + "step": 1669 + }, + { + "epoch": 1.4065131948343628, + "grad_norm": 0.23913931846618652, + "learning_rate": 6.397968340563978e-06, + "loss": 0.371, + "step": 1670 + }, + { + "epoch": 1.4073554183043235, + "grad_norm": 0.22450561821460724, + "learning_rate": 6.393260582882462e-06, + "loss": 0.3219, + "step": 1671 + }, + { + "epoch": 1.408197641774284, + "grad_norm": 0.23129063844680786, + "learning_rate": 6.3885514856875945e-06, + "loss": 0.3987, + "step": 1672 + }, + { + "epoch": 1.4090398652442448, + "grad_norm": 0.24473051726818085, + "learning_rate": 6.383841053506813e-06, + "loss": 0.382, + "step": 1673 + }, + { + "epoch": 1.4098820887142054, + "grad_norm": 0.21250121295452118, + "learning_rate": 6.379129290868837e-06, + "loss": 0.3419, + "step": 1674 + }, + { + "epoch": 1.4107243121841662, + "grad_norm": 0.223237544298172, + "learning_rate": 6.3744162023036685e-06, + "loss": 0.3889, + "step": 1675 + }, + { + "epoch": 1.411566535654127, + "grad_norm": 0.22709648311138153, + "learning_rate": 6.369701792342576e-06, + "loss": 0.3552, + "step": 1676 + }, + { + "epoch": 1.4124087591240877, + "grad_norm": 0.20558980107307434, + "learning_rate": 6.364986065518106e-06, + "loss": 0.3356, + "step": 1677 + }, + { + "epoch": 1.4132509825940482, + "grad_norm": 0.23428533971309662, + "learning_rate": 6.360269026364071e-06, + "loss": 0.3917, + "step": 1678 + }, + { + "epoch": 1.414093206064009, + "grad_norm": 0.2525196671485901, + "learning_rate": 6.35555067941554e-06, + "loss": 0.3648, + "step": 1679 + }, + { + "epoch": 1.4149354295339696, + "grad_norm": 0.2390659749507904, + "learning_rate": 6.350831029208844e-06, + "loss": 0.3376, + "step": 1680 + }, + { + "epoch": 1.4157776530039303, + "grad_norm": 0.21221652626991272, + "learning_rate": 6.3461100802815625e-06, + "loss": 0.3609, + "step": 1681 + }, + { + "epoch": 1.416619876473891, + "grad_norm": 0.22451327741146088, + "learning_rate": 6.34138783717253e-06, + "loss": 0.3531, + "step": 1682 + }, + { + "epoch": 1.4174620999438519, + "grad_norm": 0.2102045863866806, + "learning_rate": 6.336664304421818e-06, + "loss": 0.3108, + "step": 1683 + }, + { + "epoch": 1.4183043234138124, + "grad_norm": 0.21954309940338135, + "learning_rate": 6.331939486570745e-06, + "loss": 0.3837, + "step": 1684 + }, + { + "epoch": 1.4191465468837732, + "grad_norm": 0.20022211968898773, + "learning_rate": 6.3272133881618596e-06, + "loss": 0.3487, + "step": 1685 + }, + { + "epoch": 1.4199887703537337, + "grad_norm": 0.22107923030853271, + "learning_rate": 6.322486013738942e-06, + "loss": 0.3499, + "step": 1686 + }, + { + "epoch": 1.4208309938236945, + "grad_norm": 0.22098489105701447, + "learning_rate": 6.317757367847005e-06, + "loss": 0.3242, + "step": 1687 + }, + { + "epoch": 1.4216732172936553, + "grad_norm": 0.2224896252155304, + "learning_rate": 6.313027455032274e-06, + "loss": 0.4122, + "step": 1688 + }, + { + "epoch": 1.422515440763616, + "grad_norm": 0.20495165884494781, + "learning_rate": 6.308296279842204e-06, + "loss": 0.3367, + "step": 1689 + }, + { + "epoch": 1.4233576642335766, + "grad_norm": 0.23125077784061432, + "learning_rate": 6.303563846825453e-06, + "loss": 0.3776, + "step": 1690 + }, + { + "epoch": 1.4241998877035373, + "grad_norm": 0.2269773632287979, + "learning_rate": 6.298830160531895e-06, + "loss": 0.3647, + "step": 1691 + }, + { + "epoch": 1.425042111173498, + "grad_norm": 0.20680685341358185, + "learning_rate": 6.294095225512604e-06, + "loss": 0.3503, + "step": 1692 + }, + { + "epoch": 1.4258843346434587, + "grad_norm": 0.22571782767772675, + "learning_rate": 6.289359046319862e-06, + "loss": 0.4192, + "step": 1693 + }, + { + "epoch": 1.4267265581134194, + "grad_norm": 0.21243885159492493, + "learning_rate": 6.2846216275071395e-06, + "loss": 0.3199, + "step": 1694 + }, + { + "epoch": 1.4275687815833802, + "grad_norm": 0.2743223309516907, + "learning_rate": 6.279882973629101e-06, + "loss": 0.3622, + "step": 1695 + }, + { + "epoch": 1.428411005053341, + "grad_norm": 0.19054637849330902, + "learning_rate": 6.275143089241603e-06, + "loss": 0.3307, + "step": 1696 + }, + { + "epoch": 1.4292532285233015, + "grad_norm": 0.27557373046875, + "learning_rate": 6.270401978901678e-06, + "loss": 0.4077, + "step": 1697 + }, + { + "epoch": 1.4300954519932623, + "grad_norm": 0.24843667447566986, + "learning_rate": 6.265659647167542e-06, + "loss": 0.369, + "step": 1698 + }, + { + "epoch": 1.4309376754632228, + "grad_norm": 0.23135574162006378, + "learning_rate": 6.260916098598584e-06, + "loss": 0.3172, + "step": 1699 + }, + { + "epoch": 1.4317798989331836, + "grad_norm": 0.19605474174022675, + "learning_rate": 6.256171337755362e-06, + "loss": 0.342, + "step": 1700 + }, + { + "epoch": 1.4326221224031443, + "grad_norm": 0.2249126434326172, + "learning_rate": 6.2514253691996e-06, + "loss": 0.3809, + "step": 1701 + }, + { + "epoch": 1.4334643458731051, + "grad_norm": 0.22616976499557495, + "learning_rate": 6.246678197494185e-06, + "loss": 0.3946, + "step": 1702 + }, + { + "epoch": 1.4343065693430657, + "grad_norm": 0.1968989074230194, + "learning_rate": 6.241929827203156e-06, + "loss": 0.3583, + "step": 1703 + }, + { + "epoch": 1.4351487928130264, + "grad_norm": 0.21847349405288696, + "learning_rate": 6.237180262891709e-06, + "loss": 0.3921, + "step": 1704 + }, + { + "epoch": 1.435991016282987, + "grad_norm": 0.19567416608333588, + "learning_rate": 6.2324295091261885e-06, + "loss": 0.3554, + "step": 1705 + }, + { + "epoch": 1.4368332397529477, + "grad_norm": 0.21993115544319153, + "learning_rate": 6.227677570474077e-06, + "loss": 0.3655, + "step": 1706 + }, + { + "epoch": 1.4376754632229085, + "grad_norm": 0.19446784257888794, + "learning_rate": 6.222924451504001e-06, + "loss": 0.3544, + "step": 1707 + }, + { + "epoch": 1.4385176866928693, + "grad_norm": 0.2094394713640213, + "learning_rate": 6.21817015678572e-06, + "loss": 0.337, + "step": 1708 + }, + { + "epoch": 1.4393599101628298, + "grad_norm": 0.22094875574111938, + "learning_rate": 6.213414690890125e-06, + "loss": 0.3941, + "step": 1709 + }, + { + "epoch": 1.4402021336327906, + "grad_norm": 0.1867690086364746, + "learning_rate": 6.208658058389232e-06, + "loss": 0.3237, + "step": 1710 + }, + { + "epoch": 1.4410443571027511, + "grad_norm": 0.2281683087348938, + "learning_rate": 6.203900263856177e-06, + "loss": 0.3747, + "step": 1711 + }, + { + "epoch": 1.441886580572712, + "grad_norm": 0.18243515491485596, + "learning_rate": 6.19914131186522e-06, + "loss": 0.3331, + "step": 1712 + }, + { + "epoch": 1.4427288040426727, + "grad_norm": 0.18891458213329315, + "learning_rate": 6.194381206991723e-06, + "loss": 0.3709, + "step": 1713 + }, + { + "epoch": 1.4435710275126334, + "grad_norm": 0.21232585608959198, + "learning_rate": 6.189619953812167e-06, + "loss": 0.3641, + "step": 1714 + }, + { + "epoch": 1.444413250982594, + "grad_norm": 0.21884340047836304, + "learning_rate": 6.184857556904129e-06, + "loss": 0.354, + "step": 1715 + }, + { + "epoch": 1.4452554744525548, + "grad_norm": 0.284342497587204, + "learning_rate": 6.180094020846291e-06, + "loss": 0.3789, + "step": 1716 + }, + { + "epoch": 1.4460976979225155, + "grad_norm": 0.1979561448097229, + "learning_rate": 6.175329350218426e-06, + "loss": 0.3294, + "step": 1717 + }, + { + "epoch": 1.446939921392476, + "grad_norm": 0.26564377546310425, + "learning_rate": 6.170563549601402e-06, + "loss": 0.3644, + "step": 1718 + }, + { + "epoch": 1.4477821448624368, + "grad_norm": 0.21076220273971558, + "learning_rate": 6.165796623577171e-06, + "loss": 0.3588, + "step": 1719 + }, + { + "epoch": 1.4486243683323976, + "grad_norm": 0.22534281015396118, + "learning_rate": 6.161028576728767e-06, + "loss": 0.3614, + "step": 1720 + }, + { + "epoch": 1.4494665918023582, + "grad_norm": 0.22138455510139465, + "learning_rate": 6.156259413640302e-06, + "loss": 0.3963, + "step": 1721 + }, + { + "epoch": 1.450308815272319, + "grad_norm": 0.20443442463874817, + "learning_rate": 6.15148913889696e-06, + "loss": 0.351, + "step": 1722 + }, + { + "epoch": 1.4511510387422797, + "grad_norm": 0.21905840933322906, + "learning_rate": 6.146717757084995e-06, + "loss": 0.4063, + "step": 1723 + }, + { + "epoch": 1.4519932622122402, + "grad_norm": 0.20653998851776123, + "learning_rate": 6.141945272791727e-06, + "loss": 0.3584, + "step": 1724 + }, + { + "epoch": 1.452835485682201, + "grad_norm": 0.1866150051355362, + "learning_rate": 6.1371716906055336e-06, + "loss": 0.3513, + "step": 1725 + }, + { + "epoch": 1.4536777091521618, + "grad_norm": 0.1922345757484436, + "learning_rate": 6.132397015115846e-06, + "loss": 0.3332, + "step": 1726 + }, + { + "epoch": 1.4545199326221225, + "grad_norm": 0.2280290275812149, + "learning_rate": 6.127621250913152e-06, + "loss": 0.389, + "step": 1727 + }, + { + "epoch": 1.455362156092083, + "grad_norm": 0.2247847467660904, + "learning_rate": 6.122844402588982e-06, + "loss": 0.3664, + "step": 1728 + }, + { + "epoch": 1.4562043795620438, + "grad_norm": 0.21930207312107086, + "learning_rate": 6.11806647473591e-06, + "loss": 0.3643, + "step": 1729 + }, + { + "epoch": 1.4570466030320044, + "grad_norm": 0.21347032487392426, + "learning_rate": 6.113287471947547e-06, + "loss": 0.3611, + "step": 1730 + }, + { + "epoch": 1.4578888265019652, + "grad_norm": 0.20544639229774475, + "learning_rate": 6.10850739881854e-06, + "loss": 0.3591, + "step": 1731 + }, + { + "epoch": 1.458731049971926, + "grad_norm": 0.20113444328308105, + "learning_rate": 6.103726259944562e-06, + "loss": 0.3545, + "step": 1732 + }, + { + "epoch": 1.4595732734418867, + "grad_norm": 0.19256292283535004, + "learning_rate": 6.098944059922311e-06, + "loss": 0.3378, + "step": 1733 + }, + { + "epoch": 1.4604154969118472, + "grad_norm": 0.22689050436019897, + "learning_rate": 6.094160803349508e-06, + "loss": 0.3587, + "step": 1734 + }, + { + "epoch": 1.461257720381808, + "grad_norm": 0.23645469546318054, + "learning_rate": 6.089376494824886e-06, + "loss": 0.4021, + "step": 1735 + }, + { + "epoch": 1.4620999438517686, + "grad_norm": 0.19880850613117218, + "learning_rate": 6.084591138948192e-06, + "loss": 0.3314, + "step": 1736 + }, + { + "epoch": 1.4629421673217293, + "grad_norm": 0.21392516791820526, + "learning_rate": 6.079804740320181e-06, + "loss": 0.3278, + "step": 1737 + }, + { + "epoch": 1.46378439079169, + "grad_norm": 0.20528754591941833, + "learning_rate": 6.075017303542605e-06, + "loss": 0.4016, + "step": 1738 + }, + { + "epoch": 1.4646266142616509, + "grad_norm": 0.19299480319023132, + "learning_rate": 6.070228833218221e-06, + "loss": 0.3255, + "step": 1739 + }, + { + "epoch": 1.4654688377316114, + "grad_norm": 0.21759140491485596, + "learning_rate": 6.065439333950776e-06, + "loss": 0.3951, + "step": 1740 + }, + { + "epoch": 1.4663110612015722, + "grad_norm": 0.20918811857700348, + "learning_rate": 6.060648810345006e-06, + "loss": 0.3355, + "step": 1741 + }, + { + "epoch": 1.4671532846715327, + "grad_norm": 0.22659289836883545, + "learning_rate": 6.055857267006631e-06, + "loss": 0.3633, + "step": 1742 + }, + { + "epoch": 1.4679955081414935, + "grad_norm": 0.2189176082611084, + "learning_rate": 6.051064708542357e-06, + "loss": 0.3777, + "step": 1743 + }, + { + "epoch": 1.4688377316114543, + "grad_norm": 0.20815801620483398, + "learning_rate": 6.046271139559859e-06, + "loss": 0.3448, + "step": 1744 + }, + { + "epoch": 1.469679955081415, + "grad_norm": 0.2091105580329895, + "learning_rate": 6.041476564667785e-06, + "loss": 0.3552, + "step": 1745 + }, + { + "epoch": 1.4705221785513756, + "grad_norm": 0.203644797205925, + "learning_rate": 6.036680988475756e-06, + "loss": 0.3248, + "step": 1746 + }, + { + "epoch": 1.4713644020213363, + "grad_norm": 0.20944565534591675, + "learning_rate": 6.031884415594347e-06, + "loss": 0.3752, + "step": 1747 + }, + { + "epoch": 1.472206625491297, + "grad_norm": 0.215575709939003, + "learning_rate": 6.0270868506351e-06, + "loss": 0.333, + "step": 1748 + }, + { + "epoch": 1.4730488489612577, + "grad_norm": 0.233964204788208, + "learning_rate": 6.022288298210502e-06, + "loss": 0.4178, + "step": 1749 + }, + { + "epoch": 1.4738910724312184, + "grad_norm": 0.22134220600128174, + "learning_rate": 6.017488762933996e-06, + "loss": 0.3831, + "step": 1750 + }, + { + "epoch": 1.4747332959011792, + "grad_norm": 0.2089032381772995, + "learning_rate": 6.012688249419966e-06, + "loss": 0.3641, + "step": 1751 + }, + { + "epoch": 1.4755755193711397, + "grad_norm": 0.2150684893131256, + "learning_rate": 6.00788676228374e-06, + "loss": 0.3739, + "step": 1752 + }, + { + "epoch": 1.4764177428411005, + "grad_norm": 0.20269526541233063, + "learning_rate": 6.003084306141579e-06, + "loss": 0.3398, + "step": 1753 + }, + { + "epoch": 1.4772599663110613, + "grad_norm": 0.20560164749622345, + "learning_rate": 5.998280885610677e-06, + "loss": 0.338, + "step": 1754 + }, + { + "epoch": 1.4781021897810218, + "grad_norm": 0.22493983805179596, + "learning_rate": 5.993476505309154e-06, + "loss": 0.4067, + "step": 1755 + }, + { + "epoch": 1.4789444132509826, + "grad_norm": 0.20303654670715332, + "learning_rate": 5.988671169856056e-06, + "loss": 0.3289, + "step": 1756 + }, + { + "epoch": 1.4797866367209433, + "grad_norm": 0.21149663627147675, + "learning_rate": 5.983864883871344e-06, + "loss": 0.3467, + "step": 1757 + }, + { + "epoch": 1.4806288601909041, + "grad_norm": 0.21716906130313873, + "learning_rate": 5.979057651975893e-06, + "loss": 0.3735, + "step": 1758 + }, + { + "epoch": 1.4814710836608647, + "grad_norm": 0.23328666388988495, + "learning_rate": 5.974249478791489e-06, + "loss": 0.3605, + "step": 1759 + }, + { + "epoch": 1.4823133071308254, + "grad_norm": 0.22171860933303833, + "learning_rate": 5.969440368940823e-06, + "loss": 0.3493, + "step": 1760 + }, + { + "epoch": 1.483155530600786, + "grad_norm": 0.1985427439212799, + "learning_rate": 5.964630327047485e-06, + "loss": 0.3656, + "step": 1761 + }, + { + "epoch": 1.4839977540707467, + "grad_norm": 0.22209234535694122, + "learning_rate": 5.9598193577359606e-06, + "loss": 0.3545, + "step": 1762 + }, + { + "epoch": 1.4848399775407075, + "grad_norm": 0.20604878664016724, + "learning_rate": 5.955007465631632e-06, + "loss": 0.348, + "step": 1763 + }, + { + "epoch": 1.4856822010106683, + "grad_norm": 0.2349865734577179, + "learning_rate": 5.9501946553607615e-06, + "loss": 0.3721, + "step": 1764 + }, + { + "epoch": 1.4865244244806288, + "grad_norm": 0.22401748597621918, + "learning_rate": 5.945380931550497e-06, + "loss": 0.3978, + "step": 1765 + }, + { + "epoch": 1.4873666479505896, + "grad_norm": 0.20538973808288574, + "learning_rate": 5.940566298828871e-06, + "loss": 0.3581, + "step": 1766 + }, + { + "epoch": 1.4882088714205501, + "grad_norm": 0.22036215662956238, + "learning_rate": 5.935750761824777e-06, + "loss": 0.3681, + "step": 1767 + }, + { + "epoch": 1.489051094890511, + "grad_norm": 0.22985632717609406, + "learning_rate": 5.93093432516799e-06, + "loss": 0.349, + "step": 1768 + }, + { + "epoch": 1.4898933183604717, + "grad_norm": 0.21334858238697052, + "learning_rate": 5.926116993489143e-06, + "loss": 0.3491, + "step": 1769 + }, + { + "epoch": 1.4907355418304324, + "grad_norm": 0.2376968264579773, + "learning_rate": 5.921298771419731e-06, + "loss": 0.3549, + "step": 1770 + }, + { + "epoch": 1.491577765300393, + "grad_norm": 0.2066209763288498, + "learning_rate": 5.916479663592107e-06, + "loss": 0.3337, + "step": 1771 + }, + { + "epoch": 1.4924199887703538, + "grad_norm": 0.21085307002067566, + "learning_rate": 5.911659674639473e-06, + "loss": 0.3653, + "step": 1772 + }, + { + "epoch": 1.4932622122403143, + "grad_norm": 0.22900329530239105, + "learning_rate": 5.906838809195879e-06, + "loss": 0.382, + "step": 1773 + }, + { + "epoch": 1.494104435710275, + "grad_norm": 0.2079591453075409, + "learning_rate": 5.90201707189622e-06, + "loss": 0.3536, + "step": 1774 + }, + { + "epoch": 1.4949466591802358, + "grad_norm": 0.21597476303577423, + "learning_rate": 5.897194467376226e-06, + "loss": 0.349, + "step": 1775 + }, + { + "epoch": 1.4957888826501966, + "grad_norm": 0.2699699401855469, + "learning_rate": 5.8923710002724595e-06, + "loss": 0.3775, + "step": 1776 + }, + { + "epoch": 1.4966311061201572, + "grad_norm": 0.2011500895023346, + "learning_rate": 5.887546675222319e-06, + "loss": 0.357, + "step": 1777 + }, + { + "epoch": 1.497473329590118, + "grad_norm": 0.22972598671913147, + "learning_rate": 5.8827214968640215e-06, + "loss": 0.3893, + "step": 1778 + }, + { + "epoch": 1.4983155530600787, + "grad_norm": 0.22463274002075195, + "learning_rate": 5.877895469836604e-06, + "loss": 0.3571, + "step": 1779 + }, + { + "epoch": 1.4991577765300392, + "grad_norm": 0.2096172422170639, + "learning_rate": 5.873068598779926e-06, + "loss": 0.3571, + "step": 1780 + }, + { + "epoch": 1.5, + "grad_norm": 0.21031080186367035, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.3461, + "step": 1781 + }, + { + "epoch": 1.5008422234699608, + "grad_norm": 0.23288646340370178, + "learning_rate": 5.863412343142258e-06, + "loss": 0.356, + "step": 1782 + }, + { + "epoch": 1.5016844469399215, + "grad_norm": 0.23997929692268372, + "learning_rate": 5.858582967845018e-06, + "loss": 0.3703, + "step": 1783 + }, + { + "epoch": 1.502526670409882, + "grad_norm": 0.25117379426956177, + "learning_rate": 5.853752767086007e-06, + "loss": 0.371, + "step": 1784 + }, + { + "epoch": 1.5033688938798426, + "grad_norm": 0.20585308969020844, + "learning_rate": 5.848921745509094e-06, + "loss": 0.3594, + "step": 1785 + }, + { + "epoch": 1.5042111173498034, + "grad_norm": 0.217906191945076, + "learning_rate": 5.844089907758935e-06, + "loss": 0.3606, + "step": 1786 + }, + { + "epoch": 1.5050533408197642, + "grad_norm": 0.2501089870929718, + "learning_rate": 5.839257258480974e-06, + "loss": 0.3863, + "step": 1787 + }, + { + "epoch": 1.505895564289725, + "grad_norm": 0.25611549615859985, + "learning_rate": 5.8344238023214305e-06, + "loss": 0.3325, + "step": 1788 + }, + { + "epoch": 1.5067377877596857, + "grad_norm": 0.26314201951026917, + "learning_rate": 5.829589543927305e-06, + "loss": 0.3971, + "step": 1789 + }, + { + "epoch": 1.5075800112296462, + "grad_norm": 0.19663043320178986, + "learning_rate": 5.824754487946366e-06, + "loss": 0.3366, + "step": 1790 + }, + { + "epoch": 1.508422234699607, + "grad_norm": 0.22584089636802673, + "learning_rate": 5.819918639027149e-06, + "loss": 0.3916, + "step": 1791 + }, + { + "epoch": 1.5092644581695676, + "grad_norm": 0.21082425117492676, + "learning_rate": 5.815082001818951e-06, + "loss": 0.3225, + "step": 1792 + }, + { + "epoch": 1.5101066816395283, + "grad_norm": 0.23728309571743011, + "learning_rate": 5.8102445809718325e-06, + "loss": 0.3816, + "step": 1793 + }, + { + "epoch": 1.510948905109489, + "grad_norm": 0.2071065604686737, + "learning_rate": 5.805406381136598e-06, + "loss": 0.3668, + "step": 1794 + }, + { + "epoch": 1.5117911285794499, + "grad_norm": 0.21563392877578735, + "learning_rate": 5.80056740696481e-06, + "loss": 0.3587, + "step": 1795 + }, + { + "epoch": 1.5126333520494104, + "grad_norm": 0.22941777110099792, + "learning_rate": 5.79572766310877e-06, + "loss": 0.2965, + "step": 1796 + }, + { + "epoch": 1.5134755755193712, + "grad_norm": 0.23965167999267578, + "learning_rate": 5.790887154221521e-06, + "loss": 0.3838, + "step": 1797 + }, + { + "epoch": 1.5143177989893317, + "grad_norm": 0.23982979357242584, + "learning_rate": 5.7860458849568425e-06, + "loss": 0.3777, + "step": 1798 + }, + { + "epoch": 1.5151600224592925, + "grad_norm": 0.20600104331970215, + "learning_rate": 5.781203859969242e-06, + "loss": 0.3523, + "step": 1799 + }, + { + "epoch": 1.5160022459292533, + "grad_norm": 0.24267232418060303, + "learning_rate": 5.776361083913959e-06, + "loss": 0.3497, + "step": 1800 + }, + { + "epoch": 1.516844469399214, + "grad_norm": 0.23190760612487793, + "learning_rate": 5.771517561446949e-06, + "loss": 0.3207, + "step": 1801 + }, + { + "epoch": 1.5176866928691746, + "grad_norm": 0.21379292011260986, + "learning_rate": 5.766673297224889e-06, + "loss": 0.3448, + "step": 1802 + }, + { + "epoch": 1.5185289163391353, + "grad_norm": 0.2329062819480896, + "learning_rate": 5.7618282959051685e-06, + "loss": 0.3896, + "step": 1803 + }, + { + "epoch": 1.5193711398090959, + "grad_norm": 0.21625331044197083, + "learning_rate": 5.756982562145884e-06, + "loss": 0.3653, + "step": 1804 + }, + { + "epoch": 1.5202133632790567, + "grad_norm": 0.20603640377521515, + "learning_rate": 5.75213610060584e-06, + "loss": 0.3589, + "step": 1805 + }, + { + "epoch": 1.5210555867490174, + "grad_norm": 0.20811544358730316, + "learning_rate": 5.747288915944533e-06, + "loss": 0.3272, + "step": 1806 + }, + { + "epoch": 1.5218978102189782, + "grad_norm": 0.21714171767234802, + "learning_rate": 5.742441012822166e-06, + "loss": 0.38, + "step": 1807 + }, + { + "epoch": 1.522740033688939, + "grad_norm": 0.20149259269237518, + "learning_rate": 5.737592395899623e-06, + "loss": 0.3756, + "step": 1808 + }, + { + "epoch": 1.5235822571588995, + "grad_norm": 0.20166918635368347, + "learning_rate": 5.7327430698384775e-06, + "loss": 0.3489, + "step": 1809 + }, + { + "epoch": 1.52442448062886, + "grad_norm": 0.20611999928951263, + "learning_rate": 5.727893039300987e-06, + "loss": 0.3676, + "step": 1810 + }, + { + "epoch": 1.5252667040988208, + "grad_norm": 0.21965749561786652, + "learning_rate": 5.7230423089500845e-06, + "loss": 0.361, + "step": 1811 + }, + { + "epoch": 1.5261089275687816, + "grad_norm": 0.217516228556633, + "learning_rate": 5.718190883449373e-06, + "loss": 0.3783, + "step": 1812 + }, + { + "epoch": 1.5269511510387423, + "grad_norm": 0.19376209378242493, + "learning_rate": 5.713338767463129e-06, + "loss": 0.3281, + "step": 1813 + }, + { + "epoch": 1.5277933745087031, + "grad_norm": 0.22235795855522156, + "learning_rate": 5.708485965656291e-06, + "loss": 0.3736, + "step": 1814 + }, + { + "epoch": 1.5286355979786637, + "grad_norm": 0.2107383906841278, + "learning_rate": 5.703632482694453e-06, + "loss": 0.3482, + "step": 1815 + }, + { + "epoch": 1.5294778214486242, + "grad_norm": 0.24591943621635437, + "learning_rate": 5.698778323243871e-06, + "loss": 0.3923, + "step": 1816 + }, + { + "epoch": 1.530320044918585, + "grad_norm": 0.2085351198911667, + "learning_rate": 5.693923491971445e-06, + "loss": 0.3583, + "step": 1817 + }, + { + "epoch": 1.5311622683885457, + "grad_norm": 0.21462799608707428, + "learning_rate": 5.689067993544726e-06, + "loss": 0.3653, + "step": 1818 + }, + { + "epoch": 1.5320044918585065, + "grad_norm": 0.21209967136383057, + "learning_rate": 5.6842118326318996e-06, + "loss": 0.385, + "step": 1819 + }, + { + "epoch": 1.5328467153284673, + "grad_norm": 0.24777087569236755, + "learning_rate": 5.679355013901797e-06, + "loss": 0.3857, + "step": 1820 + }, + { + "epoch": 1.5336889387984278, + "grad_norm": 0.2122153341770172, + "learning_rate": 5.674497542023875e-06, + "loss": 0.3491, + "step": 1821 + }, + { + "epoch": 1.5345311622683886, + "grad_norm": 0.2072700560092926, + "learning_rate": 5.669639421668221e-06, + "loss": 0.3753, + "step": 1822 + }, + { + "epoch": 1.5353733857383491, + "grad_norm": 0.20608697831630707, + "learning_rate": 5.664780657505547e-06, + "loss": 0.3453, + "step": 1823 + }, + { + "epoch": 1.53621560920831, + "grad_norm": 0.23337386548519135, + "learning_rate": 5.659921254207183e-06, + "loss": 0.3751, + "step": 1824 + }, + { + "epoch": 1.5370578326782707, + "grad_norm": 0.18582665920257568, + "learning_rate": 5.65506121644507e-06, + "loss": 0.3205, + "step": 1825 + }, + { + "epoch": 1.5379000561482314, + "grad_norm": 0.2243811935186386, + "learning_rate": 5.650200548891764e-06, + "loss": 0.3701, + "step": 1826 + }, + { + "epoch": 1.538742279618192, + "grad_norm": 0.1962759792804718, + "learning_rate": 5.645339256220427e-06, + "loss": 0.3491, + "step": 1827 + }, + { + "epoch": 1.5395845030881528, + "grad_norm": 0.20144011080265045, + "learning_rate": 5.640477343104815e-06, + "loss": 0.3763, + "step": 1828 + }, + { + "epoch": 1.5404267265581133, + "grad_norm": 0.19987617433071136, + "learning_rate": 5.635614814219289e-06, + "loss": 0.3559, + "step": 1829 + }, + { + "epoch": 1.541268950028074, + "grad_norm": 0.21206045150756836, + "learning_rate": 5.630751674238796e-06, + "loss": 0.3374, + "step": 1830 + }, + { + "epoch": 1.5421111734980348, + "grad_norm": 0.22243531048297882, + "learning_rate": 5.625887927838872e-06, + "loss": 0.3783, + "step": 1831 + }, + { + "epoch": 1.5429533969679956, + "grad_norm": 0.20843222737312317, + "learning_rate": 5.6210235796956395e-06, + "loss": 0.3295, + "step": 1832 + }, + { + "epoch": 1.5437956204379562, + "grad_norm": 0.2444627583026886, + "learning_rate": 5.616158634485793e-06, + "loss": 0.3971, + "step": 1833 + }, + { + "epoch": 1.544637843907917, + "grad_norm": 0.23421430587768555, + "learning_rate": 5.61129309688661e-06, + "loss": 0.349, + "step": 1834 + }, + { + "epoch": 1.5454800673778775, + "grad_norm": 0.2181040346622467, + "learning_rate": 5.606426971575926e-06, + "loss": 0.387, + "step": 1835 + }, + { + "epoch": 1.5463222908478382, + "grad_norm": 0.18999342620372772, + "learning_rate": 5.601560263232153e-06, + "loss": 0.355, + "step": 1836 + }, + { + "epoch": 1.547164514317799, + "grad_norm": 0.21678003668785095, + "learning_rate": 5.596692976534256e-06, + "loss": 0.3697, + "step": 1837 + }, + { + "epoch": 1.5480067377877598, + "grad_norm": 0.23101647198200226, + "learning_rate": 5.591825116161758e-06, + "loss": 0.3819, + "step": 1838 + }, + { + "epoch": 1.5488489612577205, + "grad_norm": 0.2279675304889679, + "learning_rate": 5.5869566867947344e-06, + "loss": 0.3661, + "step": 1839 + }, + { + "epoch": 1.549691184727681, + "grad_norm": 0.20913884043693542, + "learning_rate": 5.582087693113808e-06, + "loss": 0.3679, + "step": 1840 + }, + { + "epoch": 1.5505334081976416, + "grad_norm": 0.21088729798793793, + "learning_rate": 5.577218139800143e-06, + "loss": 0.3428, + "step": 1841 + }, + { + "epoch": 1.5513756316676024, + "grad_norm": 0.2020246833562851, + "learning_rate": 5.572348031535442e-06, + "loss": 0.3491, + "step": 1842 + }, + { + "epoch": 1.5522178551375632, + "grad_norm": 0.21002264320850372, + "learning_rate": 5.567477373001942e-06, + "loss": 0.3529, + "step": 1843 + }, + { + "epoch": 1.553060078607524, + "grad_norm": 0.2384791225194931, + "learning_rate": 5.562606168882404e-06, + "loss": 0.3819, + "step": 1844 + }, + { + "epoch": 1.5539023020774847, + "grad_norm": 0.2081071436405182, + "learning_rate": 5.557734423860122e-06, + "loss": 0.3635, + "step": 1845 + }, + { + "epoch": 1.5547445255474452, + "grad_norm": 0.1915816217660904, + "learning_rate": 5.552862142618906e-06, + "loss": 0.3083, + "step": 1846 + }, + { + "epoch": 1.5555867490174058, + "grad_norm": 0.20684143900871277, + "learning_rate": 5.547989329843079e-06, + "loss": 0.3731, + "step": 1847 + }, + { + "epoch": 1.5564289724873666, + "grad_norm": 0.23124712705612183, + "learning_rate": 5.543115990217478e-06, + "loss": 0.3839, + "step": 1848 + }, + { + "epoch": 1.5572711959573273, + "grad_norm": 0.25232207775115967, + "learning_rate": 5.538242128427444e-06, + "loss": 0.3614, + "step": 1849 + }, + { + "epoch": 1.558113419427288, + "grad_norm": 0.21806760132312775, + "learning_rate": 5.533367749158829e-06, + "loss": 0.3809, + "step": 1850 + }, + { + "epoch": 1.5589556428972489, + "grad_norm": 0.21748806536197662, + "learning_rate": 5.528492857097966e-06, + "loss": 0.3599, + "step": 1851 + }, + { + "epoch": 1.5597978663672094, + "grad_norm": 0.229938343167305, + "learning_rate": 5.523617456931696e-06, + "loss": 0.3265, + "step": 1852 + }, + { + "epoch": 1.5606400898371702, + "grad_norm": 0.21047593653202057, + "learning_rate": 5.518741553347341e-06, + "loss": 0.3902, + "step": 1853 + }, + { + "epoch": 1.5614823133071307, + "grad_norm": 0.2067413330078125, + "learning_rate": 5.513865151032709e-06, + "loss": 0.3387, + "step": 1854 + }, + { + "epoch": 1.5623245367770915, + "grad_norm": 0.20057551562786102, + "learning_rate": 5.508988254676087e-06, + "loss": 0.3508, + "step": 1855 + }, + { + "epoch": 1.5631667602470523, + "grad_norm": 0.20652174949645996, + "learning_rate": 5.504110868966239e-06, + "loss": 0.3392, + "step": 1856 + }, + { + "epoch": 1.564008983717013, + "grad_norm": 0.200935959815979, + "learning_rate": 5.499232998592399e-06, + "loss": 0.3447, + "step": 1857 + }, + { + "epoch": 1.5648512071869736, + "grad_norm": 0.20910383760929108, + "learning_rate": 5.49435464824426e-06, + "loss": 0.3653, + "step": 1858 + }, + { + "epoch": 1.5656934306569343, + "grad_norm": 0.1886066496372223, + "learning_rate": 5.489475822611988e-06, + "loss": 0.3388, + "step": 1859 + }, + { + "epoch": 1.5665356541268949, + "grad_norm": 0.21101483702659607, + "learning_rate": 5.484596526386198e-06, + "loss": 0.3786, + "step": 1860 + }, + { + "epoch": 1.5673778775968557, + "grad_norm": 0.2070988267660141, + "learning_rate": 5.479716764257961e-06, + "loss": 0.3624, + "step": 1861 + }, + { + "epoch": 1.5682201010668164, + "grad_norm": 0.20134082436561584, + "learning_rate": 5.474836540918791e-06, + "loss": 0.3558, + "step": 1862 + }, + { + "epoch": 1.5690623245367772, + "grad_norm": 0.20253299176692963, + "learning_rate": 5.469955861060653e-06, + "loss": 0.3575, + "step": 1863 + }, + { + "epoch": 1.5699045480067377, + "grad_norm": 0.19125215709209442, + "learning_rate": 5.465074729375944e-06, + "loss": 0.3635, + "step": 1864 + }, + { + "epoch": 1.5707467714766985, + "grad_norm": 0.21241691708564758, + "learning_rate": 5.4601931505575e-06, + "loss": 0.3698, + "step": 1865 + }, + { + "epoch": 1.571588994946659, + "grad_norm": 0.1975466012954712, + "learning_rate": 5.455311129298586e-06, + "loss": 0.3218, + "step": 1866 + }, + { + "epoch": 1.5724312184166198, + "grad_norm": 0.21139660477638245, + "learning_rate": 5.450428670292889e-06, + "loss": 0.3605, + "step": 1867 + }, + { + "epoch": 1.5732734418865806, + "grad_norm": 0.22668103873729706, + "learning_rate": 5.445545778234522e-06, + "loss": 0.3883, + "step": 1868 + }, + { + "epoch": 1.5741156653565413, + "grad_norm": 0.19831670820713043, + "learning_rate": 5.44066245781801e-06, + "loss": 0.3758, + "step": 1869 + }, + { + "epoch": 1.5749578888265021, + "grad_norm": 0.2144181728363037, + "learning_rate": 5.435778713738292e-06, + "loss": 0.3292, + "step": 1870 + }, + { + "epoch": 1.5758001122964627, + "grad_norm": 0.20668131113052368, + "learning_rate": 5.430894550690714e-06, + "loss": 0.3601, + "step": 1871 + }, + { + "epoch": 1.5766423357664232, + "grad_norm": 0.1921137273311615, + "learning_rate": 5.426009973371026e-06, + "loss": 0.3536, + "step": 1872 + }, + { + "epoch": 1.577484559236384, + "grad_norm": 0.1983177214860916, + "learning_rate": 5.421124986475371e-06, + "loss": 0.3605, + "step": 1873 + }, + { + "epoch": 1.5783267827063447, + "grad_norm": 0.211525559425354, + "learning_rate": 5.416239594700294e-06, + "loss": 0.3763, + "step": 1874 + }, + { + "epoch": 1.5791690061763055, + "grad_norm": 0.2196986824274063, + "learning_rate": 5.4113538027427245e-06, + "loss": 0.387, + "step": 1875 + }, + { + "epoch": 1.5800112296462663, + "grad_norm": 0.18906791508197784, + "learning_rate": 5.4064676152999765e-06, + "loss": 0.3233, + "step": 1876 + }, + { + "epoch": 1.5808534531162268, + "grad_norm": 0.1828877031803131, + "learning_rate": 5.4015810370697445e-06, + "loss": 0.3364, + "step": 1877 + }, + { + "epoch": 1.5816956765861874, + "grad_norm": 0.23177126049995422, + "learning_rate": 5.396694072750099e-06, + "loss": 0.3752, + "step": 1878 + }, + { + "epoch": 1.5825379000561481, + "grad_norm": 0.22009316086769104, + "learning_rate": 5.391806727039484e-06, + "loss": 0.3812, + "step": 1879 + }, + { + "epoch": 1.583380123526109, + "grad_norm": 0.19118615984916687, + "learning_rate": 5.386919004636706e-06, + "loss": 0.3442, + "step": 1880 + }, + { + "epoch": 1.5842223469960697, + "grad_norm": 0.20579873025417328, + "learning_rate": 5.382030910240936e-06, + "loss": 0.3754, + "step": 1881 + }, + { + "epoch": 1.5850645704660304, + "grad_norm": 0.18276536464691162, + "learning_rate": 5.3771424485517034e-06, + "loss": 0.3288, + "step": 1882 + }, + { + "epoch": 1.585906793935991, + "grad_norm": 0.23342770338058472, + "learning_rate": 5.3722536242688895e-06, + "loss": 0.342, + "step": 1883 + }, + { + "epoch": 1.5867490174059518, + "grad_norm": 0.20647521317005157, + "learning_rate": 5.367364442092724e-06, + "loss": 0.3435, + "step": 1884 + }, + { + "epoch": 1.5875912408759123, + "grad_norm": 0.2187911421060562, + "learning_rate": 5.362474906723781e-06, + "loss": 0.3579, + "step": 1885 + }, + { + "epoch": 1.588433464345873, + "grad_norm": 0.1964806765317917, + "learning_rate": 5.357585022862977e-06, + "loss": 0.38, + "step": 1886 + }, + { + "epoch": 1.5892756878158338, + "grad_norm": 0.21945969760417938, + "learning_rate": 5.352694795211555e-06, + "loss": 0.3737, + "step": 1887 + }, + { + "epoch": 1.5901179112857946, + "grad_norm": 0.22322909533977509, + "learning_rate": 5.347804228471101e-06, + "loss": 0.3511, + "step": 1888 + }, + { + "epoch": 1.5909601347557552, + "grad_norm": 0.1914677619934082, + "learning_rate": 5.342913327343515e-06, + "loss": 0.3429, + "step": 1889 + }, + { + "epoch": 1.591802358225716, + "grad_norm": 1.419649362564087, + "learning_rate": 5.338022096531028e-06, + "loss": 0.3701, + "step": 1890 + }, + { + "epoch": 1.5926445816956765, + "grad_norm": 0.20444141328334808, + "learning_rate": 5.33313054073618e-06, + "loss": 0.3279, + "step": 1891 + }, + { + "epoch": 1.5934868051656372, + "grad_norm": 0.23807506263256073, + "learning_rate": 5.32823866466183e-06, + "loss": 0.4132, + "step": 1892 + }, + { + "epoch": 1.594329028635598, + "grad_norm": 0.2279578149318695, + "learning_rate": 5.3233464730111426e-06, + "loss": 0.3564, + "step": 1893 + }, + { + "epoch": 1.5951712521055588, + "grad_norm": 0.19392019510269165, + "learning_rate": 5.318453970487582e-06, + "loss": 0.3329, + "step": 1894 + }, + { + "epoch": 1.5960134755755195, + "grad_norm": 0.2214561104774475, + "learning_rate": 5.31356116179492e-06, + "loss": 0.3618, + "step": 1895 + }, + { + "epoch": 1.59685569904548, + "grad_norm": 0.22227728366851807, + "learning_rate": 5.308668051637213e-06, + "loss": 0.3457, + "step": 1896 + }, + { + "epoch": 1.5976979225154406, + "grad_norm": 0.22292588651180267, + "learning_rate": 5.303774644718813e-06, + "loss": 0.3571, + "step": 1897 + }, + { + "epoch": 1.5985401459854014, + "grad_norm": 0.1955156922340393, + "learning_rate": 5.298880945744356e-06, + "loss": 0.3253, + "step": 1898 + }, + { + "epoch": 1.5993823694553622, + "grad_norm": 0.21948570013046265, + "learning_rate": 5.29398695941876e-06, + "loss": 0.3733, + "step": 1899 + }, + { + "epoch": 1.600224592925323, + "grad_norm": 0.22376330196857452, + "learning_rate": 5.289092690447215e-06, + "loss": 0.3589, + "step": 1900 + }, + { + "epoch": 1.6010668163952837, + "grad_norm": 0.20499028265476227, + "learning_rate": 5.284198143535188e-06, + "loss": 0.3581, + "step": 1901 + }, + { + "epoch": 1.6019090398652442, + "grad_norm": 0.20056955516338348, + "learning_rate": 5.279303323388413e-06, + "loss": 0.3406, + "step": 1902 + }, + { + "epoch": 1.6027512633352048, + "grad_norm": 0.2071678787469864, + "learning_rate": 5.274408234712881e-06, + "loss": 0.3567, + "step": 1903 + }, + { + "epoch": 1.6035934868051656, + "grad_norm": 0.20119628310203552, + "learning_rate": 5.2695128822148466e-06, + "loss": 0.337, + "step": 1904 + }, + { + "epoch": 1.6044357102751263, + "grad_norm": 0.2180289775133133, + "learning_rate": 5.2646172706008154e-06, + "loss": 0.4007, + "step": 1905 + }, + { + "epoch": 1.605277933745087, + "grad_norm": 0.1883217990398407, + "learning_rate": 5.259721404577546e-06, + "loss": 0.3784, + "step": 1906 + }, + { + "epoch": 1.6061201572150479, + "grad_norm": 0.21434904634952545, + "learning_rate": 5.254825288852033e-06, + "loss": 0.3512, + "step": 1907 + }, + { + "epoch": 1.6069623806850084, + "grad_norm": 0.22084085643291473, + "learning_rate": 5.249928928131523e-06, + "loss": 0.3534, + "step": 1908 + }, + { + "epoch": 1.607804604154969, + "grad_norm": 0.21422971785068512, + "learning_rate": 5.245032327123488e-06, + "loss": 0.3967, + "step": 1909 + }, + { + "epoch": 1.6086468276249297, + "grad_norm": 0.21373017132282257, + "learning_rate": 5.240135490535635e-06, + "loss": 0.3509, + "step": 1910 + }, + { + "epoch": 1.6094890510948905, + "grad_norm": 0.19849702715873718, + "learning_rate": 5.235238423075899e-06, + "loss": 0.3481, + "step": 1911 + }, + { + "epoch": 1.6103312745648513, + "grad_norm": 0.20527733862400055, + "learning_rate": 5.230341129452434e-06, + "loss": 0.3659, + "step": 1912 + }, + { + "epoch": 1.611173498034812, + "grad_norm": 0.21651127934455872, + "learning_rate": 5.225443614373614e-06, + "loss": 0.3777, + "step": 1913 + }, + { + "epoch": 1.6120157215047726, + "grad_norm": 0.22285105288028717, + "learning_rate": 5.220545882548024e-06, + "loss": 0.3802, + "step": 1914 + }, + { + "epoch": 1.6128579449747333, + "grad_norm": 0.22284671664237976, + "learning_rate": 5.215647938684458e-06, + "loss": 0.3693, + "step": 1915 + }, + { + "epoch": 1.6137001684446939, + "grad_norm": 0.18833483755588531, + "learning_rate": 5.210749787491913e-06, + "loss": 0.3456, + "step": 1916 + }, + { + "epoch": 1.6145423919146547, + "grad_norm": 0.19601304829120636, + "learning_rate": 5.20585143367959e-06, + "loss": 0.3472, + "step": 1917 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.21516947448253632, + "learning_rate": 5.200952881956875e-06, + "loss": 0.3466, + "step": 1918 + }, + { + "epoch": 1.6162268388545762, + "grad_norm": 0.1969214677810669, + "learning_rate": 5.196054137033354e-06, + "loss": 0.3382, + "step": 1919 + }, + { + "epoch": 1.6170690623245367, + "grad_norm": 0.19827716052532196, + "learning_rate": 5.191155203618796e-06, + "loss": 0.367, + "step": 1920 + }, + { + "epoch": 1.6179112857944975, + "grad_norm": 0.19620153307914734, + "learning_rate": 5.186256086423148e-06, + "loss": 0.3505, + "step": 1921 + }, + { + "epoch": 1.618753509264458, + "grad_norm": 0.22013582289218903, + "learning_rate": 5.181356790156539e-06, + "loss": 0.3871, + "step": 1922 + }, + { + "epoch": 1.6195957327344188, + "grad_norm": 0.19816164672374725, + "learning_rate": 5.176457319529264e-06, + "loss": 0.3493, + "step": 1923 + }, + { + "epoch": 1.6204379562043796, + "grad_norm": 0.1959572434425354, + "learning_rate": 5.171557679251788e-06, + "loss": 0.3681, + "step": 1924 + }, + { + "epoch": 1.6212801796743403, + "grad_norm": 0.21222509443759918, + "learning_rate": 5.166657874034745e-06, + "loss": 0.371, + "step": 1925 + }, + { + "epoch": 1.6221224031443011, + "grad_norm": 0.19689340889453888, + "learning_rate": 5.161757908588917e-06, + "loss": 0.3541, + "step": 1926 + }, + { + "epoch": 1.6229646266142617, + "grad_norm": 0.19198477268218994, + "learning_rate": 5.156857787625249e-06, + "loss": 0.3305, + "step": 1927 + }, + { + "epoch": 1.6238068500842222, + "grad_norm": 0.24948351085186005, + "learning_rate": 5.15195751585483e-06, + "loss": 0.3904, + "step": 1928 + }, + { + "epoch": 1.624649073554183, + "grad_norm": 0.20237040519714355, + "learning_rate": 5.147057097988898e-06, + "loss": 0.3091, + "step": 1929 + }, + { + "epoch": 1.6254912970241437, + "grad_norm": 0.20857195556163788, + "learning_rate": 5.142156538738827e-06, + "loss": 0.3853, + "step": 1930 + }, + { + "epoch": 1.6263335204941045, + "grad_norm": 0.2776411175727844, + "learning_rate": 5.137255842816132e-06, + "loss": 0.3281, + "step": 1931 + }, + { + "epoch": 1.6271757439640653, + "grad_norm": 0.191249817609787, + "learning_rate": 5.132355014932455e-06, + "loss": 0.3557, + "step": 1932 + }, + { + "epoch": 1.6280179674340258, + "grad_norm": 0.19988055527210236, + "learning_rate": 5.127454059799567e-06, + "loss": 0.3846, + "step": 1933 + }, + { + "epoch": 1.6288601909039864, + "grad_norm": 0.22994700074195862, + "learning_rate": 5.122552982129362e-06, + "loss": 0.3628, + "step": 1934 + }, + { + "epoch": 1.6297024143739471, + "grad_norm": 0.20393429696559906, + "learning_rate": 5.1176517866338495e-06, + "loss": 0.329, + "step": 1935 + }, + { + "epoch": 1.630544637843908, + "grad_norm": 0.19987967610359192, + "learning_rate": 5.112750478025156e-06, + "loss": 0.348, + "step": 1936 + }, + { + "epoch": 1.6313868613138687, + "grad_norm": 0.1870167851448059, + "learning_rate": 5.1078490610155105e-06, + "loss": 0.3124, + "step": 1937 + }, + { + "epoch": 1.6322290847838294, + "grad_norm": 0.20360225439071655, + "learning_rate": 5.102947540317254e-06, + "loss": 0.3941, + "step": 1938 + }, + { + "epoch": 1.63307130825379, + "grad_norm": 0.1947534829378128, + "learning_rate": 5.09804592064282e-06, + "loss": 0.3386, + "step": 1939 + }, + { + "epoch": 1.6339135317237508, + "grad_norm": 0.21810080111026764, + "learning_rate": 5.093144206704743e-06, + "loss": 0.3688, + "step": 1940 + }, + { + "epoch": 1.6347557551937113, + "grad_norm": 0.21921783685684204, + "learning_rate": 5.088242403215644e-06, + "loss": 0.3516, + "step": 1941 + }, + { + "epoch": 1.635597978663672, + "grad_norm": 0.19858203828334808, + "learning_rate": 5.083340514888232e-06, + "loss": 0.3498, + "step": 1942 + }, + { + "epoch": 1.6364402021336328, + "grad_norm": 0.2206740379333496, + "learning_rate": 5.078438546435298e-06, + "loss": 0.4053, + "step": 1943 + }, + { + "epoch": 1.6372824256035936, + "grad_norm": 0.193548783659935, + "learning_rate": 5.073536502569708e-06, + "loss": 0.3109, + "step": 1944 + }, + { + "epoch": 1.6381246490735542, + "grad_norm": 0.22453932464122772, + "learning_rate": 5.0686343880044044e-06, + "loss": 0.3971, + "step": 1945 + }, + { + "epoch": 1.638966872543515, + "grad_norm": 0.22631575167179108, + "learning_rate": 5.063732207452391e-06, + "loss": 0.3603, + "step": 1946 + }, + { + "epoch": 1.6398090960134755, + "grad_norm": 0.21678605675697327, + "learning_rate": 5.058829965626742e-06, + "loss": 0.3326, + "step": 1947 + }, + { + "epoch": 1.6406513194834362, + "grad_norm": 0.21846479177474976, + "learning_rate": 5.053927667240585e-06, + "loss": 0.3264, + "step": 1948 + }, + { + "epoch": 1.641493542953397, + "grad_norm": 0.21924428641796112, + "learning_rate": 5.049025317007108e-06, + "loss": 0.3638, + "step": 1949 + }, + { + "epoch": 1.6423357664233578, + "grad_norm": 0.21155060827732086, + "learning_rate": 5.0441229196395416e-06, + "loss": 0.374, + "step": 1950 + }, + { + "epoch": 1.6431779898933183, + "grad_norm": 0.21062292158603668, + "learning_rate": 5.039220479851167e-06, + "loss": 0.3419, + "step": 1951 + }, + { + "epoch": 1.644020213363279, + "grad_norm": 0.2159111499786377, + "learning_rate": 5.034318002355305e-06, + "loss": 0.3643, + "step": 1952 + }, + { + "epoch": 1.6448624368332396, + "grad_norm": 0.21283845603466034, + "learning_rate": 5.029415491865311e-06, + "loss": 0.3666, + "step": 1953 + }, + { + "epoch": 1.6457046603032004, + "grad_norm": 0.19281776249408722, + "learning_rate": 5.024512953094577e-06, + "loss": 0.3391, + "step": 1954 + }, + { + "epoch": 1.6465468837731612, + "grad_norm": 0.21010302007198334, + "learning_rate": 5.019610390756513e-06, + "loss": 0.3065, + "step": 1955 + }, + { + "epoch": 1.647389107243122, + "grad_norm": 0.21600353717803955, + "learning_rate": 5.014707809564562e-06, + "loss": 0.3885, + "step": 1956 + }, + { + "epoch": 1.6482313307130827, + "grad_norm": 0.19259744882583618, + "learning_rate": 5.009805214232177e-06, + "loss": 0.3272, + "step": 1957 + }, + { + "epoch": 1.6490735541830432, + "grad_norm": 0.2523140609264374, + "learning_rate": 5.004902609472831e-06, + "loss": 0.3671, + "step": 1958 + }, + { + "epoch": 1.6499157776530038, + "grad_norm": 0.1950521618127823, + "learning_rate": 5e-06, + "loss": 0.3745, + "step": 1959 + }, + { + "epoch": 1.6507580011229646, + "grad_norm": 0.21287277340888977, + "learning_rate": 4.995097390527171e-06, + "loss": 0.3727, + "step": 1960 + }, + { + "epoch": 1.6516002245929253, + "grad_norm": 0.2051495909690857, + "learning_rate": 4.990194785767824e-06, + "loss": 0.3648, + "step": 1961 + }, + { + "epoch": 1.652442448062886, + "grad_norm": 0.20224712789058685, + "learning_rate": 4.98529219043544e-06, + "loss": 0.3372, + "step": 1962 + }, + { + "epoch": 1.6532846715328469, + "grad_norm": 0.21337299048900604, + "learning_rate": 4.980389609243488e-06, + "loss": 0.3896, + "step": 1963 + }, + { + "epoch": 1.6541268950028074, + "grad_norm": 0.20423704385757446, + "learning_rate": 4.975487046905426e-06, + "loss": 0.3244, + "step": 1964 + }, + { + "epoch": 1.654969118472768, + "grad_norm": 0.22124212980270386, + "learning_rate": 4.97058450813469e-06, + "loss": 0.3616, + "step": 1965 + }, + { + "epoch": 1.6558113419427287, + "grad_norm": 0.20289303362369537, + "learning_rate": 4.9656819976446975e-06, + "loss": 0.3701, + "step": 1966 + }, + { + "epoch": 1.6566535654126895, + "grad_norm": 0.2149975448846817, + "learning_rate": 4.960779520148835e-06, + "loss": 0.3912, + "step": 1967 + }, + { + "epoch": 1.6574957888826503, + "grad_norm": 0.18951010704040527, + "learning_rate": 4.955877080360462e-06, + "loss": 0.3175, + "step": 1968 + }, + { + "epoch": 1.658338012352611, + "grad_norm": 0.22500726580619812, + "learning_rate": 4.950974682992894e-06, + "loss": 0.4012, + "step": 1969 + }, + { + "epoch": 1.6591802358225716, + "grad_norm": 0.19183504581451416, + "learning_rate": 4.9460723327594175e-06, + "loss": 0.3442, + "step": 1970 + }, + { + "epoch": 1.6600224592925323, + "grad_norm": 0.18563278019428253, + "learning_rate": 4.94117003437326e-06, + "loss": 0.345, + "step": 1971 + }, + { + "epoch": 1.6608646827624929, + "grad_norm": 0.2061086893081665, + "learning_rate": 4.9362677925476124e-06, + "loss": 0.3471, + "step": 1972 + }, + { + "epoch": 1.6617069062324537, + "grad_norm": 0.21592488884925842, + "learning_rate": 4.931365611995598e-06, + "loss": 0.374, + "step": 1973 + }, + { + "epoch": 1.6625491297024144, + "grad_norm": 0.19126929342746735, + "learning_rate": 4.926463497430293e-06, + "loss": 0.3394, + "step": 1974 + }, + { + "epoch": 1.6633913531723752, + "grad_norm": 0.2073042094707489, + "learning_rate": 4.921561453564704e-06, + "loss": 0.3551, + "step": 1975 + }, + { + "epoch": 1.6642335766423357, + "grad_norm": 0.20550045371055603, + "learning_rate": 4.9166594851117696e-06, + "loss": 0.349, + "step": 1976 + }, + { + "epoch": 1.6650758001122965, + "grad_norm": 0.23410096764564514, + "learning_rate": 4.911757596784358e-06, + "loss": 0.375, + "step": 1977 + }, + { + "epoch": 1.665918023582257, + "grad_norm": 0.19714966416358948, + "learning_rate": 4.906855793295259e-06, + "loss": 0.3579, + "step": 1978 + }, + { + "epoch": 1.6667602470522178, + "grad_norm": 0.21757762134075165, + "learning_rate": 4.901954079357182e-06, + "loss": 0.2941, + "step": 1979 + }, + { + "epoch": 1.6676024705221786, + "grad_norm": 0.2046147584915161, + "learning_rate": 4.897052459682749e-06, + "loss": 0.3857, + "step": 1980 + }, + { + "epoch": 1.6684446939921393, + "grad_norm": 0.21447475254535675, + "learning_rate": 4.892150938984491e-06, + "loss": 0.3776, + "step": 1981 + }, + { + "epoch": 1.6692869174621, + "grad_norm": 0.2030257284641266, + "learning_rate": 4.887249521974848e-06, + "loss": 0.3783, + "step": 1982 + }, + { + "epoch": 1.6701291409320607, + "grad_norm": 0.1929410845041275, + "learning_rate": 4.882348213366152e-06, + "loss": 0.3402, + "step": 1983 + }, + { + "epoch": 1.6709713644020212, + "grad_norm": 0.2044433206319809, + "learning_rate": 4.8774470178706405e-06, + "loss": 0.3749, + "step": 1984 + }, + { + "epoch": 1.671813587871982, + "grad_norm": 0.18149399757385254, + "learning_rate": 4.872545940200435e-06, + "loss": 0.3396, + "step": 1985 + }, + { + "epoch": 1.6726558113419427, + "grad_norm": 0.22244606912136078, + "learning_rate": 4.867644985067548e-06, + "loss": 0.3704, + "step": 1986 + }, + { + "epoch": 1.6734980348119035, + "grad_norm": 0.2242031842470169, + "learning_rate": 4.862744157183869e-06, + "loss": 0.379, + "step": 1987 + }, + { + "epoch": 1.6743402582818643, + "grad_norm": 0.22645780444145203, + "learning_rate": 4.857843461261176e-06, + "loss": 0.3882, + "step": 1988 + }, + { + "epoch": 1.6751824817518248, + "grad_norm": 0.18309590220451355, + "learning_rate": 4.8529429020111035e-06, + "loss": 0.3464, + "step": 1989 + }, + { + "epoch": 1.6760247052217854, + "grad_norm": 0.19142934679985046, + "learning_rate": 4.8480424841451725e-06, + "loss": 0.3794, + "step": 1990 + }, + { + "epoch": 1.6768669286917461, + "grad_norm": 0.2454836666584015, + "learning_rate": 4.8431422123747524e-06, + "loss": 0.3674, + "step": 1991 + }, + { + "epoch": 1.677709152161707, + "grad_norm": 0.20532433688640594, + "learning_rate": 4.838242091411085e-06, + "loss": 0.3232, + "step": 1992 + }, + { + "epoch": 1.6785513756316677, + "grad_norm": 0.21484729647636414, + "learning_rate": 4.833342125965257e-06, + "loss": 0.3612, + "step": 1993 + }, + { + "epoch": 1.6793935991016284, + "grad_norm": 0.20386093854904175, + "learning_rate": 4.828442320748213e-06, + "loss": 0.3322, + "step": 1994 + }, + { + "epoch": 1.680235822571589, + "grad_norm": 0.20775291323661804, + "learning_rate": 4.823542680470738e-06, + "loss": 0.329, + "step": 1995 + }, + { + "epoch": 1.6810780460415495, + "grad_norm": 0.2114599198102951, + "learning_rate": 4.818643209843463e-06, + "loss": 0.3909, + "step": 1996 + }, + { + "epoch": 1.6819202695115103, + "grad_norm": 0.19673332571983337, + "learning_rate": 4.813743913576852e-06, + "loss": 0.3436, + "step": 1997 + }, + { + "epoch": 1.682762492981471, + "grad_norm": 0.2197994440793991, + "learning_rate": 4.808844796381205e-06, + "loss": 0.3784, + "step": 1998 + }, + { + "epoch": 1.6836047164514318, + "grad_norm": 0.18247802555561066, + "learning_rate": 4.803945862966646e-06, + "loss": 0.3212, + "step": 1999 + }, + { + "epoch": 1.6844469399213926, + "grad_norm": 0.20470093190670013, + "learning_rate": 4.799047118043126e-06, + "loss": 0.3754, + "step": 2000 + }, + { + "epoch": 1.6852891633913532, + "grad_norm": 0.19941620528697968, + "learning_rate": 4.794148566320412e-06, + "loss": 0.3699, + "step": 2001 + }, + { + "epoch": 1.686131386861314, + "grad_norm": 0.18958649039268494, + "learning_rate": 4.789250212508088e-06, + "loss": 0.3426, + "step": 2002 + }, + { + "epoch": 1.6869736103312745, + "grad_norm": 0.2122664898633957, + "learning_rate": 4.7843520613155434e-06, + "loss": 0.3775, + "step": 2003 + }, + { + "epoch": 1.6878158338012352, + "grad_norm": 0.198108971118927, + "learning_rate": 4.779454117451978e-06, + "loss": 0.3637, + "step": 2004 + }, + { + "epoch": 1.688658057271196, + "grad_norm": 0.1983402669429779, + "learning_rate": 4.774556385626386e-06, + "loss": 0.3666, + "step": 2005 + }, + { + "epoch": 1.6895002807411568, + "grad_norm": 0.1770869344472885, + "learning_rate": 4.769658870547567e-06, + "loss": 0.3214, + "step": 2006 + }, + { + "epoch": 1.6903425042111173, + "grad_norm": 0.19760024547576904, + "learning_rate": 4.7647615769241e-06, + "loss": 0.3505, + "step": 2007 + }, + { + "epoch": 1.691184727681078, + "grad_norm": 0.1891362965106964, + "learning_rate": 4.759864509464366e-06, + "loss": 0.344, + "step": 2008 + }, + { + "epoch": 1.6920269511510386, + "grad_norm": 0.19179244339466095, + "learning_rate": 4.754967672876513e-06, + "loss": 0.3583, + "step": 2009 + }, + { + "epoch": 1.6928691746209994, + "grad_norm": 0.21558877825737, + "learning_rate": 4.750071071868478e-06, + "loss": 0.3713, + "step": 2010 + }, + { + "epoch": 1.6937113980909602, + "grad_norm": 0.20262889564037323, + "learning_rate": 4.745174711147967e-06, + "loss": 0.357, + "step": 2011 + }, + { + "epoch": 1.694553621560921, + "grad_norm": 0.21164673566818237, + "learning_rate": 4.7402785954224565e-06, + "loss": 0.37, + "step": 2012 + }, + { + "epoch": 1.6953958450308815, + "grad_norm": 0.20908448100090027, + "learning_rate": 4.7353827293991845e-06, + "loss": 0.368, + "step": 2013 + }, + { + "epoch": 1.6962380685008422, + "grad_norm": 0.2034013271331787, + "learning_rate": 4.730487117785155e-06, + "loss": 0.3431, + "step": 2014 + }, + { + "epoch": 1.6970802919708028, + "grad_norm": 0.22839775681495667, + "learning_rate": 4.725591765287119e-06, + "loss": 0.3753, + "step": 2015 + }, + { + "epoch": 1.6979225154407636, + "grad_norm": 0.22359232604503632, + "learning_rate": 4.720696676611589e-06, + "loss": 0.3214, + "step": 2016 + }, + { + "epoch": 1.6987647389107243, + "grad_norm": 0.22633424401283264, + "learning_rate": 4.715801856464812e-06, + "loss": 0.3904, + "step": 2017 + }, + { + "epoch": 1.699606962380685, + "grad_norm": 0.20711787045001984, + "learning_rate": 4.710907309552787e-06, + "loss": 0.3368, + "step": 2018 + }, + { + "epoch": 1.7004491858506459, + "grad_norm": 0.23692451417446136, + "learning_rate": 4.706013040581242e-06, + "loss": 0.375, + "step": 2019 + }, + { + "epoch": 1.7012914093206064, + "grad_norm": 0.2271483689546585, + "learning_rate": 4.701119054255646e-06, + "loss": 0.3536, + "step": 2020 + }, + { + "epoch": 1.702133632790567, + "grad_norm": 0.21336886286735535, + "learning_rate": 4.6962253552811885e-06, + "loss": 0.3488, + "step": 2021 + }, + { + "epoch": 1.7029758562605277, + "grad_norm": 0.1952803134918213, + "learning_rate": 4.691331948362789e-06, + "loss": 0.3107, + "step": 2022 + }, + { + "epoch": 1.7038180797304885, + "grad_norm": 0.2554273307323456, + "learning_rate": 4.6864388382050804e-06, + "loss": 0.3854, + "step": 2023 + }, + { + "epoch": 1.7046603032004493, + "grad_norm": 0.21529453992843628, + "learning_rate": 4.6815460295124185e-06, + "loss": 0.3762, + "step": 2024 + }, + { + "epoch": 1.70550252667041, + "grad_norm": 0.199816033244133, + "learning_rate": 4.676653526988858e-06, + "loss": 0.3406, + "step": 2025 + }, + { + "epoch": 1.7063447501403706, + "grad_norm": 0.2005997598171234, + "learning_rate": 4.671761335338171e-06, + "loss": 0.3618, + "step": 2026 + }, + { + "epoch": 1.7071869736103311, + "grad_norm": 0.22079318761825562, + "learning_rate": 4.666869459263821e-06, + "loss": 0.3761, + "step": 2027 + }, + { + "epoch": 1.7080291970802919, + "grad_norm": 0.21227099001407623, + "learning_rate": 4.661977903468974e-06, + "loss": 0.3523, + "step": 2028 + }, + { + "epoch": 1.7088714205502527, + "grad_norm": 0.19983810186386108, + "learning_rate": 4.657086672656486e-06, + "loss": 0.3221, + "step": 2029 + }, + { + "epoch": 1.7097136440202134, + "grad_norm": 0.2068592607975006, + "learning_rate": 4.652195771528901e-06, + "loss": 0.3619, + "step": 2030 + }, + { + "epoch": 1.7105558674901742, + "grad_norm": 0.21467489004135132, + "learning_rate": 4.647305204788445e-06, + "loss": 0.3632, + "step": 2031 + }, + { + "epoch": 1.7113980909601347, + "grad_norm": 0.20065774023532867, + "learning_rate": 4.642414977137026e-06, + "loss": 0.3784, + "step": 2032 + }, + { + "epoch": 1.7122403144300955, + "grad_norm": 0.19884279370307922, + "learning_rate": 4.63752509327622e-06, + "loss": 0.3419, + "step": 2033 + }, + { + "epoch": 1.713082537900056, + "grad_norm": 0.2171352505683899, + "learning_rate": 4.632635557907277e-06, + "loss": 0.352, + "step": 2034 + }, + { + "epoch": 1.7139247613700168, + "grad_norm": 0.22259283065795898, + "learning_rate": 4.627746375731112e-06, + "loss": 0.4014, + "step": 2035 + }, + { + "epoch": 1.7147669848399776, + "grad_norm": 0.22084489464759827, + "learning_rate": 4.622857551448297e-06, + "loss": 0.3455, + "step": 2036 + }, + { + "epoch": 1.7156092083099383, + "grad_norm": 0.2158217579126358, + "learning_rate": 4.617969089759066e-06, + "loss": 0.3451, + "step": 2037 + }, + { + "epoch": 1.716451431779899, + "grad_norm": 0.22856098413467407, + "learning_rate": 4.613080995363296e-06, + "loss": 0.3886, + "step": 2038 + }, + { + "epoch": 1.7172936552498597, + "grad_norm": 0.2910906970500946, + "learning_rate": 4.608193272960519e-06, + "loss": 0.3484, + "step": 2039 + }, + { + "epoch": 1.7181358787198202, + "grad_norm": 0.21710596978664398, + "learning_rate": 4.603305927249902e-06, + "loss": 0.3511, + "step": 2040 + }, + { + "epoch": 1.718978102189781, + "grad_norm": 0.21317730844020844, + "learning_rate": 4.598418962930258e-06, + "loss": 0.3707, + "step": 2041 + }, + { + "epoch": 1.7198203256597417, + "grad_norm": 0.20781542360782623, + "learning_rate": 4.593532384700026e-06, + "loss": 0.4028, + "step": 2042 + }, + { + "epoch": 1.7206625491297025, + "grad_norm": 0.2237594872713089, + "learning_rate": 4.588646197257278e-06, + "loss": 0.3914, + "step": 2043 + }, + { + "epoch": 1.721504772599663, + "grad_norm": 0.18635544180870056, + "learning_rate": 4.583760405299707e-06, + "loss": 0.3365, + "step": 2044 + }, + { + "epoch": 1.7223469960696238, + "grad_norm": 0.20491984486579895, + "learning_rate": 4.57887501352463e-06, + "loss": 0.3303, + "step": 2045 + }, + { + "epoch": 1.7231892195395844, + "grad_norm": 0.1990174651145935, + "learning_rate": 4.573990026628976e-06, + "loss": 0.3646, + "step": 2046 + }, + { + "epoch": 1.7240314430095451, + "grad_norm": 0.1963694989681244, + "learning_rate": 4.569105449309289e-06, + "loss": 0.3729, + "step": 2047 + }, + { + "epoch": 1.724873666479506, + "grad_norm": 0.18767237663269043, + "learning_rate": 4.564221286261709e-06, + "loss": 0.3316, + "step": 2048 + }, + { + "epoch": 1.7257158899494667, + "grad_norm": 0.23530213534832, + "learning_rate": 4.559337542181993e-06, + "loss": 0.3956, + "step": 2049 + }, + { + "epoch": 1.7265581134194274, + "grad_norm": 0.18765990436077118, + "learning_rate": 4.554454221765479e-06, + "loss": 0.3267, + "step": 2050 + }, + { + "epoch": 1.727400336889388, + "grad_norm": 0.22365470230579376, + "learning_rate": 4.549571329707113e-06, + "loss": 0.3459, + "step": 2051 + }, + { + "epoch": 1.7282425603593485, + "grad_norm": 0.21409985423088074, + "learning_rate": 4.544688870701416e-06, + "loss": 0.3421, + "step": 2052 + }, + { + "epoch": 1.7290847838293093, + "grad_norm": 0.21623964607715607, + "learning_rate": 4.539806849442501e-06, + "loss": 0.3462, + "step": 2053 + }, + { + "epoch": 1.72992700729927, + "grad_norm": 0.22499880194664001, + "learning_rate": 4.534925270624057e-06, + "loss": 0.3901, + "step": 2054 + }, + { + "epoch": 1.7307692307692308, + "grad_norm": 0.19443631172180176, + "learning_rate": 4.53004413893935e-06, + "loss": 0.3437, + "step": 2055 + }, + { + "epoch": 1.7316114542391916, + "grad_norm": 0.21552076935768127, + "learning_rate": 4.52516345908121e-06, + "loss": 0.3848, + "step": 2056 + }, + { + "epoch": 1.7324536777091522, + "grad_norm": 0.2129429429769516, + "learning_rate": 4.520283235742042e-06, + "loss": 0.3924, + "step": 2057 + }, + { + "epoch": 1.7332959011791127, + "grad_norm": 0.1930941939353943, + "learning_rate": 4.5154034736138035e-06, + "loss": 0.3357, + "step": 2058 + }, + { + "epoch": 1.7341381246490735, + "grad_norm": 0.20328223705291748, + "learning_rate": 4.510524177388014e-06, + "loss": 0.3711, + "step": 2059 + }, + { + "epoch": 1.7349803481190342, + "grad_norm": 0.1964006870985031, + "learning_rate": 4.505645351755741e-06, + "loss": 0.3599, + "step": 2060 + }, + { + "epoch": 1.735822571588995, + "grad_norm": 0.19247199594974518, + "learning_rate": 4.500767001407604e-06, + "loss": 0.3902, + "step": 2061 + }, + { + "epoch": 1.7366647950589558, + "grad_norm": 0.19466297328472137, + "learning_rate": 4.495889131033762e-06, + "loss": 0.351, + "step": 2062 + }, + { + "epoch": 1.7375070185289163, + "grad_norm": 0.20538896322250366, + "learning_rate": 4.491011745323914e-06, + "loss": 0.3439, + "step": 2063 + }, + { + "epoch": 1.738349241998877, + "grad_norm": 0.21315395832061768, + "learning_rate": 4.486134848967292e-06, + "loss": 0.3412, + "step": 2064 + }, + { + "epoch": 1.7391914654688376, + "grad_norm": 0.20313483476638794, + "learning_rate": 4.481258446652662e-06, + "loss": 0.3713, + "step": 2065 + }, + { + "epoch": 1.7400336889387984, + "grad_norm": 0.19733814895153046, + "learning_rate": 4.4763825430683055e-06, + "loss": 0.3602, + "step": 2066 + }, + { + "epoch": 1.7408759124087592, + "grad_norm": 0.19734057784080505, + "learning_rate": 4.471507142902036e-06, + "loss": 0.3461, + "step": 2067 + }, + { + "epoch": 1.74171813587872, + "grad_norm": 0.20122112333774567, + "learning_rate": 4.466632250841173e-06, + "loss": 0.3566, + "step": 2068 + }, + { + "epoch": 1.7425603593486805, + "grad_norm": 0.1987045258283615, + "learning_rate": 4.4617578715725565e-06, + "loss": 0.3677, + "step": 2069 + }, + { + "epoch": 1.7434025828186412, + "grad_norm": 0.2019500732421875, + "learning_rate": 4.4568840097825225e-06, + "loss": 0.3665, + "step": 2070 + }, + { + "epoch": 1.7442448062886018, + "grad_norm": 0.1923629343509674, + "learning_rate": 4.452010670156922e-06, + "loss": 0.3234, + "step": 2071 + }, + { + "epoch": 1.7450870297585626, + "grad_norm": 0.22685682773590088, + "learning_rate": 4.447137857381095e-06, + "loss": 0.3724, + "step": 2072 + }, + { + "epoch": 1.7459292532285233, + "grad_norm": 0.19463816285133362, + "learning_rate": 4.4422655761398785e-06, + "loss": 0.3531, + "step": 2073 + }, + { + "epoch": 1.746771476698484, + "grad_norm": 0.22609847784042358, + "learning_rate": 4.437393831117596e-06, + "loss": 0.4006, + "step": 2074 + }, + { + "epoch": 1.7476137001684446, + "grad_norm": 0.2065419703722, + "learning_rate": 4.432522626998061e-06, + "loss": 0.3708, + "step": 2075 + }, + { + "epoch": 1.7484559236384054, + "grad_norm": 0.18311607837677002, + "learning_rate": 4.427651968464559e-06, + "loss": 0.3563, + "step": 2076 + }, + { + "epoch": 1.749298147108366, + "grad_norm": 0.18132922053337097, + "learning_rate": 4.4227818601998575e-06, + "loss": 0.3192, + "step": 2077 + }, + { + "epoch": 1.7501403705783267, + "grad_norm": 0.2064925730228424, + "learning_rate": 4.417912306886192e-06, + "loss": 0.3578, + "step": 2078 + }, + { + "epoch": 1.7509825940482875, + "grad_norm": 0.21537601947784424, + "learning_rate": 4.413043313205266e-06, + "loss": 0.3696, + "step": 2079 + }, + { + "epoch": 1.7518248175182483, + "grad_norm": 0.1957070529460907, + "learning_rate": 4.408174883838243e-06, + "loss": 0.3298, + "step": 2080 + }, + { + "epoch": 1.752667040988209, + "grad_norm": 0.2095799446105957, + "learning_rate": 4.403307023465746e-06, + "loss": 0.3488, + "step": 2081 + }, + { + "epoch": 1.7535092644581696, + "grad_norm": 0.2147350013256073, + "learning_rate": 4.3984397367678475e-06, + "loss": 0.3798, + "step": 2082 + }, + { + "epoch": 1.7543514879281301, + "grad_norm": 0.2071705162525177, + "learning_rate": 4.393573028424075e-06, + "loss": 0.3623, + "step": 2083 + }, + { + "epoch": 1.7551937113980909, + "grad_norm": 0.2035631388425827, + "learning_rate": 4.388706903113391e-06, + "loss": 0.3553, + "step": 2084 + }, + { + "epoch": 1.7560359348680517, + "grad_norm": 0.20198489725589752, + "learning_rate": 4.383841365514208e-06, + "loss": 0.3733, + "step": 2085 + }, + { + "epoch": 1.7568781583380124, + "grad_norm": 0.2736494541168213, + "learning_rate": 4.378976420304361e-06, + "loss": 0.3946, + "step": 2086 + }, + { + "epoch": 1.7577203818079732, + "grad_norm": 0.21576163172721863, + "learning_rate": 4.374112072161129e-06, + "loss": 0.357, + "step": 2087 + }, + { + "epoch": 1.7585626052779337, + "grad_norm": 0.2382166087627411, + "learning_rate": 4.369248325761205e-06, + "loss": 0.3318, + "step": 2088 + }, + { + "epoch": 1.7594048287478943, + "grad_norm": 0.21679919958114624, + "learning_rate": 4.364385185780712e-06, + "loss": 0.3622, + "step": 2089 + }, + { + "epoch": 1.760247052217855, + "grad_norm": 0.1898692101240158, + "learning_rate": 4.359522656895185e-06, + "loss": 0.2974, + "step": 2090 + }, + { + "epoch": 1.7610892756878158, + "grad_norm": 0.2273748368024826, + "learning_rate": 4.354660743779575e-06, + "loss": 0.3993, + "step": 2091 + }, + { + "epoch": 1.7619314991577766, + "grad_norm": 0.23613716661930084, + "learning_rate": 4.349799451108236e-06, + "loss": 0.3663, + "step": 2092 + }, + { + "epoch": 1.7627737226277373, + "grad_norm": 0.20020337402820587, + "learning_rate": 4.3449387835549305e-06, + "loss": 0.3606, + "step": 2093 + }, + { + "epoch": 1.763615946097698, + "grad_norm": 0.18953366577625275, + "learning_rate": 4.340078745792818e-06, + "loss": 0.3435, + "step": 2094 + }, + { + "epoch": 1.7644581695676587, + "grad_norm": 0.20416444540023804, + "learning_rate": 4.3352193424944535e-06, + "loss": 0.3636, + "step": 2095 + }, + { + "epoch": 1.7653003930376192, + "grad_norm": 0.19791902601718903, + "learning_rate": 4.3303605783317794e-06, + "loss": 0.3573, + "step": 2096 + }, + { + "epoch": 1.76614261650758, + "grad_norm": 0.22840075194835663, + "learning_rate": 4.325502457976126e-06, + "loss": 0.3499, + "step": 2097 + }, + { + "epoch": 1.7669848399775407, + "grad_norm": 0.20278143882751465, + "learning_rate": 4.320644986098204e-06, + "loss": 0.396, + "step": 2098 + }, + { + "epoch": 1.7678270634475015, + "grad_norm": 2.3698928356170654, + "learning_rate": 4.315788167368102e-06, + "loss": 0.3408, + "step": 2099 + }, + { + "epoch": 1.768669286917462, + "grad_norm": 0.21950553357601166, + "learning_rate": 4.310932006455276e-06, + "loss": 0.3197, + "step": 2100 + }, + { + "epoch": 1.7695115103874228, + "grad_norm": 0.19593040645122528, + "learning_rate": 4.306076508028557e-06, + "loss": 0.344, + "step": 2101 + }, + { + "epoch": 1.7703537338573834, + "grad_norm": 0.2054298222064972, + "learning_rate": 4.301221676756129e-06, + "loss": 0.3511, + "step": 2102 + }, + { + "epoch": 1.7711959573273441, + "grad_norm": 0.20474767684936523, + "learning_rate": 4.296367517305548e-06, + "loss": 0.3553, + "step": 2103 + }, + { + "epoch": 1.772038180797305, + "grad_norm": 0.1904682219028473, + "learning_rate": 4.29151403434371e-06, + "loss": 0.322, + "step": 2104 + }, + { + "epoch": 1.7728804042672657, + "grad_norm": 0.21345622837543488, + "learning_rate": 4.286661232536873e-06, + "loss": 0.3496, + "step": 2105 + }, + { + "epoch": 1.7737226277372264, + "grad_norm": 0.2012915164232254, + "learning_rate": 4.281809116550629e-06, + "loss": 0.3608, + "step": 2106 + }, + { + "epoch": 1.774564851207187, + "grad_norm": 0.18038402497768402, + "learning_rate": 4.276957691049917e-06, + "loss": 0.3155, + "step": 2107 + }, + { + "epoch": 1.7754070746771475, + "grad_norm": 0.20159615576267242, + "learning_rate": 4.272106960699015e-06, + "loss": 0.392, + "step": 2108 + }, + { + "epoch": 1.7762492981471083, + "grad_norm": 0.19055233895778656, + "learning_rate": 4.267256930161523e-06, + "loss": 0.3248, + "step": 2109 + }, + { + "epoch": 1.777091521617069, + "grad_norm": 0.20288807153701782, + "learning_rate": 4.2624076041003794e-06, + "loss": 0.3645, + "step": 2110 + }, + { + "epoch": 1.7779337450870298, + "grad_norm": 0.19328925013542175, + "learning_rate": 4.257558987177835e-06, + "loss": 0.3432, + "step": 2111 + }, + { + "epoch": 1.7787759685569906, + "grad_norm": 0.20823122560977936, + "learning_rate": 4.252711084055468e-06, + "loss": 0.3514, + "step": 2112 + }, + { + "epoch": 1.7796181920269512, + "grad_norm": 0.1911734789609909, + "learning_rate": 4.247863899394162e-06, + "loss": 0.3436, + "step": 2113 + }, + { + "epoch": 1.7804604154969117, + "grad_norm": 0.2023228406906128, + "learning_rate": 4.243017437854117e-06, + "loss": 0.38, + "step": 2114 + }, + { + "epoch": 1.7813026389668725, + "grad_norm": 0.19723080098628998, + "learning_rate": 4.238171704094833e-06, + "loss": 0.3404, + "step": 2115 + }, + { + "epoch": 1.7821448624368332, + "grad_norm": 0.19566403329372406, + "learning_rate": 4.2333267027751125e-06, + "loss": 0.339, + "step": 2116 + }, + { + "epoch": 1.782987085906794, + "grad_norm": 0.23212406039237976, + "learning_rate": 4.228482438553052e-06, + "loss": 0.3674, + "step": 2117 + }, + { + "epoch": 1.7838293093767548, + "grad_norm": 0.1956307590007782, + "learning_rate": 4.223638916086044e-06, + "loss": 0.3553, + "step": 2118 + }, + { + "epoch": 1.7846715328467153, + "grad_norm": 0.18204696476459503, + "learning_rate": 4.218796140030759e-06, + "loss": 0.3242, + "step": 2119 + }, + { + "epoch": 1.7855137563166759, + "grad_norm": 0.1948997676372528, + "learning_rate": 4.21395411504316e-06, + "loss": 0.3601, + "step": 2120 + }, + { + "epoch": 1.7863559797866366, + "grad_norm": 0.1899755448102951, + "learning_rate": 4.209112845778481e-06, + "loss": 0.3525, + "step": 2121 + }, + { + "epoch": 1.7871982032565974, + "grad_norm": 0.212238147854805, + "learning_rate": 4.204272336891232e-06, + "loss": 0.353, + "step": 2122 + }, + { + "epoch": 1.7880404267265582, + "grad_norm": 0.1929008811712265, + "learning_rate": 4.199432593035192e-06, + "loss": 0.3801, + "step": 2123 + }, + { + "epoch": 1.788882650196519, + "grad_norm": 0.1923249065876007, + "learning_rate": 4.194593618863404e-06, + "loss": 0.3367, + "step": 2124 + }, + { + "epoch": 1.7897248736664795, + "grad_norm": 0.21146802604198456, + "learning_rate": 4.189755419028169e-06, + "loss": 0.3914, + "step": 2125 + }, + { + "epoch": 1.7905670971364402, + "grad_norm": 0.21049576997756958, + "learning_rate": 4.1849179981810506e-06, + "loss": 0.3775, + "step": 2126 + }, + { + "epoch": 1.7914093206064008, + "grad_norm": 0.18470361828804016, + "learning_rate": 4.180081360972852e-06, + "loss": 0.3293, + "step": 2127 + }, + { + "epoch": 1.7922515440763616, + "grad_norm": 0.19915157556533813, + "learning_rate": 4.175245512053637e-06, + "loss": 0.3579, + "step": 2128 + }, + { + "epoch": 1.7930937675463223, + "grad_norm": 0.19717763364315033, + "learning_rate": 4.1704104560726955e-06, + "loss": 0.3564, + "step": 2129 + }, + { + "epoch": 1.793935991016283, + "grad_norm": 0.21021628379821777, + "learning_rate": 4.165576197678571e-06, + "loss": 0.3781, + "step": 2130 + }, + { + "epoch": 1.7947782144862436, + "grad_norm": 0.1901855617761612, + "learning_rate": 4.160742741519028e-06, + "loss": 0.3428, + "step": 2131 + }, + { + "epoch": 1.7956204379562044, + "grad_norm": 0.21265830099582672, + "learning_rate": 4.1559100922410665e-06, + "loss": 0.3477, + "step": 2132 + }, + { + "epoch": 1.796462661426165, + "grad_norm": 0.19887612760066986, + "learning_rate": 4.151078254490908e-06, + "loss": 0.3552, + "step": 2133 + }, + { + "epoch": 1.7973048848961257, + "grad_norm": 0.22088709473609924, + "learning_rate": 4.146247232913996e-06, + "loss": 0.3513, + "step": 2134 + }, + { + "epoch": 1.7981471083660865, + "grad_norm": 0.1965794712305069, + "learning_rate": 4.141417032154984e-06, + "loss": 0.3753, + "step": 2135 + }, + { + "epoch": 1.7989893318360473, + "grad_norm": 0.357808381319046, + "learning_rate": 4.136587656857744e-06, + "loss": 0.3688, + "step": 2136 + }, + { + "epoch": 1.799831555306008, + "grad_norm": 0.20281732082366943, + "learning_rate": 4.131759111665349e-06, + "loss": 0.3474, + "step": 2137 + }, + { + "epoch": 1.8006737787759686, + "grad_norm": 0.20766642689704895, + "learning_rate": 4.126931401220075e-06, + "loss": 0.3685, + "step": 2138 + }, + { + "epoch": 1.8015160022459291, + "grad_norm": 0.2085840106010437, + "learning_rate": 4.122104530163397e-06, + "loss": 0.3612, + "step": 2139 + }, + { + "epoch": 1.8023582257158899, + "grad_norm": 0.2228211760520935, + "learning_rate": 4.117278503135981e-06, + "loss": 0.3581, + "step": 2140 + }, + { + "epoch": 1.8032004491858507, + "grad_norm": 0.22840428352355957, + "learning_rate": 4.112453324777683e-06, + "loss": 0.3242, + "step": 2141 + }, + { + "epoch": 1.8040426726558114, + "grad_norm": 0.22499187290668488, + "learning_rate": 4.107628999727542e-06, + "loss": 0.4048, + "step": 2142 + }, + { + "epoch": 1.8048848961257722, + "grad_norm": 0.1917266845703125, + "learning_rate": 4.102805532623775e-06, + "loss": 0.3339, + "step": 2143 + }, + { + "epoch": 1.8057271195957327, + "grad_norm": 0.2015596628189087, + "learning_rate": 4.097982928103782e-06, + "loss": 0.362, + "step": 2144 + }, + { + "epoch": 1.8065693430656933, + "grad_norm": 0.24050593376159668, + "learning_rate": 4.09316119080412e-06, + "loss": 0.3884, + "step": 2145 + }, + { + "epoch": 1.807411566535654, + "grad_norm": 0.18397676944732666, + "learning_rate": 4.088340325360529e-06, + "loss": 0.3047, + "step": 2146 + }, + { + "epoch": 1.8082537900056148, + "grad_norm": 0.20920848846435547, + "learning_rate": 4.083520336407894e-06, + "loss": 0.365, + "step": 2147 + }, + { + "epoch": 1.8090960134755756, + "grad_norm": 0.1963934749364853, + "learning_rate": 4.0787012285802695e-06, + "loss": 0.3462, + "step": 2148 + }, + { + "epoch": 1.8099382369455363, + "grad_norm": 0.19430825114250183, + "learning_rate": 4.073883006510858e-06, + "loss": 0.3678, + "step": 2149 + }, + { + "epoch": 1.810780460415497, + "grad_norm": 0.197347491979599, + "learning_rate": 4.069065674832011e-06, + "loss": 0.3406, + "step": 2150 + }, + { + "epoch": 1.8116226838854577, + "grad_norm": 0.22697080671787262, + "learning_rate": 4.064249238175223e-06, + "loss": 0.39, + "step": 2151 + }, + { + "epoch": 1.8124649073554182, + "grad_norm": 0.19796526432037354, + "learning_rate": 4.059433701171131e-06, + "loss": 0.3796, + "step": 2152 + }, + { + "epoch": 1.813307130825379, + "grad_norm": 0.19998793303966522, + "learning_rate": 4.054619068449502e-06, + "loss": 0.3628, + "step": 2153 + }, + { + "epoch": 1.8141493542953397, + "grad_norm": 0.21857374906539917, + "learning_rate": 4.04980534463924e-06, + "loss": 0.3922, + "step": 2154 + }, + { + "epoch": 1.8149915777653005, + "grad_norm": 0.27835121750831604, + "learning_rate": 4.044992534368369e-06, + "loss": 0.343, + "step": 2155 + }, + { + "epoch": 1.815833801235261, + "grad_norm": 0.22096095979213715, + "learning_rate": 4.04018064226404e-06, + "loss": 0.3341, + "step": 2156 + }, + { + "epoch": 1.8166760247052218, + "grad_norm": 0.21028409898281097, + "learning_rate": 4.035369672952516e-06, + "loss": 0.346, + "step": 2157 + }, + { + "epoch": 1.8175182481751824, + "grad_norm": 0.19172050058841705, + "learning_rate": 4.030559631059179e-06, + "loss": 0.3601, + "step": 2158 + }, + { + "epoch": 1.8183604716451431, + "grad_norm": 0.20794759690761566, + "learning_rate": 4.025750521208512e-06, + "loss": 0.3691, + "step": 2159 + }, + { + "epoch": 1.819202695115104, + "grad_norm": 0.22366368770599365, + "learning_rate": 4.020942348024108e-06, + "loss": 0.396, + "step": 2160 + }, + { + "epoch": 1.8200449185850647, + "grad_norm": 0.20643411576747894, + "learning_rate": 4.016135116128656e-06, + "loss": 0.3429, + "step": 2161 + }, + { + "epoch": 1.8208871420550252, + "grad_norm": 0.2257109135389328, + "learning_rate": 4.011328830143945e-06, + "loss": 0.326, + "step": 2162 + }, + { + "epoch": 1.821729365524986, + "grad_norm": 0.18669435381889343, + "learning_rate": 4.0065234946908456e-06, + "loss": 0.3354, + "step": 2163 + }, + { + "epoch": 1.8225715889949465, + "grad_norm": 0.20572814345359802, + "learning_rate": 4.001719114389325e-06, + "loss": 0.3673, + "step": 2164 + }, + { + "epoch": 1.8234138124649073, + "grad_norm": 0.2180599868297577, + "learning_rate": 3.996915693858422e-06, + "loss": 0.3897, + "step": 2165 + }, + { + "epoch": 1.824256035934868, + "grad_norm": 0.19118410348892212, + "learning_rate": 3.992113237716261e-06, + "loss": 0.3432, + "step": 2166 + }, + { + "epoch": 1.8250982594048288, + "grad_norm": 0.20306925475597382, + "learning_rate": 3.987311750580035e-06, + "loss": 0.3452, + "step": 2167 + }, + { + "epoch": 1.8259404828747896, + "grad_norm": 0.19168779253959656, + "learning_rate": 3.9825112370660055e-06, + "loss": 0.3373, + "step": 2168 + }, + { + "epoch": 1.8267827063447502, + "grad_norm": 0.19850198924541473, + "learning_rate": 3.977711701789499e-06, + "loss": 0.3646, + "step": 2169 + }, + { + "epoch": 1.8276249298147107, + "grad_norm": 0.20444419980049133, + "learning_rate": 3.972913149364902e-06, + "loss": 0.3841, + "step": 2170 + }, + { + "epoch": 1.8284671532846715, + "grad_norm": 0.1842080056667328, + "learning_rate": 3.9681155844056525e-06, + "loss": 0.3444, + "step": 2171 + }, + { + "epoch": 1.8293093767546322, + "grad_norm": 0.18652406334877014, + "learning_rate": 3.963319011524246e-06, + "loss": 0.3487, + "step": 2172 + }, + { + "epoch": 1.830151600224593, + "grad_norm": 0.1885792762041092, + "learning_rate": 3.9585234353322155e-06, + "loss": 0.3647, + "step": 2173 + }, + { + "epoch": 1.8309938236945538, + "grad_norm": 0.2128867506980896, + "learning_rate": 3.953728860440144e-06, + "loss": 0.3386, + "step": 2174 + }, + { + "epoch": 1.8318360471645143, + "grad_norm": 0.1972573846578598, + "learning_rate": 3.948935291457645e-06, + "loss": 0.3125, + "step": 2175 + }, + { + "epoch": 1.8326782706344749, + "grad_norm": 0.21564050018787384, + "learning_rate": 3.94414273299337e-06, + "loss": 0.3508, + "step": 2176 + }, + { + "epoch": 1.8335204941044356, + "grad_norm": 0.2020157277584076, + "learning_rate": 3.939351189654996e-06, + "loss": 0.3616, + "step": 2177 + }, + { + "epoch": 1.8343627175743964, + "grad_norm": 0.21299459040164948, + "learning_rate": 3.934560666049226e-06, + "loss": 0.3513, + "step": 2178 + }, + { + "epoch": 1.8352049410443572, + "grad_norm": 0.18165704607963562, + "learning_rate": 3.929771166781781e-06, + "loss": 0.3205, + "step": 2179 + }, + { + "epoch": 1.836047164514318, + "grad_norm": 0.21832971274852753, + "learning_rate": 3.9249826964573965e-06, + "loss": 0.3825, + "step": 2180 + }, + { + "epoch": 1.8368893879842785, + "grad_norm": 0.19859738647937775, + "learning_rate": 3.920195259679822e-06, + "loss": 0.3642, + "step": 2181 + }, + { + "epoch": 1.8377316114542392, + "grad_norm": 0.1959538608789444, + "learning_rate": 3.915408861051809e-06, + "loss": 0.3482, + "step": 2182 + }, + { + "epoch": 1.8385738349241998, + "grad_norm": 0.20815512537956238, + "learning_rate": 3.910623505175116e-06, + "loss": 0.3406, + "step": 2183 + }, + { + "epoch": 1.8394160583941606, + "grad_norm": 0.19060750305652618, + "learning_rate": 3.905839196650494e-06, + "loss": 0.3643, + "step": 2184 + }, + { + "epoch": 1.8402582818641213, + "grad_norm": 0.20968878269195557, + "learning_rate": 3.901055940077691e-06, + "loss": 0.3546, + "step": 2185 + }, + { + "epoch": 1.841100505334082, + "grad_norm": 0.23598092794418335, + "learning_rate": 3.8962737400554395e-06, + "loss": 0.3687, + "step": 2186 + }, + { + "epoch": 1.8419427288040426, + "grad_norm": 0.1882329136133194, + "learning_rate": 3.891492601181462e-06, + "loss": 0.3631, + "step": 2187 + }, + { + "epoch": 1.8427849522740034, + "grad_norm": 0.18433696031570435, + "learning_rate": 3.8867125280524535e-06, + "loss": 0.3527, + "step": 2188 + }, + { + "epoch": 1.843627175743964, + "grad_norm": 0.20385466516017914, + "learning_rate": 3.881933525264092e-06, + "loss": 0.336, + "step": 2189 + }, + { + "epoch": 1.8444693992139247, + "grad_norm": 0.22509856522083282, + "learning_rate": 3.877155597411019e-06, + "loss": 0.3987, + "step": 2190 + }, + { + "epoch": 1.8453116226838855, + "grad_norm": 0.19194535911083221, + "learning_rate": 3.87237874908685e-06, + "loss": 0.3319, + "step": 2191 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 0.23621989786624908, + "learning_rate": 3.867602984884155e-06, + "loss": 0.3902, + "step": 2192 + }, + { + "epoch": 1.8469960696238068, + "grad_norm": 0.19871260225772858, + "learning_rate": 3.862828309394469e-06, + "loss": 0.3398, + "step": 2193 + }, + { + "epoch": 1.8478382930937676, + "grad_norm": 0.19352886080741882, + "learning_rate": 3.8580547272082746e-06, + "loss": 0.3259, + "step": 2194 + }, + { + "epoch": 1.8486805165637281, + "grad_norm": 0.19926981627941132, + "learning_rate": 3.853282242915007e-06, + "loss": 0.3544, + "step": 2195 + }, + { + "epoch": 1.8495227400336889, + "grad_norm": 0.22587287425994873, + "learning_rate": 3.8485108611030415e-06, + "loss": 0.3695, + "step": 2196 + }, + { + "epoch": 1.8503649635036497, + "grad_norm": 0.20771846175193787, + "learning_rate": 3.843740586359701e-06, + "loss": 0.3504, + "step": 2197 + }, + { + "epoch": 1.8512071869736104, + "grad_norm": 0.1965019553899765, + "learning_rate": 3.8389714232712346e-06, + "loss": 0.3503, + "step": 2198 + }, + { + "epoch": 1.8520494104435712, + "grad_norm": 0.21217331290245056, + "learning_rate": 3.834203376422831e-06, + "loss": 0.3286, + "step": 2199 + }, + { + "epoch": 1.8528916339135317, + "grad_norm": 0.19798773527145386, + "learning_rate": 3.829436450398599e-06, + "loss": 0.3543, + "step": 2200 + }, + { + "epoch": 1.8537338573834923, + "grad_norm": 0.2020939290523529, + "learning_rate": 3.824670649781576e-06, + "loss": 0.376, + "step": 2201 + }, + { + "epoch": 1.854576080853453, + "grad_norm": 0.19974105060100555, + "learning_rate": 3.8199059791537105e-06, + "loss": 0.3364, + "step": 2202 + }, + { + "epoch": 1.8554183043234138, + "grad_norm": 0.1852501779794693, + "learning_rate": 3.815142443095873e-06, + "loss": 0.365, + "step": 2203 + }, + { + "epoch": 1.8562605277933746, + "grad_norm": 0.20142561197280884, + "learning_rate": 3.8103800461878344e-06, + "loss": 0.3541, + "step": 2204 + }, + { + "epoch": 1.8571027512633353, + "grad_norm": 0.21110711991786957, + "learning_rate": 3.805618793008279e-06, + "loss": 0.3517, + "step": 2205 + }, + { + "epoch": 1.857944974733296, + "grad_norm": 0.2029833346605301, + "learning_rate": 3.8008586881347815e-06, + "loss": 0.3593, + "step": 2206 + }, + { + "epoch": 1.8587871982032564, + "grad_norm": 0.1974041759967804, + "learning_rate": 3.7960997361438235e-06, + "loss": 0.324, + "step": 2207 + }, + { + "epoch": 1.8596294216732172, + "grad_norm": 0.19707661867141724, + "learning_rate": 3.7913419416107692e-06, + "loss": 0.358, + "step": 2208 + }, + { + "epoch": 1.860471645143178, + "grad_norm": 0.18754833936691284, + "learning_rate": 3.786585309109877e-06, + "loss": 0.3383, + "step": 2209 + }, + { + "epoch": 1.8613138686131387, + "grad_norm": 0.20792751014232635, + "learning_rate": 3.7818298432142814e-06, + "loss": 0.3767, + "step": 2210 + }, + { + "epoch": 1.8621560920830995, + "grad_norm": 0.2125100940465927, + "learning_rate": 3.777075548496001e-06, + "loss": 0.3741, + "step": 2211 + }, + { + "epoch": 1.86299831555306, + "grad_norm": 0.20231522619724274, + "learning_rate": 3.7723224295259247e-06, + "loss": 0.3425, + "step": 2212 + }, + { + "epoch": 1.8638405390230208, + "grad_norm": 0.19012172520160675, + "learning_rate": 3.7675704908738136e-06, + "loss": 0.3723, + "step": 2213 + }, + { + "epoch": 1.8646827624929814, + "grad_norm": 0.18218067288398743, + "learning_rate": 3.7628197371082916e-06, + "loss": 0.3243, + "step": 2214 + }, + { + "epoch": 1.8655249859629421, + "grad_norm": 0.1938510239124298, + "learning_rate": 3.758070172796846e-06, + "loss": 0.3692, + "step": 2215 + }, + { + "epoch": 1.866367209432903, + "grad_norm": 0.20477046072483063, + "learning_rate": 3.753321802505817e-06, + "loss": 0.4046, + "step": 2216 + }, + { + "epoch": 1.8672094329028637, + "grad_norm": 0.19240835309028625, + "learning_rate": 3.7485746308004013e-06, + "loss": 0.3404, + "step": 2217 + }, + { + "epoch": 1.8680516563728242, + "grad_norm": 0.17858603596687317, + "learning_rate": 3.743828662244639e-06, + "loss": 0.3911, + "step": 2218 + }, + { + "epoch": 1.868893879842785, + "grad_norm": 0.18475256860256195, + "learning_rate": 3.739083901401418e-06, + "loss": 0.3246, + "step": 2219 + }, + { + "epoch": 1.8697361033127455, + "grad_norm": 0.19433172047138214, + "learning_rate": 3.7343403528324574e-06, + "loss": 0.3648, + "step": 2220 + }, + { + "epoch": 1.8705783267827063, + "grad_norm": 0.20487923920154572, + "learning_rate": 3.7295980210983233e-06, + "loss": 0.3835, + "step": 2221 + }, + { + "epoch": 1.871420550252667, + "grad_norm": 0.19962626695632935, + "learning_rate": 3.7248569107583976e-06, + "loss": 0.3572, + "step": 2222 + }, + { + "epoch": 1.8722627737226278, + "grad_norm": 0.18625806272029877, + "learning_rate": 3.7201170263709004e-06, + "loss": 0.3539, + "step": 2223 + }, + { + "epoch": 1.8731049971925884, + "grad_norm": 0.1880733072757721, + "learning_rate": 3.7153783724928617e-06, + "loss": 0.338, + "step": 2224 + }, + { + "epoch": 1.8739472206625492, + "grad_norm": 0.22529815137386322, + "learning_rate": 3.71064095368014e-06, + "loss": 0.3857, + "step": 2225 + }, + { + "epoch": 1.8747894441325097, + "grad_norm": 0.19889788329601288, + "learning_rate": 3.705904774487396e-06, + "loss": 0.3197, + "step": 2226 + }, + { + "epoch": 1.8756316676024705, + "grad_norm": 0.22110477089881897, + "learning_rate": 3.7011698394681075e-06, + "loss": 0.3915, + "step": 2227 + }, + { + "epoch": 1.8764738910724312, + "grad_norm": 0.18708573281764984, + "learning_rate": 3.696436153174548e-06, + "loss": 0.3457, + "step": 2228 + }, + { + "epoch": 1.877316114542392, + "grad_norm": 0.1997976154088974, + "learning_rate": 3.6917037201577977e-06, + "loss": 0.3416, + "step": 2229 + }, + { + "epoch": 1.8781583380123528, + "grad_norm": 0.19073273241519928, + "learning_rate": 3.6869725449677254e-06, + "loss": 0.3595, + "step": 2230 + }, + { + "epoch": 1.8790005614823133, + "grad_norm": 0.204877570271492, + "learning_rate": 3.6822426321529967e-06, + "loss": 0.3675, + "step": 2231 + }, + { + "epoch": 1.8798427849522739, + "grad_norm": 0.19971223175525665, + "learning_rate": 3.6775139862610577e-06, + "loss": 0.3554, + "step": 2232 + }, + { + "epoch": 1.8806850084222346, + "grad_norm": 0.18837471306324005, + "learning_rate": 3.672786611838142e-06, + "loss": 0.3191, + "step": 2233 + }, + { + "epoch": 1.8815272318921954, + "grad_norm": 0.18985477089881897, + "learning_rate": 3.668060513429256e-06, + "loss": 0.3756, + "step": 2234 + }, + { + "epoch": 1.8823694553621562, + "grad_norm": 0.32632893323898315, + "learning_rate": 3.6633356955781827e-06, + "loss": 0.3784, + "step": 2235 + }, + { + "epoch": 1.883211678832117, + "grad_norm": 0.22074228525161743, + "learning_rate": 3.658612162827472e-06, + "loss": 0.3536, + "step": 2236 + }, + { + "epoch": 1.8840539023020775, + "grad_norm": 0.20336230099201202, + "learning_rate": 3.653889919718439e-06, + "loss": 0.3252, + "step": 2237 + }, + { + "epoch": 1.884896125772038, + "grad_norm": 0.20199203491210938, + "learning_rate": 3.649168970791157e-06, + "loss": 0.3427, + "step": 2238 + }, + { + "epoch": 1.8857383492419988, + "grad_norm": 0.18671944737434387, + "learning_rate": 3.644449320584462e-06, + "loss": 0.3506, + "step": 2239 + }, + { + "epoch": 1.8865805727119596, + "grad_norm": 0.21687693893909454, + "learning_rate": 3.639730973635929e-06, + "loss": 0.3996, + "step": 2240 + }, + { + "epoch": 1.8874227961819203, + "grad_norm": 0.19979308545589447, + "learning_rate": 3.635013934481895e-06, + "loss": 0.3454, + "step": 2241 + }, + { + "epoch": 1.888265019651881, + "grad_norm": 0.18458348512649536, + "learning_rate": 3.6302982076574244e-06, + "loss": 0.3344, + "step": 2242 + }, + { + "epoch": 1.8891072431218416, + "grad_norm": 0.21379539370536804, + "learning_rate": 3.6255837976963336e-06, + "loss": 0.3483, + "step": 2243 + }, + { + "epoch": 1.8899494665918024, + "grad_norm": 0.1934133619070053, + "learning_rate": 3.620870709131163e-06, + "loss": 0.36, + "step": 2244 + }, + { + "epoch": 1.890791690061763, + "grad_norm": 0.20163948833942413, + "learning_rate": 3.616158946493188e-06, + "loss": 0.3736, + "step": 2245 + }, + { + "epoch": 1.8916339135317237, + "grad_norm": 0.19730351865291595, + "learning_rate": 3.6114485143124068e-06, + "loss": 0.3244, + "step": 2246 + }, + { + "epoch": 1.8924761370016845, + "grad_norm": 0.21036989986896515, + "learning_rate": 3.6067394171175397e-06, + "loss": 0.3571, + "step": 2247 + }, + { + "epoch": 1.8933183604716453, + "grad_norm": 0.19844365119934082, + "learning_rate": 3.602031659436022e-06, + "loss": 0.3596, + "step": 2248 + }, + { + "epoch": 1.8941605839416058, + "grad_norm": 0.20970189571380615, + "learning_rate": 3.5973252457940034e-06, + "loss": 0.3491, + "step": 2249 + }, + { + "epoch": 1.8950028074115666, + "grad_norm": 0.20957820117473602, + "learning_rate": 3.5926201807163384e-06, + "loss": 0.3727, + "step": 2250 + }, + { + "epoch": 1.8958450308815271, + "grad_norm": 0.20133230090141296, + "learning_rate": 3.58791646872659e-06, + "loss": 0.3719, + "step": 2251 + }, + { + "epoch": 1.8966872543514879, + "grad_norm": 0.2148762345314026, + "learning_rate": 3.5832141143470146e-06, + "loss": 0.351, + "step": 2252 + }, + { + "epoch": 1.8975294778214487, + "grad_norm": 0.19707302749156952, + "learning_rate": 3.578513122098566e-06, + "loss": 0.323, + "step": 2253 + }, + { + "epoch": 1.8983717012914094, + "grad_norm": 0.22635704278945923, + "learning_rate": 3.5738134965008885e-06, + "loss": 0.3896, + "step": 2254 + }, + { + "epoch": 1.89921392476137, + "grad_norm": 0.4272863268852234, + "learning_rate": 3.5691152420723115e-06, + "loss": 0.3598, + "step": 2255 + }, + { + "epoch": 1.9000561482313307, + "grad_norm": 0.1947227418422699, + "learning_rate": 3.564418363329848e-06, + "loss": 0.3521, + "step": 2256 + }, + { + "epoch": 1.9008983717012913, + "grad_norm": 0.2008480727672577, + "learning_rate": 3.559722864789187e-06, + "loss": 0.3246, + "step": 2257 + }, + { + "epoch": 1.901740595171252, + "grad_norm": 0.22643692791461945, + "learning_rate": 3.5550287509646902e-06, + "loss": 0.3798, + "step": 2258 + }, + { + "epoch": 1.9025828186412128, + "grad_norm": 0.18489694595336914, + "learning_rate": 3.5503360263693887e-06, + "loss": 0.3446, + "step": 2259 + }, + { + "epoch": 1.9034250421111736, + "grad_norm": 0.19398696720600128, + "learning_rate": 3.5456446955149783e-06, + "loss": 0.3824, + "step": 2260 + }, + { + "epoch": 1.9042672655811343, + "grad_norm": 0.2685796916484833, + "learning_rate": 3.5409547629118124e-06, + "loss": 0.3266, + "step": 2261 + }, + { + "epoch": 1.905109489051095, + "grad_norm": 0.18848860263824463, + "learning_rate": 3.5362662330689067e-06, + "loss": 0.3595, + "step": 2262 + }, + { + "epoch": 1.9059517125210554, + "grad_norm": 0.20824608206748962, + "learning_rate": 3.531579110493917e-06, + "loss": 0.3746, + "step": 2263 + }, + { + "epoch": 1.9067939359910162, + "grad_norm": 0.20500995218753815, + "learning_rate": 3.5268933996931596e-06, + "loss": 0.3158, + "step": 2264 + }, + { + "epoch": 1.907636159460977, + "grad_norm": 0.18048281967639923, + "learning_rate": 3.5222091051715803e-06, + "loss": 0.3517, + "step": 2265 + }, + { + "epoch": 1.9084783829309377, + "grad_norm": 0.21019864082336426, + "learning_rate": 3.517526231432775e-06, + "loss": 0.3784, + "step": 2266 + }, + { + "epoch": 1.9093206064008985, + "grad_norm": 0.1963644176721573, + "learning_rate": 3.512844782978963e-06, + "loss": 0.3428, + "step": 2267 + }, + { + "epoch": 1.910162829870859, + "grad_norm": 0.2139882594347, + "learning_rate": 3.5081647643110028e-06, + "loss": 0.3766, + "step": 2268 + }, + { + "epoch": 1.9110050533408196, + "grad_norm": 0.17365844547748566, + "learning_rate": 3.5034861799283713e-06, + "loss": 0.3341, + "step": 2269 + }, + { + "epoch": 1.9118472768107804, + "grad_norm": 0.18427762389183044, + "learning_rate": 3.498809034329171e-06, + "loss": 0.3435, + "step": 2270 + }, + { + "epoch": 1.9126895002807411, + "grad_norm": 0.1972460150718689, + "learning_rate": 3.4941333320101173e-06, + "loss": 0.3402, + "step": 2271 + }, + { + "epoch": 1.913531723750702, + "grad_norm": 0.22721323370933533, + "learning_rate": 3.4894590774665414e-06, + "loss": 0.3715, + "step": 2272 + }, + { + "epoch": 1.9143739472206627, + "grad_norm": 0.204066202044487, + "learning_rate": 3.48478627519238e-06, + "loss": 0.3372, + "step": 2273 + }, + { + "epoch": 1.9152161706906232, + "grad_norm": 0.21919427812099457, + "learning_rate": 3.480114929680176e-06, + "loss": 0.3593, + "step": 2274 + }, + { + "epoch": 1.916058394160584, + "grad_norm": 0.24482612311840057, + "learning_rate": 3.4754450454210686e-06, + "loss": 0.3878, + "step": 2275 + }, + { + "epoch": 1.9169006176305445, + "grad_norm": 0.21800611913204193, + "learning_rate": 3.470776626904795e-06, + "loss": 0.3533, + "step": 2276 + }, + { + "epoch": 1.9177428411005053, + "grad_norm": 0.21003371477127075, + "learning_rate": 3.466109678619681e-06, + "loss": 0.3648, + "step": 2277 + }, + { + "epoch": 1.918585064570466, + "grad_norm": 0.20837385952472687, + "learning_rate": 3.4614442050526424e-06, + "loss": 0.3303, + "step": 2278 + }, + { + "epoch": 1.9194272880404268, + "grad_norm": 0.25349435210227966, + "learning_rate": 3.4567802106891724e-06, + "loss": 0.3891, + "step": 2279 + }, + { + "epoch": 1.9202695115103874, + "grad_norm": 0.23070620000362396, + "learning_rate": 3.4521177000133456e-06, + "loss": 0.3497, + "step": 2280 + }, + { + "epoch": 1.9211117349803482, + "grad_norm": 0.21101213991641998, + "learning_rate": 3.4474566775078055e-06, + "loss": 0.3595, + "step": 2281 + }, + { + "epoch": 1.9219539584503087, + "grad_norm": 0.18892095983028412, + "learning_rate": 3.442797147653776e-06, + "loss": 0.3136, + "step": 2282 + }, + { + "epoch": 1.9227961819202695, + "grad_norm": 0.22585587203502655, + "learning_rate": 3.4381391149310294e-06, + "loss": 0.3791, + "step": 2283 + }, + { + "epoch": 1.9236384053902302, + "grad_norm": 0.21867422759532928, + "learning_rate": 3.4334825838179143e-06, + "loss": 0.3722, + "step": 2284 + }, + { + "epoch": 1.924480628860191, + "grad_norm": 0.20635460317134857, + "learning_rate": 3.4288275587913235e-06, + "loss": 0.3633, + "step": 2285 + }, + { + "epoch": 1.9253228523301515, + "grad_norm": 0.18385303020477295, + "learning_rate": 3.4241740443267112e-06, + "loss": 0.3453, + "step": 2286 + }, + { + "epoch": 1.9261650758001123, + "grad_norm": 0.2156953364610672, + "learning_rate": 3.419522044898073e-06, + "loss": 0.3443, + "step": 2287 + }, + { + "epoch": 1.9270072992700729, + "grad_norm": 0.20479954779148102, + "learning_rate": 3.414871564977951e-06, + "loss": 0.3783, + "step": 2288 + }, + { + "epoch": 1.9278495227400336, + "grad_norm": 0.1958267241716385, + "learning_rate": 3.4102226090374246e-06, + "loss": 0.3814, + "step": 2289 + }, + { + "epoch": 1.9286917462099944, + "grad_norm": 0.20148569345474243, + "learning_rate": 3.4055751815461102e-06, + "loss": 0.341, + "step": 2290 + }, + { + "epoch": 1.9295339696799552, + "grad_norm": 0.20417562127113342, + "learning_rate": 3.4009292869721516e-06, + "loss": 0.3586, + "step": 2291 + }, + { + "epoch": 1.930376193149916, + "grad_norm": 0.22307829558849335, + "learning_rate": 3.3962849297822225e-06, + "loss": 0.3653, + "step": 2292 + }, + { + "epoch": 1.9312184166198765, + "grad_norm": 0.2065296322107315, + "learning_rate": 3.3916421144415146e-06, + "loss": 0.3578, + "step": 2293 + }, + { + "epoch": 1.932060640089837, + "grad_norm": 0.17637860774993896, + "learning_rate": 3.387000845413742e-06, + "loss": 0.3092, + "step": 2294 + }, + { + "epoch": 1.9329028635597978, + "grad_norm": 0.2161666601896286, + "learning_rate": 3.3823611271611266e-06, + "loss": 0.3947, + "step": 2295 + }, + { + "epoch": 1.9337450870297586, + "grad_norm": 0.19415435194969177, + "learning_rate": 3.377722964144405e-06, + "loss": 0.3314, + "step": 2296 + }, + { + "epoch": 1.9345873104997193, + "grad_norm": 0.26362645626068115, + "learning_rate": 3.3730863608228125e-06, + "loss": 0.3795, + "step": 2297 + }, + { + "epoch": 1.93542953396968, + "grad_norm": 0.192129984498024, + "learning_rate": 3.368451321654091e-06, + "loss": 0.3675, + "step": 2298 + }, + { + "epoch": 1.9362717574396406, + "grad_norm": 0.19030117988586426, + "learning_rate": 3.363817851094473e-06, + "loss": 0.3225, + "step": 2299 + }, + { + "epoch": 1.9371139809096012, + "grad_norm": 0.2569832503795624, + "learning_rate": 3.3591859535986894e-06, + "loss": 0.3636, + "step": 2300 + }, + { + "epoch": 1.937956204379562, + "grad_norm": 0.19425804913043976, + "learning_rate": 3.35455563361995e-06, + "loss": 0.3706, + "step": 2301 + }, + { + "epoch": 1.9387984278495227, + "grad_norm": 0.18746623396873474, + "learning_rate": 3.3499268956099583e-06, + "loss": 0.3475, + "step": 2302 + }, + { + "epoch": 1.9396406513194835, + "grad_norm": 0.20306524634361267, + "learning_rate": 3.345299744018886e-06, + "loss": 0.3673, + "step": 2303 + }, + { + "epoch": 1.9404828747894443, + "grad_norm": 0.20163676142692566, + "learning_rate": 3.3406741832953893e-06, + "loss": 0.36, + "step": 2304 + }, + { + "epoch": 1.9413250982594048, + "grad_norm": 0.23225265741348267, + "learning_rate": 3.336050217886588e-06, + "loss": 0.3633, + "step": 2305 + }, + { + "epoch": 1.9421673217293656, + "grad_norm": 0.19788064062595367, + "learning_rate": 3.331427852238073e-06, + "loss": 0.3652, + "step": 2306 + }, + { + "epoch": 1.9430095451993261, + "grad_norm": 0.19441500306129456, + "learning_rate": 3.3268070907938915e-06, + "loss": 0.3412, + "step": 2307 + }, + { + "epoch": 1.9438517686692869, + "grad_norm": 0.18719305098056793, + "learning_rate": 3.3221879379965553e-06, + "loss": 0.3476, + "step": 2308 + }, + { + "epoch": 1.9446939921392477, + "grad_norm": 0.1911689043045044, + "learning_rate": 3.3175703982870232e-06, + "loss": 0.3511, + "step": 2309 + }, + { + "epoch": 1.9455362156092084, + "grad_norm": 0.18693746626377106, + "learning_rate": 3.3129544761047093e-06, + "loss": 0.3135, + "step": 2310 + }, + { + "epoch": 1.946378439079169, + "grad_norm": 0.2037518471479416, + "learning_rate": 3.3083401758874655e-06, + "loss": 0.3657, + "step": 2311 + }, + { + "epoch": 1.9472206625491297, + "grad_norm": 0.21990913152694702, + "learning_rate": 3.303727502071591e-06, + "loss": 0.3737, + "step": 2312 + }, + { + "epoch": 1.9480628860190903, + "grad_norm": 0.17429234087467194, + "learning_rate": 3.2991164590918162e-06, + "loss": 0.331, + "step": 2313 + }, + { + "epoch": 1.948905109489051, + "grad_norm": 0.18694309890270233, + "learning_rate": 3.2945070513813082e-06, + "loss": 0.3524, + "step": 2314 + }, + { + "epoch": 1.9497473329590118, + "grad_norm": 0.2149062603712082, + "learning_rate": 3.289899283371657e-06, + "loss": 0.3825, + "step": 2315 + }, + { + "epoch": 1.9505895564289726, + "grad_norm": 0.19212117791175842, + "learning_rate": 3.2852931594928804e-06, + "loss": 0.3433, + "step": 2316 + }, + { + "epoch": 1.9514317798989333, + "grad_norm": 0.1927662193775177, + "learning_rate": 3.280688684173412e-06, + "loss": 0.3673, + "step": 2317 + }, + { + "epoch": 1.952274003368894, + "grad_norm": 0.20035819709300995, + "learning_rate": 3.276085861840106e-06, + "loss": 0.3546, + "step": 2318 + }, + { + "epoch": 1.9531162268388544, + "grad_norm": 0.20155903697013855, + "learning_rate": 3.271484696918218e-06, + "loss": 0.3196, + "step": 2319 + }, + { + "epoch": 1.9539584503088152, + "grad_norm": 0.2146608829498291, + "learning_rate": 3.2668851938314217e-06, + "loss": 0.3786, + "step": 2320 + }, + { + "epoch": 1.954800673778776, + "grad_norm": 0.20258279144763947, + "learning_rate": 3.262287357001781e-06, + "loss": 0.3656, + "step": 2321 + }, + { + "epoch": 1.9556428972487367, + "grad_norm": 0.20564958453178406, + "learning_rate": 3.2576911908497695e-06, + "loss": 0.3539, + "step": 2322 + }, + { + "epoch": 1.9564851207186975, + "grad_norm": 0.20844140648841858, + "learning_rate": 3.253096699794245e-06, + "loss": 0.3663, + "step": 2323 + }, + { + "epoch": 1.957327344188658, + "grad_norm": 0.20116738975048065, + "learning_rate": 3.248503888252461e-06, + "loss": 0.3497, + "step": 2324 + }, + { + "epoch": 1.9581695676586186, + "grad_norm": 0.18670654296875, + "learning_rate": 3.2439127606400546e-06, + "loss": 0.3396, + "step": 2325 + }, + { + "epoch": 1.9590117911285794, + "grad_norm": 0.2075943797826767, + "learning_rate": 3.239323321371039e-06, + "loss": 0.3482, + "step": 2326 + }, + { + "epoch": 1.9598540145985401, + "grad_norm": 0.21039381623268127, + "learning_rate": 3.2347355748578134e-06, + "loss": 0.4028, + "step": 2327 + }, + { + "epoch": 1.960696238068501, + "grad_norm": 0.1902557611465454, + "learning_rate": 3.2301495255111426e-06, + "loss": 0.3305, + "step": 2328 + }, + { + "epoch": 1.9615384615384617, + "grad_norm": 0.210167795419693, + "learning_rate": 3.225565177740163e-06, + "loss": 0.3658, + "step": 2329 + }, + { + "epoch": 1.9623806850084222, + "grad_norm": 0.18592344224452972, + "learning_rate": 3.2209825359523717e-06, + "loss": 0.3383, + "step": 2330 + }, + { + "epoch": 1.9632229084783828, + "grad_norm": 0.19514751434326172, + "learning_rate": 3.2164016045536306e-06, + "loss": 0.3599, + "step": 2331 + }, + { + "epoch": 1.9640651319483435, + "grad_norm": 0.18985408544540405, + "learning_rate": 3.2118223879481525e-06, + "loss": 0.3548, + "step": 2332 + }, + { + "epoch": 1.9649073554183043, + "grad_norm": 0.19115465879440308, + "learning_rate": 3.2072448905385046e-06, + "loss": 0.3573, + "step": 2333 + }, + { + "epoch": 1.965749578888265, + "grad_norm": 0.20761296153068542, + "learning_rate": 3.202669116725598e-06, + "loss": 0.3612, + "step": 2334 + }, + { + "epoch": 1.9665918023582258, + "grad_norm": 0.20256203413009644, + "learning_rate": 3.1980950709086923e-06, + "loss": 0.3644, + "step": 2335 + }, + { + "epoch": 1.9674340258281864, + "grad_norm": 0.20262305438518524, + "learning_rate": 3.193522757485378e-06, + "loss": 0.3643, + "step": 2336 + }, + { + "epoch": 1.9682762492981472, + "grad_norm": 0.18650726974010468, + "learning_rate": 3.1889521808515888e-06, + "loss": 0.3278, + "step": 2337 + }, + { + "epoch": 1.9691184727681077, + "grad_norm": 0.18252995610237122, + "learning_rate": 3.1843833454015804e-06, + "loss": 0.357, + "step": 2338 + }, + { + "epoch": 1.9699606962380685, + "grad_norm": 0.2056460976600647, + "learning_rate": 3.179816255527941e-06, + "loss": 0.3342, + "step": 2339 + }, + { + "epoch": 1.9708029197080292, + "grad_norm": 0.21376261115074158, + "learning_rate": 3.1752509156215738e-06, + "loss": 0.3602, + "step": 2340 + }, + { + "epoch": 1.97164514317799, + "grad_norm": 0.18783794343471527, + "learning_rate": 3.1706873300717094e-06, + "loss": 0.3479, + "step": 2341 + }, + { + "epoch": 1.9724873666479505, + "grad_norm": 0.18589752912521362, + "learning_rate": 3.16612550326588e-06, + "loss": 0.3471, + "step": 2342 + }, + { + "epoch": 1.9733295901179113, + "grad_norm": 0.2014445960521698, + "learning_rate": 3.1615654395899377e-06, + "loss": 0.3662, + "step": 2343 + }, + { + "epoch": 1.9741718135878719, + "grad_norm": 0.19668903946876526, + "learning_rate": 3.1570071434280292e-06, + "loss": 0.3346, + "step": 2344 + }, + { + "epoch": 1.9750140370578326, + "grad_norm": 0.20105549693107605, + "learning_rate": 3.152450619162612e-06, + "loss": 0.3417, + "step": 2345 + }, + { + "epoch": 1.9758562605277934, + "grad_norm": 0.1995558738708496, + "learning_rate": 3.1478958711744324e-06, + "loss": 0.3672, + "step": 2346 + }, + { + "epoch": 1.9766984839977542, + "grad_norm": 0.19072867929935455, + "learning_rate": 3.1433429038425334e-06, + "loss": 0.3369, + "step": 2347 + }, + { + "epoch": 1.977540707467715, + "grad_norm": 0.20282776653766632, + "learning_rate": 3.1387917215442427e-06, + "loss": 0.3741, + "step": 2348 + }, + { + "epoch": 1.9783829309376755, + "grad_norm": 0.19809283316135406, + "learning_rate": 3.1342423286551756e-06, + "loss": 0.3668, + "step": 2349 + }, + { + "epoch": 1.979225154407636, + "grad_norm": 0.21220876276493073, + "learning_rate": 3.1296947295492226e-06, + "loss": 0.3635, + "step": 2350 + }, + { + "epoch": 1.9800673778775968, + "grad_norm": 0.21924105286598206, + "learning_rate": 3.125148928598554e-06, + "loss": 0.3749, + "step": 2351 + }, + { + "epoch": 1.9809096013475576, + "grad_norm": 0.19084849953651428, + "learning_rate": 3.120604930173608e-06, + "loss": 0.3156, + "step": 2352 + }, + { + "epoch": 1.9817518248175183, + "grad_norm": 0.2195017784833908, + "learning_rate": 3.116062738643092e-06, + "loss": 0.3913, + "step": 2353 + }, + { + "epoch": 1.982594048287479, + "grad_norm": 0.18741287291049957, + "learning_rate": 3.1115223583739746e-06, + "loss": 0.3305, + "step": 2354 + }, + { + "epoch": 1.9834362717574396, + "grad_norm": 0.226642906665802, + "learning_rate": 3.1069837937314846e-06, + "loss": 0.3552, + "step": 2355 + }, + { + "epoch": 1.9842784952274002, + "grad_norm": 0.2061690092086792, + "learning_rate": 3.1024470490791027e-06, + "loss": 0.3734, + "step": 2356 + }, + { + "epoch": 1.985120718697361, + "grad_norm": 0.18772165477275848, + "learning_rate": 3.097912128778563e-06, + "loss": 0.3295, + "step": 2357 + }, + { + "epoch": 1.9859629421673217, + "grad_norm": 0.18409810960292816, + "learning_rate": 3.093379037189842e-06, + "loss": 0.3459, + "step": 2358 + }, + { + "epoch": 1.9868051656372825, + "grad_norm": 0.1886821687221527, + "learning_rate": 3.0888477786711646e-06, + "loss": 0.3526, + "step": 2359 + }, + { + "epoch": 1.9876473891072433, + "grad_norm": 0.2152273952960968, + "learning_rate": 3.0843183575789824e-06, + "loss": 0.3898, + "step": 2360 + }, + { + "epoch": 1.9884896125772038, + "grad_norm": 0.17739993333816528, + "learning_rate": 3.0797907782679944e-06, + "loss": 0.3301, + "step": 2361 + }, + { + "epoch": 1.9893318360471643, + "grad_norm": 0.2642650306224823, + "learning_rate": 3.075265045091114e-06, + "loss": 0.3241, + "step": 2362 + }, + { + "epoch": 1.9901740595171251, + "grad_norm": 0.20823827385902405, + "learning_rate": 3.070741162399492e-06, + "loss": 0.415, + "step": 2363 + }, + { + "epoch": 1.9910162829870859, + "grad_norm": 0.20513373613357544, + "learning_rate": 3.0662191345424925e-06, + "loss": 0.3595, + "step": 2364 + }, + { + "epoch": 1.9918585064570467, + "grad_norm": 0.20934171974658966, + "learning_rate": 3.061698965867701e-06, + "loss": 0.3682, + "step": 2365 + }, + { + "epoch": 1.9927007299270074, + "grad_norm": 0.1961492896080017, + "learning_rate": 3.057180660720912e-06, + "loss": 0.3486, + "step": 2366 + }, + { + "epoch": 1.993542953396968, + "grad_norm": 0.1857389658689499, + "learning_rate": 3.0526642234461313e-06, + "loss": 0.3278, + "step": 2367 + }, + { + "epoch": 1.9943851768669287, + "grad_norm": 0.21698589622974396, + "learning_rate": 3.048149658385565e-06, + "loss": 0.3702, + "step": 2368 + }, + { + "epoch": 1.9952274003368893, + "grad_norm": 0.18934296071529388, + "learning_rate": 3.043636969879625e-06, + "loss": 0.359, + "step": 2369 + }, + { + "epoch": 1.99606962380685, + "grad_norm": 0.1986536830663681, + "learning_rate": 3.039126162266912e-06, + "loss": 0.3645, + "step": 2370 + }, + { + "epoch": 1.9969118472768108, + "grad_norm": 0.18482689559459686, + "learning_rate": 3.0346172398842254e-06, + "loss": 0.3557, + "step": 2371 + }, + { + "epoch": 1.9977540707467716, + "grad_norm": 0.19014687836170197, + "learning_rate": 3.0301102070665466e-06, + "loss": 0.3637, + "step": 2372 + }, + { + "epoch": 1.9985962942167321, + "grad_norm": 0.18836571276187897, + "learning_rate": 3.0256050681470446e-06, + "loss": 0.3267, + "step": 2373 + }, + { + "epoch": 1.999438517686693, + "grad_norm": 0.21261648833751678, + "learning_rate": 3.0211018274570625e-06, + "loss": 0.3664, + "step": 2374 + }, + { + "epoch": 2.0002807411566534, + "grad_norm": 0.4335678815841675, + "learning_rate": 3.0166004893261247e-06, + "loss": 0.5585, + "step": 2375 + }, + { + "epoch": 2.001122964626614, + "grad_norm": 0.18163718283176422, + "learning_rate": 3.012101058081919e-06, + "loss": 0.2978, + "step": 2376 + }, + { + "epoch": 2.001965188096575, + "grad_norm": 0.18798938393592834, + "learning_rate": 3.007603538050309e-06, + "loss": 0.3536, + "step": 2377 + }, + { + "epoch": 2.0028074115665357, + "grad_norm": 0.19393697381019592, + "learning_rate": 3.0031079335553097e-06, + "loss": 0.2928, + "step": 2378 + }, + { + "epoch": 2.0036496350364965, + "grad_norm": 0.22345486283302307, + "learning_rate": 2.9986142489191074e-06, + "loss": 0.3708, + "step": 2379 + }, + { + "epoch": 2.004491858506457, + "grad_norm": 0.20450833439826965, + "learning_rate": 2.994122488462029e-06, + "loss": 0.3271, + "step": 2380 + }, + { + "epoch": 2.0053340819764176, + "grad_norm": 0.18012557923793793, + "learning_rate": 2.989632656502564e-06, + "loss": 0.3147, + "step": 2381 + }, + { + "epoch": 2.0061763054463784, + "grad_norm": 0.1874862015247345, + "learning_rate": 2.9851447573573383e-06, + "loss": 0.3424, + "step": 2382 + }, + { + "epoch": 2.007018528916339, + "grad_norm": 0.20006859302520752, + "learning_rate": 2.980658795341125e-06, + "loss": 0.3219, + "step": 2383 + }, + { + "epoch": 2.0078607523863, + "grad_norm": 0.20945049822330475, + "learning_rate": 2.9761747747668314e-06, + "loss": 0.3499, + "step": 2384 + }, + { + "epoch": 2.0087029758562607, + "grad_norm": 0.20440226793289185, + "learning_rate": 2.971692699945502e-06, + "loss": 0.3626, + "step": 2385 + }, + { + "epoch": 2.0095451993262214, + "grad_norm": 0.19525331258773804, + "learning_rate": 2.9672125751863067e-06, + "loss": 0.3229, + "step": 2386 + }, + { + "epoch": 2.0103874227961818, + "grad_norm": 0.1915249079465866, + "learning_rate": 2.9627344047965433e-06, + "loss": 0.3042, + "step": 2387 + }, + { + "epoch": 2.0112296462661425, + "grad_norm": 0.2285671979188919, + "learning_rate": 2.958258193081629e-06, + "loss": 0.3975, + "step": 2388 + }, + { + "epoch": 2.0120718697361033, + "grad_norm": 0.17777401208877563, + "learning_rate": 2.9537839443451e-06, + "loss": 0.294, + "step": 2389 + }, + { + "epoch": 2.012914093206064, + "grad_norm": 0.21443428099155426, + "learning_rate": 2.949311662888601e-06, + "loss": 0.322, + "step": 2390 + }, + { + "epoch": 2.013756316676025, + "grad_norm": 0.205448180437088, + "learning_rate": 2.9448413530118912e-06, + "loss": 0.3371, + "step": 2391 + }, + { + "epoch": 2.0145985401459856, + "grad_norm": 0.2116105556488037, + "learning_rate": 2.94037301901283e-06, + "loss": 0.3382, + "step": 2392 + }, + { + "epoch": 2.015440763615946, + "grad_norm": 0.18419043719768524, + "learning_rate": 2.935906665187378e-06, + "loss": 0.3019, + "step": 2393 + }, + { + "epoch": 2.0162829870859067, + "grad_norm": 0.21632736921310425, + "learning_rate": 2.9314422958295906e-06, + "loss": 0.336, + "step": 2394 + }, + { + "epoch": 2.0171252105558675, + "grad_norm": 0.20205894112586975, + "learning_rate": 2.9269799152316226e-06, + "loss": 0.3463, + "step": 2395 + }, + { + "epoch": 2.0179674340258282, + "grad_norm": 0.20102480053901672, + "learning_rate": 2.922519527683706e-06, + "loss": 0.3536, + "step": 2396 + }, + { + "epoch": 2.018809657495789, + "grad_norm": 0.18754532933235168, + "learning_rate": 2.9180611374741623e-06, + "loss": 0.308, + "step": 2397 + }, + { + "epoch": 2.0196518809657498, + "grad_norm": 0.17986422777175903, + "learning_rate": 2.913604748889395e-06, + "loss": 0.3044, + "step": 2398 + }, + { + "epoch": 2.02049410443571, + "grad_norm": 0.20123037695884705, + "learning_rate": 2.9091503662138764e-06, + "loss": 0.3692, + "step": 2399 + }, + { + "epoch": 2.021336327905671, + "grad_norm": 0.1836530864238739, + "learning_rate": 2.904697993730159e-06, + "loss": 0.3328, + "step": 2400 + }, + { + "epoch": 2.0221785513756316, + "grad_norm": 0.19825614988803864, + "learning_rate": 2.900247635718856e-06, + "loss": 0.3433, + "step": 2401 + }, + { + "epoch": 2.0230207748455924, + "grad_norm": 0.17435987293720245, + "learning_rate": 2.8957992964586445e-06, + "loss": 0.2942, + "step": 2402 + }, + { + "epoch": 2.023862998315553, + "grad_norm": 0.19725970923900604, + "learning_rate": 2.891352980226262e-06, + "loss": 0.342, + "step": 2403 + }, + { + "epoch": 2.024705221785514, + "grad_norm": 0.16922511160373688, + "learning_rate": 2.886908691296504e-06, + "loss": 0.288, + "step": 2404 + }, + { + "epoch": 2.0255474452554743, + "grad_norm": 0.20383970439434052, + "learning_rate": 2.8824664339422115e-06, + "loss": 0.322, + "step": 2405 + }, + { + "epoch": 2.026389668725435, + "grad_norm": 0.1849295198917389, + "learning_rate": 2.8780262124342755e-06, + "loss": 0.2902, + "step": 2406 + }, + { + "epoch": 2.027231892195396, + "grad_norm": 0.20193511247634888, + "learning_rate": 2.873588031041627e-06, + "loss": 0.3766, + "step": 2407 + }, + { + "epoch": 2.0280741156653566, + "grad_norm": 0.19835780560970306, + "learning_rate": 2.8691518940312413e-06, + "loss": 0.3577, + "step": 2408 + }, + { + "epoch": 2.0289163391353173, + "grad_norm": 0.1865420639514923, + "learning_rate": 2.8647178056681197e-06, + "loss": 0.3205, + "step": 2409 + }, + { + "epoch": 2.029758562605278, + "grad_norm": 0.19542618095874786, + "learning_rate": 2.8602857702153054e-06, + "loss": 0.3441, + "step": 2410 + }, + { + "epoch": 2.0306007860752384, + "grad_norm": 0.20656581223011017, + "learning_rate": 2.8558557919338537e-06, + "loss": 0.3295, + "step": 2411 + }, + { + "epoch": 2.031443009545199, + "grad_norm": 0.20013107359409332, + "learning_rate": 2.8514278750828537e-06, + "loss": 0.3104, + "step": 2412 + }, + { + "epoch": 2.03228523301516, + "grad_norm": 0.19680936634540558, + "learning_rate": 2.847002023919406e-06, + "loss": 0.3641, + "step": 2413 + }, + { + "epoch": 2.0331274564851207, + "grad_norm": 0.20290818810462952, + "learning_rate": 2.8425782426986304e-06, + "loss": 0.3606, + "step": 2414 + }, + { + "epoch": 2.0339696799550815, + "grad_norm": 0.2051219791173935, + "learning_rate": 2.838156535673652e-06, + "loss": 0.3302, + "step": 2415 + }, + { + "epoch": 2.0348119034250423, + "grad_norm": 0.1948593556880951, + "learning_rate": 2.833736907095604e-06, + "loss": 0.3358, + "step": 2416 + }, + { + "epoch": 2.035654126895003, + "grad_norm": 0.19389194250106812, + "learning_rate": 2.8293193612136183e-06, + "loss": 0.3352, + "step": 2417 + }, + { + "epoch": 2.0364963503649633, + "grad_norm": 0.19726473093032837, + "learning_rate": 2.8249039022748315e-06, + "loss": 0.3391, + "step": 2418 + }, + { + "epoch": 2.037338573834924, + "grad_norm": 0.19745996594429016, + "learning_rate": 2.8204905345243664e-06, + "loss": 0.3313, + "step": 2419 + }, + { + "epoch": 2.038180797304885, + "grad_norm": 0.18009109795093536, + "learning_rate": 2.816079262205339e-06, + "loss": 0.3175, + "step": 2420 + }, + { + "epoch": 2.0390230207748457, + "grad_norm": 0.18453653156757355, + "learning_rate": 2.8116700895588473e-06, + "loss": 0.3362, + "step": 2421 + }, + { + "epoch": 2.0398652442448064, + "grad_norm": 0.18108515441417694, + "learning_rate": 2.807263020823977e-06, + "loss": 0.3319, + "step": 2422 + }, + { + "epoch": 2.040707467714767, + "grad_norm": 0.18968205153942108, + "learning_rate": 2.8028580602377852e-06, + "loss": 0.3198, + "step": 2423 + }, + { + "epoch": 2.0415496911847275, + "grad_norm": 0.20090173184871674, + "learning_rate": 2.798455212035305e-06, + "loss": 0.3436, + "step": 2424 + }, + { + "epoch": 2.0423919146546883, + "grad_norm": 0.2093648612499237, + "learning_rate": 2.7940544804495345e-06, + "loss": 0.3239, + "step": 2425 + }, + { + "epoch": 2.043234138124649, + "grad_norm": 0.18286184966564178, + "learning_rate": 2.789655869711445e-06, + "loss": 0.298, + "step": 2426 + }, + { + "epoch": 2.04407636159461, + "grad_norm": 0.21552641689777374, + "learning_rate": 2.785259384049959e-06, + "loss": 0.3602, + "step": 2427 + }, + { + "epoch": 2.0449185850645706, + "grad_norm": 0.1874585598707199, + "learning_rate": 2.780865027691968e-06, + "loss": 0.3115, + "step": 2428 + }, + { + "epoch": 2.0457608085345313, + "grad_norm": 0.18681229650974274, + "learning_rate": 2.7764728048623003e-06, + "loss": 0.348, + "step": 2429 + }, + { + "epoch": 2.0466030320044917, + "grad_norm": 0.18479189276695251, + "learning_rate": 2.7720827197837475e-06, + "loss": 0.3516, + "step": 2430 + }, + { + "epoch": 2.0474452554744524, + "grad_norm": 0.20195545256137848, + "learning_rate": 2.7676947766770367e-06, + "loss": 0.3282, + "step": 2431 + }, + { + "epoch": 2.048287478944413, + "grad_norm": 0.19953538477420807, + "learning_rate": 2.7633089797608435e-06, + "loss": 0.3537, + "step": 2432 + }, + { + "epoch": 2.049129702414374, + "grad_norm": 0.17841529846191406, + "learning_rate": 2.7589253332517736e-06, + "loss": 0.314, + "step": 2433 + }, + { + "epoch": 2.0499719258843347, + "grad_norm": 0.2024816870689392, + "learning_rate": 2.7545438413643666e-06, + "loss": 0.3343, + "step": 2434 + }, + { + "epoch": 2.0508141493542955, + "grad_norm": 0.2072976976633072, + "learning_rate": 2.7501645083110893e-06, + "loss": 0.3357, + "step": 2435 + }, + { + "epoch": 2.051656372824256, + "grad_norm": 0.19924525916576385, + "learning_rate": 2.745787338302341e-06, + "loss": 0.3382, + "step": 2436 + }, + { + "epoch": 2.0524985962942166, + "grad_norm": 0.18899208307266235, + "learning_rate": 2.741412335546431e-06, + "loss": 0.3391, + "step": 2437 + }, + { + "epoch": 2.0533408197641774, + "grad_norm": 0.183904767036438, + "learning_rate": 2.7370395042495913e-06, + "loss": 0.3212, + "step": 2438 + }, + { + "epoch": 2.054183043234138, + "grad_norm": 0.17699691653251648, + "learning_rate": 2.7326688486159613e-06, + "loss": 0.3036, + "step": 2439 + }, + { + "epoch": 2.055025266704099, + "grad_norm": 0.19395457208156586, + "learning_rate": 2.7283003728475952e-06, + "loss": 0.3218, + "step": 2440 + }, + { + "epoch": 2.0558674901740597, + "grad_norm": 0.19687485694885254, + "learning_rate": 2.7239340811444476e-06, + "loss": 0.3498, + "step": 2441 + }, + { + "epoch": 2.05670971364402, + "grad_norm": 0.18807673454284668, + "learning_rate": 2.7195699777043723e-06, + "loss": 0.3553, + "step": 2442 + }, + { + "epoch": 2.0575519371139808, + "grad_norm": 0.1877565085887909, + "learning_rate": 2.7152080667231185e-06, + "loss": 0.3251, + "step": 2443 + }, + { + "epoch": 2.0583941605839415, + "grad_norm": 0.19361265003681183, + "learning_rate": 2.710848352394334e-06, + "loss": 0.3404, + "step": 2444 + }, + { + "epoch": 2.0592363840539023, + "grad_norm": 0.1828143149614334, + "learning_rate": 2.706490838909547e-06, + "loss": 0.2938, + "step": 2445 + }, + { + "epoch": 2.060078607523863, + "grad_norm": 0.20654404163360596, + "learning_rate": 2.7021355304581765e-06, + "loss": 0.3361, + "step": 2446 + }, + { + "epoch": 2.060920830993824, + "grad_norm": 0.2480696141719818, + "learning_rate": 2.6977824312275123e-06, + "loss": 0.3332, + "step": 2447 + }, + { + "epoch": 2.0617630544637846, + "grad_norm": 0.18790100514888763, + "learning_rate": 2.6934315454027323e-06, + "loss": 0.3492, + "step": 2448 + }, + { + "epoch": 2.062605277933745, + "grad_norm": 0.18553946912288666, + "learning_rate": 2.6890828771668742e-06, + "loss": 0.3542, + "step": 2449 + }, + { + "epoch": 2.0634475014037057, + "grad_norm": 0.1775999516248703, + "learning_rate": 2.684736430700854e-06, + "loss": 0.3335, + "step": 2450 + }, + { + "epoch": 2.0642897248736665, + "grad_norm": 0.1797809600830078, + "learning_rate": 2.680392210183446e-06, + "loss": 0.324, + "step": 2451 + }, + { + "epoch": 2.0651319483436272, + "grad_norm": 0.1839604526758194, + "learning_rate": 2.6760502197912842e-06, + "loss": 0.3312, + "step": 2452 + }, + { + "epoch": 2.065974171813588, + "grad_norm": 0.20131711661815643, + "learning_rate": 2.671710463698859e-06, + "loss": 0.3376, + "step": 2453 + }, + { + "epoch": 2.0668163952835488, + "grad_norm": 0.1908835917711258, + "learning_rate": 2.6673729460785174e-06, + "loss": 0.2927, + "step": 2454 + }, + { + "epoch": 2.067658618753509, + "grad_norm": 0.1810787320137024, + "learning_rate": 2.663037671100448e-06, + "loss": 0.3633, + "step": 2455 + }, + { + "epoch": 2.06850084222347, + "grad_norm": 0.17869506776332855, + "learning_rate": 2.6587046429326855e-06, + "loss": 0.3156, + "step": 2456 + }, + { + "epoch": 2.0693430656934306, + "grad_norm": 0.17782184481620789, + "learning_rate": 2.6543738657411033e-06, + "loss": 0.3061, + "step": 2457 + }, + { + "epoch": 2.0701852891633914, + "grad_norm": 0.19159172475337982, + "learning_rate": 2.6500453436894157e-06, + "loss": 0.3426, + "step": 2458 + }, + { + "epoch": 2.071027512633352, + "grad_norm": 0.1826389580965042, + "learning_rate": 2.6457190809391627e-06, + "loss": 0.3113, + "step": 2459 + }, + { + "epoch": 2.071869736103313, + "grad_norm": 0.17592735588550568, + "learning_rate": 2.6413950816497146e-06, + "loss": 0.344, + "step": 2460 + }, + { + "epoch": 2.0727119595732733, + "grad_norm": 0.18362398445606232, + "learning_rate": 2.6370733499782654e-06, + "loss": 0.3353, + "step": 2461 + }, + { + "epoch": 2.073554183043234, + "grad_norm": 0.18338549137115479, + "learning_rate": 2.6327538900798306e-06, + "loss": 0.2975, + "step": 2462 + }, + { + "epoch": 2.074396406513195, + "grad_norm": 0.18559111654758453, + "learning_rate": 2.628436706107238e-06, + "loss": 0.3489, + "step": 2463 + }, + { + "epoch": 2.0752386299831556, + "grad_norm": 0.20324550569057465, + "learning_rate": 2.6241218022111336e-06, + "loss": 0.3407, + "step": 2464 + }, + { + "epoch": 2.0760808534531163, + "grad_norm": 0.19184359908103943, + "learning_rate": 2.6198091825399606e-06, + "loss": 0.3328, + "step": 2465 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 0.19564636051654816, + "learning_rate": 2.6154988512399784e-06, + "loss": 0.3096, + "step": 2466 + }, + { + "epoch": 2.0777653003930374, + "grad_norm": 0.18796582520008087, + "learning_rate": 2.6111908124552355e-06, + "loss": 0.3161, + "step": 2467 + }, + { + "epoch": 2.078607523862998, + "grad_norm": 0.18963384628295898, + "learning_rate": 2.6068850703275856e-06, + "loss": 0.3167, + "step": 2468 + }, + { + "epoch": 2.079449747332959, + "grad_norm": 0.20601922273635864, + "learning_rate": 2.6025816289966703e-06, + "loss": 0.3675, + "step": 2469 + }, + { + "epoch": 2.0802919708029197, + "grad_norm": 0.17115628719329834, + "learning_rate": 2.598280492599913e-06, + "loss": 0.3148, + "step": 2470 + }, + { + "epoch": 2.0811341942728805, + "grad_norm": 0.1847679615020752, + "learning_rate": 2.5939816652725324e-06, + "loss": 0.337, + "step": 2471 + }, + { + "epoch": 2.0819764177428413, + "grad_norm": 0.1880049705505371, + "learning_rate": 2.5896851511475184e-06, + "loss": 0.3443, + "step": 2472 + }, + { + "epoch": 2.0828186412128016, + "grad_norm": 0.1756199449300766, + "learning_rate": 2.5853909543556444e-06, + "loss": 0.2916, + "step": 2473 + }, + { + "epoch": 2.0836608646827623, + "grad_norm": 0.19857759773731232, + "learning_rate": 2.5810990790254486e-06, + "loss": 0.352, + "step": 2474 + }, + { + "epoch": 2.084503088152723, + "grad_norm": 0.17780746519565582, + "learning_rate": 2.5768095292832412e-06, + "loss": 0.3154, + "step": 2475 + }, + { + "epoch": 2.085345311622684, + "grad_norm": 0.1851738542318344, + "learning_rate": 2.5725223092530937e-06, + "loss": 0.33, + "step": 2476 + }, + { + "epoch": 2.0861875350926447, + "grad_norm": 0.18334592878818512, + "learning_rate": 2.568237423056844e-06, + "loss": 0.3127, + "step": 2477 + }, + { + "epoch": 2.0870297585626054, + "grad_norm": 0.19777873158454895, + "learning_rate": 2.5639548748140803e-06, + "loss": 0.3236, + "step": 2478 + }, + { + "epoch": 2.087871982032566, + "grad_norm": 0.19690145552158356, + "learning_rate": 2.5596746686421436e-06, + "loss": 0.3272, + "step": 2479 + }, + { + "epoch": 2.0887142055025265, + "grad_norm": 0.20011959969997406, + "learning_rate": 2.5553968086561244e-06, + "loss": 0.3663, + "step": 2480 + }, + { + "epoch": 2.0895564289724873, + "grad_norm": 0.18417225778102875, + "learning_rate": 2.5511212989688587e-06, + "loss": 0.2957, + "step": 2481 + }, + { + "epoch": 2.090398652442448, + "grad_norm": 0.18653437495231628, + "learning_rate": 2.546848143690922e-06, + "loss": 0.3384, + "step": 2482 + }, + { + "epoch": 2.091240875912409, + "grad_norm": 0.1805136352777481, + "learning_rate": 2.5425773469306247e-06, + "loss": 0.3047, + "step": 2483 + }, + { + "epoch": 2.0920830993823696, + "grad_norm": 0.19548816978931427, + "learning_rate": 2.5383089127940087e-06, + "loss": 0.3162, + "step": 2484 + }, + { + "epoch": 2.0929253228523303, + "grad_norm": 0.1792110949754715, + "learning_rate": 2.534042845384851e-06, + "loss": 0.2979, + "step": 2485 + }, + { + "epoch": 2.0937675463222907, + "grad_norm": 0.19322973489761353, + "learning_rate": 2.5297791488046445e-06, + "loss": 0.3261, + "step": 2486 + }, + { + "epoch": 2.0946097697922514, + "grad_norm": 0.19382603466510773, + "learning_rate": 2.525517827152614e-06, + "loss": 0.3536, + "step": 2487 + }, + { + "epoch": 2.095451993262212, + "grad_norm": 0.18523024022579193, + "learning_rate": 2.5212588845256837e-06, + "loss": 0.3117, + "step": 2488 + }, + { + "epoch": 2.096294216732173, + "grad_norm": 0.2027691900730133, + "learning_rate": 2.517002325018508e-06, + "loss": 0.32, + "step": 2489 + }, + { + "epoch": 2.0971364402021337, + "grad_norm": 0.19315862655639648, + "learning_rate": 2.5127481527234397e-06, + "loss": 0.3182, + "step": 2490 + }, + { + "epoch": 2.0979786636720945, + "grad_norm": 0.20692229270935059, + "learning_rate": 2.508496371730543e-06, + "loss": 0.4019, + "step": 2491 + }, + { + "epoch": 2.098820887142055, + "grad_norm": 0.16789032518863678, + "learning_rate": 2.5042469861275768e-06, + "loss": 0.2814, + "step": 2492 + }, + { + "epoch": 2.0996631106120156, + "grad_norm": 0.1847231537103653, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.3051, + "step": 2493 + }, + { + "epoch": 2.1005053340819764, + "grad_norm": 0.1905345767736435, + "learning_rate": 2.4957554174309655e-06, + "loss": 0.3466, + "step": 2494 + }, + { + "epoch": 2.101347557551937, + "grad_norm": 0.20040138065814972, + "learning_rate": 2.491513242501315e-06, + "loss": 0.3337, + "step": 2495 + }, + { + "epoch": 2.102189781021898, + "grad_norm": 0.1926167607307434, + "learning_rate": 2.487273479289574e-06, + "loss": 0.31, + "step": 2496 + }, + { + "epoch": 2.1030320044918587, + "grad_norm": 0.18961243331432343, + "learning_rate": 2.4830361318719493e-06, + "loss": 0.343, + "step": 2497 + }, + { + "epoch": 2.103874227961819, + "grad_norm": 0.16875000298023224, + "learning_rate": 2.4788012043223253e-06, + "loss": 0.2956, + "step": 2498 + }, + { + "epoch": 2.1047164514317798, + "grad_norm": 0.23073846101760864, + "learning_rate": 2.4745687007122636e-06, + "loss": 0.3803, + "step": 2499 + }, + { + "epoch": 2.1055586749017405, + "grad_norm": 0.2010485678911209, + "learning_rate": 2.470338625110991e-06, + "loss": 0.3403, + "step": 2500 + }, + { + "epoch": 2.1064008983717013, + "grad_norm": 0.18841859698295593, + "learning_rate": 2.4661109815854005e-06, + "loss": 0.3306, + "step": 2501 + }, + { + "epoch": 2.107243121841662, + "grad_norm": 0.19118495285511017, + "learning_rate": 2.4618857742000463e-06, + "loss": 0.3179, + "step": 2502 + }, + { + "epoch": 2.108085345311623, + "grad_norm": 0.19626981019973755, + "learning_rate": 2.4576630070171447e-06, + "loss": 0.3228, + "step": 2503 + }, + { + "epoch": 2.108927568781583, + "grad_norm": 0.18817634880542755, + "learning_rate": 2.4534426840965604e-06, + "loss": 0.3178, + "step": 2504 + }, + { + "epoch": 2.109769792251544, + "grad_norm": 0.1974536031484604, + "learning_rate": 2.449224809495815e-06, + "loss": 0.3492, + "step": 2505 + }, + { + "epoch": 2.1106120157215047, + "grad_norm": 0.17316868901252747, + "learning_rate": 2.4450093872700648e-06, + "loss": 0.3129, + "step": 2506 + }, + { + "epoch": 2.1114542391914655, + "grad_norm": 0.1934424191713333, + "learning_rate": 2.440796421472122e-06, + "loss": 0.341, + "step": 2507 + }, + { + "epoch": 2.1122964626614262, + "grad_norm": 0.17776821553707123, + "learning_rate": 2.436585916152426e-06, + "loss": 0.336, + "step": 2508 + }, + { + "epoch": 2.113138686131387, + "grad_norm": 0.18335790932178497, + "learning_rate": 2.4323778753590582e-06, + "loss": 0.356, + "step": 2509 + }, + { + "epoch": 2.1139809096013478, + "grad_norm": 0.18221352994441986, + "learning_rate": 2.4281723031377275e-06, + "loss": 0.3176, + "step": 2510 + }, + { + "epoch": 2.114823133071308, + "grad_norm": 0.19676458835601807, + "learning_rate": 2.423969203531768e-06, + "loss": 0.3308, + "step": 2511 + }, + { + "epoch": 2.115665356541269, + "grad_norm": 0.18670789897441864, + "learning_rate": 2.419768580582137e-06, + "loss": 0.3109, + "step": 2512 + }, + { + "epoch": 2.1165075800112296, + "grad_norm": 0.19073185324668884, + "learning_rate": 2.4155704383274154e-06, + "loss": 0.343, + "step": 2513 + }, + { + "epoch": 2.1173498034811904, + "grad_norm": 0.20621013641357422, + "learning_rate": 2.411374780803793e-06, + "loss": 0.36, + "step": 2514 + }, + { + "epoch": 2.118192026951151, + "grad_norm": 0.19015243649482727, + "learning_rate": 2.4071816120450742e-06, + "loss": 0.3454, + "step": 2515 + }, + { + "epoch": 2.119034250421112, + "grad_norm": 0.17132285237312317, + "learning_rate": 2.402990936082667e-06, + "loss": 0.3016, + "step": 2516 + }, + { + "epoch": 2.1198764738910723, + "grad_norm": 0.18261852860450745, + "learning_rate": 2.3988027569455895e-06, + "loss": 0.355, + "step": 2517 + }, + { + "epoch": 2.120718697361033, + "grad_norm": 0.18323764204978943, + "learning_rate": 2.3946170786604526e-06, + "loss": 0.3355, + "step": 2518 + }, + { + "epoch": 2.121560920830994, + "grad_norm": 0.18560270965099335, + "learning_rate": 2.390433905251467e-06, + "loss": 0.3491, + "step": 2519 + }, + { + "epoch": 2.1224031443009546, + "grad_norm": 0.19938194751739502, + "learning_rate": 2.3862532407404306e-06, + "loss": 0.3359, + "step": 2520 + }, + { + "epoch": 2.1232453677709153, + "grad_norm": 0.17980343103408813, + "learning_rate": 2.3820750891467355e-06, + "loss": 0.3232, + "step": 2521 + }, + { + "epoch": 2.124087591240876, + "grad_norm": 0.19313830137252808, + "learning_rate": 2.377899454487351e-06, + "loss": 0.328, + "step": 2522 + }, + { + "epoch": 2.1249298147108364, + "grad_norm": 0.17931406199932098, + "learning_rate": 2.373726340776837e-06, + "loss": 0.3344, + "step": 2523 + }, + { + "epoch": 2.125772038180797, + "grad_norm": 0.20262593030929565, + "learning_rate": 2.369555752027313e-06, + "loss": 0.3525, + "step": 2524 + }, + { + "epoch": 2.126614261650758, + "grad_norm": 0.20063352584838867, + "learning_rate": 2.365387692248488e-06, + "loss": 0.3515, + "step": 2525 + }, + { + "epoch": 2.1274564851207187, + "grad_norm": 0.2100335657596588, + "learning_rate": 2.361222165447628e-06, + "loss": 0.3118, + "step": 2526 + }, + { + "epoch": 2.1282987085906795, + "grad_norm": 0.1892918199300766, + "learning_rate": 2.3570591756295717e-06, + "loss": 0.3397, + "step": 2527 + }, + { + "epoch": 2.1291409320606403, + "grad_norm": 0.1774381697177887, + "learning_rate": 2.3528987267967135e-06, + "loss": 0.2957, + "step": 2528 + }, + { + "epoch": 2.1299831555306006, + "grad_norm": 0.1967756599187851, + "learning_rate": 2.348740822949006e-06, + "loss": 0.3509, + "step": 2529 + }, + { + "epoch": 2.1308253790005613, + "grad_norm": 0.1812041848897934, + "learning_rate": 2.3445854680839534e-06, + "loss": 0.3119, + "step": 2530 + }, + { + "epoch": 2.131667602470522, + "grad_norm": 0.16971538960933685, + "learning_rate": 2.3404326661966148e-06, + "loss": 0.3074, + "step": 2531 + }, + { + "epoch": 2.132509825940483, + "grad_norm": 0.18924599885940552, + "learning_rate": 2.33628242127959e-06, + "loss": 0.3577, + "step": 2532 + }, + { + "epoch": 2.1333520494104437, + "grad_norm": 0.18778833746910095, + "learning_rate": 2.33213473732302e-06, + "loss": 0.3468, + "step": 2533 + }, + { + "epoch": 2.1341942728804044, + "grad_norm": 0.1919918656349182, + "learning_rate": 2.3279896183145857e-06, + "loss": 0.3034, + "step": 2534 + }, + { + "epoch": 2.1350364963503647, + "grad_norm": 0.180246502161026, + "learning_rate": 2.323847068239504e-06, + "loss": 0.3239, + "step": 2535 + }, + { + "epoch": 2.1358787198203255, + "grad_norm": 0.1909414827823639, + "learning_rate": 2.319707091080517e-06, + "loss": 0.3546, + "step": 2536 + }, + { + "epoch": 2.1367209432902863, + "grad_norm": 0.20118744671344757, + "learning_rate": 2.3155696908178974e-06, + "loss": 0.328, + "step": 2537 + }, + { + "epoch": 2.137563166760247, + "grad_norm": 0.18613122403621674, + "learning_rate": 2.3114348714294355e-06, + "loss": 0.3143, + "step": 2538 + }, + { + "epoch": 2.138405390230208, + "grad_norm": 0.19210347533226013, + "learning_rate": 2.3073026368904478e-06, + "loss": 0.3156, + "step": 2539 + }, + { + "epoch": 2.1392476137001686, + "grad_norm": 0.17713823914527893, + "learning_rate": 2.3031729911737576e-06, + "loss": 0.3152, + "step": 2540 + }, + { + "epoch": 2.1400898371701293, + "grad_norm": 0.19556507468223572, + "learning_rate": 2.2990459382497086e-06, + "loss": 0.3505, + "step": 2541 + }, + { + "epoch": 2.1409320606400897, + "grad_norm": 0.18178249895572662, + "learning_rate": 2.2949214820861403e-06, + "loss": 0.3466, + "step": 2542 + }, + { + "epoch": 2.1417742841100504, + "grad_norm": 0.17022491991519928, + "learning_rate": 2.290799626648402e-06, + "loss": 0.3005, + "step": 2543 + }, + { + "epoch": 2.142616507580011, + "grad_norm": 0.20459839701652527, + "learning_rate": 2.2866803758993446e-06, + "loss": 0.3601, + "step": 2544 + }, + { + "epoch": 2.143458731049972, + "grad_norm": 0.19675502181053162, + "learning_rate": 2.2825637337993094e-06, + "loss": 0.3271, + "step": 2545 + }, + { + "epoch": 2.1443009545199327, + "grad_norm": 0.21750758588314056, + "learning_rate": 2.2784497043061384e-06, + "loss": 0.3476, + "step": 2546 + }, + { + "epoch": 2.1451431779898935, + "grad_norm": 0.18072174489498138, + "learning_rate": 2.274338291375147e-06, + "loss": 0.3143, + "step": 2547 + }, + { + "epoch": 2.145985401459854, + "grad_norm": 0.1931690126657486, + "learning_rate": 2.2702294989591513e-06, + "loss": 0.362, + "step": 2548 + }, + { + "epoch": 2.1468276249298146, + "grad_norm": 0.18842744827270508, + "learning_rate": 2.266123331008436e-06, + "loss": 0.3127, + "step": 2549 + }, + { + "epoch": 2.1476698483997754, + "grad_norm": 0.22671958804130554, + "learning_rate": 2.262019791470772e-06, + "loss": 0.3548, + "step": 2550 + }, + { + "epoch": 2.148512071869736, + "grad_norm": 0.19913806021213531, + "learning_rate": 2.257918884291392e-06, + "loss": 0.3126, + "step": 2551 + }, + { + "epoch": 2.149354295339697, + "grad_norm": 0.17924943566322327, + "learning_rate": 2.253820613413009e-06, + "loss": 0.3431, + "step": 2552 + }, + { + "epoch": 2.1501965188096577, + "grad_norm": 0.19935238361358643, + "learning_rate": 2.2497249827757933e-06, + "loss": 0.3337, + "step": 2553 + }, + { + "epoch": 2.151038742279618, + "grad_norm": 0.18192946910858154, + "learning_rate": 2.245631996317384e-06, + "loss": 0.2829, + "step": 2554 + }, + { + "epoch": 2.1518809657495788, + "grad_norm": 0.20926830172538757, + "learning_rate": 2.2415416579728714e-06, + "loss": 0.37, + "step": 2555 + }, + { + "epoch": 2.1527231892195395, + "grad_norm": 0.18061847984790802, + "learning_rate": 2.2374539716748034e-06, + "loss": 0.3359, + "step": 2556 + }, + { + "epoch": 2.1535654126895003, + "grad_norm": 0.1798250526189804, + "learning_rate": 2.233368941353175e-06, + "loss": 0.3111, + "step": 2557 + }, + { + "epoch": 2.154407636159461, + "grad_norm": 0.1831042319536209, + "learning_rate": 2.2292865709354346e-06, + "loss": 0.337, + "step": 2558 + }, + { + "epoch": 2.155249859629422, + "grad_norm": 0.18504156172275543, + "learning_rate": 2.225206864346465e-06, + "loss": 0.331, + "step": 2559 + }, + { + "epoch": 2.156092083099382, + "grad_norm": 0.2094961702823639, + "learning_rate": 2.221129825508593e-06, + "loss": 0.3424, + "step": 2560 + }, + { + "epoch": 2.156934306569343, + "grad_norm": 0.16761691868305206, + "learning_rate": 2.2170554583415782e-06, + "loss": 0.2776, + "step": 2561 + }, + { + "epoch": 2.1577765300393037, + "grad_norm": 0.17078322172164917, + "learning_rate": 2.2129837667626147e-06, + "loss": 0.348, + "step": 2562 + }, + { + "epoch": 2.1586187535092645, + "grad_norm": 0.18788252770900726, + "learning_rate": 2.2089147546863187e-06, + "loss": 0.3638, + "step": 2563 + }, + { + "epoch": 2.1594609769792252, + "grad_norm": 0.1982835978269577, + "learning_rate": 2.20484842602474e-06, + "loss": 0.3234, + "step": 2564 + }, + { + "epoch": 2.160303200449186, + "grad_norm": 0.18814495205879211, + "learning_rate": 2.2007847846873342e-06, + "loss": 0.3536, + "step": 2565 + }, + { + "epoch": 2.1611454239191463, + "grad_norm": 0.16890032589435577, + "learning_rate": 2.196723834580987e-06, + "loss": 0.305, + "step": 2566 + }, + { + "epoch": 2.161987647389107, + "grad_norm": 0.1856263428926468, + "learning_rate": 2.1926655796099873e-06, + "loss": 0.3459, + "step": 2567 + }, + { + "epoch": 2.162829870859068, + "grad_norm": 0.19124473631381989, + "learning_rate": 2.188610023676041e-06, + "loss": 0.3144, + "step": 2568 + }, + { + "epoch": 2.1636720943290286, + "grad_norm": 0.21970148384571075, + "learning_rate": 2.1845571706782486e-06, + "loss": 0.3411, + "step": 2569 + }, + { + "epoch": 2.1645143177989894, + "grad_norm": 0.20076794922351837, + "learning_rate": 2.1805070245131234e-06, + "loss": 0.3265, + "step": 2570 + }, + { + "epoch": 2.16535654126895, + "grad_norm": 0.18565763533115387, + "learning_rate": 2.176459589074566e-06, + "loss": 0.2936, + "step": 2571 + }, + { + "epoch": 2.166198764738911, + "grad_norm": 0.20779716968536377, + "learning_rate": 2.17241486825388e-06, + "loss": 0.3521, + "step": 2572 + }, + { + "epoch": 2.1670409882088713, + "grad_norm": 0.18811194598674774, + "learning_rate": 2.1683728659397517e-06, + "loss": 0.2978, + "step": 2573 + }, + { + "epoch": 2.167883211678832, + "grad_norm": 0.19322511553764343, + "learning_rate": 2.164333586018259e-06, + "loss": 0.3225, + "step": 2574 + }, + { + "epoch": 2.168725435148793, + "grad_norm": 0.19798986613750458, + "learning_rate": 2.160297032372857e-06, + "loss": 0.3435, + "step": 2575 + }, + { + "epoch": 2.1695676586187536, + "grad_norm": 0.19660674035549164, + "learning_rate": 2.156263208884386e-06, + "loss": 0.3775, + "step": 2576 + }, + { + "epoch": 2.1704098820887143, + "grad_norm": 0.18910837173461914, + "learning_rate": 2.1522321194310577e-06, + "loss": 0.3182, + "step": 2577 + }, + { + "epoch": 2.171252105558675, + "grad_norm": 0.18349318206310272, + "learning_rate": 2.148203767888455e-06, + "loss": 0.353, + "step": 2578 + }, + { + "epoch": 2.1720943290286354, + "grad_norm": 0.1835123598575592, + "learning_rate": 2.1441781581295286e-06, + "loss": 0.3201, + "step": 2579 + }, + { + "epoch": 2.172936552498596, + "grad_norm": 0.18625721335411072, + "learning_rate": 2.1401552940245962e-06, + "loss": 0.3305, + "step": 2580 + }, + { + "epoch": 2.173778775968557, + "grad_norm": 0.18136513233184814, + "learning_rate": 2.1361351794413334e-06, + "loss": 0.3301, + "step": 2581 + }, + { + "epoch": 2.1746209994385177, + "grad_norm": 0.18543168902397156, + "learning_rate": 2.132117818244771e-06, + "loss": 0.325, + "step": 2582 + }, + { + "epoch": 2.1754632229084785, + "grad_norm": 0.20783573389053345, + "learning_rate": 2.1281032142972933e-06, + "loss": 0.3577, + "step": 2583 + }, + { + "epoch": 2.1763054463784393, + "grad_norm": 0.18684588372707367, + "learning_rate": 2.124091371458638e-06, + "loss": 0.3422, + "step": 2584 + }, + { + "epoch": 2.1771476698483996, + "grad_norm": 0.21282611787319183, + "learning_rate": 2.1200822935858807e-06, + "loss": 0.3752, + "step": 2585 + }, + { + "epoch": 2.1779898933183603, + "grad_norm": 0.17147836089134216, + "learning_rate": 2.1160759845334483e-06, + "loss": 0.3291, + "step": 2586 + }, + { + "epoch": 2.178832116788321, + "grad_norm": 0.17594896256923676, + "learning_rate": 2.1120724481530937e-06, + "loss": 0.3099, + "step": 2587 + }, + { + "epoch": 2.179674340258282, + "grad_norm": 0.1904868483543396, + "learning_rate": 2.1080716882939145e-06, + "loss": 0.3487, + "step": 2588 + }, + { + "epoch": 2.1805165637282427, + "grad_norm": 0.19627432525157928, + "learning_rate": 2.1040737088023323e-06, + "loss": 0.3376, + "step": 2589 + }, + { + "epoch": 2.1813587871982034, + "grad_norm": 0.18196117877960205, + "learning_rate": 2.100078513522102e-06, + "loss": 0.2858, + "step": 2590 + }, + { + "epoch": 2.182201010668164, + "grad_norm": 0.1847255825996399, + "learning_rate": 2.0960861062942956e-06, + "loss": 0.3203, + "step": 2591 + }, + { + "epoch": 2.1830432341381245, + "grad_norm": 0.1873438060283661, + "learning_rate": 2.0920964909573065e-06, + "loss": 0.3488, + "step": 2592 + }, + { + "epoch": 2.1838854576080853, + "grad_norm": 0.18644773960113525, + "learning_rate": 2.0881096713468435e-06, + "loss": 0.3249, + "step": 2593 + }, + { + "epoch": 2.184727681078046, + "grad_norm": 0.1977187842130661, + "learning_rate": 2.0841256512959314e-06, + "loss": 0.3654, + "step": 2594 + }, + { + "epoch": 2.185569904548007, + "grad_norm": 0.18865470588207245, + "learning_rate": 2.080144434634898e-06, + "loss": 0.3108, + "step": 2595 + }, + { + "epoch": 2.1864121280179676, + "grad_norm": 0.19848677515983582, + "learning_rate": 2.0761660251913795e-06, + "loss": 0.3296, + "step": 2596 + }, + { + "epoch": 2.187254351487928, + "grad_norm": 0.3032154142856598, + "learning_rate": 2.0721904267903097e-06, + "loss": 0.3087, + "step": 2597 + }, + { + "epoch": 2.1880965749578887, + "grad_norm": 0.21433380246162415, + "learning_rate": 2.068217643253925e-06, + "loss": 0.3281, + "step": 2598 + }, + { + "epoch": 2.1889387984278494, + "grad_norm": 0.20142246782779694, + "learning_rate": 2.0642476784017507e-06, + "loss": 0.3495, + "step": 2599 + }, + { + "epoch": 2.18978102189781, + "grad_norm": 0.18407846987247467, + "learning_rate": 2.0602805360506044e-06, + "loss": 0.3514, + "step": 2600 + }, + { + "epoch": 2.190623245367771, + "grad_norm": 0.18453073501586914, + "learning_rate": 2.056316220014588e-06, + "loss": 0.3087, + "step": 2601 + }, + { + "epoch": 2.1914654688377317, + "grad_norm": 0.18886882066726685, + "learning_rate": 2.0523547341050913e-06, + "loss": 0.3196, + "step": 2602 + }, + { + "epoch": 2.1923076923076925, + "grad_norm": 0.18989959359169006, + "learning_rate": 2.0483960821307757e-06, + "loss": 0.3228, + "step": 2603 + }, + { + "epoch": 2.193149915777653, + "grad_norm": 0.20251639187335968, + "learning_rate": 2.0444402678975876e-06, + "loss": 0.3731, + "step": 2604 + }, + { + "epoch": 2.1939921392476136, + "grad_norm": 0.19764868915081024, + "learning_rate": 2.040487295208732e-06, + "loss": 0.344, + "step": 2605 + }, + { + "epoch": 2.1948343627175744, + "grad_norm": 0.19254057109355927, + "learning_rate": 2.036537167864695e-06, + "loss": 0.3603, + "step": 2606 + }, + { + "epoch": 2.195676586187535, + "grad_norm": 0.18026889860630035, + "learning_rate": 2.0325898896632178e-06, + "loss": 0.3185, + "step": 2607 + }, + { + "epoch": 2.196518809657496, + "grad_norm": 0.1734847128391266, + "learning_rate": 2.0286454643993097e-06, + "loss": 0.3448, + "step": 2608 + }, + { + "epoch": 2.1973610331274567, + "grad_norm": 0.18341229856014252, + "learning_rate": 2.024703895865232e-06, + "loss": 0.3319, + "step": 2609 + }, + { + "epoch": 2.198203256597417, + "grad_norm": 0.19103066623210907, + "learning_rate": 2.0207651878505e-06, + "loss": 0.3184, + "step": 2610 + }, + { + "epoch": 2.1990454800673778, + "grad_norm": 0.18200139701366425, + "learning_rate": 2.0168293441418798e-06, + "loss": 0.3245, + "step": 2611 + }, + { + "epoch": 2.1998877035373385, + "grad_norm": 0.21664761006832123, + "learning_rate": 2.012896368523386e-06, + "loss": 0.3056, + "step": 2612 + }, + { + "epoch": 2.2007299270072993, + "grad_norm": 0.20730926096439362, + "learning_rate": 2.0089662647762716e-06, + "loss": 0.3537, + "step": 2613 + }, + { + "epoch": 2.20157215047726, + "grad_norm": 0.18876250088214874, + "learning_rate": 2.0050390366790307e-06, + "loss": 0.3144, + "step": 2614 + }, + { + "epoch": 2.202414373947221, + "grad_norm": 0.191916361451149, + "learning_rate": 2.001114688007393e-06, + "loss": 0.3202, + "step": 2615 + }, + { + "epoch": 2.203256597417181, + "grad_norm": 0.18677039444446564, + "learning_rate": 1.997193222534316e-06, + "loss": 0.339, + "step": 2616 + }, + { + "epoch": 2.204098820887142, + "grad_norm": 0.18321190774440765, + "learning_rate": 1.9932746440299926e-06, + "loss": 0.3164, + "step": 2617 + }, + { + "epoch": 2.2049410443571027, + "grad_norm": 0.18801504373550415, + "learning_rate": 1.989358956261835e-06, + "loss": 0.3168, + "step": 2618 + }, + { + "epoch": 2.2057832678270635, + "grad_norm": 0.2072179764509201, + "learning_rate": 1.9854461629944764e-06, + "loss": 0.3332, + "step": 2619 + }, + { + "epoch": 2.2066254912970242, + "grad_norm": 0.1983804851770401, + "learning_rate": 1.981536267989766e-06, + "loss": 0.342, + "step": 2620 + }, + { + "epoch": 2.207467714766985, + "grad_norm": 0.20064833760261536, + "learning_rate": 1.977629275006772e-06, + "loss": 0.3352, + "step": 2621 + }, + { + "epoch": 2.2083099382369458, + "grad_norm": 0.19819088280200958, + "learning_rate": 1.9737251878017678e-06, + "loss": 0.312, + "step": 2622 + }, + { + "epoch": 2.209152161706906, + "grad_norm": 0.18463195860385895, + "learning_rate": 1.969824010128233e-06, + "loss": 0.3287, + "step": 2623 + }, + { + "epoch": 2.209994385176867, + "grad_norm": 0.18770988285541534, + "learning_rate": 1.9659257457368503e-06, + "loss": 0.3288, + "step": 2624 + }, + { + "epoch": 2.2108366086468276, + "grad_norm": 0.20165736973285675, + "learning_rate": 1.962030398375506e-06, + "loss": 0.352, + "step": 2625 + }, + { + "epoch": 2.2116788321167884, + "grad_norm": 0.21232005953788757, + "learning_rate": 1.9581379717892748e-06, + "loss": 0.3418, + "step": 2626 + }, + { + "epoch": 2.212521055586749, + "grad_norm": 0.17897962033748627, + "learning_rate": 1.954248469720431e-06, + "loss": 0.3076, + "step": 2627 + }, + { + "epoch": 2.2133632790567095, + "grad_norm": 0.19365070760250092, + "learning_rate": 1.950361895908427e-06, + "loss": 0.333, + "step": 2628 + }, + { + "epoch": 2.2142055025266703, + "grad_norm": 0.1850166767835617, + "learning_rate": 1.946478254089911e-06, + "loss": 0.3242, + "step": 2629 + }, + { + "epoch": 2.215047725996631, + "grad_norm": 0.18275009095668793, + "learning_rate": 1.942597547998703e-06, + "loss": 0.2998, + "step": 2630 + }, + { + "epoch": 2.215889949466592, + "grad_norm": 0.21025508642196655, + "learning_rate": 1.9387197813658092e-06, + "loss": 0.3675, + "step": 2631 + }, + { + "epoch": 2.2167321729365526, + "grad_norm": 0.20004096627235413, + "learning_rate": 1.934844957919403e-06, + "loss": 0.3285, + "step": 2632 + }, + { + "epoch": 2.2175743964065133, + "grad_norm": 0.18686963617801666, + "learning_rate": 1.9309730813848302e-06, + "loss": 0.3258, + "step": 2633 + }, + { + "epoch": 2.218416619876474, + "grad_norm": 0.17905724048614502, + "learning_rate": 1.927104155484602e-06, + "loss": 0.3276, + "step": 2634 + }, + { + "epoch": 2.2192588433464344, + "grad_norm": 0.18085958063602448, + "learning_rate": 1.923238183938398e-06, + "loss": 0.3042, + "step": 2635 + }, + { + "epoch": 2.220101066816395, + "grad_norm": 0.21374529600143433, + "learning_rate": 1.919375170463052e-06, + "loss": 0.3559, + "step": 2636 + }, + { + "epoch": 2.220943290286356, + "grad_norm": 0.18827268481254578, + "learning_rate": 1.915515118772555e-06, + "loss": 0.3277, + "step": 2637 + }, + { + "epoch": 2.2217855137563167, + "grad_norm": 0.18231862783432007, + "learning_rate": 1.9116580325780505e-06, + "loss": 0.3211, + "step": 2638 + }, + { + "epoch": 2.2226277372262775, + "grad_norm": 0.20695286989212036, + "learning_rate": 1.9078039155878338e-06, + "loss": 0.3148, + "step": 2639 + }, + { + "epoch": 2.2234699606962383, + "grad_norm": 0.17526499927043915, + "learning_rate": 1.9039527715073424e-06, + "loss": 0.2952, + "step": 2640 + }, + { + "epoch": 2.2243121841661986, + "grad_norm": 0.17523185908794403, + "learning_rate": 1.9001046040391558e-06, + "loss": 0.3309, + "step": 2641 + }, + { + "epoch": 2.2251544076361593, + "grad_norm": 0.1895107626914978, + "learning_rate": 1.8962594168829907e-06, + "loss": 0.3431, + "step": 2642 + }, + { + "epoch": 2.22599663110612, + "grad_norm": 0.19150540232658386, + "learning_rate": 1.8924172137357038e-06, + "loss": 0.3294, + "step": 2643 + }, + { + "epoch": 2.226838854576081, + "grad_norm": 0.277058482170105, + "learning_rate": 1.8885779982912756e-06, + "loss": 0.3089, + "step": 2644 + }, + { + "epoch": 2.2276810780460417, + "grad_norm": 0.18266412615776062, + "learning_rate": 1.884741774240823e-06, + "loss": 0.3234, + "step": 2645 + }, + { + "epoch": 2.2285233015160024, + "grad_norm": 0.17445224523544312, + "learning_rate": 1.8809085452725744e-06, + "loss": 0.3406, + "step": 2646 + }, + { + "epoch": 2.2293655249859627, + "grad_norm": 0.18090897798538208, + "learning_rate": 1.8770783150718913e-06, + "loss": 0.3168, + "step": 2647 + }, + { + "epoch": 2.2302077484559235, + "grad_norm": 0.1880083978176117, + "learning_rate": 1.8732510873212428e-06, + "loss": 0.3188, + "step": 2648 + }, + { + "epoch": 2.2310499719258843, + "grad_norm": 0.19110964238643646, + "learning_rate": 1.8694268657002197e-06, + "loss": 0.2974, + "step": 2649 + }, + { + "epoch": 2.231892195395845, + "grad_norm": 0.18959249556064606, + "learning_rate": 1.865605653885516e-06, + "loss": 0.3262, + "step": 2650 + }, + { + "epoch": 2.232734418865806, + "grad_norm": 0.22355952858924866, + "learning_rate": 1.8617874555509342e-06, + "loss": 0.3323, + "step": 2651 + }, + { + "epoch": 2.2335766423357666, + "grad_norm": 0.21287395060062408, + "learning_rate": 1.8579722743673773e-06, + "loss": 0.3184, + "step": 2652 + }, + { + "epoch": 2.2344188658057273, + "grad_norm": 0.17951267957687378, + "learning_rate": 1.8541601140028542e-06, + "loss": 0.3534, + "step": 2653 + }, + { + "epoch": 2.2352610892756877, + "grad_norm": 0.1638944149017334, + "learning_rate": 1.8503509781224627e-06, + "loss": 0.308, + "step": 2654 + }, + { + "epoch": 2.2361033127456484, + "grad_norm": 0.1907835453748703, + "learning_rate": 1.8465448703883959e-06, + "loss": 0.3423, + "step": 2655 + }, + { + "epoch": 2.236945536215609, + "grad_norm": 0.21052592992782593, + "learning_rate": 1.8427417944599325e-06, + "loss": 0.3469, + "step": 2656 + }, + { + "epoch": 2.23778775968557, + "grad_norm": 0.16829697787761688, + "learning_rate": 1.8389417539934428e-06, + "loss": 0.2978, + "step": 2657 + }, + { + "epoch": 2.2386299831555307, + "grad_norm": 0.19448906183242798, + "learning_rate": 1.8351447526423728e-06, + "loss": 0.3445, + "step": 2658 + }, + { + "epoch": 2.239472206625491, + "grad_norm": 0.1964830756187439, + "learning_rate": 1.8313507940572477e-06, + "loss": 0.3412, + "step": 2659 + }, + { + "epoch": 2.240314430095452, + "grad_norm": 0.18357017636299133, + "learning_rate": 1.8275598818856682e-06, + "loss": 0.3451, + "step": 2660 + }, + { + "epoch": 2.2411566535654126, + "grad_norm": 0.1971561759710312, + "learning_rate": 1.8237720197723075e-06, + "loss": 0.3274, + "step": 2661 + }, + { + "epoch": 2.2419988770353734, + "grad_norm": 0.19698187708854675, + "learning_rate": 1.819987211358903e-06, + "loss": 0.3322, + "step": 2662 + }, + { + "epoch": 2.242841100505334, + "grad_norm": 0.18066270649433136, + "learning_rate": 1.8162054602842621e-06, + "loss": 0.3321, + "step": 2663 + }, + { + "epoch": 2.243683323975295, + "grad_norm": 0.19735048711299896, + "learning_rate": 1.812426770184243e-06, + "loss": 0.2899, + "step": 2664 + }, + { + "epoch": 2.2445255474452557, + "grad_norm": 0.16894842684268951, + "learning_rate": 1.8086511446917715e-06, + "loss": 0.3217, + "step": 2665 + }, + { + "epoch": 2.245367770915216, + "grad_norm": 0.18202324211597443, + "learning_rate": 1.8048785874368191e-06, + "loss": 0.3128, + "step": 2666 + }, + { + "epoch": 2.2462099943851768, + "grad_norm": 0.20821116864681244, + "learning_rate": 1.8011091020464138e-06, + "loss": 0.3589, + "step": 2667 + }, + { + "epoch": 2.2470522178551375, + "grad_norm": 0.17096319794654846, + "learning_rate": 1.7973426921446258e-06, + "loss": 0.3224, + "step": 2668 + }, + { + "epoch": 2.2478944413250983, + "grad_norm": 0.1881159394979477, + "learning_rate": 1.7935793613525693e-06, + "loss": 0.336, + "step": 2669 + }, + { + "epoch": 2.248736664795059, + "grad_norm": 0.17406538128852844, + "learning_rate": 1.789819113288397e-06, + "loss": 0.3212, + "step": 2670 + }, + { + "epoch": 2.24957888826502, + "grad_norm": 0.1771821230649948, + "learning_rate": 1.7860619515673034e-06, + "loss": 0.3411, + "step": 2671 + }, + { + "epoch": 2.25042111173498, + "grad_norm": 0.1870764195919037, + "learning_rate": 1.7823078798015098e-06, + "loss": 0.3485, + "step": 2672 + }, + { + "epoch": 2.251263335204941, + "grad_norm": 0.19147291779518127, + "learning_rate": 1.7785569016002686e-06, + "loss": 0.2787, + "step": 2673 + }, + { + "epoch": 2.2521055586749017, + "grad_norm": 0.19009776413440704, + "learning_rate": 1.7748090205698565e-06, + "loss": 0.3568, + "step": 2674 + }, + { + "epoch": 2.2529477821448625, + "grad_norm": 0.18951989710330963, + "learning_rate": 1.7710642403135768e-06, + "loss": 0.3371, + "step": 2675 + }, + { + "epoch": 2.2537900056148232, + "grad_norm": 0.18312712013721466, + "learning_rate": 1.7673225644317487e-06, + "loss": 0.3404, + "step": 2676 + }, + { + "epoch": 2.254632229084784, + "grad_norm": 0.20596158504486084, + "learning_rate": 1.7635839965217055e-06, + "loss": 0.3215, + "step": 2677 + }, + { + "epoch": 2.2554744525547443, + "grad_norm": 0.20948167145252228, + "learning_rate": 1.7598485401777932e-06, + "loss": 0.3267, + "step": 2678 + }, + { + "epoch": 2.256316676024705, + "grad_norm": 0.20294427871704102, + "learning_rate": 1.75611619899137e-06, + "loss": 0.3713, + "step": 2679 + }, + { + "epoch": 2.257158899494666, + "grad_norm": 0.1707998812198639, + "learning_rate": 1.7523869765507928e-06, + "loss": 0.3182, + "step": 2680 + }, + { + "epoch": 2.2580011229646266, + "grad_norm": 0.17670342326164246, + "learning_rate": 1.748660876441428e-06, + "loss": 0.3587, + "step": 2681 + }, + { + "epoch": 2.2588433464345874, + "grad_norm": 0.18761256337165833, + "learning_rate": 1.7449379022456297e-06, + "loss": 0.3356, + "step": 2682 + }, + { + "epoch": 2.259685569904548, + "grad_norm": 0.19832022488117218, + "learning_rate": 1.7412180575427572e-06, + "loss": 0.3478, + "step": 2683 + }, + { + "epoch": 2.260527793374509, + "grad_norm": 0.19921515882015228, + "learning_rate": 1.7375013459091529e-06, + "loss": 0.3493, + "step": 2684 + }, + { + "epoch": 2.2613700168444693, + "grad_norm": 0.18138949573040009, + "learning_rate": 1.7337877709181527e-06, + "loss": 0.3108, + "step": 2685 + }, + { + "epoch": 2.26221224031443, + "grad_norm": 0.19637282192707062, + "learning_rate": 1.7300773361400746e-06, + "loss": 0.3033, + "step": 2686 + }, + { + "epoch": 2.263054463784391, + "grad_norm": 0.20320606231689453, + "learning_rate": 1.7263700451422166e-06, + "loss": 0.3553, + "step": 2687 + }, + { + "epoch": 2.2638966872543516, + "grad_norm": 0.3034162223339081, + "learning_rate": 1.7226659014888548e-06, + "loss": 0.3312, + "step": 2688 + }, + { + "epoch": 2.2647389107243123, + "grad_norm": 0.18244308233261108, + "learning_rate": 1.7189649087412385e-06, + "loss": 0.3552, + "step": 2689 + }, + { + "epoch": 2.2655811341942727, + "grad_norm": 0.1824118047952652, + "learning_rate": 1.7152670704575919e-06, + "loss": 0.3138, + "step": 2690 + }, + { + "epoch": 2.2664233576642334, + "grad_norm": 0.1936587542295456, + "learning_rate": 1.711572390193102e-06, + "loss": 0.3388, + "step": 2691 + }, + { + "epoch": 2.267265581134194, + "grad_norm": 0.2071102261543274, + "learning_rate": 1.7078808714999207e-06, + "loss": 0.3675, + "step": 2692 + }, + { + "epoch": 2.268107804604155, + "grad_norm": 0.19295620918273926, + "learning_rate": 1.7041925179271584e-06, + "loss": 0.3218, + "step": 2693 + }, + { + "epoch": 2.2689500280741157, + "grad_norm": 0.18506266176700592, + "learning_rate": 1.7005073330208881e-06, + "loss": 0.3429, + "step": 2694 + }, + { + "epoch": 2.2697922515440765, + "grad_norm": 0.17563672363758087, + "learning_rate": 1.696825320324132e-06, + "loss": 0.339, + "step": 2695 + }, + { + "epoch": 2.2706344750140373, + "grad_norm": 0.17654356360435486, + "learning_rate": 1.6931464833768624e-06, + "loss": 0.2947, + "step": 2696 + }, + { + "epoch": 2.2714766984839976, + "grad_norm": 0.19551007449626923, + "learning_rate": 1.689470825715998e-06, + "loss": 0.3435, + "step": 2697 + }, + { + "epoch": 2.2723189219539583, + "grad_norm": 0.1912021040916443, + "learning_rate": 1.6857983508754056e-06, + "loss": 0.3226, + "step": 2698 + }, + { + "epoch": 2.273161145423919, + "grad_norm": 0.17643921077251434, + "learning_rate": 1.6821290623858865e-06, + "loss": 0.3279, + "step": 2699 + }, + { + "epoch": 2.27400336889388, + "grad_norm": 0.20375093817710876, + "learning_rate": 1.6784629637751814e-06, + "loss": 0.3578, + "step": 2700 + }, + { + "epoch": 2.2748455923638407, + "grad_norm": 0.19104832410812378, + "learning_rate": 1.6748000585679602e-06, + "loss": 0.328, + "step": 2701 + }, + { + "epoch": 2.2756878158338014, + "grad_norm": 0.19647930562496185, + "learning_rate": 1.6711403502858302e-06, + "loss": 0.3139, + "step": 2702 + }, + { + "epoch": 2.2765300393037617, + "grad_norm": 0.1835135668516159, + "learning_rate": 1.6674838424473172e-06, + "loss": 0.3185, + "step": 2703 + }, + { + "epoch": 2.2773722627737225, + "grad_norm": 0.1910325586795807, + "learning_rate": 1.6638305385678783e-06, + "loss": 0.3367, + "step": 2704 + }, + { + "epoch": 2.2782144862436833, + "grad_norm": 0.17093531787395477, + "learning_rate": 1.6601804421598787e-06, + "loss": 0.2972, + "step": 2705 + }, + { + "epoch": 2.279056709713644, + "grad_norm": 0.19414487481117249, + "learning_rate": 1.6565335567326112e-06, + "loss": 0.372, + "step": 2706 + }, + { + "epoch": 2.279898933183605, + "grad_norm": 0.1806740164756775, + "learning_rate": 1.6528898857922747e-06, + "loss": 0.2987, + "step": 2707 + }, + { + "epoch": 2.2807411566535656, + "grad_norm": 0.19277624785900116, + "learning_rate": 1.6492494328419816e-06, + "loss": 0.355, + "step": 2708 + }, + { + "epoch": 2.281583380123526, + "grad_norm": 0.1879538744688034, + "learning_rate": 1.6456122013817477e-06, + "loss": 0.3711, + "step": 2709 + }, + { + "epoch": 2.2824256035934867, + "grad_norm": 0.26613113284111023, + "learning_rate": 1.6419781949084928e-06, + "loss": 0.3437, + "step": 2710 + }, + { + "epoch": 2.2832678270634474, + "grad_norm": 0.18551747500896454, + "learning_rate": 1.6383474169160334e-06, + "loss": 0.3551, + "step": 2711 + }, + { + "epoch": 2.284110050533408, + "grad_norm": 0.18406280875205994, + "learning_rate": 1.6347198708950884e-06, + "loss": 0.327, + "step": 2712 + }, + { + "epoch": 2.284952274003369, + "grad_norm": 0.1676313877105713, + "learning_rate": 1.631095560333264e-06, + "loss": 0.3267, + "step": 2713 + }, + { + "epoch": 2.2857944974733297, + "grad_norm": 0.18933825194835663, + "learning_rate": 1.6274744887150562e-06, + "loss": 0.3433, + "step": 2714 + }, + { + "epoch": 2.2866367209432905, + "grad_norm": 0.19097904860973358, + "learning_rate": 1.6238566595218475e-06, + "loss": 0.3314, + "step": 2715 + }, + { + "epoch": 2.287478944413251, + "grad_norm": 0.1931888610124588, + "learning_rate": 1.6202420762319065e-06, + "loss": 0.3265, + "step": 2716 + }, + { + "epoch": 2.2883211678832116, + "grad_norm": 0.18155337870121002, + "learning_rate": 1.6166307423203765e-06, + "loss": 0.326, + "step": 2717 + }, + { + "epoch": 2.2891633913531724, + "grad_norm": 0.196844682097435, + "learning_rate": 1.6130226612592787e-06, + "loss": 0.3373, + "step": 2718 + }, + { + "epoch": 2.290005614823133, + "grad_norm": 0.18781322240829468, + "learning_rate": 1.6094178365175044e-06, + "loss": 0.3457, + "step": 2719 + }, + { + "epoch": 2.290847838293094, + "grad_norm": 0.17167329788208008, + "learning_rate": 1.6058162715608205e-06, + "loss": 0.3206, + "step": 2720 + }, + { + "epoch": 2.2916900617630542, + "grad_norm": 0.16969625651836395, + "learning_rate": 1.6022179698518525e-06, + "loss": 0.3458, + "step": 2721 + }, + { + "epoch": 2.292532285233015, + "grad_norm": 0.1915416568517685, + "learning_rate": 1.598622934850097e-06, + "loss": 0.3676, + "step": 2722 + }, + { + "epoch": 2.2933745087029758, + "grad_norm": 0.1874096542596817, + "learning_rate": 1.595031170011898e-06, + "loss": 0.3355, + "step": 2723 + }, + { + "epoch": 2.2942167321729365, + "grad_norm": 0.1758255809545517, + "learning_rate": 1.591442678790467e-06, + "loss": 0.3155, + "step": 2724 + }, + { + "epoch": 2.2950589556428973, + "grad_norm": 0.18766756355762482, + "learning_rate": 1.5878574646358608e-06, + "loss": 0.3806, + "step": 2725 + }, + { + "epoch": 2.295901179112858, + "grad_norm": 0.46558520197868347, + "learning_rate": 1.584275530994991e-06, + "loss": 0.317, + "step": 2726 + }, + { + "epoch": 2.296743402582819, + "grad_norm": 0.17401887476444244, + "learning_rate": 1.580696881311611e-06, + "loss": 0.3636, + "step": 2727 + }, + { + "epoch": 2.297585626052779, + "grad_norm": 0.1759478896856308, + "learning_rate": 1.5771215190263183e-06, + "loss": 0.301, + "step": 2728 + }, + { + "epoch": 2.29842784952274, + "grad_norm": 0.1894080638885498, + "learning_rate": 1.573549447576549e-06, + "loss": 0.3249, + "step": 2729 + }, + { + "epoch": 2.2992700729927007, + "grad_norm": 0.1710214465856552, + "learning_rate": 1.5699806703965787e-06, + "loss": 0.3388, + "step": 2730 + }, + { + "epoch": 2.3001122964626615, + "grad_norm": 0.18094199895858765, + "learning_rate": 1.5664151909175124e-06, + "loss": 0.3442, + "step": 2731 + }, + { + "epoch": 2.3009545199326222, + "grad_norm": 0.19348318874835968, + "learning_rate": 1.5628530125672848e-06, + "loss": 0.3502, + "step": 2732 + }, + { + "epoch": 2.301796743402583, + "grad_norm": 0.1840953230857849, + "learning_rate": 1.5592941387706562e-06, + "loss": 0.3152, + "step": 2733 + }, + { + "epoch": 2.3026389668725433, + "grad_norm": 0.17521828413009644, + "learning_rate": 1.555738572949214e-06, + "loss": 0.3359, + "step": 2734 + }, + { + "epoch": 2.303481190342504, + "grad_norm": 0.2008764147758484, + "learning_rate": 1.5521863185213626e-06, + "loss": 0.4031, + "step": 2735 + }, + { + "epoch": 2.304323413812465, + "grad_norm": 0.16677844524383545, + "learning_rate": 1.5486373789023206e-06, + "loss": 0.2943, + "step": 2736 + }, + { + "epoch": 2.3051656372824256, + "grad_norm": 0.19051140546798706, + "learning_rate": 1.5450917575041209e-06, + "loss": 0.3354, + "step": 2737 + }, + { + "epoch": 2.3060078607523864, + "grad_norm": 0.16995146870613098, + "learning_rate": 1.54154945773561e-06, + "loss": 0.3129, + "step": 2738 + }, + { + "epoch": 2.306850084222347, + "grad_norm": 0.17699390649795532, + "learning_rate": 1.538010483002435e-06, + "loss": 0.3423, + "step": 2739 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.16745799779891968, + "learning_rate": 1.5344748367070534e-06, + "loss": 0.3244, + "step": 2740 + }, + { + "epoch": 2.3085345311622683, + "grad_norm": 0.193995401263237, + "learning_rate": 1.5309425222487119e-06, + "loss": 0.3781, + "step": 2741 + }, + { + "epoch": 2.309376754632229, + "grad_norm": 0.19723963737487793, + "learning_rate": 1.5274135430234654e-06, + "loss": 0.3255, + "step": 2742 + }, + { + "epoch": 2.31021897810219, + "grad_norm": 0.19136905670166016, + "learning_rate": 1.5238879024241544e-06, + "loss": 0.3166, + "step": 2743 + }, + { + "epoch": 2.3110612015721506, + "grad_norm": 0.1685858815908432, + "learning_rate": 1.5203656038404146e-06, + "loss": 0.3125, + "step": 2744 + }, + { + "epoch": 2.3119034250421113, + "grad_norm": 0.17219972610473633, + "learning_rate": 1.5168466506586654e-06, + "loss": 0.3216, + "step": 2745 + }, + { + "epoch": 2.312745648512072, + "grad_norm": 0.19580884277820587, + "learning_rate": 1.5133310462621103e-06, + "loss": 0.3772, + "step": 2746 + }, + { + "epoch": 2.3135878719820324, + "grad_norm": 0.1875855177640915, + "learning_rate": 1.509818794030733e-06, + "loss": 0.3521, + "step": 2747 + }, + { + "epoch": 2.314430095451993, + "grad_norm": 0.17141541838645935, + "learning_rate": 1.506309897341297e-06, + "loss": 0.2875, + "step": 2748 + }, + { + "epoch": 2.315272318921954, + "grad_norm": 0.19664089381694794, + "learning_rate": 1.502804359567337e-06, + "loss": 0.3451, + "step": 2749 + }, + { + "epoch": 2.3161145423919147, + "grad_norm": 0.18293555080890656, + "learning_rate": 1.499302184079159e-06, + "loss": 0.3167, + "step": 2750 + }, + { + "epoch": 2.3169567658618755, + "grad_norm": 0.18555696308612823, + "learning_rate": 1.4958033742438348e-06, + "loss": 0.3243, + "step": 2751 + }, + { + "epoch": 2.317798989331836, + "grad_norm": 0.16527746617794037, + "learning_rate": 1.492307933425205e-06, + "loss": 0.299, + "step": 2752 + }, + { + "epoch": 2.3186412128017966, + "grad_norm": 0.18836185336112976, + "learning_rate": 1.4888158649838675e-06, + "loss": 0.3353, + "step": 2753 + }, + { + "epoch": 2.3194834362717573, + "grad_norm": 0.18642380833625793, + "learning_rate": 1.4853271722771772e-06, + "loss": 0.3335, + "step": 2754 + }, + { + "epoch": 2.320325659741718, + "grad_norm": 0.1658937633037567, + "learning_rate": 1.4818418586592448e-06, + "loss": 0.3213, + "step": 2755 + }, + { + "epoch": 2.321167883211679, + "grad_norm": 0.1841190606355667, + "learning_rate": 1.478359927480935e-06, + "loss": 0.3417, + "step": 2756 + }, + { + "epoch": 2.3220101066816397, + "grad_norm": 0.17181068658828735, + "learning_rate": 1.4748813820898554e-06, + "loss": 0.3131, + "step": 2757 + }, + { + "epoch": 2.3228523301516004, + "grad_norm": 0.17926974594593048, + "learning_rate": 1.4714062258303653e-06, + "loss": 0.3535, + "step": 2758 + }, + { + "epoch": 2.3236945536215607, + "grad_norm": 0.18430732190608978, + "learning_rate": 1.4679344620435543e-06, + "loss": 0.3513, + "step": 2759 + }, + { + "epoch": 2.3245367770915215, + "grad_norm": 0.18636822700500488, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.3289, + "step": 2760 + }, + { + "epoch": 2.3253790005614823, + "grad_norm": 0.22061777114868164, + "learning_rate": 1.4610011252360594e-06, + "loss": 0.3499, + "step": 2761 + }, + { + "epoch": 2.326221224031443, + "grad_norm": 0.16940012574195862, + "learning_rate": 1.4575395588812452e-06, + "loss": 0.3058, + "step": 2762 + }, + { + "epoch": 2.327063447501404, + "grad_norm": 0.17286397516727448, + "learning_rate": 1.454081398330855e-06, + "loss": 0.3498, + "step": 2763 + }, + { + "epoch": 2.3279056709713646, + "grad_norm": 0.17675434052944183, + "learning_rate": 1.450626646909639e-06, + "loss": 0.3084, + "step": 2764 + }, + { + "epoch": 2.328747894441325, + "grad_norm": 0.18519286811351776, + "learning_rate": 1.4471753079390815e-06, + "loss": 0.3194, + "step": 2765 + }, + { + "epoch": 2.3295901179112857, + "grad_norm": 0.18685199320316315, + "learning_rate": 1.4437273847373778e-06, + "loss": 0.3492, + "step": 2766 + }, + { + "epoch": 2.3304323413812464, + "grad_norm": 0.1702100932598114, + "learning_rate": 1.4402828806194436e-06, + "loss": 0.3163, + "step": 2767 + }, + { + "epoch": 2.331274564851207, + "grad_norm": 0.15651482343673706, + "learning_rate": 1.4368417988969058e-06, + "loss": 0.2848, + "step": 2768 + }, + { + "epoch": 2.332116788321168, + "grad_norm": 0.20350609719753265, + "learning_rate": 1.4334041428781003e-06, + "loss": 0.3811, + "step": 2769 + }, + { + "epoch": 2.3329590117911287, + "grad_norm": 0.18207013607025146, + "learning_rate": 1.429969915868068e-06, + "loss": 0.3147, + "step": 2770 + }, + { + "epoch": 2.333801235261089, + "grad_norm": 0.17690841853618622, + "learning_rate": 1.4265391211685597e-06, + "loss": 0.3298, + "step": 2771 + }, + { + "epoch": 2.33464345873105, + "grad_norm": 0.18768583238124847, + "learning_rate": 1.4231117620780188e-06, + "loss": 0.318, + "step": 2772 + }, + { + "epoch": 2.3354856822010106, + "grad_norm": 0.18087123334407806, + "learning_rate": 1.4196878418915894e-06, + "loss": 0.3227, + "step": 2773 + }, + { + "epoch": 2.3363279056709714, + "grad_norm": 0.19178766012191772, + "learning_rate": 1.4162673639011065e-06, + "loss": 0.3507, + "step": 2774 + }, + { + "epoch": 2.337170129140932, + "grad_norm": 0.2115650326013565, + "learning_rate": 1.4128503313951008e-06, + "loss": 0.3659, + "step": 2775 + }, + { + "epoch": 2.338012352610893, + "grad_norm": 0.1869487762451172, + "learning_rate": 1.4094367476587867e-06, + "loss": 0.3332, + "step": 2776 + }, + { + "epoch": 2.3388545760808537, + "grad_norm": 0.19197297096252441, + "learning_rate": 1.4060266159740627e-06, + "loss": 0.3117, + "step": 2777 + }, + { + "epoch": 2.339696799550814, + "grad_norm": 0.16871175169944763, + "learning_rate": 1.4026199396195078e-06, + "loss": 0.3073, + "step": 2778 + }, + { + "epoch": 2.3405390230207748, + "grad_norm": 0.1799178570508957, + "learning_rate": 1.399216721870384e-06, + "loss": 0.364, + "step": 2779 + }, + { + "epoch": 2.3413812464907355, + "grad_norm": 0.17059600353240967, + "learning_rate": 1.3958169659986204e-06, + "loss": 0.3244, + "step": 2780 + }, + { + "epoch": 2.3422234699606963, + "grad_norm": 0.184001624584198, + "learning_rate": 1.3924206752728282e-06, + "loss": 0.3267, + "step": 2781 + }, + { + "epoch": 2.343065693430657, + "grad_norm": 0.1990918219089508, + "learning_rate": 1.389027852958273e-06, + "loss": 0.3968, + "step": 2782 + }, + { + "epoch": 2.3439079169006174, + "grad_norm": 0.17149695754051208, + "learning_rate": 1.385638502316899e-06, + "loss": 0.3008, + "step": 2783 + }, + { + "epoch": 2.344750140370578, + "grad_norm": 0.1957639753818512, + "learning_rate": 1.3822526266073044e-06, + "loss": 0.3502, + "step": 2784 + }, + { + "epoch": 2.345592363840539, + "grad_norm": 0.1924923062324524, + "learning_rate": 1.3788702290847517e-06, + "loss": 0.2741, + "step": 2785 + }, + { + "epoch": 2.3464345873104997, + "grad_norm": 0.20024049282073975, + "learning_rate": 1.3754913130011566e-06, + "loss": 0.3951, + "step": 2786 + }, + { + "epoch": 2.3472768107804605, + "grad_norm": 0.1801765114068985, + "learning_rate": 1.3721158816050872e-06, + "loss": 0.2943, + "step": 2787 + }, + { + "epoch": 2.3481190342504212, + "grad_norm": 0.20196901261806488, + "learning_rate": 1.3687439381417616e-06, + "loss": 0.3332, + "step": 2788 + }, + { + "epoch": 2.348961257720382, + "grad_norm": 0.18237796425819397, + "learning_rate": 1.3653754858530477e-06, + "loss": 0.3503, + "step": 2789 + }, + { + "epoch": 2.3498034811903423, + "grad_norm": 0.18183031678199768, + "learning_rate": 1.3620105279774532e-06, + "loss": 0.327, + "step": 2790 + }, + { + "epoch": 2.350645704660303, + "grad_norm": 0.1969723403453827, + "learning_rate": 1.3586490677501269e-06, + "loss": 0.3508, + "step": 2791 + }, + { + "epoch": 2.351487928130264, + "grad_norm": 0.18273185193538666, + "learning_rate": 1.3552911084028536e-06, + "loss": 0.3099, + "step": 2792 + }, + { + "epoch": 2.3523301516002246, + "grad_norm": 0.18478451669216156, + "learning_rate": 1.3519366531640589e-06, + "loss": 0.3326, + "step": 2793 + }, + { + "epoch": 2.3531723750701854, + "grad_norm": 0.19646549224853516, + "learning_rate": 1.3485857052587908e-06, + "loss": 0.3583, + "step": 2794 + }, + { + "epoch": 2.354014598540146, + "grad_norm": 0.17265410721302032, + "learning_rate": 1.3452382679087307e-06, + "loss": 0.3372, + "step": 2795 + }, + { + "epoch": 2.3548568220101065, + "grad_norm": 0.19773848354816437, + "learning_rate": 1.3418943443321807e-06, + "loss": 0.3436, + "step": 2796 + }, + { + "epoch": 2.3556990454800673, + "grad_norm": 0.17929218709468842, + "learning_rate": 1.3385539377440709e-06, + "loss": 0.2815, + "step": 2797 + }, + { + "epoch": 2.356541268950028, + "grad_norm": 0.19509731233119965, + "learning_rate": 1.3352170513559432e-06, + "loss": 0.3589, + "step": 2798 + }, + { + "epoch": 2.357383492419989, + "grad_norm": 0.189386248588562, + "learning_rate": 1.3318836883759634e-06, + "loss": 0.3374, + "step": 2799 + }, + { + "epoch": 2.3582257158899496, + "grad_norm": 0.18874704837799072, + "learning_rate": 1.3285538520088976e-06, + "loss": 0.3145, + "step": 2800 + }, + { + "epoch": 2.3590679393599103, + "grad_norm": 0.19876275956630707, + "learning_rate": 1.3252275454561337e-06, + "loss": 0.3612, + "step": 2801 + }, + { + "epoch": 2.3599101628298707, + "grad_norm": 0.17174991965293884, + "learning_rate": 1.3219047719156575e-06, + "loss": 0.3259, + "step": 2802 + }, + { + "epoch": 2.3607523862998314, + "grad_norm": 0.16203835606575012, + "learning_rate": 1.318585534582064e-06, + "loss": 0.309, + "step": 2803 + }, + { + "epoch": 2.361594609769792, + "grad_norm": 0.1940368413925171, + "learning_rate": 1.3152698366465449e-06, + "loss": 0.3591, + "step": 2804 + }, + { + "epoch": 2.362436833239753, + "grad_norm": 0.1763688176870346, + "learning_rate": 1.3119576812968893e-06, + "loss": 0.3434, + "step": 2805 + }, + { + "epoch": 2.3632790567097137, + "grad_norm": 0.1753963679075241, + "learning_rate": 1.30864907171748e-06, + "loss": 0.3003, + "step": 2806 + }, + { + "epoch": 2.3641212801796745, + "grad_norm": 0.1836460530757904, + "learning_rate": 1.305344011089294e-06, + "loss": 0.3296, + "step": 2807 + }, + { + "epoch": 2.3649635036496353, + "grad_norm": 0.18466545641422272, + "learning_rate": 1.3020425025898926e-06, + "loss": 0.3264, + "step": 2808 + }, + { + "epoch": 2.3658057271195956, + "grad_norm": 0.18666084110736847, + "learning_rate": 1.2987445493934236e-06, + "loss": 0.3428, + "step": 2809 + }, + { + "epoch": 2.3666479505895563, + "grad_norm": 0.17960001528263092, + "learning_rate": 1.295450154670615e-06, + "loss": 0.3471, + "step": 2810 + }, + { + "epoch": 2.367490174059517, + "grad_norm": 0.18435019254684448, + "learning_rate": 1.292159321588778e-06, + "loss": 0.3479, + "step": 2811 + }, + { + "epoch": 2.368332397529478, + "grad_norm": 0.16246698796749115, + "learning_rate": 1.288872053311795e-06, + "loss": 0.3057, + "step": 2812 + }, + { + "epoch": 2.3691746209994387, + "grad_norm": 0.1754060536623001, + "learning_rate": 1.2855883530001228e-06, + "loss": 0.3296, + "step": 2813 + }, + { + "epoch": 2.370016844469399, + "grad_norm": 0.3555665612220764, + "learning_rate": 1.282308223810786e-06, + "loss": 0.3046, + "step": 2814 + }, + { + "epoch": 2.3708590679393597, + "grad_norm": 0.17933470010757446, + "learning_rate": 1.2790316688973809e-06, + "loss": 0.3344, + "step": 2815 + }, + { + "epoch": 2.3717012914093205, + "grad_norm": 0.18554142117500305, + "learning_rate": 1.2757586914100612e-06, + "loss": 0.3695, + "step": 2816 + }, + { + "epoch": 2.3725435148792813, + "grad_norm": 0.17822647094726562, + "learning_rate": 1.272489294495548e-06, + "loss": 0.3418, + "step": 2817 + }, + { + "epoch": 2.373385738349242, + "grad_norm": 0.18284878134727478, + "learning_rate": 1.2692234812971106e-06, + "loss": 0.3372, + "step": 2818 + }, + { + "epoch": 2.374227961819203, + "grad_norm": 0.18310308456420898, + "learning_rate": 1.265961254954583e-06, + "loss": 0.3264, + "step": 2819 + }, + { + "epoch": 2.3750701852891636, + "grad_norm": 0.17785710096359253, + "learning_rate": 1.2627026186043423e-06, + "loss": 0.3195, + "step": 2820 + }, + { + "epoch": 2.375912408759124, + "grad_norm": 0.17308805882930756, + "learning_rate": 1.2594475753793211e-06, + "loss": 0.3216, + "step": 2821 + }, + { + "epoch": 2.3767546322290847, + "grad_norm": 0.19008275866508484, + "learning_rate": 1.256196128408993e-06, + "loss": 0.3308, + "step": 2822 + }, + { + "epoch": 2.3775968556990454, + "grad_norm": 0.18448509275913239, + "learning_rate": 1.252948280819375e-06, + "loss": 0.3341, + "step": 2823 + }, + { + "epoch": 2.378439079169006, + "grad_norm": 0.201051726937294, + "learning_rate": 1.249704035733022e-06, + "loss": 0.3335, + "step": 2824 + }, + { + "epoch": 2.379281302638967, + "grad_norm": 0.19201770424842834, + "learning_rate": 1.2464633962690304e-06, + "loss": 0.3422, + "step": 2825 + }, + { + "epoch": 2.3801235261089277, + "grad_norm": 0.1856694221496582, + "learning_rate": 1.243226365543026e-06, + "loss": 0.3521, + "step": 2826 + }, + { + "epoch": 2.3809657495788885, + "grad_norm": 0.17718330025672913, + "learning_rate": 1.239992946667165e-06, + "loss": 0.3353, + "step": 2827 + }, + { + "epoch": 2.381807973048849, + "grad_norm": 0.18037939071655273, + "learning_rate": 1.2367631427501308e-06, + "loss": 0.3386, + "step": 2828 + }, + { + "epoch": 2.3826501965188096, + "grad_norm": 0.18260610103607178, + "learning_rate": 1.2335369568971362e-06, + "loss": 0.3399, + "step": 2829 + }, + { + "epoch": 2.3834924199887704, + "grad_norm": 0.18438413739204407, + "learning_rate": 1.2303143922099092e-06, + "loss": 0.3626, + "step": 2830 + }, + { + "epoch": 2.384334643458731, + "grad_norm": 0.18501873314380646, + "learning_rate": 1.2270954517867e-06, + "loss": 0.3314, + "step": 2831 + }, + { + "epoch": 2.385176866928692, + "grad_norm": 0.1844661831855774, + "learning_rate": 1.2238801387222716e-06, + "loss": 0.3208, + "step": 2832 + }, + { + "epoch": 2.3860190903986522, + "grad_norm": 0.17237085103988647, + "learning_rate": 1.2206684561079035e-06, + "loss": 0.299, + "step": 2833 + }, + { + "epoch": 2.386861313868613, + "grad_norm": 0.1830216497182846, + "learning_rate": 1.2174604070313811e-06, + "loss": 0.3126, + "step": 2834 + }, + { + "epoch": 2.3877035373385738, + "grad_norm": 0.17193448543548584, + "learning_rate": 1.2142559945769995e-06, + "loss": 0.3232, + "step": 2835 + }, + { + "epoch": 2.3885457608085345, + "grad_norm": 0.18704761564731598, + "learning_rate": 1.211055221825554e-06, + "loss": 0.3653, + "step": 2836 + }, + { + "epoch": 2.3893879842784953, + "grad_norm": 0.18288898468017578, + "learning_rate": 1.207858091854342e-06, + "loss": 0.3709, + "step": 2837 + }, + { + "epoch": 2.390230207748456, + "grad_norm": 0.17680326104164124, + "learning_rate": 1.2046646077371615e-06, + "loss": 0.2964, + "step": 2838 + }, + { + "epoch": 2.391072431218417, + "grad_norm": 0.18553590774536133, + "learning_rate": 1.2014747725443004e-06, + "loss": 0.3341, + "step": 2839 + }, + { + "epoch": 2.391914654688377, + "grad_norm": 0.18301990628242493, + "learning_rate": 1.1982885893425455e-06, + "loss": 0.3643, + "step": 2840 + }, + { + "epoch": 2.392756878158338, + "grad_norm": 0.1753537952899933, + "learning_rate": 1.1951060611951615e-06, + "loss": 0.3195, + "step": 2841 + }, + { + "epoch": 2.3935991016282987, + "grad_norm": 0.184483602643013, + "learning_rate": 1.1919271911619106e-06, + "loss": 0.3384, + "step": 2842 + }, + { + "epoch": 2.3944413250982595, + "grad_norm": 0.19095274806022644, + "learning_rate": 1.1887519822990296e-06, + "loss": 0.3437, + "step": 2843 + }, + { + "epoch": 2.3952835485682202, + "grad_norm": 0.17648278176784515, + "learning_rate": 1.185580437659241e-06, + "loss": 0.3209, + "step": 2844 + }, + { + "epoch": 2.3961257720381806, + "grad_norm": 0.18717674911022186, + "learning_rate": 1.1824125602917414e-06, + "loss": 0.3459, + "step": 2845 + }, + { + "epoch": 2.3969679955081413, + "grad_norm": 0.18629314005374908, + "learning_rate": 1.1792483532422021e-06, + "loss": 0.3246, + "step": 2846 + }, + { + "epoch": 2.397810218978102, + "grad_norm": 0.19330480694770813, + "learning_rate": 1.1760878195527642e-06, + "loss": 0.3543, + "step": 2847 + }, + { + "epoch": 2.398652442448063, + "grad_norm": 0.17509174346923828, + "learning_rate": 1.1729309622620422e-06, + "loss": 0.3044, + "step": 2848 + }, + { + "epoch": 2.3994946659180236, + "grad_norm": 0.18171800673007965, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.3116, + "step": 2849 + }, + { + "epoch": 2.4003368893879844, + "grad_norm": 0.19177602231502533, + "learning_rate": 1.1666282890135083e-06, + "loss": 0.3456, + "step": 2850 + }, + { + "epoch": 2.401179112857945, + "grad_norm": 0.18695443868637085, + "learning_rate": 1.1634824791152334e-06, + "loss": 0.3575, + "step": 2851 + }, + { + "epoch": 2.4020213363279055, + "grad_norm": 0.1723720133304596, + "learning_rate": 1.1603403577347434e-06, + "loss": 0.2962, + "step": 2852 + }, + { + "epoch": 2.4028635597978663, + "grad_norm": 0.18195010721683502, + "learning_rate": 1.1572019278929457e-06, + "loss": 0.3296, + "step": 2853 + }, + { + "epoch": 2.403705783267827, + "grad_norm": 0.1848120242357254, + "learning_rate": 1.1540671926072012e-06, + "loss": 0.3466, + "step": 2854 + }, + { + "epoch": 2.404548006737788, + "grad_norm": 0.17766617238521576, + "learning_rate": 1.1509361548913151e-06, + "loss": 0.3489, + "step": 2855 + }, + { + "epoch": 2.4053902302077486, + "grad_norm": 0.17193879187107086, + "learning_rate": 1.147808817755544e-06, + "loss": 0.3216, + "step": 2856 + }, + { + "epoch": 2.4062324536777093, + "grad_norm": 0.18416596949100494, + "learning_rate": 1.1446851842065804e-06, + "loss": 0.3361, + "step": 2857 + }, + { + "epoch": 2.40707467714767, + "grad_norm": 0.18768543004989624, + "learning_rate": 1.1415652572475628e-06, + "loss": 0.3183, + "step": 2858 + }, + { + "epoch": 2.4079169006176304, + "grad_norm": 0.1932467818260193, + "learning_rate": 1.1384490398780563e-06, + "loss": 0.3496, + "step": 2859 + }, + { + "epoch": 2.408759124087591, + "grad_norm": 0.18695873022079468, + "learning_rate": 1.1353365350940688e-06, + "loss": 0.3069, + "step": 2860 + }, + { + "epoch": 2.409601347557552, + "grad_norm": 0.17973104119300842, + "learning_rate": 1.1322277458880337e-06, + "loss": 0.3243, + "step": 2861 + }, + { + "epoch": 2.4104435710275127, + "grad_norm": 0.18068546056747437, + "learning_rate": 1.129122675248816e-06, + "loss": 0.342, + "step": 2862 + }, + { + "epoch": 2.4112857944974735, + "grad_norm": 0.16292357444763184, + "learning_rate": 1.1260213261617015e-06, + "loss": 0.3167, + "step": 2863 + }, + { + "epoch": 2.412128017967434, + "grad_norm": 0.18908433616161346, + "learning_rate": 1.1229237016084005e-06, + "loss": 0.3719, + "step": 2864 + }, + { + "epoch": 2.4129702414373946, + "grad_norm": 0.1725766956806183, + "learning_rate": 1.1198298045670402e-06, + "loss": 0.298, + "step": 2865 + }, + { + "epoch": 2.4138124649073553, + "grad_norm": 0.19303034245967865, + "learning_rate": 1.116739638012168e-06, + "loss": 0.3337, + "step": 2866 + }, + { + "epoch": 2.414654688377316, + "grad_norm": 0.17117536067962646, + "learning_rate": 1.113653204914742e-06, + "loss": 0.3324, + "step": 2867 + }, + { + "epoch": 2.415496911847277, + "grad_norm": 0.17073644697666168, + "learning_rate": 1.1105705082421303e-06, + "loss": 0.3238, + "step": 2868 + }, + { + "epoch": 2.4163391353172377, + "grad_norm": 0.1811438351869583, + "learning_rate": 1.1074915509581086e-06, + "loss": 0.3241, + "step": 2869 + }, + { + "epoch": 2.4171813587871984, + "grad_norm": 0.18533632159233093, + "learning_rate": 1.104416336022861e-06, + "loss": 0.3437, + "step": 2870 + }, + { + "epoch": 2.4180235822571587, + "grad_norm": 0.17663425207138062, + "learning_rate": 1.1013448663929704e-06, + "loss": 0.3603, + "step": 2871 + }, + { + "epoch": 2.4188658057271195, + "grad_norm": 0.1756037026643753, + "learning_rate": 1.0982771450214197e-06, + "loss": 0.3411, + "step": 2872 + }, + { + "epoch": 2.4197080291970803, + "grad_norm": 0.18769052624702454, + "learning_rate": 1.0952131748575855e-06, + "loss": 0.3106, + "step": 2873 + }, + { + "epoch": 2.420550252667041, + "grad_norm": 0.18603095412254333, + "learning_rate": 1.0921529588472446e-06, + "loss": 0.334, + "step": 2874 + }, + { + "epoch": 2.421392476137002, + "grad_norm": 0.19260810315608978, + "learning_rate": 1.0890964999325566e-06, + "loss": 0.3317, + "step": 2875 + }, + { + "epoch": 2.422234699606962, + "grad_norm": 0.17378182709217072, + "learning_rate": 1.0860438010520773e-06, + "loss": 0.3022, + "step": 2876 + }, + { + "epoch": 2.423076923076923, + "grad_norm": 0.18065233528614044, + "learning_rate": 1.0829948651407374e-06, + "loss": 0.3406, + "step": 2877 + }, + { + "epoch": 2.4239191465468837, + "grad_norm": 0.16788865625858307, + "learning_rate": 1.0799496951298595e-06, + "loss": 0.3158, + "step": 2878 + }, + { + "epoch": 2.4247613700168444, + "grad_norm": 0.16910141706466675, + "learning_rate": 1.0769082939471382e-06, + "loss": 0.3344, + "step": 2879 + }, + { + "epoch": 2.425603593486805, + "grad_norm": 0.1839236170053482, + "learning_rate": 1.0738706645166508e-06, + "loss": 0.3453, + "step": 2880 + }, + { + "epoch": 2.426445816956766, + "grad_norm": 0.16786138713359833, + "learning_rate": 1.0708368097588435e-06, + "loss": 0.3309, + "step": 2881 + }, + { + "epoch": 2.4272880404267267, + "grad_norm": 0.17052896320819855, + "learning_rate": 1.0678067325905362e-06, + "loss": 0.3456, + "step": 2882 + }, + { + "epoch": 2.428130263896687, + "grad_norm": 0.17799879610538483, + "learning_rate": 1.0647804359249143e-06, + "loss": 0.319, + "step": 2883 + }, + { + "epoch": 2.428972487366648, + "grad_norm": 0.1701173037290573, + "learning_rate": 1.0617579226715324e-06, + "loss": 0.3113, + "step": 2884 + }, + { + "epoch": 2.4298147108366086, + "grad_norm": 0.18232646584510803, + "learning_rate": 1.0587391957363053e-06, + "loss": 0.3778, + "step": 2885 + }, + { + "epoch": 2.4306569343065694, + "grad_norm": 0.15923668444156647, + "learning_rate": 1.0557242580215066e-06, + "loss": 0.3241, + "step": 2886 + }, + { + "epoch": 2.43149915777653, + "grad_norm": 0.17628398537635803, + "learning_rate": 1.0527131124257677e-06, + "loss": 0.3057, + "step": 2887 + }, + { + "epoch": 2.432341381246491, + "grad_norm": 0.18149569630622864, + "learning_rate": 1.0497057618440765e-06, + "loss": 0.3219, + "step": 2888 + }, + { + "epoch": 2.4331836047164517, + "grad_norm": 0.18324145674705505, + "learning_rate": 1.0467022091677692e-06, + "loss": 0.3294, + "step": 2889 + }, + { + "epoch": 2.434025828186412, + "grad_norm": 0.1819029152393341, + "learning_rate": 1.0437024572845317e-06, + "loss": 0.3265, + "step": 2890 + }, + { + "epoch": 2.4348680516563728, + "grad_norm": 0.18516305088996887, + "learning_rate": 1.040706509078394e-06, + "loss": 0.342, + "step": 2891 + }, + { + "epoch": 2.4357102751263335, + "grad_norm": 0.1963118612766266, + "learning_rate": 1.037714367429734e-06, + "loss": 0.3462, + "step": 2892 + }, + { + "epoch": 2.4365524985962943, + "grad_norm": 0.17552661895751953, + "learning_rate": 1.0347260352152644e-06, + "loss": 0.2975, + "step": 2893 + }, + { + "epoch": 2.437394722066255, + "grad_norm": 0.18327851593494415, + "learning_rate": 1.0317415153080406e-06, + "loss": 0.3481, + "step": 2894 + }, + { + "epoch": 2.4382369455362154, + "grad_norm": 0.17430095374584198, + "learning_rate": 1.0287608105774456e-06, + "loss": 0.3583, + "step": 2895 + }, + { + "epoch": 2.439079169006176, + "grad_norm": 0.1779869645833969, + "learning_rate": 1.025783923889202e-06, + "loss": 0.3114, + "step": 2896 + }, + { + "epoch": 2.439921392476137, + "grad_norm": 0.1845777928829193, + "learning_rate": 1.0228108581053565e-06, + "loss": 0.354, + "step": 2897 + }, + { + "epoch": 2.4407636159460977, + "grad_norm": 0.1726304590702057, + "learning_rate": 1.019841616084286e-06, + "loss": 0.323, + "step": 2898 + }, + { + "epoch": 2.4416058394160585, + "grad_norm": 0.17444516718387604, + "learning_rate": 1.0168762006806886e-06, + "loss": 0.3091, + "step": 2899 + }, + { + "epoch": 2.4424480628860192, + "grad_norm": 0.1818244457244873, + "learning_rate": 1.0139146147455842e-06, + "loss": 0.3464, + "step": 2900 + }, + { + "epoch": 2.44329028635598, + "grad_norm": 0.17454013228416443, + "learning_rate": 1.0109568611263094e-06, + "loss": 0.3061, + "step": 2901 + }, + { + "epoch": 2.4441325098259403, + "grad_norm": 0.17296692728996277, + "learning_rate": 1.0080029426665194e-06, + "loss": 0.3122, + "step": 2902 + }, + { + "epoch": 2.444974733295901, + "grad_norm": 0.17483364045619965, + "learning_rate": 1.0050528622061805e-06, + "loss": 0.3076, + "step": 2903 + }, + { + "epoch": 2.445816956765862, + "grad_norm": 0.1877381056547165, + "learning_rate": 1.002106622581569e-06, + "loss": 0.3669, + "step": 2904 + }, + { + "epoch": 2.4466591802358226, + "grad_norm": 0.1751059740781784, + "learning_rate": 9.991642266252672e-07, + "loss": 0.3014, + "step": 2905 + }, + { + "epoch": 2.4475014037057834, + "grad_norm": 0.19321346282958984, + "learning_rate": 9.96225677166166e-07, + "loss": 0.3562, + "step": 2906 + }, + { + "epoch": 2.4483436271757437, + "grad_norm": 0.1792708784341812, + "learning_rate": 9.932909770294542e-07, + "loss": 0.3156, + "step": 2907 + }, + { + "epoch": 2.4491858506457045, + "grad_norm": 0.1754908710718155, + "learning_rate": 9.903601290366217e-07, + "loss": 0.3201, + "step": 2908 + }, + { + "epoch": 2.4500280741156653, + "grad_norm": 0.18347208201885223, + "learning_rate": 9.87433136005454e-07, + "loss": 0.3294, + "step": 2909 + }, + { + "epoch": 2.450870297585626, + "grad_norm": 0.19159957766532898, + "learning_rate": 9.845100007500292e-07, + "loss": 0.3835, + "step": 2910 + }, + { + "epoch": 2.451712521055587, + "grad_norm": 0.16591417789459229, + "learning_rate": 9.81590726080721e-07, + "loss": 0.2709, + "step": 2911 + }, + { + "epoch": 2.4525547445255476, + "grad_norm": 0.19382108747959137, + "learning_rate": 9.786753148041871e-07, + "loss": 0.3401, + "step": 2912 + }, + { + "epoch": 2.4533969679955083, + "grad_norm": 0.1861683875322342, + "learning_rate": 9.757637697233723e-07, + "loss": 0.3328, + "step": 2913 + }, + { + "epoch": 2.4542391914654687, + "grad_norm": 0.1860036998987198, + "learning_rate": 9.728560936375032e-07, + "loss": 0.3403, + "step": 2914 + }, + { + "epoch": 2.4550814149354294, + "grad_norm": 0.19895556569099426, + "learning_rate": 9.699522893420894e-07, + "loss": 0.3314, + "step": 2915 + }, + { + "epoch": 2.45592363840539, + "grad_norm": 0.18061065673828125, + "learning_rate": 9.670523596289138e-07, + "loss": 0.3118, + "step": 2916 + }, + { + "epoch": 2.456765861875351, + "grad_norm": 0.17831942439079285, + "learning_rate": 9.641563072860416e-07, + "loss": 0.3282, + "step": 2917 + }, + { + "epoch": 2.4576080853453117, + "grad_norm": 0.1895962655544281, + "learning_rate": 9.61264135097799e-07, + "loss": 0.3429, + "step": 2918 + }, + { + "epoch": 2.4584503088152725, + "grad_norm": 0.18186987936496735, + "learning_rate": 9.58375845844793e-07, + "loss": 0.322, + "step": 2919 + }, + { + "epoch": 2.4592925322852333, + "grad_norm": 0.1867487132549286, + "learning_rate": 9.55491442303889e-07, + "loss": 0.3304, + "step": 2920 + }, + { + "epoch": 2.4601347557551936, + "grad_norm": 0.1775023490190506, + "learning_rate": 9.526109272482237e-07, + "loss": 0.3296, + "step": 2921 + }, + { + "epoch": 2.4609769792251543, + "grad_norm": 0.18398189544677734, + "learning_rate": 9.497343034471896e-07, + "loss": 0.3703, + "step": 2922 + }, + { + "epoch": 2.461819202695115, + "grad_norm": 0.16970793902873993, + "learning_rate": 9.468615736664405e-07, + "loss": 0.3224, + "step": 2923 + }, + { + "epoch": 2.462661426165076, + "grad_norm": 0.18678072094917297, + "learning_rate": 9.439927406678845e-07, + "loss": 0.3703, + "step": 2924 + }, + { + "epoch": 2.4635036496350367, + "grad_norm": 0.17588737607002258, + "learning_rate": 9.41127807209688e-07, + "loss": 0.35, + "step": 2925 + }, + { + "epoch": 2.464345873104997, + "grad_norm": 0.15737545490264893, + "learning_rate": 9.382667760462633e-07, + "loss": 0.2765, + "step": 2926 + }, + { + "epoch": 2.4651880965749577, + "grad_norm": 0.19520333409309387, + "learning_rate": 9.354096499282728e-07, + "loss": 0.3557, + "step": 2927 + }, + { + "epoch": 2.4660303200449185, + "grad_norm": 0.17047427594661713, + "learning_rate": 9.325564316026236e-07, + "loss": 0.2859, + "step": 2928 + }, + { + "epoch": 2.4668725435148793, + "grad_norm": 0.19578218460083008, + "learning_rate": 9.297071238124683e-07, + "loss": 0.3369, + "step": 2929 + }, + { + "epoch": 2.46771476698484, + "grad_norm": 0.19017136096954346, + "learning_rate": 9.268617292971982e-07, + "loss": 0.3502, + "step": 2930 + }, + { + "epoch": 2.468556990454801, + "grad_norm": 0.17374084889888763, + "learning_rate": 9.240202507924412e-07, + "loss": 0.3248, + "step": 2931 + }, + { + "epoch": 2.4693992139247616, + "grad_norm": 0.18185202777385712, + "learning_rate": 9.211826910300598e-07, + "loss": 0.3337, + "step": 2932 + }, + { + "epoch": 2.470241437394722, + "grad_norm": 0.17019344866275787, + "learning_rate": 9.183490527381539e-07, + "loss": 0.3557, + "step": 2933 + }, + { + "epoch": 2.4710836608646827, + "grad_norm": 0.17223379015922546, + "learning_rate": 9.155193386410466e-07, + "loss": 0.3242, + "step": 2934 + }, + { + "epoch": 2.4719258843346434, + "grad_norm": 0.18503350019454956, + "learning_rate": 9.126935514592949e-07, + "loss": 0.3253, + "step": 2935 + }, + { + "epoch": 2.472768107804604, + "grad_norm": 0.1900034248828888, + "learning_rate": 9.098716939096719e-07, + "loss": 0.3339, + "step": 2936 + }, + { + "epoch": 2.473610331274565, + "grad_norm": 0.20346327126026154, + "learning_rate": 9.070537687051817e-07, + "loss": 0.376, + "step": 2937 + }, + { + "epoch": 2.4744525547445253, + "grad_norm": 0.18149623274803162, + "learning_rate": 9.042397785550405e-07, + "loss": 0.333, + "step": 2938 + }, + { + "epoch": 2.475294778214486, + "grad_norm": 0.18477939069271088, + "learning_rate": 9.014297261646876e-07, + "loss": 0.3282, + "step": 2939 + }, + { + "epoch": 2.476137001684447, + "grad_norm": 0.18960389494895935, + "learning_rate": 8.986236142357707e-07, + "loss": 0.31, + "step": 2940 + }, + { + "epoch": 2.4769792251544076, + "grad_norm": 0.1833685338497162, + "learning_rate": 8.958214454661529e-07, + "loss": 0.356, + "step": 2941 + }, + { + "epoch": 2.4778214486243684, + "grad_norm": 0.17379865050315857, + "learning_rate": 8.930232225499025e-07, + "loss": 0.3437, + "step": 2942 + }, + { + "epoch": 2.478663672094329, + "grad_norm": 0.1800137162208557, + "learning_rate": 8.902289481772996e-07, + "loss": 0.3334, + "step": 2943 + }, + { + "epoch": 2.47950589556429, + "grad_norm": 0.17582203447818756, + "learning_rate": 8.874386250348232e-07, + "loss": 0.3356, + "step": 2944 + }, + { + "epoch": 2.4803481190342502, + "grad_norm": 0.17575836181640625, + "learning_rate": 8.846522558051563e-07, + "loss": 0.3388, + "step": 2945 + }, + { + "epoch": 2.481190342504211, + "grad_norm": 0.2217385172843933, + "learning_rate": 8.818698431671774e-07, + "loss": 0.3411, + "step": 2946 + }, + { + "epoch": 2.4820325659741718, + "grad_norm": 0.20173339545726776, + "learning_rate": 8.790913897959663e-07, + "loss": 0.3125, + "step": 2947 + }, + { + "epoch": 2.4828747894441325, + "grad_norm": 0.1942809671163559, + "learning_rate": 8.763168983627912e-07, + "loss": 0.3455, + "step": 2948 + }, + { + "epoch": 2.4837170129140933, + "grad_norm": 0.1882442831993103, + "learning_rate": 8.735463715351139e-07, + "loss": 0.2995, + "step": 2949 + }, + { + "epoch": 2.484559236384054, + "grad_norm": 0.1855713129043579, + "learning_rate": 8.70779811976582e-07, + "loss": 0.3468, + "step": 2950 + }, + { + "epoch": 2.485401459854015, + "grad_norm": 0.41547971963882446, + "learning_rate": 8.680172223470329e-07, + "loss": 0.3343, + "step": 2951 + }, + { + "epoch": 2.486243683323975, + "grad_norm": 0.18178273737430573, + "learning_rate": 8.652586053024836e-07, + "loss": 0.3185, + "step": 2952 + }, + { + "epoch": 2.487085906793936, + "grad_norm": 0.20105557143688202, + "learning_rate": 8.625039634951354e-07, + "loss": 0.3473, + "step": 2953 + }, + { + "epoch": 2.4879281302638967, + "grad_norm": 0.1699233204126358, + "learning_rate": 8.597532995733615e-07, + "loss": 0.2968, + "step": 2954 + }, + { + "epoch": 2.4887703537338575, + "grad_norm": 0.1811775267124176, + "learning_rate": 8.570066161817176e-07, + "loss": 0.3389, + "step": 2955 + }, + { + "epoch": 2.4896125772038182, + "grad_norm": 0.18310898542404175, + "learning_rate": 8.542639159609278e-07, + "loss": 0.333, + "step": 2956 + }, + { + "epoch": 2.4904548006737786, + "grad_norm": 0.173427015542984, + "learning_rate": 8.515252015478915e-07, + "loss": 0.3211, + "step": 2957 + }, + { + "epoch": 2.4912970241437393, + "grad_norm": 0.19226166605949402, + "learning_rate": 8.487904755756676e-07, + "loss": 0.3345, + "step": 2958 + }, + { + "epoch": 2.4921392476137, + "grad_norm": 0.17834272980690002, + "learning_rate": 8.460597406734905e-07, + "loss": 0.3325, + "step": 2959 + }, + { + "epoch": 2.492981471083661, + "grad_norm": 0.17401964962482452, + "learning_rate": 8.433329994667488e-07, + "loss": 0.3249, + "step": 2960 + }, + { + "epoch": 2.4938236945536216, + "grad_norm": 0.19734671711921692, + "learning_rate": 8.406102545769989e-07, + "loss": 0.3586, + "step": 2961 + }, + { + "epoch": 2.4946659180235824, + "grad_norm": 0.1604025810956955, + "learning_rate": 8.378915086219497e-07, + "loss": 0.2906, + "step": 2962 + }, + { + "epoch": 2.495508141493543, + "grad_norm": 0.17842164635658264, + "learning_rate": 8.351767642154673e-07, + "loss": 0.336, + "step": 2963 + }, + { + "epoch": 2.4963503649635035, + "grad_norm": 0.38500332832336426, + "learning_rate": 8.324660239675697e-07, + "loss": 0.3368, + "step": 2964 + }, + { + "epoch": 2.4971925884334643, + "grad_norm": 0.17705510556697845, + "learning_rate": 8.297592904844282e-07, + "loss": 0.3123, + "step": 2965 + }, + { + "epoch": 2.498034811903425, + "grad_norm": 0.18042810261249542, + "learning_rate": 8.270565663683583e-07, + "loss": 0.3285, + "step": 2966 + }, + { + "epoch": 2.498877035373386, + "grad_norm": 0.18032227456569672, + "learning_rate": 8.243578542178227e-07, + "loss": 0.3032, + "step": 2967 + }, + { + "epoch": 2.4997192588433466, + "grad_norm": 0.19498011469841003, + "learning_rate": 8.216631566274252e-07, + "loss": 0.3297, + "step": 2968 + }, + { + "epoch": 2.500561482313307, + "grad_norm": 0.17469584941864014, + "learning_rate": 8.189724761879131e-07, + "loss": 0.3529, + "step": 2969 + }, + { + "epoch": 2.501403705783268, + "grad_norm": 0.1745612919330597, + "learning_rate": 8.16285815486168e-07, + "loss": 0.3364, + "step": 2970 + }, + { + "epoch": 2.5022459292532284, + "grad_norm": 0.18163642287254333, + "learning_rate": 8.13603177105211e-07, + "loss": 0.3452, + "step": 2971 + }, + { + "epoch": 2.503088152723189, + "grad_norm": 0.1809900403022766, + "learning_rate": 8.109245636241892e-07, + "loss": 0.3404, + "step": 2972 + }, + { + "epoch": 2.50393037619315, + "grad_norm": 0.16917380690574646, + "learning_rate": 8.082499776183883e-07, + "loss": 0.2734, + "step": 2973 + }, + { + "epoch": 2.5047725996631107, + "grad_norm": 0.19013775885105133, + "learning_rate": 8.05579421659215e-07, + "loss": 0.3715, + "step": 2974 + }, + { + "epoch": 2.5056148231330715, + "grad_norm": 0.18309292197227478, + "learning_rate": 8.029128983142076e-07, + "loss": 0.3175, + "step": 2975 + }, + { + "epoch": 2.506457046603032, + "grad_norm": 0.17404715716838837, + "learning_rate": 8.002504101470204e-07, + "loss": 0.2973, + "step": 2976 + }, + { + "epoch": 2.5072992700729926, + "grad_norm": 0.17378441989421844, + "learning_rate": 7.975919597174342e-07, + "loss": 0.3214, + "step": 2977 + }, + { + "epoch": 2.5081414935429533, + "grad_norm": 0.17642168700695038, + "learning_rate": 7.949375495813439e-07, + "loss": 0.3262, + "step": 2978 + }, + { + "epoch": 2.508983717012914, + "grad_norm": 0.1785106658935547, + "learning_rate": 7.922871822907641e-07, + "loss": 0.3236, + "step": 2979 + }, + { + "epoch": 2.509825940482875, + "grad_norm": 0.1916131228208542, + "learning_rate": 7.896408603938194e-07, + "loss": 0.2964, + "step": 2980 + }, + { + "epoch": 2.5106681639528357, + "grad_norm": 0.2003284990787506, + "learning_rate": 7.869985864347424e-07, + "loss": 0.3688, + "step": 2981 + }, + { + "epoch": 2.5115103874227964, + "grad_norm": 0.1730331927537918, + "learning_rate": 7.843603629538804e-07, + "loss": 0.3459, + "step": 2982 + }, + { + "epoch": 2.5123526108927567, + "grad_norm": 0.1806933879852295, + "learning_rate": 7.817261924876812e-07, + "loss": 0.2955, + "step": 2983 + }, + { + "epoch": 2.5131948343627175, + "grad_norm": 0.18728403747081757, + "learning_rate": 7.790960775687001e-07, + "loss": 0.3404, + "step": 2984 + }, + { + "epoch": 2.5140370578326783, + "grad_norm": 0.19027812778949738, + "learning_rate": 7.764700207255904e-07, + "loss": 0.3419, + "step": 2985 + }, + { + "epoch": 2.514879281302639, + "grad_norm": 0.17174813151359558, + "learning_rate": 7.738480244831042e-07, + "loss": 0.3349, + "step": 2986 + }, + { + "epoch": 2.5157215047726, + "grad_norm": 0.2719394266605377, + "learning_rate": 7.71230091362089e-07, + "loss": 0.332, + "step": 2987 + }, + { + "epoch": 2.51656372824256, + "grad_norm": 0.1689806878566742, + "learning_rate": 7.686162238794898e-07, + "loss": 0.314, + "step": 2988 + }, + { + "epoch": 2.517405951712521, + "grad_norm": 0.18157345056533813, + "learning_rate": 7.660064245483384e-07, + "loss": 0.3358, + "step": 2989 + }, + { + "epoch": 2.5182481751824817, + "grad_norm": 0.16696105897426605, + "learning_rate": 7.634006958777568e-07, + "loss": 0.3185, + "step": 2990 + }, + { + "epoch": 2.5190903986524424, + "grad_norm": 0.17663368582725525, + "learning_rate": 7.607990403729526e-07, + "loss": 0.3406, + "step": 2991 + }, + { + "epoch": 2.519932622122403, + "grad_norm": 0.18124790489673615, + "learning_rate": 7.582014605352206e-07, + "loss": 0.3177, + "step": 2992 + }, + { + "epoch": 2.520774845592364, + "grad_norm": 0.18290624022483826, + "learning_rate": 7.556079588619341e-07, + "loss": 0.3303, + "step": 2993 + }, + { + "epoch": 2.5216170690623247, + "grad_norm": 0.16427665948867798, + "learning_rate": 7.530185378465459e-07, + "loss": 0.3084, + "step": 2994 + }, + { + "epoch": 2.522459292532285, + "grad_norm": 0.19871890544891357, + "learning_rate": 7.504331999785852e-07, + "loss": 0.3531, + "step": 2995 + }, + { + "epoch": 2.523301516002246, + "grad_norm": 0.17493316531181335, + "learning_rate": 7.47851947743658e-07, + "loss": 0.3122, + "step": 2996 + }, + { + "epoch": 2.5241437394722066, + "grad_norm": 0.1823892593383789, + "learning_rate": 7.452747836234392e-07, + "loss": 0.333, + "step": 2997 + }, + { + "epoch": 2.5249859629421674, + "grad_norm": 0.1649978756904602, + "learning_rate": 7.427017100956779e-07, + "loss": 0.2861, + "step": 2998 + }, + { + "epoch": 2.525828186412128, + "grad_norm": 0.17946523427963257, + "learning_rate": 7.401327296341826e-07, + "loss": 0.3261, + "step": 2999 + }, + { + "epoch": 2.5266704098820885, + "grad_norm": 0.18829745054244995, + "learning_rate": 7.375678447088347e-07, + "loss": 0.3552, + "step": 3000 + }, + { + "epoch": 2.5275126333520497, + "grad_norm": 0.18125388026237488, + "learning_rate": 7.350070577855716e-07, + "loss": 0.34, + "step": 3001 + }, + { + "epoch": 2.52835485682201, + "grad_norm": 0.18347607553005219, + "learning_rate": 7.324503713263975e-07, + "loss": 0.3344, + "step": 3002 + }, + { + "epoch": 2.5291970802919708, + "grad_norm": 0.16962610185146332, + "learning_rate": 7.298977877893688e-07, + "loss": 0.3393, + "step": 3003 + }, + { + "epoch": 2.5300393037619315, + "grad_norm": 0.17049525678157806, + "learning_rate": 7.273493096285989e-07, + "loss": 0.3219, + "step": 3004 + }, + { + "epoch": 2.5308815272318923, + "grad_norm": 0.18462485074996948, + "learning_rate": 7.24804939294253e-07, + "loss": 0.3143, + "step": 3005 + }, + { + "epoch": 2.531723750701853, + "grad_norm": 0.16474902629852295, + "learning_rate": 7.222646792325516e-07, + "loss": 0.3203, + "step": 3006 + }, + { + "epoch": 2.5325659741718134, + "grad_norm": 0.17993924021720886, + "learning_rate": 7.197285318857584e-07, + "loss": 0.3542, + "step": 3007 + }, + { + "epoch": 2.533408197641774, + "grad_norm": 0.20460404455661774, + "learning_rate": 7.171964996921848e-07, + "loss": 0.3391, + "step": 3008 + }, + { + "epoch": 2.534250421111735, + "grad_norm": 0.1803237944841385, + "learning_rate": 7.146685850861851e-07, + "loss": 0.3286, + "step": 3009 + }, + { + "epoch": 2.5350926445816957, + "grad_norm": 0.19286656379699707, + "learning_rate": 7.121447904981571e-07, + "loss": 0.3516, + "step": 3010 + }, + { + "epoch": 2.5359348680516565, + "grad_norm": 0.1730838119983673, + "learning_rate": 7.096251183545355e-07, + "loss": 0.3244, + "step": 3011 + }, + { + "epoch": 2.5367770915216172, + "grad_norm": 0.18125079572200775, + "learning_rate": 7.071095710777925e-07, + "loss": 0.3491, + "step": 3012 + }, + { + "epoch": 2.537619314991578, + "grad_norm": 0.18218089640140533, + "learning_rate": 7.045981510864319e-07, + "loss": 0.3676, + "step": 3013 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 0.17780695855617523, + "learning_rate": 7.02090860794995e-07, + "loss": 0.3324, + "step": 3014 + }, + { + "epoch": 2.539303761931499, + "grad_norm": 0.1743364930152893, + "learning_rate": 6.995877026140468e-07, + "loss": 0.3368, + "step": 3015 + }, + { + "epoch": 2.54014598540146, + "grad_norm": 0.1723746955394745, + "learning_rate": 6.970886789501851e-07, + "loss": 0.3074, + "step": 3016 + }, + { + "epoch": 2.5409882088714206, + "grad_norm": 0.19902728497982025, + "learning_rate": 6.945937922060259e-07, + "loss": 0.3647, + "step": 3017 + }, + { + "epoch": 2.5418304323413814, + "grad_norm": 0.19043061137199402, + "learning_rate": 6.921030447802146e-07, + "loss": 0.3365, + "step": 3018 + }, + { + "epoch": 2.5426726558113417, + "grad_norm": 0.1770019680261612, + "learning_rate": 6.896164390674125e-07, + "loss": 0.3303, + "step": 3019 + }, + { + "epoch": 2.5435148792813025, + "grad_norm": 0.17343956232070923, + "learning_rate": 6.871339774583025e-07, + "loss": 0.3198, + "step": 3020 + }, + { + "epoch": 2.5443571027512633, + "grad_norm": 0.19765575230121613, + "learning_rate": 6.846556623395795e-07, + "loss": 0.3465, + "step": 3021 + }, + { + "epoch": 2.545199326221224, + "grad_norm": 0.1678420454263687, + "learning_rate": 6.821814960939549e-07, + "loss": 0.3295, + "step": 3022 + }, + { + "epoch": 2.546041549691185, + "grad_norm": 0.16188831627368927, + "learning_rate": 6.797114811001482e-07, + "loss": 0.305, + "step": 3023 + }, + { + "epoch": 2.5468837731611456, + "grad_norm": 0.1842898577451706, + "learning_rate": 6.772456197328919e-07, + "loss": 0.372, + "step": 3024 + }, + { + "epoch": 2.5477259966311063, + "grad_norm": 0.18573446571826935, + "learning_rate": 6.74783914362922e-07, + "loss": 0.333, + "step": 3025 + }, + { + "epoch": 2.5485682201010667, + "grad_norm": 0.16520562767982483, + "learning_rate": 6.723263673569796e-07, + "loss": 0.2889, + "step": 3026 + }, + { + "epoch": 2.5494104435710274, + "grad_norm": 0.19662410020828247, + "learning_rate": 6.698729810778065e-07, + "loss": 0.3318, + "step": 3027 + }, + { + "epoch": 2.550252667040988, + "grad_norm": 0.18073995411396027, + "learning_rate": 6.674237578841486e-07, + "loss": 0.3625, + "step": 3028 + }, + { + "epoch": 2.551094890510949, + "grad_norm": 0.17097055912017822, + "learning_rate": 6.649787001307451e-07, + "loss": 0.2885, + "step": 3029 + }, + { + "epoch": 2.5519371139809097, + "grad_norm": 0.18093878030776978, + "learning_rate": 6.625378101683317e-07, + "loss": 0.338, + "step": 3030 + }, + { + "epoch": 2.55277933745087, + "grad_norm": 0.17633871734142303, + "learning_rate": 6.601010903436355e-07, + "loss": 0.3234, + "step": 3031 + }, + { + "epoch": 2.5536215609208313, + "grad_norm": 0.17590993642807007, + "learning_rate": 6.57668542999379e-07, + "loss": 0.3093, + "step": 3032 + }, + { + "epoch": 2.5544637843907916, + "grad_norm": 0.1865670382976532, + "learning_rate": 6.552401704742678e-07, + "loss": 0.3262, + "step": 3033 + }, + { + "epoch": 2.5553060078607523, + "grad_norm": 0.18153560161590576, + "learning_rate": 6.528159751029988e-07, + "loss": 0.3753, + "step": 3034 + }, + { + "epoch": 2.556148231330713, + "grad_norm": 0.1866806298494339, + "learning_rate": 6.503959592162468e-07, + "loss": 0.3234, + "step": 3035 + }, + { + "epoch": 2.556990454800674, + "grad_norm": 0.18803523480892181, + "learning_rate": 6.479801251406748e-07, + "loss": 0.3966, + "step": 3036 + }, + { + "epoch": 2.5578326782706347, + "grad_norm": 0.1652144193649292, + "learning_rate": 6.455684751989194e-07, + "loss": 0.33, + "step": 3037 + }, + { + "epoch": 2.558674901740595, + "grad_norm": 0.18305109441280365, + "learning_rate": 6.431610117095999e-07, + "loss": 0.3395, + "step": 3038 + }, + { + "epoch": 2.5595171252105557, + "grad_norm": 0.21174503862857819, + "learning_rate": 6.40757736987307e-07, + "loss": 0.327, + "step": 3039 + }, + { + "epoch": 2.5603593486805165, + "grad_norm": 0.17883852124214172, + "learning_rate": 6.383586533426051e-07, + "loss": 0.3276, + "step": 3040 + }, + { + "epoch": 2.5612015721504773, + "grad_norm": 0.17874382436275482, + "learning_rate": 6.359637630820292e-07, + "loss": 0.3482, + "step": 3041 + }, + { + "epoch": 2.562043795620438, + "grad_norm": 0.17511144280433655, + "learning_rate": 6.335730685080838e-07, + "loss": 0.3241, + "step": 3042 + }, + { + "epoch": 2.562886019090399, + "grad_norm": 0.18171054124832153, + "learning_rate": 6.311865719192384e-07, + "loss": 0.3565, + "step": 3043 + }, + { + "epoch": 2.5637282425603596, + "grad_norm": 0.16010089218616486, + "learning_rate": 6.28804275609926e-07, + "loss": 0.3, + "step": 3044 + }, + { + "epoch": 2.56457046603032, + "grad_norm": 0.17939692735671997, + "learning_rate": 6.26426181870542e-07, + "loss": 0.3338, + "step": 3045 + }, + { + "epoch": 2.5654126895002807, + "grad_norm": 0.18418268859386444, + "learning_rate": 6.24052292987442e-07, + "loss": 0.3296, + "step": 3046 + }, + { + "epoch": 2.5662549129702414, + "grad_norm": 0.1782510131597519, + "learning_rate": 6.216826112429391e-07, + "loss": 0.3392, + "step": 3047 + }, + { + "epoch": 2.567097136440202, + "grad_norm": 0.17586378753185272, + "learning_rate": 6.193171389152996e-07, + "loss": 0.3384, + "step": 3048 + }, + { + "epoch": 2.567939359910163, + "grad_norm": 0.19140559434890747, + "learning_rate": 6.169558782787438e-07, + "loss": 0.3345, + "step": 3049 + }, + { + "epoch": 2.5687815833801233, + "grad_norm": 0.18451711535453796, + "learning_rate": 6.145988316034441e-07, + "loss": 0.3388, + "step": 3050 + }, + { + "epoch": 2.569623806850084, + "grad_norm": 0.16443663835525513, + "learning_rate": 6.122460011555187e-07, + "loss": 0.3415, + "step": 3051 + }, + { + "epoch": 2.570466030320045, + "grad_norm": 0.17171230912208557, + "learning_rate": 6.098973891970373e-07, + "loss": 0.3185, + "step": 3052 + }, + { + "epoch": 2.5713082537900056, + "grad_norm": 0.17013221979141235, + "learning_rate": 6.075529979860068e-07, + "loss": 0.3115, + "step": 3053 + }, + { + "epoch": 2.5721504772599664, + "grad_norm": 0.18915019929409027, + "learning_rate": 6.052128297763804e-07, + "loss": 0.3331, + "step": 3054 + }, + { + "epoch": 2.572992700729927, + "grad_norm": 0.17950762808322906, + "learning_rate": 6.028768868180523e-07, + "loss": 0.3327, + "step": 3055 + }, + { + "epoch": 2.573834924199888, + "grad_norm": 0.17014242708683014, + "learning_rate": 6.005451713568505e-07, + "loss": 0.3283, + "step": 3056 + }, + { + "epoch": 2.5746771476698482, + "grad_norm": 0.1866944283246994, + "learning_rate": 5.982176856345445e-07, + "loss": 0.3299, + "step": 3057 + }, + { + "epoch": 2.575519371139809, + "grad_norm": 0.18184182047843933, + "learning_rate": 5.958944318888287e-07, + "loss": 0.3335, + "step": 3058 + }, + { + "epoch": 2.5763615946097698, + "grad_norm": 0.19136478006839752, + "learning_rate": 5.935754123533378e-07, + "loss": 0.3218, + "step": 3059 + }, + { + "epoch": 2.5772038180797305, + "grad_norm": 0.16770581901073456, + "learning_rate": 5.912606292576284e-07, + "loss": 0.3255, + "step": 3060 + }, + { + "epoch": 2.5780460415496913, + "grad_norm": 0.18040478229522705, + "learning_rate": 5.889500848271901e-07, + "loss": 0.3051, + "step": 3061 + }, + { + "epoch": 2.5788882650196516, + "grad_norm": 0.17083778977394104, + "learning_rate": 5.866437812834325e-07, + "loss": 0.3578, + "step": 3062 + }, + { + "epoch": 2.579730488489613, + "grad_norm": 0.18398484587669373, + "learning_rate": 5.843417208436908e-07, + "loss": 0.3524, + "step": 3063 + }, + { + "epoch": 2.580572711959573, + "grad_norm": 0.17296041548252106, + "learning_rate": 5.82043905721218e-07, + "loss": 0.345, + "step": 3064 + }, + { + "epoch": 2.581414935429534, + "grad_norm": 0.18131209909915924, + "learning_rate": 5.797503381251896e-07, + "loss": 0.334, + "step": 3065 + }, + { + "epoch": 2.5822571588994947, + "grad_norm": 0.17383840680122375, + "learning_rate": 5.774610202606939e-07, + "loss": 0.3408, + "step": 3066 + }, + { + "epoch": 2.5830993823694555, + "grad_norm": 0.17166505753993988, + "learning_rate": 5.751759543287355e-07, + "loss": 0.3521, + "step": 3067 + }, + { + "epoch": 2.5839416058394162, + "grad_norm": 0.17635107040405273, + "learning_rate": 5.728951425262292e-07, + "loss": 0.2984, + "step": 3068 + }, + { + "epoch": 2.5847838293093766, + "grad_norm": 0.16817434132099152, + "learning_rate": 5.706185870460018e-07, + "loss": 0.3215, + "step": 3069 + }, + { + "epoch": 2.5856260527793373, + "grad_norm": 0.1779753714799881, + "learning_rate": 5.683462900767873e-07, + "loss": 0.3313, + "step": 3070 + }, + { + "epoch": 2.586468276249298, + "grad_norm": 0.17176592350006104, + "learning_rate": 5.660782538032245e-07, + "loss": 0.3281, + "step": 3071 + }, + { + "epoch": 2.587310499719259, + "grad_norm": 0.18446749448776245, + "learning_rate": 5.63814480405856e-07, + "loss": 0.3245, + "step": 3072 + }, + { + "epoch": 2.5881527231892196, + "grad_norm": 0.17266134917736053, + "learning_rate": 5.61554972061128e-07, + "loss": 0.3175, + "step": 3073 + }, + { + "epoch": 2.5889949466591804, + "grad_norm": 0.17002305388450623, + "learning_rate": 5.592997309413834e-07, + "loss": 0.3193, + "step": 3074 + }, + { + "epoch": 2.589837170129141, + "grad_norm": 0.19935524463653564, + "learning_rate": 5.570487592148666e-07, + "loss": 0.3508, + "step": 3075 + }, + { + "epoch": 2.5906793935991015, + "grad_norm": 0.15772496163845062, + "learning_rate": 5.548020590457098e-07, + "loss": 0.3333, + "step": 3076 + }, + { + "epoch": 2.5915216170690623, + "grad_norm": 0.1764228343963623, + "learning_rate": 5.525596325939469e-07, + "loss": 0.3406, + "step": 3077 + }, + { + "epoch": 2.592363840539023, + "grad_norm": 0.18433359265327454, + "learning_rate": 5.503214820154979e-07, + "loss": 0.3054, + "step": 3078 + }, + { + "epoch": 2.593206064008984, + "grad_norm": 0.17767508327960968, + "learning_rate": 5.480876094621734e-07, + "loss": 0.319, + "step": 3079 + }, + { + "epoch": 2.5940482874789446, + "grad_norm": 0.17714276909828186, + "learning_rate": 5.458580170816713e-07, + "loss": 0.3555, + "step": 3080 + }, + { + "epoch": 2.594890510948905, + "grad_norm": 0.16562534868717194, + "learning_rate": 5.436327070175729e-07, + "loss": 0.2983, + "step": 3081 + }, + { + "epoch": 2.5957327344188657, + "grad_norm": 0.2066238671541214, + "learning_rate": 5.414116814093434e-07, + "loss": 0.3616, + "step": 3082 + }, + { + "epoch": 2.5965749578888264, + "grad_norm": 0.18519745767116547, + "learning_rate": 5.391949423923298e-07, + "loss": 0.3018, + "step": 3083 + }, + { + "epoch": 2.597417181358787, + "grad_norm": 0.18363292515277863, + "learning_rate": 5.369824920977567e-07, + "loss": 0.3116, + "step": 3084 + }, + { + "epoch": 2.598259404828748, + "grad_norm": 0.18081538379192352, + "learning_rate": 5.347743326527255e-07, + "loss": 0.3443, + "step": 3085 + }, + { + "epoch": 2.5991016282987087, + "grad_norm": 0.185759037733078, + "learning_rate": 5.325704661802106e-07, + "loss": 0.353, + "step": 3086 + }, + { + "epoch": 2.5999438517686695, + "grad_norm": 0.1803555190563202, + "learning_rate": 5.303708947990638e-07, + "loss": 0.3493, + "step": 3087 + }, + { + "epoch": 2.60078607523863, + "grad_norm": 0.1740846186876297, + "learning_rate": 5.281756206240035e-07, + "loss": 0.2993, + "step": 3088 + }, + { + "epoch": 2.6016282987085906, + "grad_norm": 0.17752055823802948, + "learning_rate": 5.25984645765617e-07, + "loss": 0.347, + "step": 3089 + }, + { + "epoch": 2.6024705221785513, + "grad_norm": 0.17252019047737122, + "learning_rate": 5.237979723303582e-07, + "loss": 0.3308, + "step": 3090 + }, + { + "epoch": 2.603312745648512, + "grad_norm": 0.1650722175836563, + "learning_rate": 5.216156024205482e-07, + "loss": 0.3362, + "step": 3091 + }, + { + "epoch": 2.604154969118473, + "grad_norm": 0.16832077503204346, + "learning_rate": 5.194375381343664e-07, + "loss": 0.3179, + "step": 3092 + }, + { + "epoch": 2.604997192588433, + "grad_norm": 0.1857997328042984, + "learning_rate": 5.172637815658583e-07, + "loss": 0.3459, + "step": 3093 + }, + { + "epoch": 2.6058394160583944, + "grad_norm": 0.17449375987052917, + "learning_rate": 5.150943348049198e-07, + "loss": 0.3144, + "step": 3094 + }, + { + "epoch": 2.6066816395283547, + "grad_norm": 0.19014085829257965, + "learning_rate": 5.129291999373109e-07, + "loss": 0.3837, + "step": 3095 + }, + { + "epoch": 2.6075238629983155, + "grad_norm": 0.1807759404182434, + "learning_rate": 5.107683790446411e-07, + "loss": 0.3193, + "step": 3096 + }, + { + "epoch": 2.6083660864682763, + "grad_norm": 0.17734181880950928, + "learning_rate": 5.086118742043761e-07, + "loss": 0.3575, + "step": 3097 + }, + { + "epoch": 2.609208309938237, + "grad_norm": 0.16949152946472168, + "learning_rate": 5.064596874898292e-07, + "loss": 0.293, + "step": 3098 + }, + { + "epoch": 2.610050533408198, + "grad_norm": 0.17707063257694244, + "learning_rate": 5.04311820970163e-07, + "loss": 0.3613, + "step": 3099 + }, + { + "epoch": 2.610892756878158, + "grad_norm": 0.18580542504787445, + "learning_rate": 5.021682767103858e-07, + "loss": 0.3379, + "step": 3100 + }, + { + "epoch": 2.611734980348119, + "grad_norm": 0.18288764357566833, + "learning_rate": 5.000290567713533e-07, + "loss": 0.3342, + "step": 3101 + }, + { + "epoch": 2.6125772038180797, + "grad_norm": 0.19285543262958527, + "learning_rate": 4.978941632097612e-07, + "loss": 0.3157, + "step": 3102 + }, + { + "epoch": 2.6134194272880404, + "grad_norm": 0.17595289647579193, + "learning_rate": 4.957635980781445e-07, + "loss": 0.3196, + "step": 3103 + }, + { + "epoch": 2.614261650758001, + "grad_norm": 0.18875975906848907, + "learning_rate": 4.936373634248792e-07, + "loss": 0.3137, + "step": 3104 + }, + { + "epoch": 2.615103874227962, + "grad_norm": 0.17160099744796753, + "learning_rate": 4.915154612941781e-07, + "loss": 0.312, + "step": 3105 + }, + { + "epoch": 2.6159460976979227, + "grad_norm": 0.1704033613204956, + "learning_rate": 4.893978937260868e-07, + "loss": 0.3431, + "step": 3106 + }, + { + "epoch": 2.616788321167883, + "grad_norm": 0.18089723587036133, + "learning_rate": 4.872846627564842e-07, + "loss": 0.3543, + "step": 3107 + }, + { + "epoch": 2.617630544637844, + "grad_norm": 0.1861312836408615, + "learning_rate": 4.851757704170796e-07, + "loss": 0.3494, + "step": 3108 + }, + { + "epoch": 2.6184727681078046, + "grad_norm": 0.19477856159210205, + "learning_rate": 4.830712187354125e-07, + "loss": 0.3508, + "step": 3109 + }, + { + "epoch": 2.6193149915777654, + "grad_norm": 0.1828373372554779, + "learning_rate": 4.809710097348469e-07, + "loss": 0.2927, + "step": 3110 + }, + { + "epoch": 2.620157215047726, + "grad_norm": 0.19689728319644928, + "learning_rate": 4.788751454345763e-07, + "loss": 0.3149, + "step": 3111 + }, + { + "epoch": 2.6209994385176865, + "grad_norm": 0.19670797884464264, + "learning_rate": 4.767836278496085e-07, + "loss": 0.3386, + "step": 3112 + }, + { + "epoch": 2.6218416619876472, + "grad_norm": 0.16965490579605103, + "learning_rate": 4.7469645899078153e-07, + "loss": 0.3201, + "step": 3113 + }, + { + "epoch": 2.622683885457608, + "grad_norm": 0.1843360811471939, + "learning_rate": 4.726136408647464e-07, + "loss": 0.332, + "step": 3114 + }, + { + "epoch": 2.6235261089275688, + "grad_norm": 0.17019793391227722, + "learning_rate": 4.7053517547397454e-07, + "loss": 0.3233, + "step": 3115 + }, + { + "epoch": 2.6243683323975295, + "grad_norm": 0.16619959473609924, + "learning_rate": 4.6846106481675035e-07, + "loss": 0.3347, + "step": 3116 + }, + { + "epoch": 2.6252105558674903, + "grad_norm": 0.1637767255306244, + "learning_rate": 4.663913108871726e-07, + "loss": 0.3206, + "step": 3117 + }, + { + "epoch": 2.626052779337451, + "grad_norm": 0.17986620962619781, + "learning_rate": 4.643259156751506e-07, + "loss": 0.3438, + "step": 3118 + }, + { + "epoch": 2.6268950028074114, + "grad_norm": 0.18076659739017487, + "learning_rate": 4.622648811664049e-07, + "loss": 0.3557, + "step": 3119 + }, + { + "epoch": 2.627737226277372, + "grad_norm": 0.16573551297187805, + "learning_rate": 4.60208209342462e-07, + "loss": 0.3173, + "step": 3120 + }, + { + "epoch": 2.628579449747333, + "grad_norm": 0.19350749254226685, + "learning_rate": 4.581559021806542e-07, + "loss": 0.3453, + "step": 3121 + }, + { + "epoch": 2.6294216732172937, + "grad_norm": 0.18189962208271027, + "learning_rate": 4.561079616541164e-07, + "loss": 0.3085, + "step": 3122 + }, + { + "epoch": 2.6302638966872545, + "grad_norm": 0.18365921080112457, + "learning_rate": 4.540643897317887e-07, + "loss": 0.3258, + "step": 3123 + }, + { + "epoch": 2.631106120157215, + "grad_norm": 0.16131576895713806, + "learning_rate": 4.520251883784077e-07, + "loss": 0.3052, + "step": 3124 + }, + { + "epoch": 2.631948343627176, + "grad_norm": 0.1853943169116974, + "learning_rate": 4.4999035955450964e-07, + "loss": 0.3786, + "step": 3125 + }, + { + "epoch": 2.6327905670971363, + "grad_norm": 0.1730596274137497, + "learning_rate": 4.4795990521642684e-07, + "loss": 0.322, + "step": 3126 + }, + { + "epoch": 2.633632790567097, + "grad_norm": 0.18700529634952545, + "learning_rate": 4.459338273162844e-07, + "loss": 0.3538, + "step": 3127 + }, + { + "epoch": 2.634475014037058, + "grad_norm": 0.17436066269874573, + "learning_rate": 4.439121278020031e-07, + "loss": 0.3491, + "step": 3128 + }, + { + "epoch": 2.6353172375070186, + "grad_norm": 0.1638750582933426, + "learning_rate": 4.4189480861729137e-07, + "loss": 0.3038, + "step": 3129 + }, + { + "epoch": 2.6361594609769794, + "grad_norm": 0.16320183873176575, + "learning_rate": 4.3988187170164673e-07, + "loss": 0.3137, + "step": 3130 + }, + { + "epoch": 2.6370016844469397, + "grad_norm": 0.16656161844730377, + "learning_rate": 4.378733189903528e-07, + "loss": 0.3385, + "step": 3131 + }, + { + "epoch": 2.6378439079169005, + "grad_norm": 0.1839463710784912, + "learning_rate": 4.35869152414482e-07, + "loss": 0.3445, + "step": 3132 + }, + { + "epoch": 2.6386861313868613, + "grad_norm": 0.18246237933635712, + "learning_rate": 4.3386937390088366e-07, + "loss": 0.3685, + "step": 3133 + }, + { + "epoch": 2.639528354856822, + "grad_norm": 0.1700492799282074, + "learning_rate": 4.3187398537219593e-07, + "loss": 0.3194, + "step": 3134 + }, + { + "epoch": 2.640370578326783, + "grad_norm": 0.17533659934997559, + "learning_rate": 4.2988298874682754e-07, + "loss": 0.3096, + "step": 3135 + }, + { + "epoch": 2.6412128017967436, + "grad_norm": 0.17206905782222748, + "learning_rate": 4.278963859389723e-07, + "loss": 0.3508, + "step": 3136 + }, + { + "epoch": 2.6420550252667043, + "grad_norm": 0.20786888897418976, + "learning_rate": 4.259141788585947e-07, + "loss": 0.3397, + "step": 3137 + }, + { + "epoch": 2.6428972487366647, + "grad_norm": 0.1743757575750351, + "learning_rate": 4.239363694114368e-07, + "loss": 0.3344, + "step": 3138 + }, + { + "epoch": 2.6437394722066254, + "grad_norm": 0.16983045637607574, + "learning_rate": 4.2196295949901044e-07, + "loss": 0.3174, + "step": 3139 + }, + { + "epoch": 2.644581695676586, + "grad_norm": 0.1640872210264206, + "learning_rate": 4.1999395101859796e-07, + "loss": 0.3219, + "step": 3140 + }, + { + "epoch": 2.645423919146547, + "grad_norm": 0.17452681064605713, + "learning_rate": 4.1802934586324897e-07, + "loss": 0.3523, + "step": 3141 + }, + { + "epoch": 2.6462661426165077, + "grad_norm": 0.18185783922672272, + "learning_rate": 4.160691459217825e-07, + "loss": 0.3166, + "step": 3142 + }, + { + "epoch": 2.647108366086468, + "grad_norm": 0.1847081035375595, + "learning_rate": 4.1411335307878056e-07, + "loss": 0.3256, + "step": 3143 + }, + { + "epoch": 2.647950589556429, + "grad_norm": 0.17259185016155243, + "learning_rate": 4.1216196921458786e-07, + "loss": 0.333, + "step": 3144 + }, + { + "epoch": 2.6487928130263896, + "grad_norm": 0.18488125503063202, + "learning_rate": 4.102149962053098e-07, + "loss": 0.3293, + "step": 3145 + }, + { + "epoch": 2.6496350364963503, + "grad_norm": 0.16416329145431519, + "learning_rate": 4.0827243592281294e-07, + "loss": 0.3229, + "step": 3146 + }, + { + "epoch": 2.650477259966311, + "grad_norm": 0.17134909331798553, + "learning_rate": 4.0633429023472004e-07, + "loss": 0.3599, + "step": 3147 + }, + { + "epoch": 2.651319483436272, + "grad_norm": 0.16451704502105713, + "learning_rate": 4.044005610044094e-07, + "loss": 0.3238, + "step": 3148 + }, + { + "epoch": 2.6521617069062327, + "grad_norm": 0.17452752590179443, + "learning_rate": 4.0247125009101275e-07, + "loss": 0.3491, + "step": 3149 + }, + { + "epoch": 2.653003930376193, + "grad_norm": 0.19579990208148956, + "learning_rate": 4.0054635934941633e-07, + "loss": 0.3426, + "step": 3150 + }, + { + "epoch": 2.6538461538461537, + "grad_norm": 0.16347888112068176, + "learning_rate": 3.986258906302543e-07, + "loss": 0.3198, + "step": 3151 + }, + { + "epoch": 2.6546883773161145, + "grad_norm": 0.17911840975284576, + "learning_rate": 3.967098457799118e-07, + "loss": 0.3578, + "step": 3152 + }, + { + "epoch": 2.6555306007860753, + "grad_norm": 0.16491587460041046, + "learning_rate": 3.947982266405159e-07, + "loss": 0.3344, + "step": 3153 + }, + { + "epoch": 2.656372824256036, + "grad_norm": 0.17907950282096863, + "learning_rate": 3.928910350499454e-07, + "loss": 0.345, + "step": 3154 + }, + { + "epoch": 2.6572150477259964, + "grad_norm": 0.16243170201778412, + "learning_rate": 3.9098827284181683e-07, + "loss": 0.3067, + "step": 3155 + }, + { + "epoch": 2.6580572711959576, + "grad_norm": 0.19042733311653137, + "learning_rate": 3.890899418454913e-07, + "loss": 0.333, + "step": 3156 + }, + { + "epoch": 2.658899494665918, + "grad_norm": 0.18333835899829865, + "learning_rate": 3.871960438860689e-07, + "loss": 0.3522, + "step": 3157 + }, + { + "epoch": 2.6597417181358787, + "grad_norm": 0.17561234533786774, + "learning_rate": 3.8530658078438754e-07, + "loss": 0.3267, + "step": 3158 + }, + { + "epoch": 2.6605839416058394, + "grad_norm": 0.17840707302093506, + "learning_rate": 3.834215543570191e-07, + "loss": 0.33, + "step": 3159 + }, + { + "epoch": 2.6614261650758, + "grad_norm": 0.17847512662410736, + "learning_rate": 3.81540966416275e-07, + "loss": 0.3327, + "step": 3160 + }, + { + "epoch": 2.662268388545761, + "grad_norm": 0.16171397268772125, + "learning_rate": 3.796648187701957e-07, + "loss": 0.319, + "step": 3161 + }, + { + "epoch": 2.6631106120157213, + "grad_norm": 0.1658945530653, + "learning_rate": 3.777931132225526e-07, + "loss": 0.3392, + "step": 3162 + }, + { + "epoch": 2.663952835485682, + "grad_norm": 0.20843006670475006, + "learning_rate": 3.75925851572847e-07, + "loss": 0.3263, + "step": 3163 + }, + { + "epoch": 2.664795058955643, + "grad_norm": 0.17359398305416107, + "learning_rate": 3.7406303561630996e-07, + "loss": 0.3257, + "step": 3164 + }, + { + "epoch": 2.6656372824256036, + "grad_norm": 0.17524690926074982, + "learning_rate": 3.72204667143895e-07, + "loss": 0.3084, + "step": 3165 + }, + { + "epoch": 2.6664795058955644, + "grad_norm": 0.1768205612897873, + "learning_rate": 3.703507479422813e-07, + "loss": 0.3398, + "step": 3166 + }, + { + "epoch": 2.667321729365525, + "grad_norm": 0.18176667392253876, + "learning_rate": 3.6850127979386917e-07, + "loss": 0.3399, + "step": 3167 + }, + { + "epoch": 2.668163952835486, + "grad_norm": 0.18099865317344666, + "learning_rate": 3.666562644767824e-07, + "loss": 0.3444, + "step": 3168 + }, + { + "epoch": 2.6690061763054462, + "grad_norm": 0.1720365285873413, + "learning_rate": 3.648157037648598e-07, + "loss": 0.3564, + "step": 3169 + }, + { + "epoch": 2.669848399775407, + "grad_norm": 0.18111947178840637, + "learning_rate": 3.6297959942766303e-07, + "loss": 0.3414, + "step": 3170 + }, + { + "epoch": 2.6706906232453678, + "grad_norm": 0.17587114870548248, + "learning_rate": 3.611479532304618e-07, + "loss": 0.2977, + "step": 3171 + }, + { + "epoch": 2.6715328467153285, + "grad_norm": 0.16626040637493134, + "learning_rate": 3.593207669342463e-07, + "loss": 0.3231, + "step": 3172 + }, + { + "epoch": 2.6723750701852893, + "grad_norm": 0.6724367737770081, + "learning_rate": 3.574980422957147e-07, + "loss": 0.3574, + "step": 3173 + }, + { + "epoch": 2.6732172936552496, + "grad_norm": 0.17822889983654022, + "learning_rate": 3.556797810672785e-07, + "loss": 0.3476, + "step": 3174 + }, + { + "epoch": 2.6740595171252104, + "grad_norm": 0.17662286758422852, + "learning_rate": 3.538659849970555e-07, + "loss": 0.3207, + "step": 3175 + }, + { + "epoch": 2.674901740595171, + "grad_norm": 0.17599961161613464, + "learning_rate": 3.5205665582887296e-07, + "loss": 0.3628, + "step": 3176 + }, + { + "epoch": 2.675743964065132, + "grad_norm": 0.17218033969402313, + "learning_rate": 3.5025179530225995e-07, + "loss": 0.3065, + "step": 3177 + }, + { + "epoch": 2.6765861875350927, + "grad_norm": 0.17550210654735565, + "learning_rate": 3.484514051524546e-07, + "loss": 0.3106, + "step": 3178 + }, + { + "epoch": 2.6774284110050535, + "grad_norm": 0.1761777400970459, + "learning_rate": 3.466554871103922e-07, + "loss": 0.3175, + "step": 3179 + }, + { + "epoch": 2.6782706344750142, + "grad_norm": 0.17350716888904572, + "learning_rate": 3.4486404290271115e-07, + "loss": 0.3517, + "step": 3180 + }, + { + "epoch": 2.6791128579449746, + "grad_norm": 0.16766268014907837, + "learning_rate": 3.43077074251747e-07, + "loss": 0.3212, + "step": 3181 + }, + { + "epoch": 2.6799550814149353, + "grad_norm": 0.19454942643642426, + "learning_rate": 3.4129458287553487e-07, + "loss": 0.3421, + "step": 3182 + }, + { + "epoch": 2.680797304884896, + "grad_norm": 0.1776033639907837, + "learning_rate": 3.395165704878023e-07, + "loss": 0.3429, + "step": 3183 + }, + { + "epoch": 2.681639528354857, + "grad_norm": 0.1730322688817978, + "learning_rate": 3.3774303879797297e-07, + "loss": 0.3139, + "step": 3184 + }, + { + "epoch": 2.6824817518248176, + "grad_norm": 0.16911497712135315, + "learning_rate": 3.359739895111602e-07, + "loss": 0.3132, + "step": 3185 + }, + { + "epoch": 2.683323975294778, + "grad_norm": 0.17976872622966766, + "learning_rate": 3.3420942432817127e-07, + "loss": 0.3274, + "step": 3186 + }, + { + "epoch": 2.684166198764739, + "grad_norm": 0.19177532196044922, + "learning_rate": 3.324493449454991e-07, + "loss": 0.3444, + "step": 3187 + }, + { + "epoch": 2.6850084222346995, + "grad_norm": 0.17109844088554382, + "learning_rate": 3.3069375305532725e-07, + "loss": 0.3241, + "step": 3188 + }, + { + "epoch": 2.6858506457046603, + "grad_norm": 0.18712511658668518, + "learning_rate": 3.289426503455201e-07, + "loss": 0.3379, + "step": 3189 + }, + { + "epoch": 2.686692869174621, + "grad_norm": 0.1836530715227127, + "learning_rate": 3.271960384996309e-07, + "loss": 0.3118, + "step": 3190 + }, + { + "epoch": 2.687535092644582, + "grad_norm": 0.18208006024360657, + "learning_rate": 3.2545391919689193e-07, + "loss": 0.3328, + "step": 3191 + }, + { + "epoch": 2.6883773161145426, + "grad_norm": 0.18471787869930267, + "learning_rate": 3.237162941122185e-07, + "loss": 0.336, + "step": 3192 + }, + { + "epoch": 2.689219539584503, + "grad_norm": 0.16804207861423492, + "learning_rate": 3.2198316491620305e-07, + "loss": 0.3044, + "step": 3193 + }, + { + "epoch": 2.6900617630544637, + "grad_norm": 0.18219810724258423, + "learning_rate": 3.202545332751178e-07, + "loss": 0.3453, + "step": 3194 + }, + { + "epoch": 2.6909039865244244, + "grad_norm": 0.17537109553813934, + "learning_rate": 3.185304008509077e-07, + "loss": 0.3089, + "step": 3195 + }, + { + "epoch": 2.691746209994385, + "grad_norm": 0.1631346195936203, + "learning_rate": 3.1681076930119626e-07, + "loss": 0.3375, + "step": 3196 + }, + { + "epoch": 2.692588433464346, + "grad_norm": 0.18209043145179749, + "learning_rate": 3.150956402792765e-07, + "loss": 0.3504, + "step": 3197 + }, + { + "epoch": 2.6934306569343067, + "grad_norm": 0.1706717163324356, + "learning_rate": 3.133850154341139e-07, + "loss": 0.3147, + "step": 3198 + }, + { + "epoch": 2.6942728804042675, + "grad_norm": 0.16930033266544342, + "learning_rate": 3.116788964103429e-07, + "loss": 0.3296, + "step": 3199 + }, + { + "epoch": 2.695115103874228, + "grad_norm": 0.17827804386615753, + "learning_rate": 3.099772848482657e-07, + "loss": 0.3239, + "step": 3200 + }, + { + "epoch": 2.6959573273441886, + "grad_norm": 0.1688344031572342, + "learning_rate": 3.082801823838527e-07, + "loss": 0.3129, + "step": 3201 + }, + { + "epoch": 2.6967995508141493, + "grad_norm": 0.18355657160282135, + "learning_rate": 3.0658759064873755e-07, + "loss": 0.3664, + "step": 3202 + }, + { + "epoch": 2.69764177428411, + "grad_norm": 0.17502044141292572, + "learning_rate": 3.0489951127021744e-07, + "loss": 0.3255, + "step": 3203 + }, + { + "epoch": 2.698483997754071, + "grad_norm": 0.17082899808883667, + "learning_rate": 3.0321594587125083e-07, + "loss": 0.2905, + "step": 3204 + }, + { + "epoch": 2.699326221224031, + "grad_norm": 0.15576323866844177, + "learning_rate": 3.015368960704584e-07, + "loss": 0.329, + "step": 3205 + }, + { + "epoch": 2.700168444693992, + "grad_norm": 0.17618028819561005, + "learning_rate": 2.9986236348211684e-07, + "loss": 0.351, + "step": 3206 + }, + { + "epoch": 2.7010106681639527, + "grad_norm": 0.15897849202156067, + "learning_rate": 2.9819234971616154e-07, + "loss": 0.3138, + "step": 3207 + }, + { + "epoch": 2.7018528916339135, + "grad_norm": 0.17301517724990845, + "learning_rate": 2.9652685637818147e-07, + "loss": 0.3238, + "step": 3208 + }, + { + "epoch": 2.7026951151038743, + "grad_norm": 0.18447841703891754, + "learning_rate": 2.9486588506942303e-07, + "loss": 0.3684, + "step": 3209 + }, + { + "epoch": 2.703537338573835, + "grad_norm": 0.19456276297569275, + "learning_rate": 2.932094373867811e-07, + "loss": 0.3428, + "step": 3210 + }, + { + "epoch": 2.704379562043796, + "grad_norm": 0.1816757172346115, + "learning_rate": 2.915575149228056e-07, + "loss": 0.3259, + "step": 3211 + }, + { + "epoch": 2.705221785513756, + "grad_norm": 0.20598746836185455, + "learning_rate": 2.8991011926569003e-07, + "loss": 0.3233, + "step": 3212 + }, + { + "epoch": 2.706064008983717, + "grad_norm": 0.16217656433582306, + "learning_rate": 2.882672519992824e-07, + "loss": 0.3118, + "step": 3213 + }, + { + "epoch": 2.7069062324536777, + "grad_norm": 0.17557232081890106, + "learning_rate": 2.8662891470307154e-07, + "loss": 0.3467, + "step": 3214 + }, + { + "epoch": 2.7077484559236384, + "grad_norm": 0.17025545239448547, + "learning_rate": 2.8499510895219464e-07, + "loss": 0.308, + "step": 3215 + }, + { + "epoch": 2.708590679393599, + "grad_norm": 0.16908609867095947, + "learning_rate": 2.833658363174302e-07, + "loss": 0.3198, + "step": 3216 + }, + { + "epoch": 2.7094329028635595, + "grad_norm": 0.16965502500534058, + "learning_rate": 2.817410983651997e-07, + "loss": 0.3376, + "step": 3217 + }, + { + "epoch": 2.7102751263335207, + "grad_norm": 0.17812147736549377, + "learning_rate": 2.80120896657563e-07, + "loss": 0.3437, + "step": 3218 + }, + { + "epoch": 2.711117349803481, + "grad_norm": 0.17128512263298035, + "learning_rate": 2.785052327522214e-07, + "loss": 0.3224, + "step": 3219 + }, + { + "epoch": 2.711959573273442, + "grad_norm": 0.1748804748058319, + "learning_rate": 2.768941082025112e-07, + "loss": 0.34, + "step": 3220 + }, + { + "epoch": 2.7128017967434026, + "grad_norm": 0.17742173373699188, + "learning_rate": 2.7528752455740606e-07, + "loss": 0.3349, + "step": 3221 + }, + { + "epoch": 2.7136440202133634, + "grad_norm": 0.1696932166814804, + "learning_rate": 2.73685483361511e-07, + "loss": 0.3446, + "step": 3222 + }, + { + "epoch": 2.714486243683324, + "grad_norm": 0.1748046725988388, + "learning_rate": 2.720879861550685e-07, + "loss": 0.3052, + "step": 3223 + }, + { + "epoch": 2.7153284671532845, + "grad_norm": 0.1804862767457962, + "learning_rate": 2.7049503447394874e-07, + "loss": 0.3297, + "step": 3224 + }, + { + "epoch": 2.7161706906232452, + "grad_norm": 0.18347153067588806, + "learning_rate": 2.6890662984965234e-07, + "loss": 0.3496, + "step": 3225 + }, + { + "epoch": 2.717012914093206, + "grad_norm": 0.1759885549545288, + "learning_rate": 2.6732277380930873e-07, + "loss": 0.3362, + "step": 3226 + }, + { + "epoch": 2.7178551375631668, + "grad_norm": 0.16716201603412628, + "learning_rate": 2.657434678756754e-07, + "loss": 0.2997, + "step": 3227 + }, + { + "epoch": 2.7186973610331275, + "grad_norm": 0.18232782185077667, + "learning_rate": 2.6416871356713224e-07, + "loss": 0.352, + "step": 3228 + }, + { + "epoch": 2.7195395845030883, + "grad_norm": 0.185092493891716, + "learning_rate": 2.625985123976876e-07, + "loss": 0.3423, + "step": 3229 + }, + { + "epoch": 2.720381807973049, + "grad_norm": 0.4390159249305725, + "learning_rate": 2.6103286587696674e-07, + "loss": 0.3557, + "step": 3230 + }, + { + "epoch": 2.7212240314430094, + "grad_norm": 0.16824904084205627, + "learning_rate": 2.594717755102205e-07, + "loss": 0.3386, + "step": 3231 + }, + { + "epoch": 2.72206625491297, + "grad_norm": 0.17320628464221954, + "learning_rate": 2.5791524279831613e-07, + "loss": 0.3302, + "step": 3232 + }, + { + "epoch": 2.722908478382931, + "grad_norm": 0.18359090387821198, + "learning_rate": 2.5636326923774325e-07, + "loss": 0.3163, + "step": 3233 + }, + { + "epoch": 2.7237507018528917, + "grad_norm": 0.17598529160022736, + "learning_rate": 2.548158563206038e-07, + "loss": 0.2999, + "step": 3234 + }, + { + "epoch": 2.7245929253228525, + "grad_norm": 0.17214341461658478, + "learning_rate": 2.532730055346172e-07, + "loss": 0.3218, + "step": 3235 + }, + { + "epoch": 2.725435148792813, + "grad_norm": 0.17373663187026978, + "learning_rate": 2.517347183631158e-07, + "loss": 0.3266, + "step": 3236 + }, + { + "epoch": 2.7262773722627736, + "grad_norm": 0.17507827281951904, + "learning_rate": 2.5020099628504603e-07, + "loss": 0.3468, + "step": 3237 + }, + { + "epoch": 2.7271195957327343, + "grad_norm": 0.1673845499753952, + "learning_rate": 2.4867184077496333e-07, + "loss": 0.3034, + "step": 3238 + }, + { + "epoch": 2.727961819202695, + "grad_norm": 0.16584512591362, + "learning_rate": 2.471472533030339e-07, + "loss": 0.3437, + "step": 3239 + }, + { + "epoch": 2.728804042672656, + "grad_norm": 0.16631992161273956, + "learning_rate": 2.4562723533503084e-07, + "loss": 0.2933, + "step": 3240 + }, + { + "epoch": 2.7296462661426166, + "grad_norm": 0.1780492663383484, + "learning_rate": 2.441117883323374e-07, + "loss": 0.345, + "step": 3241 + }, + { + "epoch": 2.7304884896125774, + "grad_norm": 0.17436213791370392, + "learning_rate": 2.426009137519375e-07, + "loss": 0.3417, + "step": 3242 + }, + { + "epoch": 2.7313307130825377, + "grad_norm": 0.16060450673103333, + "learning_rate": 2.4109461304642254e-07, + "loss": 0.3304, + "step": 3243 + }, + { + "epoch": 2.7321729365524985, + "grad_norm": 0.16995938122272491, + "learning_rate": 2.395928876639847e-07, + "loss": 0.3158, + "step": 3244 + }, + { + "epoch": 2.7330151600224593, + "grad_norm": 0.1702856421470642, + "learning_rate": 2.3809573904841844e-07, + "loss": 0.3076, + "step": 3245 + }, + { + "epoch": 2.73385738349242, + "grad_norm": 0.1806149184703827, + "learning_rate": 2.3660316863911682e-07, + "loss": 0.3237, + "step": 3246 + }, + { + "epoch": 2.734699606962381, + "grad_norm": 0.18080653250217438, + "learning_rate": 2.3511517787107363e-07, + "loss": 0.3547, + "step": 3247 + }, + { + "epoch": 2.735541830432341, + "grad_norm": 0.1690075546503067, + "learning_rate": 2.336317681748751e-07, + "loss": 0.3174, + "step": 3248 + }, + { + "epoch": 2.7363840539023023, + "grad_norm": 0.16987837851047516, + "learning_rate": 2.3215294097670927e-07, + "loss": 0.3013, + "step": 3249 + }, + { + "epoch": 2.7372262773722627, + "grad_norm": 0.16689546406269073, + "learning_rate": 2.3067869769835215e-07, + "loss": 0.3377, + "step": 3250 + }, + { + "epoch": 2.7380685008422234, + "grad_norm": 0.18353791534900665, + "learning_rate": 2.292090397571789e-07, + "loss": 0.3565, + "step": 3251 + }, + { + "epoch": 2.738910724312184, + "grad_norm": 0.17699800431728363, + "learning_rate": 2.277439685661509e-07, + "loss": 0.2995, + "step": 3252 + }, + { + "epoch": 2.739752947782145, + "grad_norm": 0.1799086481332779, + "learning_rate": 2.262834855338225e-07, + "loss": 0.3096, + "step": 3253 + }, + { + "epoch": 2.7405951712521057, + "grad_norm": 0.18038147687911987, + "learning_rate": 2.2482759206433613e-07, + "loss": 0.34, + "step": 3254 + }, + { + "epoch": 2.741437394722066, + "grad_norm": 0.18209418654441833, + "learning_rate": 2.2337628955742263e-07, + "loss": 0.3362, + "step": 3255 + }, + { + "epoch": 2.742279618192027, + "grad_norm": 0.175188347697258, + "learning_rate": 2.21929579408397e-07, + "loss": 0.3516, + "step": 3256 + }, + { + "epoch": 2.7431218416619876, + "grad_norm": 0.1681680530309677, + "learning_rate": 2.204874630081616e-07, + "loss": 0.3223, + "step": 3257 + }, + { + "epoch": 2.7439640651319483, + "grad_norm": 0.1675475686788559, + "learning_rate": 2.1904994174319903e-07, + "loss": 0.3522, + "step": 3258 + }, + { + "epoch": 2.744806288601909, + "grad_norm": 0.17860925197601318, + "learning_rate": 2.1761701699557824e-07, + "loss": 0.3132, + "step": 3259 + }, + { + "epoch": 2.74564851207187, + "grad_norm": 0.1875511258840561, + "learning_rate": 2.1618869014294498e-07, + "loss": 0.3268, + "step": 3260 + }, + { + "epoch": 2.7464907355418307, + "grad_norm": 0.1863333284854889, + "learning_rate": 2.1476496255852685e-07, + "loss": 0.3233, + "step": 3261 + }, + { + "epoch": 2.747332959011791, + "grad_norm": 0.1790715754032135, + "learning_rate": 2.1334583561112786e-07, + "loss": 0.3351, + "step": 3262 + }, + { + "epoch": 2.7481751824817517, + "grad_norm": 0.16683176159858704, + "learning_rate": 2.1193131066513107e-07, + "loss": 0.3015, + "step": 3263 + }, + { + "epoch": 2.7490174059517125, + "grad_norm": 0.16594216227531433, + "learning_rate": 2.1052138908049303e-07, + "loss": 0.3334, + "step": 3264 + }, + { + "epoch": 2.7498596294216733, + "grad_norm": 0.15618757903575897, + "learning_rate": 2.091160722127472e-07, + "loss": 0.303, + "step": 3265 + }, + { + "epoch": 2.750701852891634, + "grad_norm": 0.16715048253536224, + "learning_rate": 2.0771536141299565e-07, + "loss": 0.3438, + "step": 3266 + }, + { + "epoch": 2.7515440763615944, + "grad_norm": 0.16841080784797668, + "learning_rate": 2.0631925802791608e-07, + "loss": 0.3173, + "step": 3267 + }, + { + "epoch": 2.7523862998315556, + "grad_norm": 0.1783439964056015, + "learning_rate": 2.0492776339975374e-07, + "loss": 0.3484, + "step": 3268 + }, + { + "epoch": 2.753228523301516, + "grad_norm": 0.18109945952892303, + "learning_rate": 2.0354087886632623e-07, + "loss": 0.3232, + "step": 3269 + }, + { + "epoch": 2.7540707467714767, + "grad_norm": 0.1643245667219162, + "learning_rate": 2.0215860576101532e-07, + "loss": 0.323, + "step": 3270 + }, + { + "epoch": 2.7549129702414374, + "grad_norm": 0.1697859913110733, + "learning_rate": 2.0078094541277016e-07, + "loss": 0.323, + "step": 3271 + }, + { + "epoch": 2.755755193711398, + "grad_norm": 0.17192378640174866, + "learning_rate": 1.9940789914610682e-07, + "loss": 0.3397, + "step": 3272 + }, + { + "epoch": 2.756597417181359, + "grad_norm": 0.16300977766513824, + "learning_rate": 1.9803946828110376e-07, + "loss": 0.3215, + "step": 3273 + }, + { + "epoch": 2.7574396406513193, + "grad_norm": 0.17897097766399384, + "learning_rate": 1.966756541334025e-07, + "loss": 0.3411, + "step": 3274 + }, + { + "epoch": 2.75828186412128, + "grad_norm": 0.17692428827285767, + "learning_rate": 1.953164580142064e-07, + "loss": 0.3413, + "step": 3275 + }, + { + "epoch": 2.759124087591241, + "grad_norm": 0.17106205224990845, + "learning_rate": 1.9396188123027736e-07, + "loss": 0.3171, + "step": 3276 + }, + { + "epoch": 2.7599663110612016, + "grad_norm": 0.16745461523532867, + "learning_rate": 1.9261192508393755e-07, + "loss": 0.2923, + "step": 3277 + }, + { + "epoch": 2.7608085345311624, + "grad_norm": 0.19608838856220245, + "learning_rate": 1.912665908730671e-07, + "loss": 0.344, + "step": 3278 + }, + { + "epoch": 2.7616507580011227, + "grad_norm": 0.18816998600959778, + "learning_rate": 1.8992587989110133e-07, + "loss": 0.3349, + "step": 3279 + }, + { + "epoch": 2.762492981471084, + "grad_norm": 0.16697731614112854, + "learning_rate": 1.8858979342703088e-07, + "loss": 0.3318, + "step": 3280 + }, + { + "epoch": 2.7633352049410442, + "grad_norm": 0.16499722003936768, + "learning_rate": 1.8725833276540095e-07, + "loss": 0.32, + "step": 3281 + }, + { + "epoch": 2.764177428411005, + "grad_norm": 0.17865091562271118, + "learning_rate": 1.8593149918630927e-07, + "loss": 0.363, + "step": 3282 + }, + { + "epoch": 2.7650196518809658, + "grad_norm": 0.18328674137592316, + "learning_rate": 1.8460929396540428e-07, + "loss": 0.3149, + "step": 3283 + }, + { + "epoch": 2.7658618753509265, + "grad_norm": 0.1787954419851303, + "learning_rate": 1.8329171837388527e-07, + "loss": 0.3257, + "step": 3284 + }, + { + "epoch": 2.7667040988208873, + "grad_norm": 0.1814635545015335, + "learning_rate": 1.8197877367849948e-07, + "loss": 0.3336, + "step": 3285 + }, + { + "epoch": 2.7675463222908476, + "grad_norm": 0.1731037199497223, + "learning_rate": 1.8067046114154386e-07, + "loss": 0.3036, + "step": 3286 + }, + { + "epoch": 2.7683885457608084, + "grad_norm": 0.17906337976455688, + "learning_rate": 1.7936678202085945e-07, + "loss": 0.3525, + "step": 3287 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 0.17718879878520966, + "learning_rate": 1.7806773756983641e-07, + "loss": 0.3007, + "step": 3288 + }, + { + "epoch": 2.77007299270073, + "grad_norm": 0.19208289682865143, + "learning_rate": 1.7677332903740296e-07, + "loss": 0.3397, + "step": 3289 + }, + { + "epoch": 2.7709152161706907, + "grad_norm": 0.16983190178871155, + "learning_rate": 1.7548355766803638e-07, + "loss": 0.327, + "step": 3290 + }, + { + "epoch": 2.7717574396406515, + "grad_norm": 0.1772148162126541, + "learning_rate": 1.7419842470175196e-07, + "loss": 0.3454, + "step": 3291 + }, + { + "epoch": 2.7725996631106122, + "grad_norm": 0.16711288690567017, + "learning_rate": 1.7291793137410695e-07, + "loss": 0.3494, + "step": 3292 + }, + { + "epoch": 2.7734418865805726, + "grad_norm": 1.8478301763534546, + "learning_rate": 1.7164207891619823e-07, + "loss": 0.3289, + "step": 3293 + }, + { + "epoch": 2.7742841100505333, + "grad_norm": 0.17004255950450897, + "learning_rate": 1.7037086855465902e-07, + "loss": 0.335, + "step": 3294 + }, + { + "epoch": 2.775126333520494, + "grad_norm": 0.1759692281484604, + "learning_rate": 1.6910430151166058e-07, + "loss": 0.3325, + "step": 3295 + }, + { + "epoch": 2.775968556990455, + "grad_norm": 1.8058511018753052, + "learning_rate": 1.6784237900491163e-07, + "loss": 0.3419, + "step": 3296 + }, + { + "epoch": 2.7768107804604156, + "grad_norm": 0.1898031383752823, + "learning_rate": 1.6658510224765333e-07, + "loss": 0.3411, + "step": 3297 + }, + { + "epoch": 2.777653003930376, + "grad_norm": 0.1677130162715912, + "learning_rate": 1.6533247244866102e-07, + "loss": 0.2952, + "step": 3298 + }, + { + "epoch": 2.778495227400337, + "grad_norm": 0.17073382437229156, + "learning_rate": 1.6408449081224131e-07, + "loss": 0.3464, + "step": 3299 + }, + { + "epoch": 2.7793374508702975, + "grad_norm": 0.1733926236629486, + "learning_rate": 1.6284115853823445e-07, + "loss": 0.3215, + "step": 3300 + }, + { + "epoch": 2.7801796743402583, + "grad_norm": 0.16632869839668274, + "learning_rate": 1.6160247682200813e-07, + "loss": 0.3229, + "step": 3301 + }, + { + "epoch": 2.781021897810219, + "grad_norm": 0.18543268740177155, + "learning_rate": 1.6036844685446084e-07, + "loss": 0.3336, + "step": 3302 + }, + { + "epoch": 2.78186412128018, + "grad_norm": 0.15792086720466614, + "learning_rate": 1.5913906982201744e-07, + "loss": 0.2924, + "step": 3303 + }, + { + "epoch": 2.7827063447501406, + "grad_norm": 0.18348781764507294, + "learning_rate": 1.5791434690662966e-07, + "loss": 0.3631, + "step": 3304 + }, + { + "epoch": 2.783548568220101, + "grad_norm": 0.1935282051563263, + "learning_rate": 1.566942792857745e-07, + "loss": 0.3336, + "step": 3305 + }, + { + "epoch": 2.7843907916900617, + "grad_norm": 0.1989465206861496, + "learning_rate": 1.554788681324554e-07, + "loss": 0.347, + "step": 3306 + }, + { + "epoch": 2.7852330151600224, + "grad_norm": 0.17033520340919495, + "learning_rate": 1.5426811461519419e-07, + "loss": 0.3008, + "step": 3307 + }, + { + "epoch": 2.786075238629983, + "grad_norm": 0.16638170182704926, + "learning_rate": 1.530620198980398e-07, + "loss": 0.3154, + "step": 3308 + }, + { + "epoch": 2.786917462099944, + "grad_norm": 0.18002824485301971, + "learning_rate": 1.5186058514055912e-07, + "loss": 0.3428, + "step": 3309 + }, + { + "epoch": 2.7877596855699043, + "grad_norm": 0.17526303231716156, + "learning_rate": 1.506638114978398e-07, + "loss": 0.296, + "step": 3310 + }, + { + "epoch": 2.7886019090398655, + "grad_norm": 0.18935514986515045, + "learning_rate": 1.4947170012048872e-07, + "loss": 0.3352, + "step": 3311 + }, + { + "epoch": 2.789444132509826, + "grad_norm": 0.17067907750606537, + "learning_rate": 1.482842521546285e-07, + "loss": 0.3025, + "step": 3312 + }, + { + "epoch": 2.7902863559797866, + "grad_norm": 0.18163152039051056, + "learning_rate": 1.471014687418998e-07, + "loss": 0.3591, + "step": 3313 + }, + { + "epoch": 2.7911285794497473, + "grad_norm": 0.19213518500328064, + "learning_rate": 1.4592335101945855e-07, + "loss": 0.3448, + "step": 3314 + }, + { + "epoch": 2.791970802919708, + "grad_norm": 0.17021815478801727, + "learning_rate": 1.447499001199748e-07, + "loss": 0.3027, + "step": 3315 + }, + { + "epoch": 2.792813026389669, + "grad_norm": 0.18658633530139923, + "learning_rate": 1.435811171716317e-07, + "loss": 0.3625, + "step": 3316 + }, + { + "epoch": 2.793655249859629, + "grad_norm": 0.17915920913219452, + "learning_rate": 1.4241700329812368e-07, + "loss": 0.3426, + "step": 3317 + }, + { + "epoch": 2.79449747332959, + "grad_norm": 0.16904447972774506, + "learning_rate": 1.4125755961865827e-07, + "loss": 0.32, + "step": 3318 + }, + { + "epoch": 2.7953396967995507, + "grad_norm": 0.17508530616760254, + "learning_rate": 1.4010278724795157e-07, + "loss": 0.303, + "step": 3319 + }, + { + "epoch": 2.7961819202695115, + "grad_norm": 0.2018783986568451, + "learning_rate": 1.3895268729622824e-07, + "loss": 0.3717, + "step": 3320 + }, + { + "epoch": 2.7970241437394723, + "grad_norm": 0.17020922899246216, + "learning_rate": 1.3780726086922103e-07, + "loss": 0.3199, + "step": 3321 + }, + { + "epoch": 2.797866367209433, + "grad_norm": 0.17910431325435638, + "learning_rate": 1.366665090681707e-07, + "loss": 0.3188, + "step": 3322 + }, + { + "epoch": 2.798708590679394, + "grad_norm": 0.19550931453704834, + "learning_rate": 1.355304329898216e-07, + "loss": 0.3448, + "step": 3323 + }, + { + "epoch": 2.799550814149354, + "grad_norm": 0.17510132491588593, + "learning_rate": 1.3439903372642615e-07, + "loss": 0.3019, + "step": 3324 + }, + { + "epoch": 2.800393037619315, + "grad_norm": 0.16419118642807007, + "learning_rate": 1.332723123657348e-07, + "loss": 0.3421, + "step": 3325 + }, + { + "epoch": 2.8012352610892757, + "grad_norm": 0.16772206127643585, + "learning_rate": 1.3215026999100655e-07, + "loss": 0.3579, + "step": 3326 + }, + { + "epoch": 2.8020774845592364, + "grad_norm": 0.153725266456604, + "learning_rate": 1.3103290768099796e-07, + "loss": 0.3065, + "step": 3327 + }, + { + "epoch": 2.802919708029197, + "grad_norm": 0.19064374268054962, + "learning_rate": 1.299202265099675e-07, + "loss": 0.339, + "step": 3328 + }, + { + "epoch": 2.8037619314991575, + "grad_norm": 0.18251188099384308, + "learning_rate": 1.288122275476733e-07, + "loss": 0.3348, + "step": 3329 + }, + { + "epoch": 2.8046041549691187, + "grad_norm": 0.17828452587127686, + "learning_rate": 1.2770891185937106e-07, + "loss": 0.3311, + "step": 3330 + }, + { + "epoch": 2.805446378439079, + "grad_norm": 0.16462412476539612, + "learning_rate": 1.2661028050581446e-07, + "loss": 0.3522, + "step": 3331 + }, + { + "epoch": 2.80628860190904, + "grad_norm": 0.2014075666666031, + "learning_rate": 1.2551633454325362e-07, + "loss": 0.332, + "step": 3332 + }, + { + "epoch": 2.8071308253790006, + "grad_norm": 0.169560968875885, + "learning_rate": 1.244270750234333e-07, + "loss": 0.341, + "step": 3333 + }, + { + "epoch": 2.8079730488489614, + "grad_norm": 0.16761159896850586, + "learning_rate": 1.2334250299359362e-07, + "loss": 0.3033, + "step": 3334 + }, + { + "epoch": 2.808815272318922, + "grad_norm": 0.1883648782968521, + "learning_rate": 1.2226261949646656e-07, + "loss": 0.355, + "step": 3335 + }, + { + "epoch": 2.8096574957888825, + "grad_norm": 0.16675567626953125, + "learning_rate": 1.2118742557027885e-07, + "loss": 0.3185, + "step": 3336 + }, + { + "epoch": 2.8104997192588432, + "grad_norm": 0.17546634376049042, + "learning_rate": 1.201169222487464e-07, + "loss": 0.328, + "step": 3337 + }, + { + "epoch": 2.811341942728804, + "grad_norm": 0.16184347867965698, + "learning_rate": 1.1905111056107644e-07, + "loss": 0.3366, + "step": 3338 + }, + { + "epoch": 2.8121841661987648, + "grad_norm": 0.1763359010219574, + "learning_rate": 1.1798999153196433e-07, + "loss": 0.3356, + "step": 3339 + }, + { + "epoch": 2.8130263896687255, + "grad_norm": 0.1889248639345169, + "learning_rate": 1.1693356618159568e-07, + "loss": 0.3127, + "step": 3340 + }, + { + "epoch": 2.813868613138686, + "grad_norm": 0.18146924674510956, + "learning_rate": 1.1588183552564247e-07, + "loss": 0.3352, + "step": 3341 + }, + { + "epoch": 2.814710836608647, + "grad_norm": 0.17269383370876312, + "learning_rate": 1.1483480057526364e-07, + "loss": 0.3399, + "step": 3342 + }, + { + "epoch": 2.8155530600786074, + "grad_norm": 0.16894157230854034, + "learning_rate": 1.1379246233710172e-07, + "loss": 0.3349, + "step": 3343 + }, + { + "epoch": 2.816395283548568, + "grad_norm": 0.1881352812051773, + "learning_rate": 1.1275482181328568e-07, + "loss": 0.3627, + "step": 3344 + }, + { + "epoch": 2.817237507018529, + "grad_norm": 0.19253571331501007, + "learning_rate": 1.1172188000142803e-07, + "loss": 0.3266, + "step": 3345 + }, + { + "epoch": 2.8180797304884897, + "grad_norm": 0.17829467356204987, + "learning_rate": 1.1069363789462273e-07, + "loss": 0.3348, + "step": 3346 + }, + { + "epoch": 2.8189219539584505, + "grad_norm": 0.16123495995998383, + "learning_rate": 1.0967009648144621e-07, + "loss": 0.2821, + "step": 3347 + }, + { + "epoch": 2.819764177428411, + "grad_norm": 0.18511976301670074, + "learning_rate": 1.0865125674595467e-07, + "loss": 0.3455, + "step": 3348 + }, + { + "epoch": 2.8206064008983716, + "grad_norm": 0.18319936096668243, + "learning_rate": 1.0763711966768453e-07, + "loss": 0.3471, + "step": 3349 + }, + { + "epoch": 2.8214486243683323, + "grad_norm": 0.16767950356006622, + "learning_rate": 1.0662768622165087e-07, + "loss": 0.3123, + "step": 3350 + }, + { + "epoch": 2.822290847838293, + "grad_norm": 0.1737443506717682, + "learning_rate": 1.0562295737834738e-07, + "loss": 0.3511, + "step": 3351 + }, + { + "epoch": 2.823133071308254, + "grad_norm": 0.184846431016922, + "learning_rate": 1.0462293410374303e-07, + "loss": 0.3405, + "step": 3352 + }, + { + "epoch": 2.8239752947782146, + "grad_norm": 0.1793210804462433, + "learning_rate": 1.0362761735928372e-07, + "loss": 0.3326, + "step": 3353 + }, + { + "epoch": 2.8248175182481754, + "grad_norm": 0.17455455660820007, + "learning_rate": 1.026370081018907e-07, + "loss": 0.2945, + "step": 3354 + }, + { + "epoch": 2.8256597417181357, + "grad_norm": 0.17478932440280914, + "learning_rate": 1.0165110728395878e-07, + "loss": 0.3273, + "step": 3355 + }, + { + "epoch": 2.8265019651880965, + "grad_norm": 0.16787944734096527, + "learning_rate": 1.0066991585335583e-07, + "loss": 0.3345, + "step": 3356 + }, + { + "epoch": 2.8273441886580573, + "grad_norm": 0.16388723254203796, + "learning_rate": 9.969343475342285e-08, + "loss": 0.3339, + "step": 3357 + }, + { + "epoch": 2.828186412128018, + "grad_norm": 0.1678238958120346, + "learning_rate": 9.872166492297052e-08, + "loss": 0.3409, + "step": 3358 + }, + { + "epoch": 2.829028635597979, + "grad_norm": 0.16698604822158813, + "learning_rate": 9.775460729628262e-08, + "loss": 0.3203, + "step": 3359 + }, + { + "epoch": 2.829870859067939, + "grad_norm": 0.1685776263475418, + "learning_rate": 9.679226280310982e-08, + "loss": 0.317, + "step": 3360 + }, + { + "epoch": 2.8307130825379003, + "grad_norm": 0.17047767341136932, + "learning_rate": 9.583463236867318e-08, + "loss": 0.3277, + "step": 3361 + }, + { + "epoch": 2.8315553060078607, + "grad_norm": 0.18030725419521332, + "learning_rate": 9.48817169136601e-08, + "loss": 0.3193, + "step": 3362 + }, + { + "epoch": 2.8323975294778214, + "grad_norm": 0.18441860377788544, + "learning_rate": 9.393351735422773e-08, + "loss": 0.3523, + "step": 3363 + }, + { + "epoch": 2.833239752947782, + "grad_norm": 0.15680761635303497, + "learning_rate": 9.299003460199519e-08, + "loss": 0.3235, + "step": 3364 + }, + { + "epoch": 2.834081976417743, + "grad_norm": 0.17292484641075134, + "learning_rate": 9.205126956405075e-08, + "loss": 0.3305, + "step": 3365 + }, + { + "epoch": 2.8349241998877037, + "grad_norm": 0.1728994995355606, + "learning_rate": 9.111722314294358e-08, + "loss": 0.3431, + "step": 3366 + }, + { + "epoch": 2.835766423357664, + "grad_norm": 0.18224498629570007, + "learning_rate": 9.018789623668866e-08, + "loss": 0.3662, + "step": 3367 + }, + { + "epoch": 2.836608646827625, + "grad_norm": 0.19175554811954498, + "learning_rate": 8.926328973876242e-08, + "loss": 0.3096, + "step": 3368 + }, + { + "epoch": 2.8374508702975856, + "grad_norm": 0.1898619830608368, + "learning_rate": 8.834340453810375e-08, + "loss": 0.3613, + "step": 3369 + }, + { + "epoch": 2.8382930937675463, + "grad_norm": 0.17442317306995392, + "learning_rate": 8.742824151911022e-08, + "loss": 0.3078, + "step": 3370 + }, + { + "epoch": 2.839135317237507, + "grad_norm": 0.1779947429895401, + "learning_rate": 8.651780156164302e-08, + "loss": 0.3168, + "step": 3371 + }, + { + "epoch": 2.8399775407074674, + "grad_norm": 0.17783810198307037, + "learning_rate": 8.561208554101863e-08, + "loss": 0.3499, + "step": 3372 + }, + { + "epoch": 2.8408197641774287, + "grad_norm": 0.16697177290916443, + "learning_rate": 8.471109432801494e-08, + "loss": 0.3265, + "step": 3373 + }, + { + "epoch": 2.841661987647389, + "grad_norm": 0.160588338971138, + "learning_rate": 8.381482878886571e-08, + "loss": 0.3002, + "step": 3374 + }, + { + "epoch": 2.8425042111173497, + "grad_norm": 0.1759585589170456, + "learning_rate": 8.29232897852611e-08, + "loss": 0.397, + "step": 3375 + }, + { + "epoch": 2.8433464345873105, + "grad_norm": 0.16448669135570526, + "learning_rate": 8.203647817434823e-08, + "loss": 0.3006, + "step": 3376 + }, + { + "epoch": 2.8441886580572713, + "grad_norm": 0.17329882085323334, + "learning_rate": 8.11543948087279e-08, + "loss": 0.293, + "step": 3377 + }, + { + "epoch": 2.845030881527232, + "grad_norm": 0.18138037621974945, + "learning_rate": 8.027704053645613e-08, + "loss": 0.3608, + "step": 3378 + }, + { + "epoch": 2.8458731049971924, + "grad_norm": 0.1575583964586258, + "learning_rate": 7.94044162010421e-08, + "loss": 0.3175, + "step": 3379 + }, + { + "epoch": 2.846715328467153, + "grad_norm": 0.17390009760856628, + "learning_rate": 7.85365226414464e-08, + "loss": 0.3437, + "step": 3380 + }, + { + "epoch": 2.847557551937114, + "grad_norm": 0.17777961492538452, + "learning_rate": 7.76733606920832e-08, + "loss": 0.3743, + "step": 3381 + }, + { + "epoch": 2.8483997754070747, + "grad_norm": 0.1718461662530899, + "learning_rate": 7.681493118281646e-08, + "loss": 0.3202, + "step": 3382 + }, + { + "epoch": 2.8492419988770354, + "grad_norm": 0.15905161201953888, + "learning_rate": 7.59612349389599e-08, + "loss": 0.2956, + "step": 3383 + }, + { + "epoch": 2.850084222346996, + "grad_norm": 0.18780933320522308, + "learning_rate": 7.511227278127697e-08, + "loss": 0.3396, + "step": 3384 + }, + { + "epoch": 2.850926445816957, + "grad_norm": 0.17712104320526123, + "learning_rate": 7.426804552598088e-08, + "loss": 0.3616, + "step": 3385 + }, + { + "epoch": 2.8517686692869173, + "grad_norm": 0.1937163770198822, + "learning_rate": 7.342855398472958e-08, + "loss": 0.3452, + "step": 3386 + }, + { + "epoch": 2.852610892756878, + "grad_norm": 0.16116411983966827, + "learning_rate": 7.259379896463248e-08, + "loss": 0.3204, + "step": 3387 + }, + { + "epoch": 2.853453116226839, + "grad_norm": 0.16988983750343323, + "learning_rate": 7.176378126824035e-08, + "loss": 0.3418, + "step": 3388 + }, + { + "epoch": 2.8542953396967996, + "grad_norm": 0.16600120067596436, + "learning_rate": 7.093850169355266e-08, + "loss": 0.3093, + "step": 3389 + }, + { + "epoch": 2.8551375631667604, + "grad_norm": 0.16801238059997559, + "learning_rate": 7.011796103401192e-08, + "loss": 0.335, + "step": 3390 + }, + { + "epoch": 2.8559797866367207, + "grad_norm": 0.18370668590068817, + "learning_rate": 6.930216007850598e-08, + "loss": 0.3361, + "step": 3391 + }, + { + "epoch": 2.856822010106682, + "grad_norm": 0.18052010238170624, + "learning_rate": 6.849109961136468e-08, + "loss": 0.3241, + "step": 3392 + }, + { + "epoch": 2.8576642335766422, + "grad_norm": 0.16566221415996552, + "learning_rate": 6.768478041236037e-08, + "loss": 0.3119, + "step": 3393 + }, + { + "epoch": 2.858506457046603, + "grad_norm": 0.15954545140266418, + "learning_rate": 6.688320325670628e-08, + "loss": 0.3244, + "step": 3394 + }, + { + "epoch": 2.8593486805165638, + "grad_norm": 0.16784389317035675, + "learning_rate": 6.608636891505982e-08, + "loss": 0.3279, + "step": 3395 + }, + { + "epoch": 2.8601909039865245, + "grad_norm": 0.18025429546833038, + "learning_rate": 6.529427815351374e-08, + "loss": 0.3239, + "step": 3396 + }, + { + "epoch": 2.8610331274564853, + "grad_norm": 0.17966340482234955, + "learning_rate": 6.450693173360445e-08, + "loss": 0.3264, + "step": 3397 + }, + { + "epoch": 2.8618753509264456, + "grad_norm": 0.16547337174415588, + "learning_rate": 6.372433041230364e-08, + "loss": 0.3351, + "step": 3398 + }, + { + "epoch": 2.8627175743964064, + "grad_norm": 0.16610006988048553, + "learning_rate": 6.294647494202444e-08, + "loss": 0.3254, + "step": 3399 + }, + { + "epoch": 2.863559797866367, + "grad_norm": 0.1672658920288086, + "learning_rate": 6.217336607061364e-08, + "loss": 0.3208, + "step": 3400 + }, + { + "epoch": 2.864402021336328, + "grad_norm": 0.18397913873195648, + "learning_rate": 6.140500454135668e-08, + "loss": 0.3249, + "step": 3401 + }, + { + "epoch": 2.8652442448062887, + "grad_norm": 0.19070270657539368, + "learning_rate": 6.064139109297485e-08, + "loss": 0.3895, + "step": 3402 + }, + { + "epoch": 2.866086468276249, + "grad_norm": 0.17317475378513336, + "learning_rate": 5.988252645962367e-08, + "loss": 0.3131, + "step": 3403 + }, + { + "epoch": 2.8669286917462102, + "grad_norm": 0.16814860701560974, + "learning_rate": 5.912841137089287e-08, + "loss": 0.3339, + "step": 3404 + }, + { + "epoch": 2.8677709152161706, + "grad_norm": 0.17419366538524628, + "learning_rate": 5.8379046551807486e-08, + "loss": 0.3121, + "step": 3405 + }, + { + "epoch": 2.8686131386861313, + "grad_norm": 0.16927020251750946, + "learning_rate": 5.7634432722822875e-08, + "loss": 0.3349, + "step": 3406 + }, + { + "epoch": 2.869455362156092, + "grad_norm": 0.16771690547466278, + "learning_rate": 5.6894570599829726e-08, + "loss": 0.3062, + "step": 3407 + }, + { + "epoch": 2.870297585626053, + "grad_norm": 0.18303582072257996, + "learning_rate": 5.615946089414737e-08, + "loss": 0.3431, + "step": 3408 + }, + { + "epoch": 2.8711398090960136, + "grad_norm": 0.1693166345357895, + "learning_rate": 5.542910431252935e-08, + "loss": 0.3123, + "step": 3409 + }, + { + "epoch": 2.871982032565974, + "grad_norm": 0.17601299285888672, + "learning_rate": 5.470350155715565e-08, + "loss": 0.3492, + "step": 3410 + }, + { + "epoch": 2.8728242560359347, + "grad_norm": 0.17169730365276337, + "learning_rate": 5.398265332563935e-08, + "loss": 0.3386, + "step": 3411 + }, + { + "epoch": 2.8736664795058955, + "grad_norm": 0.17456136643886566, + "learning_rate": 5.32665603110194e-08, + "loss": 0.3393, + "step": 3412 + }, + { + "epoch": 2.8745087029758563, + "grad_norm": 0.16657783091068268, + "learning_rate": 5.255522320176565e-08, + "loss": 0.3061, + "step": 3413 + }, + { + "epoch": 2.875350926445817, + "grad_norm": 0.1727527678012848, + "learning_rate": 5.1848642681773254e-08, + "loss": 0.3089, + "step": 3414 + }, + { + "epoch": 2.876193149915778, + "grad_norm": 0.17237474024295807, + "learning_rate": 5.114681943036603e-08, + "loss": 0.3215, + "step": 3415 + }, + { + "epoch": 2.8770353733857386, + "grad_norm": 0.17007453739643097, + "learning_rate": 5.0449754122292585e-08, + "loss": 0.368, + "step": 3416 + }, + { + "epoch": 2.877877596855699, + "grad_norm": 0.1603289097547531, + "learning_rate": 4.975744742772848e-08, + "loss": 0.3295, + "step": 3417 + }, + { + "epoch": 2.8787198203256597, + "grad_norm": 0.18231481313705444, + "learning_rate": 4.906990001227296e-08, + "loss": 0.3121, + "step": 3418 + }, + { + "epoch": 2.8795620437956204, + "grad_norm": 0.1824287474155426, + "learning_rate": 4.838711253695061e-08, + "loss": 0.3582, + "step": 3419 + }, + { + "epoch": 2.880404267265581, + "grad_norm": 0.17323684692382812, + "learning_rate": 4.770908565820964e-08, + "loss": 0.3265, + "step": 3420 + }, + { + "epoch": 2.881246490735542, + "grad_norm": 0.17707312107086182, + "learning_rate": 4.7035820027920284e-08, + "loss": 0.2921, + "step": 3421 + }, + { + "epoch": 2.8820887142055023, + "grad_norm": 0.1917116492986679, + "learning_rate": 4.636731629337587e-08, + "loss": 0.3445, + "step": 3422 + }, + { + "epoch": 2.8829309376754635, + "grad_norm": 0.1672520935535431, + "learning_rate": 4.5703575097292286e-08, + "loss": 0.327, + "step": 3423 + }, + { + "epoch": 2.883773161145424, + "grad_norm": 0.168136328458786, + "learning_rate": 4.5044597077805175e-08, + "loss": 0.3227, + "step": 3424 + }, + { + "epoch": 2.8846153846153846, + "grad_norm": 0.1622181236743927, + "learning_rate": 4.439038286847164e-08, + "loss": 0.322, + "step": 3425 + }, + { + "epoch": 2.8854576080853453, + "grad_norm": 0.16611912846565247, + "learning_rate": 4.37409330982691e-08, + "loss": 0.3496, + "step": 3426 + }, + { + "epoch": 2.886299831555306, + "grad_norm": 0.17439675331115723, + "learning_rate": 4.309624839159254e-08, + "loss": 0.3544, + "step": 3427 + }, + { + "epoch": 2.887142055025267, + "grad_norm": 0.17138072848320007, + "learning_rate": 4.245632936825783e-08, + "loss": 0.3419, + "step": 3428 + }, + { + "epoch": 2.887984278495227, + "grad_norm": 0.17312675714492798, + "learning_rate": 4.182117664349783e-08, + "loss": 0.2944, + "step": 3429 + }, + { + "epoch": 2.888826501965188, + "grad_norm": 0.18604117631912231, + "learning_rate": 4.119079082796351e-08, + "loss": 0.3626, + "step": 3430 + }, + { + "epoch": 2.8896687254351487, + "grad_norm": 0.16157913208007812, + "learning_rate": 4.056517252772229e-08, + "loss": 0.3313, + "step": 3431 + }, + { + "epoch": 2.8905109489051095, + "grad_norm": 0.1615222543478012, + "learning_rate": 3.99443223442586e-08, + "loss": 0.3249, + "step": 3432 + }, + { + "epoch": 2.8913531723750703, + "grad_norm": 0.19957011938095093, + "learning_rate": 3.9328240874471624e-08, + "loss": 0.3522, + "step": 3433 + }, + { + "epoch": 2.892195395845031, + "grad_norm": 0.17030540108680725, + "learning_rate": 3.871692871067756e-08, + "loss": 0.3024, + "step": 3434 + }, + { + "epoch": 2.893037619314992, + "grad_norm": 0.17600853741168976, + "learning_rate": 3.8110386440605164e-08, + "loss": 0.3452, + "step": 3435 + }, + { + "epoch": 2.893879842784952, + "grad_norm": 0.1735491305589676, + "learning_rate": 3.750861464739908e-08, + "loss": 0.3244, + "step": 3436 + }, + { + "epoch": 2.894722066254913, + "grad_norm": 0.17435042560100555, + "learning_rate": 3.6911613909616505e-08, + "loss": 0.3274, + "step": 3437 + }, + { + "epoch": 2.8955642897248737, + "grad_norm": 0.28229641914367676, + "learning_rate": 3.631938480122777e-08, + "loss": 0.3657, + "step": 3438 + }, + { + "epoch": 2.8964065131948344, + "grad_norm": 0.1704999953508377, + "learning_rate": 3.573192789161628e-08, + "loss": 0.297, + "step": 3439 + }, + { + "epoch": 2.897248736664795, + "grad_norm": 0.17650829255580902, + "learning_rate": 3.514924374557638e-08, + "loss": 0.3409, + "step": 3440 + }, + { + "epoch": 2.8980909601347555, + "grad_norm": 0.18400131165981293, + "learning_rate": 3.457133292331494e-08, + "loss": 0.3751, + "step": 3441 + }, + { + "epoch": 2.8989331836047163, + "grad_norm": 0.17719951272010803, + "learning_rate": 3.3998195980448065e-08, + "loss": 0.3225, + "step": 3442 + }, + { + "epoch": 2.899775407074677, + "grad_norm": 0.177615225315094, + "learning_rate": 3.342983346800388e-08, + "loss": 0.3262, + "step": 3443 + }, + { + "epoch": 2.900617630544638, + "grad_norm": 0.16130474209785461, + "learning_rate": 3.2866245932418606e-08, + "loss": 0.2945, + "step": 3444 + }, + { + "epoch": 2.9014598540145986, + "grad_norm": 0.16819952428340912, + "learning_rate": 3.230743391553881e-08, + "loss": 0.3536, + "step": 3445 + }, + { + "epoch": 2.9023020774845594, + "grad_norm": 0.16620105504989624, + "learning_rate": 3.175339795462029e-08, + "loss": 0.3125, + "step": 3446 + }, + { + "epoch": 2.90314430095452, + "grad_norm": 0.1779838651418686, + "learning_rate": 3.120413858232474e-08, + "loss": 0.3397, + "step": 3447 + }, + { + "epoch": 2.9039865244244805, + "grad_norm": 0.17595644295215607, + "learning_rate": 3.0659656326724186e-08, + "loss": 0.3416, + "step": 3448 + }, + { + "epoch": 2.9048287478944412, + "grad_norm": 0.17794086039066315, + "learning_rate": 3.011995171129545e-08, + "loss": 0.3403, + "step": 3449 + }, + { + "epoch": 2.905670971364402, + "grad_norm": 0.16569305956363678, + "learning_rate": 2.9585025254924572e-08, + "loss": 0.3333, + "step": 3450 + }, + { + "epoch": 2.9065131948343628, + "grad_norm": 0.17164397239685059, + "learning_rate": 2.9054877471901277e-08, + "loss": 0.3338, + "step": 3451 + }, + { + "epoch": 2.9073554183043235, + "grad_norm": 0.1742820292711258, + "learning_rate": 2.852950887192285e-08, + "loss": 0.2963, + "step": 3452 + }, + { + "epoch": 2.908197641774284, + "grad_norm": 0.1825890839099884, + "learning_rate": 2.8008919960090253e-08, + "loss": 0.355, + "step": 3453 + }, + { + "epoch": 2.909039865244245, + "grad_norm": 0.15666228532791138, + "learning_rate": 2.7493111236909787e-08, + "loss": 0.2954, + "step": 3454 + }, + { + "epoch": 2.9098820887142054, + "grad_norm": 0.17284969985485077, + "learning_rate": 2.6982083198293096e-08, + "loss": 0.3231, + "step": 3455 + }, + { + "epoch": 2.910724312184166, + "grad_norm": 0.16500769555568695, + "learning_rate": 2.6475836335553838e-08, + "loss": 0.3283, + "step": 3456 + }, + { + "epoch": 2.911566535654127, + "grad_norm": 0.17349562048912048, + "learning_rate": 2.5974371135408792e-08, + "loss": 0.3398, + "step": 3457 + }, + { + "epoch": 2.9124087591240877, + "grad_norm": 0.18464182317256927, + "learning_rate": 2.5477688079979522e-08, + "loss": 0.3366, + "step": 3458 + }, + { + "epoch": 2.9132509825940485, + "grad_norm": 0.16741546988487244, + "learning_rate": 2.4985787646788497e-08, + "loss": 0.3306, + "step": 3459 + }, + { + "epoch": 2.914093206064009, + "grad_norm": 0.16009648144245148, + "learning_rate": 2.4498670308760742e-08, + "loss": 0.3007, + "step": 3460 + }, + { + "epoch": 2.9149354295339696, + "grad_norm": 0.17391559481620789, + "learning_rate": 2.401633653422053e-08, + "loss": 0.3207, + "step": 3461 + }, + { + "epoch": 2.9157776530039303, + "grad_norm": 0.1760876476764679, + "learning_rate": 2.3538786786896918e-08, + "loss": 0.35, + "step": 3462 + }, + { + "epoch": 2.916619876473891, + "grad_norm": 0.16919660568237305, + "learning_rate": 2.306602152591597e-08, + "loss": 0.3329, + "step": 3463 + }, + { + "epoch": 2.917462099943852, + "grad_norm": 0.17137733101844788, + "learning_rate": 2.2598041205806333e-08, + "loss": 0.3125, + "step": 3464 + }, + { + "epoch": 2.9183043234138126, + "grad_norm": 0.1716642528772354, + "learning_rate": 2.2134846276494205e-08, + "loss": 0.3861, + "step": 3465 + }, + { + "epoch": 2.9191465468837734, + "grad_norm": 0.1804467886686325, + "learning_rate": 2.1676437183306697e-08, + "loss": 0.3229, + "step": 3466 + }, + { + "epoch": 2.9199887703537337, + "grad_norm": 0.17677058279514313, + "learning_rate": 2.1222814366969048e-08, + "loss": 0.3313, + "step": 3467 + }, + { + "epoch": 2.9208309938236945, + "grad_norm": 0.170046865940094, + "learning_rate": 2.0773978263605164e-08, + "loss": 0.3434, + "step": 3468 + }, + { + "epoch": 2.9216732172936553, + "grad_norm": 0.18440905213356018, + "learning_rate": 2.032992930473543e-08, + "loss": 0.3705, + "step": 3469 + }, + { + "epoch": 2.922515440763616, + "grad_norm": 0.16616974771022797, + "learning_rate": 1.9890667917280006e-08, + "loss": 0.3014, + "step": 3470 + }, + { + "epoch": 2.923357664233577, + "grad_norm": 0.1813036948442459, + "learning_rate": 1.9456194523554404e-08, + "loss": 0.3493, + "step": 3471 + }, + { + "epoch": 2.924199887703537, + "grad_norm": 0.17219789326190948, + "learning_rate": 1.9026509541272276e-08, + "loss": 0.3366, + "step": 3472 + }, + { + "epoch": 2.925042111173498, + "grad_norm": 0.17453472316265106, + "learning_rate": 1.860161338354205e-08, + "loss": 0.3411, + "step": 3473 + }, + { + "epoch": 2.9258843346434587, + "grad_norm": 0.17616593837738037, + "learning_rate": 1.8181506458869735e-08, + "loss": 0.3178, + "step": 3474 + }, + { + "epoch": 2.9267265581134194, + "grad_norm": 0.1713724583387375, + "learning_rate": 1.7766189171154468e-08, + "loss": 0.305, + "step": 3475 + }, + { + "epoch": 2.92756878158338, + "grad_norm": 0.16618001461029053, + "learning_rate": 1.7355661919693513e-08, + "loss": 0.3304, + "step": 3476 + }, + { + "epoch": 2.928411005053341, + "grad_norm": 0.16214674711227417, + "learning_rate": 1.69499250991767e-08, + "loss": 0.3337, + "step": 3477 + }, + { + "epoch": 2.9292532285233017, + "grad_norm": 0.18076547980308533, + "learning_rate": 1.654897909968922e-08, + "loss": 0.3099, + "step": 3478 + }, + { + "epoch": 2.930095451993262, + "grad_norm": 0.17332997918128967, + "learning_rate": 1.6152824306709392e-08, + "loss": 0.3344, + "step": 3479 + }, + { + "epoch": 2.930937675463223, + "grad_norm": 0.18455633521080017, + "learning_rate": 1.576146110111032e-08, + "loss": 0.3437, + "step": 3480 + }, + { + "epoch": 2.9317798989331836, + "grad_norm": 0.1677646040916443, + "learning_rate": 1.5374889859157137e-08, + "loss": 0.3167, + "step": 3481 + }, + { + "epoch": 2.9326221224031443, + "grad_norm": 0.17141175270080566, + "learning_rate": 1.4993110952509215e-08, + "loss": 0.3195, + "step": 3482 + }, + { + "epoch": 2.933464345873105, + "grad_norm": 0.17640945315361023, + "learning_rate": 1.4616124748217387e-08, + "loss": 0.3656, + "step": 3483 + }, + { + "epoch": 2.9343065693430654, + "grad_norm": 0.17052777111530304, + "learning_rate": 1.424393160872506e-08, + "loss": 0.3364, + "step": 3484 + }, + { + "epoch": 2.9351487928130267, + "grad_norm": 0.19747313857078552, + "learning_rate": 1.3876531891867106e-08, + "loss": 0.2958, + "step": 3485 + }, + { + "epoch": 2.935991016282987, + "grad_norm": 0.1779537945985794, + "learning_rate": 1.351392595087042e-08, + "loss": 0.2915, + "step": 3486 + }, + { + "epoch": 2.9368332397529477, + "grad_norm": 0.18276581168174744, + "learning_rate": 1.3156114134352805e-08, + "loss": 0.3353, + "step": 3487 + }, + { + "epoch": 2.9376754632229085, + "grad_norm": 0.17492914199829102, + "learning_rate": 1.2803096786323521e-08, + "loss": 0.3233, + "step": 3488 + }, + { + "epoch": 2.9385176866928693, + "grad_norm": 0.17329710721969604, + "learning_rate": 1.2454874246181081e-08, + "loss": 0.3359, + "step": 3489 + }, + { + "epoch": 2.93935991016283, + "grad_norm": 0.17337341606616974, + "learning_rate": 1.2111446848714347e-08, + "loss": 0.3201, + "step": 3490 + }, + { + "epoch": 2.9402021336327904, + "grad_norm": 0.18885523080825806, + "learning_rate": 1.1772814924103649e-08, + "loss": 0.3499, + "step": 3491 + }, + { + "epoch": 2.941044357102751, + "grad_norm": 0.16768944263458252, + "learning_rate": 1.1438978797916888e-08, + "loss": 0.2999, + "step": 3492 + }, + { + "epoch": 2.941886580572712, + "grad_norm": 0.16767726838588715, + "learning_rate": 1.1109938791112328e-08, + "loss": 0.3321, + "step": 3493 + }, + { + "epoch": 2.9427288040426727, + "grad_norm": 0.1708662509918213, + "learning_rate": 1.0785695220035809e-08, + "loss": 0.3277, + "step": 3494 + }, + { + "epoch": 2.9435710275126334, + "grad_norm": 0.1800270676612854, + "learning_rate": 1.0466248396424072e-08, + "loss": 0.3193, + "step": 3495 + }, + { + "epoch": 2.944413250982594, + "grad_norm": 0.17361044883728027, + "learning_rate": 1.0151598627399784e-08, + "loss": 0.3189, + "step": 3496 + }, + { + "epoch": 2.945255474452555, + "grad_norm": 0.1791125237941742, + "learning_rate": 9.841746215474845e-09, + "loss": 0.3096, + "step": 3497 + }, + { + "epoch": 2.9460976979225153, + "grad_norm": 0.1758781522512436, + "learning_rate": 9.536691458548741e-09, + "loss": 0.3276, + "step": 3498 + }, + { + "epoch": 2.946939921392476, + "grad_norm": 0.16645842790603638, + "learning_rate": 9.236434649908532e-09, + "loss": 0.3395, + "step": 3499 + }, + { + "epoch": 2.947782144862437, + "grad_norm": 0.17300006747245789, + "learning_rate": 8.940976078227193e-09, + "loss": 0.3613, + "step": 3500 + }, + { + "epoch": 2.9486243683323976, + "grad_norm": 0.16995273530483246, + "learning_rate": 8.650316027566386e-09, + "loss": 0.3028, + "step": 3501 + }, + { + "epoch": 2.9494665918023584, + "grad_norm": 0.18239375948905945, + "learning_rate": 8.364454777373132e-09, + "loss": 0.3571, + "step": 3502 + }, + { + "epoch": 2.9503088152723187, + "grad_norm": 0.17194171249866486, + "learning_rate": 8.083392602481477e-09, + "loss": 0.3263, + "step": 3503 + }, + { + "epoch": 2.9511510387422795, + "grad_norm": 0.1683514416217804, + "learning_rate": 7.807129773110822e-09, + "loss": 0.3286, + "step": 3504 + }, + { + "epoch": 2.9519932622122402, + "grad_norm": 0.17565521597862244, + "learning_rate": 7.535666554866483e-09, + "loss": 0.3318, + "step": 3505 + }, + { + "epoch": 2.952835485682201, + "grad_norm": 0.1656530350446701, + "learning_rate": 7.269003208740244e-09, + "loss": 0.3311, + "step": 3506 + }, + { + "epoch": 2.9536777091521618, + "grad_norm": 0.1599675863981247, + "learning_rate": 7.007139991108136e-09, + "loss": 0.3183, + "step": 3507 + }, + { + "epoch": 2.9545199326221225, + "grad_norm": 0.17979347705841064, + "learning_rate": 6.750077153731549e-09, + "loss": 0.3415, + "step": 3508 + }, + { + "epoch": 2.9553621560920833, + "grad_norm": 0.1761392205953598, + "learning_rate": 6.497814943756675e-09, + "loss": 0.3236, + "step": 3509 + }, + { + "epoch": 2.9562043795620436, + "grad_norm": 0.17616896331310272, + "learning_rate": 6.25035360371451e-09, + "loss": 0.3507, + "step": 3510 + }, + { + "epoch": 2.9570466030320044, + "grad_norm": 0.1791347861289978, + "learning_rate": 6.00769337151974e-09, + "loss": 0.3461, + "step": 3511 + }, + { + "epoch": 2.957888826501965, + "grad_norm": 0.17160698771476746, + "learning_rate": 5.769834480472414e-09, + "loss": 0.3022, + "step": 3512 + }, + { + "epoch": 2.958731049971926, + "grad_norm": 0.17749731242656708, + "learning_rate": 5.536777159254603e-09, + "loss": 0.3267, + "step": 3513 + }, + { + "epoch": 2.9595732734418867, + "grad_norm": 0.1847856491804123, + "learning_rate": 5.308521631934294e-09, + "loss": 0.3781, + "step": 3514 + }, + { + "epoch": 2.960415496911847, + "grad_norm": 0.16491389274597168, + "learning_rate": 5.08506811796039e-09, + "loss": 0.3129, + "step": 3515 + }, + { + "epoch": 2.9612577203818082, + "grad_norm": 0.17291562259197235, + "learning_rate": 4.866416832167153e-09, + "loss": 0.3353, + "step": 3516 + }, + { + "epoch": 2.9620999438517686, + "grad_norm": 0.17772185802459717, + "learning_rate": 4.652567984770873e-09, + "loss": 0.3455, + "step": 3517 + }, + { + "epoch": 2.9629421673217293, + "grad_norm": 0.18350841104984283, + "learning_rate": 4.443521781370974e-09, + "loss": 0.3165, + "step": 3518 + }, + { + "epoch": 2.96378439079169, + "grad_norm": 0.18145957589149475, + "learning_rate": 4.239278422948911e-09, + "loss": 0.359, + "step": 3519 + }, + { + "epoch": 2.964626614261651, + "grad_norm": 0.17118234932422638, + "learning_rate": 4.0398381058692755e-09, + "loss": 0.3054, + "step": 3520 + }, + { + "epoch": 2.9654688377316116, + "grad_norm": 0.16678743064403534, + "learning_rate": 3.845201021879241e-09, + "loss": 0.3201, + "step": 3521 + }, + { + "epoch": 2.966311061201572, + "grad_norm": 0.1834692806005478, + "learning_rate": 3.655367358106343e-09, + "loss": 0.3251, + "step": 3522 + }, + { + "epoch": 2.9671532846715327, + "grad_norm": 0.17456305027008057, + "learning_rate": 3.470337297062365e-09, + "loss": 0.3449, + "step": 3523 + }, + { + "epoch": 2.9679955081414935, + "grad_norm": 0.18448472023010254, + "learning_rate": 3.290111016638342e-09, + "loss": 0.3214, + "step": 3524 + }, + { + "epoch": 2.9688377316114543, + "grad_norm": 0.15986642241477966, + "learning_rate": 3.1146886901090024e-09, + "loss": 0.3225, + "step": 3525 + }, + { + "epoch": 2.969679955081415, + "grad_norm": 0.15964114665985107, + "learning_rate": 2.9440704861288804e-09, + "loss": 0.3156, + "step": 3526 + }, + { + "epoch": 2.970522178551376, + "grad_norm": 0.17358997464179993, + "learning_rate": 2.7782565687339836e-09, + "loss": 0.3403, + "step": 3527 + }, + { + "epoch": 2.9713644020213366, + "grad_norm": 0.17132069170475006, + "learning_rate": 2.617247097342901e-09, + "loss": 0.3299, + "step": 3528 + }, + { + "epoch": 2.972206625491297, + "grad_norm": 0.16908591985702515, + "learning_rate": 2.461042226752919e-09, + "loss": 0.3103, + "step": 3529 + }, + { + "epoch": 2.9730488489612577, + "grad_norm": 0.16565345227718353, + "learning_rate": 2.3096421071433508e-09, + "loss": 0.35, + "step": 3530 + }, + { + "epoch": 2.9738910724312184, + "grad_norm": 0.18640905618667603, + "learning_rate": 2.1630468840738716e-09, + "loss": 0.322, + "step": 3531 + }, + { + "epoch": 2.974733295901179, + "grad_norm": 0.1704546958208084, + "learning_rate": 2.0212566984845194e-09, + "loss": 0.3314, + "step": 3532 + }, + { + "epoch": 2.97557551937114, + "grad_norm": 0.16451433300971985, + "learning_rate": 1.8842716866956935e-09, + "loss": 0.3142, + "step": 3533 + }, + { + "epoch": 2.9764177428411003, + "grad_norm": 0.18481574952602386, + "learning_rate": 1.7520919804075997e-09, + "loss": 0.359, + "step": 3534 + }, + { + "epoch": 2.977259966311061, + "grad_norm": 0.16199319064617157, + "learning_rate": 1.624717706701917e-09, + "loss": 0.2873, + "step": 3535 + }, + { + "epoch": 2.978102189781022, + "grad_norm": 0.1770927757024765, + "learning_rate": 1.5021489880384653e-09, + "loss": 0.348, + "step": 3536 + }, + { + "epoch": 2.9789444132509826, + "grad_norm": 0.15922176837921143, + "learning_rate": 1.3843859422574269e-09, + "loss": 0.3204, + "step": 3537 + }, + { + "epoch": 2.9797866367209433, + "grad_norm": 0.17661967873573303, + "learning_rate": 1.2714286825793453e-09, + "loss": 0.36, + "step": 3538 + }, + { + "epoch": 2.980628860190904, + "grad_norm": 0.17170146107673645, + "learning_rate": 1.163277317604572e-09, + "loss": 0.3373, + "step": 3539 + }, + { + "epoch": 2.981471083660865, + "grad_norm": 0.1677255630493164, + "learning_rate": 1.0599319513115992e-09, + "loss": 0.3, + "step": 3540 + }, + { + "epoch": 2.982313307130825, + "grad_norm": 0.1660800576210022, + "learning_rate": 9.613926830587262e-10, + "loss": 0.2874, + "step": 3541 + }, + { + "epoch": 2.983155530600786, + "grad_norm": 0.19125357270240784, + "learning_rate": 8.676596075851696e-10, + "loss": 0.3728, + "step": 3542 + }, + { + "epoch": 2.9839977540707467, + "grad_norm": 0.1575414538383484, + "learning_rate": 7.787328150071771e-10, + "loss": 0.3193, + "step": 3543 + }, + { + "epoch": 2.9848399775407075, + "grad_norm": 0.17667099833488464, + "learning_rate": 6.946123908208036e-10, + "loss": 0.3592, + "step": 3544 + }, + { + "epoch": 2.9856822010106683, + "grad_norm": 0.16778381168842316, + "learning_rate": 6.152984159024655e-10, + "loss": 0.3025, + "step": 3545 + }, + { + "epoch": 2.9865244244806286, + "grad_norm": 0.17686714231967926, + "learning_rate": 5.40790966505611e-10, + "loss": 0.3393, + "step": 3546 + }, + { + "epoch": 2.98736664795059, + "grad_norm": 0.1615176796913147, + "learning_rate": 4.710901142634949e-10, + "loss": 0.2975, + "step": 3547 + }, + { + "epoch": 2.98820887142055, + "grad_norm": 0.17146608233451843, + "learning_rate": 4.061959261886239e-10, + "loss": 0.3541, + "step": 3548 + }, + { + "epoch": 2.989051094890511, + "grad_norm": 0.18955619633197784, + "learning_rate": 3.4610846467109106e-10, + "loss": 0.3537, + "step": 3549 + }, + { + "epoch": 2.9898933183604717, + "grad_norm": 0.17441588640213013, + "learning_rate": 2.9082778748135146e-10, + "loss": 0.3191, + "step": 3550 + }, + { + "epoch": 2.9907355418304324, + "grad_norm": 0.1792432814836502, + "learning_rate": 2.403539477668915e-10, + "loss": 0.308, + "step": 3551 + }, + { + "epoch": 2.991577765300393, + "grad_norm": 0.186254620552063, + "learning_rate": 1.9468699405444936e-10, + "loss": 0.3485, + "step": 3552 + }, + { + "epoch": 2.9924199887703535, + "grad_norm": 0.17337492108345032, + "learning_rate": 1.538269702494599e-10, + "loss": 0.3396, + "step": 3553 + }, + { + "epoch": 2.9932622122403143, + "grad_norm": 0.15600022673606873, + "learning_rate": 1.1777391563549956e-10, + "loss": 0.2946, + "step": 3554 + }, + { + "epoch": 2.994104435710275, + "grad_norm": 0.18066754937171936, + "learning_rate": 8.652786487484133e-11, + "loss": 0.3236, + "step": 3555 + }, + { + "epoch": 2.994946659180236, + "grad_norm": 0.18084211647510529, + "learning_rate": 6.008884800845494e-11, + "loss": 0.3694, + "step": 3556 + }, + { + "epoch": 2.9957888826501966, + "grad_norm": 0.16585589945316315, + "learning_rate": 3.8456890455451646e-11, + "loss": 0.3137, + "step": 3557 + }, + { + "epoch": 2.9966311061201574, + "grad_norm": 0.159687802195549, + "learning_rate": 2.1632013013084265e-11, + "loss": 0.3002, + "step": 3558 + }, + { + "epoch": 2.997473329590118, + "grad_norm": 0.16455858945846558, + "learning_rate": 9.614231857302258e-12, + "loss": 0.3421, + "step": 3559 + }, + { + "epoch": 2.9983155530600785, + "grad_norm": 0.15924279391765594, + "learning_rate": 2.403558542196649e-12, + "loss": 0.3125, + "step": 3560 + }, + { + "epoch": 2.9991577765300392, + "grad_norm": 0.1690434068441391, + "learning_rate": 0.0, + "loss": 0.335, + "step": 3561 + }, + { + "epoch": 2.9991577765300392, + "step": 3561, + "total_flos": 1.5509698741010432e+16, + "train_loss": 0.37121493492157265, + "train_runtime": 260447.3904, + "train_samples_per_second": 1.313, + "train_steps_per_second": 0.014 + } + ], + "logging_steps": 1.0, + "max_steps": 3561, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.5509698741010432e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}