diff --git "a/epoch_9/checkpoint-1100/trainer_state.json" "b/epoch_9/checkpoint-1100/trainer_state.json" new file mode 100644--- /dev/null +++ "b/epoch_9/checkpoint-1100/trainer_state.json" @@ -0,0 +1,7734 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25, + "eval_steps": 500, + "global_step": 1100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00022727272727272727, + "grad_norm": 2.8016968868818073, + "learning_rate": 1.25e-06, + "loss": 0.1255, + "step": 1 + }, + { + "epoch": 0.00045454545454545455, + "grad_norm": 4.112971111605968, + "learning_rate": 1.249999840689502e-06, + "loss": 0.1102, + "step": 2 + }, + { + "epoch": 0.0006818181818181819, + "grad_norm": 2.284279232933533, + "learning_rate": 1.2499993627580887e-06, + "loss": 0.0922, + "step": 3 + }, + { + "epoch": 0.0009090909090909091, + "grad_norm": 3.302173277719191, + "learning_rate": 1.2499985662060041e-06, + "loss": 0.1276, + "step": 4 + }, + { + "epoch": 0.0011363636363636363, + "grad_norm": 2.0594478547564146, + "learning_rate": 1.249997451033654e-06, + "loss": 0.1148, + "step": 5 + }, + { + "epoch": 0.0013636363636363637, + "grad_norm": 3.286789038531438, + "learning_rate": 1.249996017241607e-06, + "loss": 0.1415, + "step": 6 + }, + { + "epoch": 0.001590909090909091, + "grad_norm": 4.263909691089189, + "learning_rate": 1.249994264830594e-06, + "loss": 0.1536, + "step": 7 + }, + { + "epoch": 0.0018181818181818182, + "grad_norm": 3.1395386531922522, + "learning_rate": 1.2499921938015086e-06, + "loss": 0.1243, + "step": 8 + }, + { + "epoch": 0.0020454545454545456, + "grad_norm": 1.50090695643886, + "learning_rate": 1.2499898041554066e-06, + "loss": 0.0773, + "step": 9 + }, + { + "epoch": 0.0022727272727272726, + "grad_norm": 2.187826181591493, + "learning_rate": 1.2499870958935056e-06, + "loss": 0.1215, + "step": 10 + }, + { + "epoch": 0.0025, + "grad_norm": 2.5804134571029755, + "learning_rate": 1.2499840690171872e-06, + "loss": 0.1421, + "step": 11 + }, + { + "epoch": 0.0027272727272727275, + "grad_norm": 2.3184080898064656, + "learning_rate": 1.2499807235279937e-06, + "loss": 0.0589, + "step": 12 + }, + { + "epoch": 0.0029545454545454545, + "grad_norm": 1.537495944168739, + "learning_rate": 1.249977059427631e-06, + "loss": 0.1289, + "step": 13 + }, + { + "epoch": 0.003181818181818182, + "grad_norm": 2.4564551204506397, + "learning_rate": 1.2499730767179668e-06, + "loss": 0.1534, + "step": 14 + }, + { + "epoch": 0.003409090909090909, + "grad_norm": 1.6481162661194857, + "learning_rate": 1.2499687754010318e-06, + "loss": 0.0966, + "step": 15 + }, + { + "epoch": 0.0036363636363636364, + "grad_norm": 2.656047394405215, + "learning_rate": 1.2499641554790185e-06, + "loss": 0.1401, + "step": 16 + }, + { + "epoch": 0.003863636363636364, + "grad_norm": 2.9301601048990182, + "learning_rate": 1.2499592169542823e-06, + "loss": 0.1214, + "step": 17 + }, + { + "epoch": 0.004090909090909091, + "grad_norm": 2.109199870348404, + "learning_rate": 1.2499539598293406e-06, + "loss": 0.069, + "step": 18 + }, + { + "epoch": 0.004318181818181818, + "grad_norm": 3.6280019518536424, + "learning_rate": 1.2499483841068736e-06, + "loss": 0.1424, + "step": 19 + }, + { + "epoch": 0.004545454545454545, + "grad_norm": 2.558709531356719, + "learning_rate": 1.2499424897897237e-06, + "loss": 0.0929, + "step": 20 + }, + { + "epoch": 0.004772727272727273, + "grad_norm": 3.9209490023886957, + "learning_rate": 1.2499362768808958e-06, + "loss": 0.103, + "step": 21 + }, + { + "epoch": 0.005, + "grad_norm": 2.0035355427055377, + "learning_rate": 1.2499297453835574e-06, + "loss": 0.1737, + "step": 22 + }, + { + "epoch": 0.005227272727272727, + "grad_norm": 1.366643171398265, + "learning_rate": 1.2499228953010379e-06, + "loss": 0.0567, + "step": 23 + }, + { + "epoch": 0.005454545454545455, + "grad_norm": 2.4159855117324445, + "learning_rate": 1.2499157266368298e-06, + "loss": 0.0655, + "step": 24 + }, + { + "epoch": 0.005681818181818182, + "grad_norm": 1.2622281145845518, + "learning_rate": 1.2499082393945871e-06, + "loss": 0.0899, + "step": 25 + }, + { + "epoch": 0.005909090909090909, + "grad_norm": 5.088549448239914, + "learning_rate": 1.2499004335781272e-06, + "loss": 0.2048, + "step": 26 + }, + { + "epoch": 0.006136363636363636, + "grad_norm": 2.7766264540697447, + "learning_rate": 1.2498923091914293e-06, + "loss": 0.1138, + "step": 27 + }, + { + "epoch": 0.006363636363636364, + "grad_norm": 2.828792441583351, + "learning_rate": 1.249883866238635e-06, + "loss": 0.1093, + "step": 28 + }, + { + "epoch": 0.006590909090909091, + "grad_norm": 2.08913381466705, + "learning_rate": 1.2498751047240488e-06, + "loss": 0.1088, + "step": 29 + }, + { + "epoch": 0.006818181818181818, + "grad_norm": 0.9662882916535072, + "learning_rate": 1.2498660246521371e-06, + "loss": 0.0487, + "step": 30 + }, + { + "epoch": 0.007045454545454546, + "grad_norm": 2.851062180418608, + "learning_rate": 1.2498566260275289e-06, + "loss": 0.144, + "step": 31 + }, + { + "epoch": 0.007272727272727273, + "grad_norm": 1.315393713532606, + "learning_rate": 1.2498469088550156e-06, + "loss": 0.097, + "step": 32 + }, + { + "epoch": 0.0075, + "grad_norm": 3.8481047595563336, + "learning_rate": 1.2498368731395507e-06, + "loss": 0.1813, + "step": 33 + }, + { + "epoch": 0.007727272727272728, + "grad_norm": 2.4923456141583458, + "learning_rate": 1.2498265188862505e-06, + "loss": 0.1761, + "step": 34 + }, + { + "epoch": 0.007954545454545454, + "grad_norm": 2.787587786820959, + "learning_rate": 1.2498158461003935e-06, + "loss": 0.1797, + "step": 35 + }, + { + "epoch": 0.008181818181818182, + "grad_norm": 2.2179956854849574, + "learning_rate": 1.2498048547874208e-06, + "loss": 0.0693, + "step": 36 + }, + { + "epoch": 0.00840909090909091, + "grad_norm": 2.600913547903702, + "learning_rate": 1.2497935449529355e-06, + "loss": 0.1528, + "step": 37 + }, + { + "epoch": 0.008636363636363636, + "grad_norm": 2.380118719029788, + "learning_rate": 1.2497819166027035e-06, + "loss": 0.066, + "step": 38 + }, + { + "epoch": 0.008863636363636363, + "grad_norm": 2.9431380076793725, + "learning_rate": 1.2497699697426523e-06, + "loss": 0.0855, + "step": 39 + }, + { + "epoch": 0.00909090909090909, + "grad_norm": 2.334877499989268, + "learning_rate": 1.2497577043788732e-06, + "loss": 0.0867, + "step": 40 + }, + { + "epoch": 0.009318181818181817, + "grad_norm": 2.3855494996020385, + "learning_rate": 1.2497451205176183e-06, + "loss": 0.1264, + "step": 41 + }, + { + "epoch": 0.009545454545454546, + "grad_norm": 2.1156951000772866, + "learning_rate": 1.2497322181653032e-06, + "loss": 0.1359, + "step": 42 + }, + { + "epoch": 0.009772727272727273, + "grad_norm": 2.507116569738136, + "learning_rate": 1.249718997328505e-06, + "loss": 0.1447, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 2.4708856549531997, + "learning_rate": 1.2497054580139642e-06, + "loss": 0.0941, + "step": 44 + }, + { + "epoch": 0.010227272727272727, + "grad_norm": 3.9405320029202033, + "learning_rate": 1.2496916002285823e-06, + "loss": 0.1265, + "step": 45 + }, + { + "epoch": 0.010454545454545454, + "grad_norm": 2.63791984901363, + "learning_rate": 1.2496774239794246e-06, + "loss": 0.1248, + "step": 46 + }, + { + "epoch": 0.010681818181818181, + "grad_norm": 2.5345907881408136, + "learning_rate": 1.2496629292737176e-06, + "loss": 0.1683, + "step": 47 + }, + { + "epoch": 0.01090909090909091, + "grad_norm": 2.5907297598441246, + "learning_rate": 1.249648116118851e-06, + "loss": 0.0838, + "step": 48 + }, + { + "epoch": 0.011136363636363637, + "grad_norm": 1.657849698521552, + "learning_rate": 1.2496329845223759e-06, + "loss": 0.1263, + "step": 49 + }, + { + "epoch": 0.011363636363636364, + "grad_norm": 1.790591345862795, + "learning_rate": 1.2496175344920069e-06, + "loss": 0.0675, + "step": 50 + }, + { + "epoch": 0.011590909090909091, + "grad_norm": 2.9769824377999266, + "learning_rate": 1.24960176603562e-06, + "loss": 0.0776, + "step": 51 + }, + { + "epoch": 0.011818181818181818, + "grad_norm": 4.09324300622227, + "learning_rate": 1.2495856791612538e-06, + "loss": 0.1775, + "step": 52 + }, + { + "epoch": 0.012045454545454545, + "grad_norm": 1.9505514146615008, + "learning_rate": 1.2495692738771095e-06, + "loss": 0.1311, + "step": 53 + }, + { + "epoch": 0.012272727272727272, + "grad_norm": 3.340965635650713, + "learning_rate": 1.2495525501915503e-06, + "loss": 0.119, + "step": 54 + }, + { + "epoch": 0.0125, + "grad_norm": 2.590211579571482, + "learning_rate": 1.2495355081131017e-06, + "loss": 0.1047, + "step": 55 + }, + { + "epoch": 0.012727272727272728, + "grad_norm": 1.7126546389525177, + "learning_rate": 1.249518147650452e-06, + "loss": 0.0781, + "step": 56 + }, + { + "epoch": 0.012954545454545455, + "grad_norm": 1.6791481403257953, + "learning_rate": 1.249500468812451e-06, + "loss": 0.104, + "step": 57 + }, + { + "epoch": 0.013181818181818182, + "grad_norm": 2.772601988807807, + "learning_rate": 1.2494824716081117e-06, + "loss": 0.1044, + "step": 58 + }, + { + "epoch": 0.013409090909090909, + "grad_norm": 2.4287061769206924, + "learning_rate": 1.2494641560466087e-06, + "loss": 0.1063, + "step": 59 + }, + { + "epoch": 0.013636363636363636, + "grad_norm": 4.931772367793536, + "learning_rate": 1.249445522137279e-06, + "loss": 0.1982, + "step": 60 + }, + { + "epoch": 0.013863636363636364, + "grad_norm": 1.5683802428051412, + "learning_rate": 1.2494265698896224e-06, + "loss": 0.0687, + "step": 61 + }, + { + "epoch": 0.014090909090909091, + "grad_norm": 3.6622125210898666, + "learning_rate": 1.2494072993133005e-06, + "loss": 0.1115, + "step": 62 + }, + { + "epoch": 0.014318181818181818, + "grad_norm": 1.5981830241199848, + "learning_rate": 1.2493877104181373e-06, + "loss": 0.0918, + "step": 63 + }, + { + "epoch": 0.014545454545454545, + "grad_norm": 2.2461126255277444, + "learning_rate": 1.249367803214119e-06, + "loss": 0.0734, + "step": 64 + }, + { + "epoch": 0.014772727272727272, + "grad_norm": 2.9922052531051264, + "learning_rate": 1.2493475777113945e-06, + "loss": 0.0737, + "step": 65 + }, + { + "epoch": 0.015, + "grad_norm": 3.0304712723495446, + "learning_rate": 1.2493270339202742e-06, + "loss": 0.2087, + "step": 66 + }, + { + "epoch": 0.015227272727272726, + "grad_norm": 3.616756934691463, + "learning_rate": 1.2493061718512314e-06, + "loss": 0.0935, + "step": 67 + }, + { + "epoch": 0.015454545454545455, + "grad_norm": 3.4906124774385083, + "learning_rate": 1.2492849915149013e-06, + "loss": 0.1423, + "step": 68 + }, + { + "epoch": 0.015681818181818182, + "grad_norm": 3.038765170731685, + "learning_rate": 1.2492634929220817e-06, + "loss": 0.1127, + "step": 69 + }, + { + "epoch": 0.015909090909090907, + "grad_norm": 3.2114710879274244, + "learning_rate": 1.2492416760837326e-06, + "loss": 0.0896, + "step": 70 + }, + { + "epoch": 0.016136363636363636, + "grad_norm": 3.3772458882062177, + "learning_rate": 1.2492195410109757e-06, + "loss": 0.2259, + "step": 71 + }, + { + "epoch": 0.016363636363636365, + "grad_norm": 1.2392666587572057, + "learning_rate": 1.2491970877150955e-06, + "loss": 0.0739, + "step": 72 + }, + { + "epoch": 0.01659090909090909, + "grad_norm": 3.7352926778664997, + "learning_rate": 1.2491743162075384e-06, + "loss": 0.0809, + "step": 73 + }, + { + "epoch": 0.01681818181818182, + "grad_norm": 3.7495487663408498, + "learning_rate": 1.2491512264999135e-06, + "loss": 0.1764, + "step": 74 + }, + { + "epoch": 0.017045454545454544, + "grad_norm": 4.148932597238052, + "learning_rate": 1.2491278186039916e-06, + "loss": 0.1807, + "step": 75 + }, + { + "epoch": 0.017272727272727273, + "grad_norm": 3.382982224916496, + "learning_rate": 1.2491040925317057e-06, + "loss": 0.1111, + "step": 76 + }, + { + "epoch": 0.0175, + "grad_norm": 2.9498608287299772, + "learning_rate": 1.2490800482951515e-06, + "loss": 0.1479, + "step": 77 + }, + { + "epoch": 0.017727272727272727, + "grad_norm": 2.2171068439041144, + "learning_rate": 1.2490556859065865e-06, + "loss": 0.1084, + "step": 78 + }, + { + "epoch": 0.017954545454545456, + "grad_norm": 2.404748491524079, + "learning_rate": 1.2490310053784301e-06, + "loss": 0.1175, + "step": 79 + }, + { + "epoch": 0.01818181818181818, + "grad_norm": 3.213366108512472, + "learning_rate": 1.249006006723265e-06, + "loss": 0.0861, + "step": 80 + }, + { + "epoch": 0.01840909090909091, + "grad_norm": 2.2487279541462746, + "learning_rate": 1.2489806899538349e-06, + "loss": 0.0871, + "step": 81 + }, + { + "epoch": 0.018636363636363635, + "grad_norm": 1.3990597415449715, + "learning_rate": 1.2489550550830462e-06, + "loss": 0.0725, + "step": 82 + }, + { + "epoch": 0.018863636363636364, + "grad_norm": 2.957544631567143, + "learning_rate": 1.2489291021239674e-06, + "loss": 0.1963, + "step": 83 + }, + { + "epoch": 0.019090909090909092, + "grad_norm": 2.8956582173986587, + "learning_rate": 1.2489028310898293e-06, + "loss": 0.0843, + "step": 84 + }, + { + "epoch": 0.019318181818181818, + "grad_norm": 2.6354376087607654, + "learning_rate": 1.2488762419940244e-06, + "loss": 0.1849, + "step": 85 + }, + { + "epoch": 0.019545454545454546, + "grad_norm": 3.219587107200409, + "learning_rate": 1.248849334850108e-06, + "loss": 0.1339, + "step": 86 + }, + { + "epoch": 0.01977272727272727, + "grad_norm": 1.4032569908965096, + "learning_rate": 1.2488221096717967e-06, + "loss": 0.0918, + "step": 87 + }, + { + "epoch": 0.02, + "grad_norm": 2.8863054724096378, + "learning_rate": 1.2487945664729703e-06, + "loss": 0.1386, + "step": 88 + }, + { + "epoch": 0.020227272727272726, + "grad_norm": 2.6708806807596623, + "learning_rate": 1.2487667052676699e-06, + "loss": 0.181, + "step": 89 + }, + { + "epoch": 0.020454545454545454, + "grad_norm": 2.6673794745766406, + "learning_rate": 1.2487385260700987e-06, + "loss": 0.1636, + "step": 90 + }, + { + "epoch": 0.020681818181818183, + "grad_norm": 1.6036753171548173, + "learning_rate": 1.2487100288946228e-06, + "loss": 0.1055, + "step": 91 + }, + { + "epoch": 0.02090909090909091, + "grad_norm": 2.4371313674557507, + "learning_rate": 1.2486812137557693e-06, + "loss": 0.0817, + "step": 92 + }, + { + "epoch": 0.021136363636363637, + "grad_norm": 3.7090598436546007, + "learning_rate": 1.2486520806682283e-06, + "loss": 0.1273, + "step": 93 + }, + { + "epoch": 0.021363636363636362, + "grad_norm": 2.537560508386459, + "learning_rate": 1.248622629646852e-06, + "loss": 0.114, + "step": 94 + }, + { + "epoch": 0.02159090909090909, + "grad_norm": 3.4628601759308153, + "learning_rate": 1.2485928607066537e-06, + "loss": 0.1155, + "step": 95 + }, + { + "epoch": 0.02181818181818182, + "grad_norm": 3.399846064780786, + "learning_rate": 1.2485627738628097e-06, + "loss": 0.2044, + "step": 96 + }, + { + "epoch": 0.022045454545454545, + "grad_norm": 2.0772256662629176, + "learning_rate": 1.248532369130658e-06, + "loss": 0.0768, + "step": 97 + }, + { + "epoch": 0.022272727272727274, + "grad_norm": 2.7271493410189582, + "learning_rate": 1.2485016465256987e-06, + "loss": 0.0805, + "step": 98 + }, + { + "epoch": 0.0225, + "grad_norm": 3.9454937508078767, + "learning_rate": 1.2484706060635945e-06, + "loss": 0.0976, + "step": 99 + }, + { + "epoch": 0.022727272727272728, + "grad_norm": 4.287036939763489, + "learning_rate": 1.248439247760169e-06, + "loss": 0.1484, + "step": 100 + }, + { + "epoch": 0.022954545454545453, + "grad_norm": 3.621650790974889, + "learning_rate": 1.2484075716314085e-06, + "loss": 0.0815, + "step": 101 + }, + { + "epoch": 0.023181818181818182, + "grad_norm": 4.2363895224865, + "learning_rate": 1.2483755776934616e-06, + "loss": 0.2861, + "step": 102 + }, + { + "epoch": 0.02340909090909091, + "grad_norm": 3.3518543639439966, + "learning_rate": 1.2483432659626384e-06, + "loss": 0.1313, + "step": 103 + }, + { + "epoch": 0.023636363636363636, + "grad_norm": 1.6851817813430572, + "learning_rate": 1.2483106364554115e-06, + "loss": 0.1106, + "step": 104 + }, + { + "epoch": 0.023863636363636365, + "grad_norm": 3.7558796733114153, + "learning_rate": 1.248277689188415e-06, + "loss": 0.0833, + "step": 105 + }, + { + "epoch": 0.02409090909090909, + "grad_norm": 2.5185027449534556, + "learning_rate": 1.248244424178445e-06, + "loss": 0.2037, + "step": 106 + }, + { + "epoch": 0.02431818181818182, + "grad_norm": 4.483827071898472, + "learning_rate": 1.2482108414424602e-06, + "loss": 0.0956, + "step": 107 + }, + { + "epoch": 0.024545454545454544, + "grad_norm": 2.3022194497893427, + "learning_rate": 1.2481769409975805e-06, + "loss": 0.0865, + "step": 108 + }, + { + "epoch": 0.024772727272727273, + "grad_norm": 4.509365227731544, + "learning_rate": 1.2481427228610881e-06, + "loss": 0.1261, + "step": 109 + }, + { + "epoch": 0.025, + "grad_norm": 1.9078702077884082, + "learning_rate": 1.2481081870504278e-06, + "loss": 0.0878, + "step": 110 + }, + { + "epoch": 0.025227272727272727, + "grad_norm": 3.1598412687896937, + "learning_rate": 1.2480733335832052e-06, + "loss": 0.0829, + "step": 111 + }, + { + "epoch": 0.025454545454545455, + "grad_norm": 3.6971181252606655, + "learning_rate": 1.2480381624771882e-06, + "loss": 0.0914, + "step": 112 + }, + { + "epoch": 0.02568181818181818, + "grad_norm": 2.008987440641116, + "learning_rate": 1.2480026737503073e-06, + "loss": 0.1008, + "step": 113 + }, + { + "epoch": 0.02590909090909091, + "grad_norm": 2.99919645772555, + "learning_rate": 1.2479668674206543e-06, + "loss": 0.1245, + "step": 114 + }, + { + "epoch": 0.026136363636363635, + "grad_norm": 3.8748833281129578, + "learning_rate": 1.2479307435064827e-06, + "loss": 0.0976, + "step": 115 + }, + { + "epoch": 0.026363636363636363, + "grad_norm": 2.122199281855629, + "learning_rate": 1.2478943020262087e-06, + "loss": 0.1251, + "step": 116 + }, + { + "epoch": 0.026590909090909092, + "grad_norm": 2.492267421311055, + "learning_rate": 1.2478575429984097e-06, + "loss": 0.152, + "step": 117 + }, + { + "epoch": 0.026818181818181817, + "grad_norm": 2.653566603024868, + "learning_rate": 1.2478204664418254e-06, + "loss": 0.0991, + "step": 118 + }, + { + "epoch": 0.027045454545454546, + "grad_norm": 4.198276247734506, + "learning_rate": 1.2477830723753567e-06, + "loss": 0.1579, + "step": 119 + }, + { + "epoch": 0.02727272727272727, + "grad_norm": 3.899807537109214, + "learning_rate": 1.2477453608180673e-06, + "loss": 0.1107, + "step": 120 + }, + { + "epoch": 0.0275, + "grad_norm": 2.3833320390188555, + "learning_rate": 1.2477073317891822e-06, + "loss": 0.1554, + "step": 121 + }, + { + "epoch": 0.02772727272727273, + "grad_norm": 3.8212587869189245, + "learning_rate": 1.2476689853080883e-06, + "loss": 0.0865, + "step": 122 + }, + { + "epoch": 0.027954545454545454, + "grad_norm": 1.9136882810464502, + "learning_rate": 1.2476303213943346e-06, + "loss": 0.0787, + "step": 123 + }, + { + "epoch": 0.028181818181818183, + "grad_norm": 1.9864512670945367, + "learning_rate": 1.2475913400676314e-06, + "loss": 0.0679, + "step": 124 + }, + { + "epoch": 0.028409090909090908, + "grad_norm": 3.057361348148449, + "learning_rate": 1.2475520413478516e-06, + "loss": 0.1558, + "step": 125 + }, + { + "epoch": 0.028636363636363637, + "grad_norm": 2.4559864848992423, + "learning_rate": 1.247512425255029e-06, + "loss": 0.0568, + "step": 126 + }, + { + "epoch": 0.028863636363636362, + "grad_norm": 2.3064987947510125, + "learning_rate": 1.2474724918093594e-06, + "loss": 0.0874, + "step": 127 + }, + { + "epoch": 0.02909090909090909, + "grad_norm": 3.4630333466988286, + "learning_rate": 1.2474322410312012e-06, + "loss": 0.1433, + "step": 128 + }, + { + "epoch": 0.02931818181818182, + "grad_norm": 3.391975827415409, + "learning_rate": 1.247391672941074e-06, + "loss": 0.0829, + "step": 129 + }, + { + "epoch": 0.029545454545454545, + "grad_norm": 3.2248882073989957, + "learning_rate": 1.2473507875596586e-06, + "loss": 0.1843, + "step": 130 + }, + { + "epoch": 0.029772727272727274, + "grad_norm": 4.118551536256316, + "learning_rate": 1.2473095849077984e-06, + "loss": 0.1819, + "step": 131 + }, + { + "epoch": 0.03, + "grad_norm": 4.137811432545649, + "learning_rate": 1.2472680650064984e-06, + "loss": 0.1778, + "step": 132 + }, + { + "epoch": 0.030227272727272728, + "grad_norm": 3.6247559276452463, + "learning_rate": 1.247226227876925e-06, + "loss": 0.161, + "step": 133 + }, + { + "epoch": 0.030454545454545453, + "grad_norm": 1.7953374315892745, + "learning_rate": 1.2471840735404066e-06, + "loss": 0.0475, + "step": 134 + }, + { + "epoch": 0.03068181818181818, + "grad_norm": 3.9170817361284764, + "learning_rate": 1.2471416020184332e-06, + "loss": 0.1046, + "step": 135 + }, + { + "epoch": 0.03090909090909091, + "grad_norm": 3.193892805213982, + "learning_rate": 1.2470988133326564e-06, + "loss": 0.0783, + "step": 136 + }, + { + "epoch": 0.031136363636363636, + "grad_norm": 2.392755985657823, + "learning_rate": 1.2470557075048897e-06, + "loss": 0.114, + "step": 137 + }, + { + "epoch": 0.031363636363636364, + "grad_norm": 4.884480413611041, + "learning_rate": 1.2470122845571081e-06, + "loss": 0.1408, + "step": 138 + }, + { + "epoch": 0.03159090909090909, + "grad_norm": 2.3704649582296926, + "learning_rate": 1.2469685445114486e-06, + "loss": 0.0848, + "step": 139 + }, + { + "epoch": 0.031818181818181815, + "grad_norm": 2.2985560540167653, + "learning_rate": 1.2469244873902089e-06, + "loss": 0.1029, + "step": 140 + }, + { + "epoch": 0.032045454545454544, + "grad_norm": 5.319831122659349, + "learning_rate": 1.2468801132158499e-06, + "loss": 0.2105, + "step": 141 + }, + { + "epoch": 0.03227272727272727, + "grad_norm": 2.7266103565527855, + "learning_rate": 1.2468354220109926e-06, + "loss": 0.1006, + "step": 142 + }, + { + "epoch": 0.0325, + "grad_norm": 2.744775358082282, + "learning_rate": 1.2467904137984208e-06, + "loss": 0.125, + "step": 143 + }, + { + "epoch": 0.03272727272727273, + "grad_norm": 4.2252693306222975, + "learning_rate": 1.246745088601079e-06, + "loss": 0.0937, + "step": 144 + }, + { + "epoch": 0.03295454545454545, + "grad_norm": 1.8301450973591429, + "learning_rate": 1.246699446442074e-06, + "loss": 0.0974, + "step": 145 + }, + { + "epoch": 0.03318181818181818, + "grad_norm": 2.178641124699276, + "learning_rate": 1.2466534873446738e-06, + "loss": 0.1036, + "step": 146 + }, + { + "epoch": 0.03340909090909091, + "grad_norm": 3.2479690123414993, + "learning_rate": 1.246607211332308e-06, + "loss": 0.143, + "step": 147 + }, + { + "epoch": 0.03363636363636364, + "grad_norm": 2.7161801022330376, + "learning_rate": 1.2465606184285679e-06, + "loss": 0.0784, + "step": 148 + }, + { + "epoch": 0.03386363636363637, + "grad_norm": 2.946252975196419, + "learning_rate": 1.2465137086572057e-06, + "loss": 0.1087, + "step": 149 + }, + { + "epoch": 0.03409090909090909, + "grad_norm": 3.7646888424049143, + "learning_rate": 1.2464664820421365e-06, + "loss": 0.2077, + "step": 150 + }, + { + "epoch": 0.03431818181818182, + "grad_norm": 2.0046468126086787, + "learning_rate": 1.246418938607436e-06, + "loss": 0.1261, + "step": 151 + }, + { + "epoch": 0.034545454545454546, + "grad_norm": 1.4259818662884742, + "learning_rate": 1.246371078377341e-06, + "loss": 0.0542, + "step": 152 + }, + { + "epoch": 0.034772727272727275, + "grad_norm": 2.8762613449347647, + "learning_rate": 1.246322901376251e-06, + "loss": 0.1723, + "step": 153 + }, + { + "epoch": 0.035, + "grad_norm": 3.9290196607015493, + "learning_rate": 1.2462744076287257e-06, + "loss": 0.1381, + "step": 154 + }, + { + "epoch": 0.035227272727272725, + "grad_norm": 3.164945939472189, + "learning_rate": 1.2462255971594874e-06, + "loss": 0.0871, + "step": 155 + }, + { + "epoch": 0.035454545454545454, + "grad_norm": 2.3422239909264024, + "learning_rate": 1.2461764699934192e-06, + "loss": 0.0733, + "step": 156 + }, + { + "epoch": 0.03568181818181818, + "grad_norm": 3.1067031772787477, + "learning_rate": 1.2461270261555657e-06, + "loss": 0.1047, + "step": 157 + }, + { + "epoch": 0.03590909090909091, + "grad_norm": 4.2162583523635115, + "learning_rate": 1.246077265671133e-06, + "loss": 0.1415, + "step": 158 + }, + { + "epoch": 0.03613636363636363, + "grad_norm": 2.7951422230559486, + "learning_rate": 1.2460271885654891e-06, + "loss": 0.0885, + "step": 159 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 2.048658317870938, + "learning_rate": 1.2459767948641626e-06, + "loss": 0.0835, + "step": 160 + }, + { + "epoch": 0.03659090909090909, + "grad_norm": 3.1707408130065264, + "learning_rate": 1.2459260845928438e-06, + "loss": 0.1229, + "step": 161 + }, + { + "epoch": 0.03681818181818182, + "grad_norm": 3.0335907524673154, + "learning_rate": 1.245875057777385e-06, + "loss": 0.1188, + "step": 162 + }, + { + "epoch": 0.03704545454545455, + "grad_norm": 2.3200226209196955, + "learning_rate": 1.245823714443799e-06, + "loss": 0.064, + "step": 163 + }, + { + "epoch": 0.03727272727272727, + "grad_norm": 1.9954265011198742, + "learning_rate": 1.24577205461826e-06, + "loss": 0.0638, + "step": 164 + }, + { + "epoch": 0.0375, + "grad_norm": 2.236806781948131, + "learning_rate": 1.2457200783271044e-06, + "loss": 0.0863, + "step": 165 + }, + { + "epoch": 0.03772727272727273, + "grad_norm": 3.0742733194769123, + "learning_rate": 1.245667785596829e-06, + "loss": 0.2, + "step": 166 + }, + { + "epoch": 0.037954545454545456, + "grad_norm": 2.2520226325508643, + "learning_rate": 1.2456151764540924e-06, + "loss": 0.0809, + "step": 167 + }, + { + "epoch": 0.038181818181818185, + "grad_norm": 2.15663011362523, + "learning_rate": 1.2455622509257147e-06, + "loss": 0.0831, + "step": 168 + }, + { + "epoch": 0.03840909090909091, + "grad_norm": 3.7997476231270535, + "learning_rate": 1.2455090090386765e-06, + "loss": 0.1603, + "step": 169 + }, + { + "epoch": 0.038636363636363635, + "grad_norm": 3.9530380374838114, + "learning_rate": 1.2454554508201205e-06, + "loss": 0.075, + "step": 170 + }, + { + "epoch": 0.038863636363636364, + "grad_norm": 2.4758478414591374, + "learning_rate": 1.2454015762973505e-06, + "loss": 0.1126, + "step": 171 + }, + { + "epoch": 0.03909090909090909, + "grad_norm": 2.9280923203486258, + "learning_rate": 1.2453473854978307e-06, + "loss": 0.1125, + "step": 172 + }, + { + "epoch": 0.03931818181818182, + "grad_norm": 1.857509727053121, + "learning_rate": 1.2452928784491877e-06, + "loss": 0.1214, + "step": 173 + }, + { + "epoch": 0.03954545454545454, + "grad_norm": 1.7565974252145562, + "learning_rate": 1.245238055179209e-06, + "loss": 0.0941, + "step": 174 + }, + { + "epoch": 0.03977272727272727, + "grad_norm": 3.214045178112143, + "learning_rate": 1.245182915715843e-06, + "loss": 0.1067, + "step": 175 + }, + { + "epoch": 0.04, + "grad_norm": 3.1459596309788425, + "learning_rate": 1.2451274600871991e-06, + "loss": 0.0903, + "step": 176 + }, + { + "epoch": 0.04022727272727273, + "grad_norm": 2.717741153219287, + "learning_rate": 1.245071688321549e-06, + "loss": 0.0933, + "step": 177 + }, + { + "epoch": 0.04045454545454545, + "grad_norm": 2.433999370162766, + "learning_rate": 1.2450156004473238e-06, + "loss": 0.1544, + "step": 178 + }, + { + "epoch": 0.04068181818181818, + "grad_norm": 1.522448767303929, + "learning_rate": 1.2449591964931173e-06, + "loss": 0.1366, + "step": 179 + }, + { + "epoch": 0.04090909090909091, + "grad_norm": 3.3266491447077904, + "learning_rate": 1.2449024764876841e-06, + "loss": 0.1176, + "step": 180 + }, + { + "epoch": 0.04113636363636364, + "grad_norm": 4.043282241976147, + "learning_rate": 1.2448454404599393e-06, + "loss": 0.2645, + "step": 181 + }, + { + "epoch": 0.041363636363636366, + "grad_norm": 3.4155863239617363, + "learning_rate": 1.2447880884389597e-06, + "loss": 0.0803, + "step": 182 + }, + { + "epoch": 0.04159090909090909, + "grad_norm": 2.9942111557293773, + "learning_rate": 1.2447304204539827e-06, + "loss": 0.0674, + "step": 183 + }, + { + "epoch": 0.04181818181818182, + "grad_norm": 2.418264089227234, + "learning_rate": 1.2446724365344076e-06, + "loss": 0.0624, + "step": 184 + }, + { + "epoch": 0.042045454545454546, + "grad_norm": 3.0587436770158467, + "learning_rate": 1.2446141367097936e-06, + "loss": 0.0967, + "step": 185 + }, + { + "epoch": 0.042272727272727274, + "grad_norm": 4.4237769530614175, + "learning_rate": 1.244555521009862e-06, + "loss": 0.1264, + "step": 186 + }, + { + "epoch": 0.0425, + "grad_norm": 3.1564699048311935, + "learning_rate": 1.2444965894644946e-06, + "loss": 0.0967, + "step": 187 + }, + { + "epoch": 0.042727272727272725, + "grad_norm": 6.667224811522091, + "learning_rate": 1.2444373421037345e-06, + "loss": 0.1933, + "step": 188 + }, + { + "epoch": 0.042954545454545454, + "grad_norm": 3.4612803721893286, + "learning_rate": 1.2443777789577852e-06, + "loss": 0.1177, + "step": 189 + }, + { + "epoch": 0.04318181818181818, + "grad_norm": 3.0645036281205753, + "learning_rate": 1.244317900057012e-06, + "loss": 0.0872, + "step": 190 + }, + { + "epoch": 0.04340909090909091, + "grad_norm": 2.7678849500637432, + "learning_rate": 1.2442577054319405e-06, + "loss": 0.1107, + "step": 191 + }, + { + "epoch": 0.04363636363636364, + "grad_norm": 2.429853550112927, + "learning_rate": 1.2441971951132578e-06, + "loss": 0.0728, + "step": 192 + }, + { + "epoch": 0.04386363636363636, + "grad_norm": 3.36528155987506, + "learning_rate": 1.2441363691318114e-06, + "loss": 0.1021, + "step": 193 + }, + { + "epoch": 0.04409090909090909, + "grad_norm": 2.92737772200131, + "learning_rate": 1.2440752275186102e-06, + "loss": 0.0724, + "step": 194 + }, + { + "epoch": 0.04431818181818182, + "grad_norm": 1.680881694384273, + "learning_rate": 1.244013770304824e-06, + "loss": 0.0557, + "step": 195 + }, + { + "epoch": 0.04454545454545455, + "grad_norm": 1.3839129700771302, + "learning_rate": 1.2439519975217828e-06, + "loss": 0.0334, + "step": 196 + }, + { + "epoch": 0.04477272727272727, + "grad_norm": 3.580271334258035, + "learning_rate": 1.2438899092009783e-06, + "loss": 0.0899, + "step": 197 + }, + { + "epoch": 0.045, + "grad_norm": 3.6568327278776147, + "learning_rate": 1.2438275053740624e-06, + "loss": 0.1196, + "step": 198 + }, + { + "epoch": 0.04522727272727273, + "grad_norm": 2.5412787688165275, + "learning_rate": 1.2437647860728487e-06, + "loss": 0.0895, + "step": 199 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 3.3934172415051176, + "learning_rate": 1.2437017513293107e-06, + "loss": 0.1361, + "step": 200 + }, + { + "epoch": 0.045681818181818185, + "grad_norm": 2.555977878825042, + "learning_rate": 1.243638401175583e-06, + "loss": 0.1095, + "step": 201 + }, + { + "epoch": 0.045909090909090906, + "grad_norm": 3.716546846964998, + "learning_rate": 1.2435747356439614e-06, + "loss": 0.2058, + "step": 202 + }, + { + "epoch": 0.046136363636363635, + "grad_norm": 2.4595525452664946, + "learning_rate": 1.2435107547669022e-06, + "loss": 0.0859, + "step": 203 + }, + { + "epoch": 0.046363636363636364, + "grad_norm": 2.6578162895974224, + "learning_rate": 1.2434464585770226e-06, + "loss": 0.0536, + "step": 204 + }, + { + "epoch": 0.04659090909090909, + "grad_norm": 3.897584603854884, + "learning_rate": 1.2433818471070998e-06, + "loss": 0.1557, + "step": 205 + }, + { + "epoch": 0.04681818181818182, + "grad_norm": 6.822349149466114, + "learning_rate": 1.2433169203900726e-06, + "loss": 0.266, + "step": 206 + }, + { + "epoch": 0.04704545454545454, + "grad_norm": 3.4167912842651313, + "learning_rate": 1.2432516784590405e-06, + "loss": 0.0961, + "step": 207 + }, + { + "epoch": 0.04727272727272727, + "grad_norm": 3.4982383483847217, + "learning_rate": 1.243186121347263e-06, + "loss": 0.1032, + "step": 208 + }, + { + "epoch": 0.0475, + "grad_norm": 2.9308534972205136, + "learning_rate": 1.243120249088161e-06, + "loss": 0.0921, + "step": 209 + }, + { + "epoch": 0.04772727272727273, + "grad_norm": 2.1650949857926864, + "learning_rate": 1.2430540617153156e-06, + "loss": 0.07, + "step": 210 + }, + { + "epoch": 0.04795454545454545, + "grad_norm": 1.877123728582618, + "learning_rate": 1.2429875592624685e-06, + "loss": 0.1097, + "step": 211 + }, + { + "epoch": 0.04818181818181818, + "grad_norm": 2.8812892658842917, + "learning_rate": 1.2429207417635226e-06, + "loss": 0.0954, + "step": 212 + }, + { + "epoch": 0.04840909090909091, + "grad_norm": 1.9447089447685935, + "learning_rate": 1.242853609252541e-06, + "loss": 0.0868, + "step": 213 + }, + { + "epoch": 0.04863636363636364, + "grad_norm": 4.572871957317649, + "learning_rate": 1.2427861617637472e-06, + "loss": 0.1737, + "step": 214 + }, + { + "epoch": 0.048863636363636366, + "grad_norm": 2.6223510240500882, + "learning_rate": 1.2427183993315256e-06, + "loss": 0.0562, + "step": 215 + }, + { + "epoch": 0.04909090909090909, + "grad_norm": 3.0747367957710385, + "learning_rate": 1.2426503219904213e-06, + "loss": 0.1262, + "step": 216 + }, + { + "epoch": 0.04931818181818182, + "grad_norm": 2.390349193340806, + "learning_rate": 1.242581929775139e-06, + "loss": 0.0824, + "step": 217 + }, + { + "epoch": 0.049545454545454545, + "grad_norm": 2.547736820961635, + "learning_rate": 1.2425132227205456e-06, + "loss": 0.0705, + "step": 218 + }, + { + "epoch": 0.049772727272727274, + "grad_norm": 1.012061665511356, + "learning_rate": 1.2424442008616667e-06, + "loss": 0.0371, + "step": 219 + }, + { + "epoch": 0.05, + "grad_norm": 3.7563787988146093, + "learning_rate": 1.2423748642336894e-06, + "loss": 0.0995, + "step": 220 + }, + { + "epoch": 0.050227272727272725, + "grad_norm": 2.260009759505115, + "learning_rate": 1.2423052128719611e-06, + "loss": 0.0758, + "step": 221 + }, + { + "epoch": 0.05045454545454545, + "grad_norm": 1.3375603854760587, + "learning_rate": 1.24223524681199e-06, + "loss": 0.0628, + "step": 222 + }, + { + "epoch": 0.05068181818181818, + "grad_norm": 2.6771117922046397, + "learning_rate": 1.2421649660894438e-06, + "loss": 0.1246, + "step": 223 + }, + { + "epoch": 0.05090909090909091, + "grad_norm": 3.7619917996055863, + "learning_rate": 1.2420943707401514e-06, + "loss": 0.0976, + "step": 224 + }, + { + "epoch": 0.05113636363636364, + "grad_norm": 3.0848514275552144, + "learning_rate": 1.2420234608001017e-06, + "loss": 0.1937, + "step": 225 + }, + { + "epoch": 0.05136363636363636, + "grad_norm": 2.4374680912316053, + "learning_rate": 1.2419522363054446e-06, + "loss": 0.0848, + "step": 226 + }, + { + "epoch": 0.05159090909090909, + "grad_norm": 3.7191848287996025, + "learning_rate": 1.2418806972924893e-06, + "loss": 0.0953, + "step": 227 + }, + { + "epoch": 0.05181818181818182, + "grad_norm": 2.66335814077733, + "learning_rate": 1.2418088437977063e-06, + "loss": 0.1009, + "step": 228 + }, + { + "epoch": 0.05204545454545455, + "grad_norm": 2.412809406877248, + "learning_rate": 1.241736675857726e-06, + "loss": 0.0675, + "step": 229 + }, + { + "epoch": 0.05227272727272727, + "grad_norm": 4.65044604462548, + "learning_rate": 1.241664193509339e-06, + "loss": 0.096, + "step": 230 + }, + { + "epoch": 0.0525, + "grad_norm": 5.09363480299026, + "learning_rate": 1.2415913967894966e-06, + "loss": 0.2421, + "step": 231 + }, + { + "epoch": 0.05272727272727273, + "grad_norm": 4.056815516837104, + "learning_rate": 1.2415182857353098e-06, + "loss": 0.1771, + "step": 232 + }, + { + "epoch": 0.052954545454545456, + "grad_norm": 2.4409832544314236, + "learning_rate": 1.2414448603840504e-06, + "loss": 0.1381, + "step": 233 + }, + { + "epoch": 0.053181818181818184, + "grad_norm": 3.30516224546615, + "learning_rate": 1.24137112077315e-06, + "loss": 0.2239, + "step": 234 + }, + { + "epoch": 0.053409090909090906, + "grad_norm": 4.750181242796731, + "learning_rate": 1.2412970669402005e-06, + "loss": 0.3172, + "step": 235 + }, + { + "epoch": 0.053636363636363635, + "grad_norm": 3.3363001291418883, + "learning_rate": 1.2412226989229542e-06, + "loss": 0.1871, + "step": 236 + }, + { + "epoch": 0.053863636363636364, + "grad_norm": 4.902123827255867, + "learning_rate": 1.2411480167593237e-06, + "loss": 0.1167, + "step": 237 + }, + { + "epoch": 0.05409090909090909, + "grad_norm": 2.140890406636027, + "learning_rate": 1.241073020487381e-06, + "loss": 0.0991, + "step": 238 + }, + { + "epoch": 0.05431818181818182, + "grad_norm": 1.460656936347126, + "learning_rate": 1.2409977101453591e-06, + "loss": 0.0585, + "step": 239 + }, + { + "epoch": 0.05454545454545454, + "grad_norm": 3.870074973045016, + "learning_rate": 1.2409220857716506e-06, + "loss": 0.1517, + "step": 240 + }, + { + "epoch": 0.05477272727272727, + "grad_norm": 3.159943045639632, + "learning_rate": 1.2408461474048083e-06, + "loss": 0.1196, + "step": 241 + }, + { + "epoch": 0.055, + "grad_norm": 4.913069714809017, + "learning_rate": 1.240769895083545e-06, + "loss": 0.1065, + "step": 242 + }, + { + "epoch": 0.05522727272727273, + "grad_norm": 2.699614594241355, + "learning_rate": 1.2406933288467337e-06, + "loss": 0.163, + "step": 243 + }, + { + "epoch": 0.05545454545454546, + "grad_norm": 2.587714516355004, + "learning_rate": 1.2406164487334077e-06, + "loss": 0.1627, + "step": 244 + }, + { + "epoch": 0.05568181818181818, + "grad_norm": 3.600462258550555, + "learning_rate": 1.2405392547827594e-06, + "loss": 0.1118, + "step": 245 + }, + { + "epoch": 0.05590909090909091, + "grad_norm": 2.1014376490107316, + "learning_rate": 1.2404617470341423e-06, + "loss": 0.063, + "step": 246 + }, + { + "epoch": 0.05613636363636364, + "grad_norm": 3.921834652165585, + "learning_rate": 1.2403839255270693e-06, + "loss": 0.1024, + "step": 247 + }, + { + "epoch": 0.056363636363636366, + "grad_norm": 2.633424805673825, + "learning_rate": 1.2403057903012128e-06, + "loss": 0.1147, + "step": 248 + }, + { + "epoch": 0.05659090909090909, + "grad_norm": 2.774498349726601, + "learning_rate": 1.240227341396406e-06, + "loss": 0.0734, + "step": 249 + }, + { + "epoch": 0.056818181818181816, + "grad_norm": 3.1093286010353913, + "learning_rate": 1.2401485788526418e-06, + "loss": 0.2027, + "step": 250 + }, + { + "epoch": 0.057045454545454545, + "grad_norm": 2.6714784300800973, + "learning_rate": 1.2400695027100725e-06, + "loss": 0.1348, + "step": 251 + }, + { + "epoch": 0.057272727272727274, + "grad_norm": 2.841094984583613, + "learning_rate": 1.2399901130090112e-06, + "loss": 0.1093, + "step": 252 + }, + { + "epoch": 0.0575, + "grad_norm": 3.9926828089880555, + "learning_rate": 1.2399104097899295e-06, + "loss": 0.1566, + "step": 253 + }, + { + "epoch": 0.057727272727272724, + "grad_norm": 2.0426673539860234, + "learning_rate": 1.2398303930934601e-06, + "loss": 0.0719, + "step": 254 + }, + { + "epoch": 0.05795454545454545, + "grad_norm": 3.4022810910147174, + "learning_rate": 1.2397500629603948e-06, + "loss": 0.1013, + "step": 255 + }, + { + "epoch": 0.05818181818181818, + "grad_norm": 3.079425326642514, + "learning_rate": 1.2396694194316851e-06, + "loss": 0.0766, + "step": 256 + }, + { + "epoch": 0.05840909090909091, + "grad_norm": 2.267428155252032, + "learning_rate": 1.2395884625484433e-06, + "loss": 0.1396, + "step": 257 + }, + { + "epoch": 0.05863636363636364, + "grad_norm": 4.2457069865097665, + "learning_rate": 1.2395071923519403e-06, + "loss": 0.1329, + "step": 258 + }, + { + "epoch": 0.05886363636363636, + "grad_norm": 4.326598150136959, + "learning_rate": 1.2394256088836069e-06, + "loss": 0.1462, + "step": 259 + }, + { + "epoch": 0.05909090909090909, + "grad_norm": 1.8812568816427564, + "learning_rate": 1.2393437121850342e-06, + "loss": 0.1002, + "step": 260 + }, + { + "epoch": 0.05931818181818182, + "grad_norm": 3.046227516360263, + "learning_rate": 1.2392615022979723e-06, + "loss": 0.108, + "step": 261 + }, + { + "epoch": 0.05954545454545455, + "grad_norm": 3.241942095267392, + "learning_rate": 1.2391789792643317e-06, + "loss": 0.103, + "step": 262 + }, + { + "epoch": 0.059772727272727276, + "grad_norm": 5.181675389134145, + "learning_rate": 1.2390961431261814e-06, + "loss": 0.2546, + "step": 263 + }, + { + "epoch": 0.06, + "grad_norm": 2.4263236911686374, + "learning_rate": 1.2390129939257515e-06, + "loss": 0.1358, + "step": 264 + }, + { + "epoch": 0.060227272727272727, + "grad_norm": 3.8158275451730352, + "learning_rate": 1.2389295317054306e-06, + "loss": 0.1787, + "step": 265 + }, + { + "epoch": 0.060454545454545455, + "grad_norm": 3.4937371000310717, + "learning_rate": 1.238845756507767e-06, + "loss": 0.1129, + "step": 266 + }, + { + "epoch": 0.060681818181818184, + "grad_norm": 5.087808693070465, + "learning_rate": 1.2387616683754691e-06, + "loss": 0.1218, + "step": 267 + }, + { + "epoch": 0.060909090909090906, + "grad_norm": 3.3657773309614973, + "learning_rate": 1.2386772673514044e-06, + "loss": 0.0644, + "step": 268 + }, + { + "epoch": 0.061136363636363635, + "grad_norm": 3.765264072041106, + "learning_rate": 1.2385925534786e-06, + "loss": 0.1217, + "step": 269 + }, + { + "epoch": 0.06136363636363636, + "grad_norm": 1.986880990295197, + "learning_rate": 1.2385075268002423e-06, + "loss": 0.0744, + "step": 270 + }, + { + "epoch": 0.06159090909090909, + "grad_norm": 1.654849985156512, + "learning_rate": 1.2384221873596775e-06, + "loss": 0.0575, + "step": 271 + }, + { + "epoch": 0.06181818181818182, + "grad_norm": 3.6900447222147807, + "learning_rate": 1.2383365352004111e-06, + "loss": 0.1439, + "step": 272 + }, + { + "epoch": 0.06204545454545454, + "grad_norm": 2.4405793751846394, + "learning_rate": 1.238250570366108e-06, + "loss": 0.1299, + "step": 273 + }, + { + "epoch": 0.06227272727272727, + "grad_norm": 2.6310308209955564, + "learning_rate": 1.2381642929005927e-06, + "loss": 0.0983, + "step": 274 + }, + { + "epoch": 0.0625, + "grad_norm": 4.5925254895336725, + "learning_rate": 1.238077702847849e-06, + "loss": 0.1707, + "step": 275 + }, + { + "epoch": 0.06272727272727273, + "grad_norm": 2.9535456356683905, + "learning_rate": 1.2379908002520191e-06, + "loss": 0.1576, + "step": 276 + }, + { + "epoch": 0.06295454545454546, + "grad_norm": 3.785953958589258, + "learning_rate": 1.2379035851574063e-06, + "loss": 0.1612, + "step": 277 + }, + { + "epoch": 0.06318181818181819, + "grad_norm": 2.7382228340177357, + "learning_rate": 1.237816057608472e-06, + "loss": 0.1532, + "step": 278 + }, + { + "epoch": 0.06340909090909091, + "grad_norm": 2.7532145912398005, + "learning_rate": 1.2377282176498371e-06, + "loss": 0.0581, + "step": 279 + }, + { + "epoch": 0.06363636363636363, + "grad_norm": 4.383515501701478, + "learning_rate": 1.2376400653262817e-06, + "loss": 0.1983, + "step": 280 + }, + { + "epoch": 0.06386363636363636, + "grad_norm": 2.8425813883365993, + "learning_rate": 1.237551600682746e-06, + "loss": 0.1144, + "step": 281 + }, + { + "epoch": 0.06409090909090909, + "grad_norm": 2.350939161092161, + "learning_rate": 1.237462823764328e-06, + "loss": 0.0955, + "step": 282 + }, + { + "epoch": 0.06431818181818182, + "grad_norm": 1.996724867356844, + "learning_rate": 1.2373737346162857e-06, + "loss": 0.0639, + "step": 283 + }, + { + "epoch": 0.06454545454545454, + "grad_norm": 1.9085859990147678, + "learning_rate": 1.2372843332840364e-06, + "loss": 0.0502, + "step": 284 + }, + { + "epoch": 0.06477272727272727, + "grad_norm": 3.51832542966258, + "learning_rate": 1.2371946198131563e-06, + "loss": 0.0803, + "step": 285 + }, + { + "epoch": 0.065, + "grad_norm": 1.7601967545837591, + "learning_rate": 1.2371045942493804e-06, + "loss": 0.0946, + "step": 286 + }, + { + "epoch": 0.06522727272727273, + "grad_norm": 2.2038956569787316, + "learning_rate": 1.2370142566386038e-06, + "loss": 0.1052, + "step": 287 + }, + { + "epoch": 0.06545454545454546, + "grad_norm": 3.1887431312405856, + "learning_rate": 1.2369236070268795e-06, + "loss": 0.0991, + "step": 288 + }, + { + "epoch": 0.06568181818181819, + "grad_norm": 1.4097208525041756, + "learning_rate": 1.2368326454604201e-06, + "loss": 0.0828, + "step": 289 + }, + { + "epoch": 0.0659090909090909, + "grad_norm": 2.536442931661013, + "learning_rate": 1.2367413719855976e-06, + "loss": 0.0763, + "step": 290 + }, + { + "epoch": 0.06613636363636363, + "grad_norm": 1.7252748106496696, + "learning_rate": 1.2366497866489423e-06, + "loss": 0.0558, + "step": 291 + }, + { + "epoch": 0.06636363636363636, + "grad_norm": 2.3801214982066408, + "learning_rate": 1.236557889497144e-06, + "loss": 0.0983, + "step": 292 + }, + { + "epoch": 0.06659090909090909, + "grad_norm": 2.5914386333247177, + "learning_rate": 1.236465680577051e-06, + "loss": 0.091, + "step": 293 + }, + { + "epoch": 0.06681818181818182, + "grad_norm": 2.1788870392905118, + "learning_rate": 1.2363731599356712e-06, + "loss": 0.1222, + "step": 294 + }, + { + "epoch": 0.06704545454545455, + "grad_norm": 3.3445059388136555, + "learning_rate": 1.2362803276201709e-06, + "loss": 0.1815, + "step": 295 + }, + { + "epoch": 0.06727272727272728, + "grad_norm": 3.6062837096175286, + "learning_rate": 1.2361871836778755e-06, + "loss": 0.0819, + "step": 296 + }, + { + "epoch": 0.0675, + "grad_norm": 3.1353860677297267, + "learning_rate": 1.236093728156269e-06, + "loss": 0.0996, + "step": 297 + }, + { + "epoch": 0.06772727272727273, + "grad_norm": 3.795202115844352, + "learning_rate": 1.2359999611029944e-06, + "loss": 0.1258, + "step": 298 + }, + { + "epoch": 0.06795454545454545, + "grad_norm": 4.4425120996211644, + "learning_rate": 1.2359058825658534e-06, + "loss": 0.1118, + "step": 299 + }, + { + "epoch": 0.06818181818181818, + "grad_norm": 3.060625425498013, + "learning_rate": 1.2358114925928073e-06, + "loss": 0.085, + "step": 300 + }, + { + "epoch": 0.0684090909090909, + "grad_norm": 2.541572581911574, + "learning_rate": 1.2357167912319747e-06, + "loss": 0.0788, + "step": 301 + }, + { + "epoch": 0.06863636363636363, + "grad_norm": 3.611123902709692, + "learning_rate": 1.2356217785316344e-06, + "loss": 0.0738, + "step": 302 + }, + { + "epoch": 0.06886363636363636, + "grad_norm": 2.57826970052294, + "learning_rate": 1.235526454540223e-06, + "loss": 0.1558, + "step": 303 + }, + { + "epoch": 0.06909090909090909, + "grad_norm": 1.405756388923129, + "learning_rate": 1.2354308193063358e-06, + "loss": 0.0558, + "step": 304 + }, + { + "epoch": 0.06931818181818182, + "grad_norm": 4.892966445443843, + "learning_rate": 1.2353348728787274e-06, + "loss": 0.1939, + "step": 305 + }, + { + "epoch": 0.06954545454545455, + "grad_norm": 3.405266279984527, + "learning_rate": 1.2352386153063107e-06, + "loss": 0.0782, + "step": 306 + }, + { + "epoch": 0.06977272727272728, + "grad_norm": 4.65382172183155, + "learning_rate": 1.2351420466381566e-06, + "loss": 0.1097, + "step": 307 + }, + { + "epoch": 0.07, + "grad_norm": 2.6483561501934108, + "learning_rate": 1.235045166923496e-06, + "loss": 0.0952, + "step": 308 + }, + { + "epoch": 0.07022727272727272, + "grad_norm": 2.492520472658006, + "learning_rate": 1.2349479762117171e-06, + "loss": 0.1081, + "step": 309 + }, + { + "epoch": 0.07045454545454545, + "grad_norm": 4.661738484788085, + "learning_rate": 1.2348504745523673e-06, + "loss": 0.2239, + "step": 310 + }, + { + "epoch": 0.07068181818181818, + "grad_norm": 2.836454965603772, + "learning_rate": 1.2347526619951523e-06, + "loss": 0.1213, + "step": 311 + }, + { + "epoch": 0.07090909090909091, + "grad_norm": 4.58311528505282, + "learning_rate": 1.2346545385899358e-06, + "loss": 0.1464, + "step": 312 + }, + { + "epoch": 0.07113636363636364, + "grad_norm": 3.5048453466304923, + "learning_rate": 1.2345561043867413e-06, + "loss": 0.1603, + "step": 313 + }, + { + "epoch": 0.07136363636363637, + "grad_norm": 3.561473338236458, + "learning_rate": 1.2344573594357493e-06, + "loss": 0.0841, + "step": 314 + }, + { + "epoch": 0.0715909090909091, + "grad_norm": 7.099554539977735, + "learning_rate": 1.2343583037872998e-06, + "loss": 0.1376, + "step": 315 + }, + { + "epoch": 0.07181818181818182, + "grad_norm": 2.0454245684933086, + "learning_rate": 1.2342589374918905e-06, + "loss": 0.1454, + "step": 316 + }, + { + "epoch": 0.07204545454545455, + "grad_norm": 2.690682928495695, + "learning_rate": 1.2341592606001777e-06, + "loss": 0.151, + "step": 317 + }, + { + "epoch": 0.07227272727272727, + "grad_norm": 1.9473548981882711, + "learning_rate": 1.2340592731629758e-06, + "loss": 0.0663, + "step": 318 + }, + { + "epoch": 0.0725, + "grad_norm": 1.9817106310961383, + "learning_rate": 1.2339589752312581e-06, + "loss": 0.0755, + "step": 319 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 3.5702400423961294, + "learning_rate": 1.233858366856156e-06, + "loss": 0.1301, + "step": 320 + }, + { + "epoch": 0.07295454545454545, + "grad_norm": 2.8060422993346554, + "learning_rate": 1.2337574480889585e-06, + "loss": 0.1066, + "step": 321 + }, + { + "epoch": 0.07318181818181818, + "grad_norm": 3.309057271529764, + "learning_rate": 1.2336562189811138e-06, + "loss": 0.0561, + "step": 322 + }, + { + "epoch": 0.07340909090909091, + "grad_norm": 4.600936564276701, + "learning_rate": 1.2335546795842276e-06, + "loss": 0.1313, + "step": 323 + }, + { + "epoch": 0.07363636363636364, + "grad_norm": 5.1141235062756305, + "learning_rate": 1.233452829950064e-06, + "loss": 0.1269, + "step": 324 + }, + { + "epoch": 0.07386363636363637, + "grad_norm": 2.677110953995895, + "learning_rate": 1.2333506701305453e-06, + "loss": 0.0729, + "step": 325 + }, + { + "epoch": 0.0740909090909091, + "grad_norm": 2.7724924929723316, + "learning_rate": 1.2332482001777522e-06, + "loss": 0.1107, + "step": 326 + }, + { + "epoch": 0.07431818181818182, + "grad_norm": 5.232846583422687, + "learning_rate": 1.233145420143923e-06, + "loss": 0.1248, + "step": 327 + }, + { + "epoch": 0.07454545454545454, + "grad_norm": 2.7835450341603054, + "learning_rate": 1.2330423300814542e-06, + "loss": 0.118, + "step": 328 + }, + { + "epoch": 0.07477272727272727, + "grad_norm": 3.274121466447511, + "learning_rate": 1.2329389300429008e-06, + "loss": 0.1509, + "step": 329 + }, + { + "epoch": 0.075, + "grad_norm": 2.8890066579436304, + "learning_rate": 1.232835220080975e-06, + "loss": 0.1061, + "step": 330 + }, + { + "epoch": 0.07522727272727273, + "grad_norm": 1.3461200298680585, + "learning_rate": 1.232731200248548e-06, + "loss": 0.0633, + "step": 331 + }, + { + "epoch": 0.07545454545454545, + "grad_norm": 5.7546563462800595, + "learning_rate": 1.232626870598648e-06, + "loss": 0.1391, + "step": 332 + }, + { + "epoch": 0.07568181818181818, + "grad_norm": 3.0275877351098917, + "learning_rate": 1.2325222311844617e-06, + "loss": 0.1363, + "step": 333 + }, + { + "epoch": 0.07590909090909091, + "grad_norm": 4.366747450337461, + "learning_rate": 1.2324172820593339e-06, + "loss": 0.1209, + "step": 334 + }, + { + "epoch": 0.07613636363636364, + "grad_norm": 3.7087005681939518, + "learning_rate": 1.2323120232767667e-06, + "loss": 0.1229, + "step": 335 + }, + { + "epoch": 0.07636363636363637, + "grad_norm": 3.775294884503768, + "learning_rate": 1.2322064548904202e-06, + "loss": 0.1068, + "step": 336 + }, + { + "epoch": 0.07659090909090908, + "grad_norm": 3.7584262607116914, + "learning_rate": 1.232100576954113e-06, + "loss": 0.0952, + "step": 337 + }, + { + "epoch": 0.07681818181818181, + "grad_norm": 1.7891425379053958, + "learning_rate": 1.2319943895218205e-06, + "loss": 0.0993, + "step": 338 + }, + { + "epoch": 0.07704545454545454, + "grad_norm": 1.827195549561059, + "learning_rate": 1.2318878926476765e-06, + "loss": 0.1345, + "step": 339 + }, + { + "epoch": 0.07727272727272727, + "grad_norm": 2.0304662719547046, + "learning_rate": 1.2317810863859728e-06, + "loss": 0.0881, + "step": 340 + }, + { + "epoch": 0.0775, + "grad_norm": 2.603856208796641, + "learning_rate": 1.231673970791158e-06, + "loss": 0.0989, + "step": 341 + }, + { + "epoch": 0.07772727272727273, + "grad_norm": 2.753960531619496, + "learning_rate": 1.231566545917839e-06, + "loss": 0.1393, + "step": 342 + }, + { + "epoch": 0.07795454545454546, + "grad_norm": 3.670105337059318, + "learning_rate": 1.2314588118207808e-06, + "loss": 0.1411, + "step": 343 + }, + { + "epoch": 0.07818181818181819, + "grad_norm": 3.0453426153803855, + "learning_rate": 1.2313507685549054e-06, + "loss": 0.0669, + "step": 344 + }, + { + "epoch": 0.07840909090909091, + "grad_norm": 3.64975550268497, + "learning_rate": 1.231242416175292e-06, + "loss": 0.0967, + "step": 345 + }, + { + "epoch": 0.07863636363636364, + "grad_norm": 3.7460249859795924, + "learning_rate": 1.2311337547371785e-06, + "loss": 0.1316, + "step": 346 + }, + { + "epoch": 0.07886363636363636, + "grad_norm": 2.5110205934468532, + "learning_rate": 1.2310247842959597e-06, + "loss": 0.1347, + "step": 347 + }, + { + "epoch": 0.07909090909090909, + "grad_norm": 2.578783845598259, + "learning_rate": 1.230915504907188e-06, + "loss": 0.0859, + "step": 348 + }, + { + "epoch": 0.07931818181818182, + "grad_norm": 2.904523661612725, + "learning_rate": 1.2308059166265734e-06, + "loss": 0.1034, + "step": 349 + }, + { + "epoch": 0.07954545454545454, + "grad_norm": 3.8463393430603903, + "learning_rate": 1.2306960195099833e-06, + "loss": 0.0773, + "step": 350 + }, + { + "epoch": 0.07977272727272727, + "grad_norm": 3.4874678834509516, + "learning_rate": 1.2305858136134422e-06, + "loss": 0.1871, + "step": 351 + }, + { + "epoch": 0.08, + "grad_norm": 3.3233687777324112, + "learning_rate": 1.2304752989931327e-06, + "loss": 0.1294, + "step": 352 + }, + { + "epoch": 0.08022727272727273, + "grad_norm": 2.5143806178353683, + "learning_rate": 1.2303644757053945e-06, + "loss": 0.1002, + "step": 353 + }, + { + "epoch": 0.08045454545454546, + "grad_norm": 2.5100887020522977, + "learning_rate": 1.2302533438067247e-06, + "loss": 0.1097, + "step": 354 + }, + { + "epoch": 0.08068181818181819, + "grad_norm": 2.6722235964082803, + "learning_rate": 1.230141903353777e-06, + "loss": 0.0936, + "step": 355 + }, + { + "epoch": 0.0809090909090909, + "grad_norm": 1.7718009120749822, + "learning_rate": 1.2300301544033636e-06, + "loss": 0.0527, + "step": 356 + }, + { + "epoch": 0.08113636363636363, + "grad_norm": 1.915075876701033, + "learning_rate": 1.2299180970124533e-06, + "loss": 0.0978, + "step": 357 + }, + { + "epoch": 0.08136363636363636, + "grad_norm": 2.1280843016236988, + "learning_rate": 1.2298057312381723e-06, + "loss": 0.0686, + "step": 358 + }, + { + "epoch": 0.08159090909090909, + "grad_norm": 4.459716743720089, + "learning_rate": 1.2296930571378035e-06, + "loss": 0.1183, + "step": 359 + }, + { + "epoch": 0.08181818181818182, + "grad_norm": 2.9168537440591225, + "learning_rate": 1.229580074768788e-06, + "loss": 0.1402, + "step": 360 + }, + { + "epoch": 0.08204545454545455, + "grad_norm": 2.38220639956826, + "learning_rate": 1.2294667841887234e-06, + "loss": 0.0956, + "step": 361 + }, + { + "epoch": 0.08227272727272728, + "grad_norm": 3.3941245990914704, + "learning_rate": 1.2293531854553642e-06, + "loss": 0.069, + "step": 362 + }, + { + "epoch": 0.0825, + "grad_norm": 2.6982014435057007, + "learning_rate": 1.2292392786266225e-06, + "loss": 0.1295, + "step": 363 + }, + { + "epoch": 0.08272727272727273, + "grad_norm": 2.5429887774673605, + "learning_rate": 1.2291250637605672e-06, + "loss": 0.0649, + "step": 364 + }, + { + "epoch": 0.08295454545454546, + "grad_norm": 4.541719412333879, + "learning_rate": 1.2290105409154244e-06, + "loss": 0.076, + "step": 365 + }, + { + "epoch": 0.08318181818181818, + "grad_norm": 3.46953041168632, + "learning_rate": 1.2288957101495772e-06, + "loss": 0.0955, + "step": 366 + }, + { + "epoch": 0.0834090909090909, + "grad_norm": 2.833747952271272, + "learning_rate": 1.2287805715215651e-06, + "loss": 0.0979, + "step": 367 + }, + { + "epoch": 0.08363636363636363, + "grad_norm": 2.4688973245010732, + "learning_rate": 1.2286651250900858e-06, + "loss": 0.0702, + "step": 368 + }, + { + "epoch": 0.08386363636363636, + "grad_norm": 3.6174807701051046, + "learning_rate": 1.2285493709139925e-06, + "loss": 0.1192, + "step": 369 + }, + { + "epoch": 0.08409090909090909, + "grad_norm": 3.1746195503233454, + "learning_rate": 1.2284333090522962e-06, + "loss": 0.0805, + "step": 370 + }, + { + "epoch": 0.08431818181818182, + "grad_norm": 3.6365274354450974, + "learning_rate": 1.2283169395641647e-06, + "loss": 0.0965, + "step": 371 + }, + { + "epoch": 0.08454545454545455, + "grad_norm": 3.1701725473834976, + "learning_rate": 1.228200262508922e-06, + "loss": 0.1219, + "step": 372 + }, + { + "epoch": 0.08477272727272728, + "grad_norm": 6.611704569027101, + "learning_rate": 1.2280832779460494e-06, + "loss": 0.1599, + "step": 373 + }, + { + "epoch": 0.085, + "grad_norm": 2.3590785477124863, + "learning_rate": 1.2279659859351853e-06, + "loss": 0.1025, + "step": 374 + }, + { + "epoch": 0.08522727272727272, + "grad_norm": 4.919619134503741, + "learning_rate": 1.2278483865361239e-06, + "loss": 0.1255, + "step": 375 + }, + { + "epoch": 0.08545454545454545, + "grad_norm": 1.6390680275869847, + "learning_rate": 1.227730479808817e-06, + "loss": 0.037, + "step": 376 + }, + { + "epoch": 0.08568181818181818, + "grad_norm": 3.7054155554055317, + "learning_rate": 1.2276122658133723e-06, + "loss": 0.0795, + "step": 377 + }, + { + "epoch": 0.08590909090909091, + "grad_norm": 3.3710877295891146, + "learning_rate": 1.2274937446100548e-06, + "loss": 0.1017, + "step": 378 + }, + { + "epoch": 0.08613636363636364, + "grad_norm": 4.049749549786513, + "learning_rate": 1.227374916259286e-06, + "loss": 0.0977, + "step": 379 + }, + { + "epoch": 0.08636363636363636, + "grad_norm": 2.8237932036299034, + "learning_rate": 1.2272557808216433e-06, + "loss": 0.082, + "step": 380 + }, + { + "epoch": 0.0865909090909091, + "grad_norm": 6.472982642514373, + "learning_rate": 1.2271363383578619e-06, + "loss": 0.1111, + "step": 381 + }, + { + "epoch": 0.08681818181818182, + "grad_norm": 2.872276084205482, + "learning_rate": 1.2270165889288325e-06, + "loss": 0.1416, + "step": 382 + }, + { + "epoch": 0.08704545454545455, + "grad_norm": 3.1121878136277763, + "learning_rate": 1.2268965325956022e-06, + "loss": 0.0709, + "step": 383 + }, + { + "epoch": 0.08727272727272728, + "grad_norm": 4.289430038043764, + "learning_rate": 1.2267761694193752e-06, + "loss": 0.1046, + "step": 384 + }, + { + "epoch": 0.0875, + "grad_norm": 3.7226223021658016, + "learning_rate": 1.2266554994615121e-06, + "loss": 0.0881, + "step": 385 + }, + { + "epoch": 0.08772727272727272, + "grad_norm": 2.432238318963267, + "learning_rate": 1.2265345227835295e-06, + "loss": 0.1079, + "step": 386 + }, + { + "epoch": 0.08795454545454545, + "grad_norm": 1.7074767331576395, + "learning_rate": 1.2264132394471007e-06, + "loss": 0.0657, + "step": 387 + }, + { + "epoch": 0.08818181818181818, + "grad_norm": 1.9745542140283514, + "learning_rate": 1.226291649514055e-06, + "loss": 0.0548, + "step": 388 + }, + { + "epoch": 0.08840909090909091, + "grad_norm": 3.486509875354997, + "learning_rate": 1.226169753046378e-06, + "loss": 0.1422, + "step": 389 + }, + { + "epoch": 0.08863636363636364, + "grad_norm": 2.105720275426089, + "learning_rate": 1.2260475501062121e-06, + "loss": 0.0753, + "step": 390 + }, + { + "epoch": 0.08886363636363637, + "grad_norm": 1.6908207330851532, + "learning_rate": 1.2259250407558553e-06, + "loss": 0.0423, + "step": 391 + }, + { + "epoch": 0.0890909090909091, + "grad_norm": 1.9536413900829848, + "learning_rate": 1.2258022250577622e-06, + "loss": 0.0651, + "step": 392 + }, + { + "epoch": 0.08931818181818182, + "grad_norm": 2.107555535166247, + "learning_rate": 1.2256791030745434e-06, + "loss": 0.0767, + "step": 393 + }, + { + "epoch": 0.08954545454545454, + "grad_norm": 3.2413360732050918, + "learning_rate": 1.225555674868966e-06, + "loss": 0.1408, + "step": 394 + }, + { + "epoch": 0.08977272727272727, + "grad_norm": 5.098748129148887, + "learning_rate": 1.2254319405039524e-06, + "loss": 0.148, + "step": 395 + }, + { + "epoch": 0.09, + "grad_norm": 2.9137716556988416, + "learning_rate": 1.2253079000425818e-06, + "loss": 0.0858, + "step": 396 + }, + { + "epoch": 0.09022727272727273, + "grad_norm": 2.306099527359283, + "learning_rate": 1.2251835535480895e-06, + "loss": 0.1314, + "step": 397 + }, + { + "epoch": 0.09045454545454545, + "grad_norm": 3.233107573064104, + "learning_rate": 1.2250589010838662e-06, + "loss": 0.1313, + "step": 398 + }, + { + "epoch": 0.09068181818181818, + "grad_norm": 3.6199998689552584, + "learning_rate": 1.224933942713459e-06, + "loss": 0.1645, + "step": 399 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 2.8815298806137113, + "learning_rate": 1.2248086785005709e-06, + "loss": 0.0814, + "step": 400 + }, + { + "epoch": 0.09113636363636364, + "grad_norm": 2.12725818411472, + "learning_rate": 1.2246831085090611e-06, + "loss": 0.0677, + "step": 401 + }, + { + "epoch": 0.09136363636363637, + "grad_norm": 2.068540185403787, + "learning_rate": 1.2245572328029438e-06, + "loss": 0.0766, + "step": 402 + }, + { + "epoch": 0.0915909090909091, + "grad_norm": 3.3787139639022477, + "learning_rate": 1.22443105144639e-06, + "loss": 0.1528, + "step": 403 + }, + { + "epoch": 0.09181818181818181, + "grad_norm": 3.6734313614551706, + "learning_rate": 1.224304564503726e-06, + "loss": 0.2006, + "step": 404 + }, + { + "epoch": 0.09204545454545454, + "grad_norm": 2.3773106183493704, + "learning_rate": 1.224177772039434e-06, + "loss": 0.0819, + "step": 405 + }, + { + "epoch": 0.09227272727272727, + "grad_norm": 4.479594338984166, + "learning_rate": 1.2240506741181523e-06, + "loss": 0.1965, + "step": 406 + }, + { + "epoch": 0.0925, + "grad_norm": 4.599512341378346, + "learning_rate": 1.2239232708046745e-06, + "loss": 0.117, + "step": 407 + }, + { + "epoch": 0.09272727272727273, + "grad_norm": 4.65201743851853, + "learning_rate": 1.2237955621639496e-06, + "loss": 0.2369, + "step": 408 + }, + { + "epoch": 0.09295454545454546, + "grad_norm": 2.7824021376543677, + "learning_rate": 1.223667548261083e-06, + "loss": 0.0941, + "step": 409 + }, + { + "epoch": 0.09318181818181819, + "grad_norm": 4.289778277836561, + "learning_rate": 1.2235392291613353e-06, + "loss": 0.1326, + "step": 410 + }, + { + "epoch": 0.09340909090909091, + "grad_norm": 3.9360665164441055, + "learning_rate": 1.2234106049301228e-06, + "loss": 0.1095, + "step": 411 + }, + { + "epoch": 0.09363636363636364, + "grad_norm": 4.915602973483901, + "learning_rate": 1.2232816756330173e-06, + "loss": 0.1522, + "step": 412 + }, + { + "epoch": 0.09386363636363636, + "grad_norm": 3.440478039172773, + "learning_rate": 1.223152441335746e-06, + "loss": 0.1086, + "step": 413 + }, + { + "epoch": 0.09409090909090909, + "grad_norm": 3.163613020768659, + "learning_rate": 1.223022902104192e-06, + "loss": 0.1091, + "step": 414 + }, + { + "epoch": 0.09431818181818181, + "grad_norm": 2.7771510795806793, + "learning_rate": 1.2228930580043931e-06, + "loss": 0.0769, + "step": 415 + }, + { + "epoch": 0.09454545454545454, + "grad_norm": 4.646937062796584, + "learning_rate": 1.2227629091025437e-06, + "loss": 0.1422, + "step": 416 + }, + { + "epoch": 0.09477272727272727, + "grad_norm": 4.410836996121542, + "learning_rate": 1.2226324554649921e-06, + "loss": 0.1196, + "step": 417 + }, + { + "epoch": 0.095, + "grad_norm": 2.5231288313876727, + "learning_rate": 1.222501697158243e-06, + "loss": 0.0799, + "step": 418 + }, + { + "epoch": 0.09522727272727273, + "grad_norm": 3.360112046907082, + "learning_rate": 1.2223706342489565e-06, + "loss": 0.0994, + "step": 419 + }, + { + "epoch": 0.09545454545454546, + "grad_norm": 4.946888659745919, + "learning_rate": 1.222239266803947e-06, + "loss": 0.1911, + "step": 420 + }, + { + "epoch": 0.09568181818181819, + "grad_norm": 1.4239021451720755, + "learning_rate": 1.2221075948901856e-06, + "loss": 0.0615, + "step": 421 + }, + { + "epoch": 0.0959090909090909, + "grad_norm": 2.7296283333268834, + "learning_rate": 1.221975618574797e-06, + "loss": 0.1013, + "step": 422 + }, + { + "epoch": 0.09613636363636363, + "grad_norm": 4.974673641627054, + "learning_rate": 1.2218433379250623e-06, + "loss": 0.129, + "step": 423 + }, + { + "epoch": 0.09636363636363636, + "grad_norm": 2.6052782154944776, + "learning_rate": 1.2217107530084174e-06, + "loss": 0.1174, + "step": 424 + }, + { + "epoch": 0.09659090909090909, + "grad_norm": 2.4871443771102544, + "learning_rate": 1.2215778638924527e-06, + "loss": 0.0799, + "step": 425 + }, + { + "epoch": 0.09681818181818182, + "grad_norm": 3.86113905482254, + "learning_rate": 1.221444670644915e-06, + "loss": 0.0932, + "step": 426 + }, + { + "epoch": 0.09704545454545455, + "grad_norm": 4.279554503418284, + "learning_rate": 1.221311173333705e-06, + "loss": 0.101, + "step": 427 + }, + { + "epoch": 0.09727272727272727, + "grad_norm": 1.7158239249526541, + "learning_rate": 1.2211773720268784e-06, + "loss": 0.0557, + "step": 428 + }, + { + "epoch": 0.0975, + "grad_norm": 1.9719435011250115, + "learning_rate": 1.2210432667926467e-06, + "loss": 0.0636, + "step": 429 + }, + { + "epoch": 0.09772727272727273, + "grad_norm": 2.800044602484537, + "learning_rate": 1.2209088576993757e-06, + "loss": 0.1438, + "step": 430 + }, + { + "epoch": 0.09795454545454546, + "grad_norm": 2.2459968059258495, + "learning_rate": 1.2207741448155867e-06, + "loss": 0.0585, + "step": 431 + }, + { + "epoch": 0.09818181818181818, + "grad_norm": 2.737923788075119, + "learning_rate": 1.220639128209955e-06, + "loss": 0.1149, + "step": 432 + }, + { + "epoch": 0.0984090909090909, + "grad_norm": 2.160504662100839, + "learning_rate": 1.2205038079513113e-06, + "loss": 0.0957, + "step": 433 + }, + { + "epoch": 0.09863636363636363, + "grad_norm": 2.3449320078814058, + "learning_rate": 1.2203681841086409e-06, + "loss": 0.0677, + "step": 434 + }, + { + "epoch": 0.09886363636363636, + "grad_norm": 1.7561253489930528, + "learning_rate": 1.2202322567510843e-06, + "loss": 0.0613, + "step": 435 + }, + { + "epoch": 0.09909090909090909, + "grad_norm": 4.3714833019328845, + "learning_rate": 1.2200960259479362e-06, + "loss": 0.1448, + "step": 436 + }, + { + "epoch": 0.09931818181818182, + "grad_norm": 5.241134950935839, + "learning_rate": 1.219959491768646e-06, + "loss": 0.1817, + "step": 437 + }, + { + "epoch": 0.09954545454545455, + "grad_norm": 4.511533086825861, + "learning_rate": 1.2198226542828183e-06, + "loss": 0.1153, + "step": 438 + }, + { + "epoch": 0.09977272727272728, + "grad_norm": 1.9872464735723243, + "learning_rate": 1.219685513560212e-06, + "loss": 0.067, + "step": 439 + }, + { + "epoch": 0.1, + "grad_norm": 1.966477682233784, + "learning_rate": 1.2195480696707401e-06, + "loss": 0.0565, + "step": 440 + }, + { + "epoch": 0.10022727272727272, + "grad_norm": 4.920514774069939, + "learning_rate": 1.2194103226844711e-06, + "loss": 0.2292, + "step": 441 + }, + { + "epoch": 0.10045454545454545, + "grad_norm": 3.3871390478027137, + "learning_rate": 1.2192722726716272e-06, + "loss": 0.1924, + "step": 442 + }, + { + "epoch": 0.10068181818181818, + "grad_norm": 2.0351508051987826, + "learning_rate": 1.2191339197025857e-06, + "loss": 0.0862, + "step": 443 + }, + { + "epoch": 0.1009090909090909, + "grad_norm": 2.88571125465246, + "learning_rate": 1.2189952638478778e-06, + "loss": 0.1076, + "step": 444 + }, + { + "epoch": 0.10113636363636364, + "grad_norm": 3.9220428041413, + "learning_rate": 1.2188563051781894e-06, + "loss": 0.0921, + "step": 445 + }, + { + "epoch": 0.10136363636363636, + "grad_norm": 2.308630021933964, + "learning_rate": 1.2187170437643608e-06, + "loss": 0.0786, + "step": 446 + }, + { + "epoch": 0.10159090909090909, + "grad_norm": 1.7021008056728288, + "learning_rate": 1.2185774796773864e-06, + "loss": 0.0489, + "step": 447 + }, + { + "epoch": 0.10181818181818182, + "grad_norm": 2.8366182130249156, + "learning_rate": 1.2184376129884156e-06, + "loss": 0.0711, + "step": 448 + }, + { + "epoch": 0.10204545454545455, + "grad_norm": 2.494654489597956, + "learning_rate": 1.2182974437687512e-06, + "loss": 0.077, + "step": 449 + }, + { + "epoch": 0.10227272727272728, + "grad_norm": 1.8789971456688084, + "learning_rate": 1.2181569720898503e-06, + "loss": 0.0603, + "step": 450 + }, + { + "epoch": 0.1025, + "grad_norm": 7.572598925909453, + "learning_rate": 1.2180161980233249e-06, + "loss": 0.2909, + "step": 451 + }, + { + "epoch": 0.10272727272727272, + "grad_norm": 3.251456224316626, + "learning_rate": 1.2178751216409404e-06, + "loss": 0.1156, + "step": 452 + }, + { + "epoch": 0.10295454545454545, + "grad_norm": 2.8783591202595775, + "learning_rate": 1.217733743014617e-06, + "loss": 0.0812, + "step": 453 + }, + { + "epoch": 0.10318181818181818, + "grad_norm": 1.779008544511726, + "learning_rate": 1.2175920622164284e-06, + "loss": 0.0822, + "step": 454 + }, + { + "epoch": 0.10340909090909091, + "grad_norm": 3.151791231016802, + "learning_rate": 1.2174500793186024e-06, + "loss": 0.1666, + "step": 455 + }, + { + "epoch": 0.10363636363636364, + "grad_norm": 6.270849489622784, + "learning_rate": 1.2173077943935212e-06, + "loss": 0.2308, + "step": 456 + }, + { + "epoch": 0.10386363636363637, + "grad_norm": 3.9063075477089595, + "learning_rate": 1.2171652075137209e-06, + "loss": 0.0935, + "step": 457 + }, + { + "epoch": 0.1040909090909091, + "grad_norm": 3.1178755438985934, + "learning_rate": 1.2170223187518908e-06, + "loss": 0.1215, + "step": 458 + }, + { + "epoch": 0.10431818181818182, + "grad_norm": 3.108509861096335, + "learning_rate": 1.216879128180875e-06, + "loss": 0.0726, + "step": 459 + }, + { + "epoch": 0.10454545454545454, + "grad_norm": 2.471974295276308, + "learning_rate": 1.2167356358736714e-06, + "loss": 0.0618, + "step": 460 + }, + { + "epoch": 0.10477272727272727, + "grad_norm": 2.0732406776270715, + "learning_rate": 1.2165918419034312e-06, + "loss": 0.0728, + "step": 461 + }, + { + "epoch": 0.105, + "grad_norm": 3.043747201893837, + "learning_rate": 1.2164477463434599e-06, + "loss": 0.1112, + "step": 462 + }, + { + "epoch": 0.10522727272727272, + "grad_norm": 3.2055267418794124, + "learning_rate": 1.216303349267216e-06, + "loss": 0.0791, + "step": 463 + }, + { + "epoch": 0.10545454545454545, + "grad_norm": 3.383368019795033, + "learning_rate": 1.2161586507483126e-06, + "loss": 0.1577, + "step": 464 + }, + { + "epoch": 0.10568181818181818, + "grad_norm": 3.9789816443070873, + "learning_rate": 1.2160136508605156e-06, + "loss": 0.0989, + "step": 465 + }, + { + "epoch": 0.10590909090909091, + "grad_norm": 2.500431701045704, + "learning_rate": 1.2158683496777457e-06, + "loss": 0.0839, + "step": 466 + }, + { + "epoch": 0.10613636363636364, + "grad_norm": 2.657039985296244, + "learning_rate": 1.215722747274076e-06, + "loss": 0.0796, + "step": 467 + }, + { + "epoch": 0.10636363636363637, + "grad_norm": 2.4937388081922007, + "learning_rate": 1.2155768437237342e-06, + "loss": 0.1086, + "step": 468 + }, + { + "epoch": 0.1065909090909091, + "grad_norm": 2.5615750491399636, + "learning_rate": 1.2154306391011003e-06, + "loss": 0.1228, + "step": 469 + }, + { + "epoch": 0.10681818181818181, + "grad_norm": 2.1009193972019267, + "learning_rate": 1.215284133480709e-06, + "loss": 0.0656, + "step": 470 + }, + { + "epoch": 0.10704545454545454, + "grad_norm": 4.025654861290522, + "learning_rate": 1.2151373269372476e-06, + "loss": 0.1972, + "step": 471 + }, + { + "epoch": 0.10727272727272727, + "grad_norm": 1.7091636419981506, + "learning_rate": 1.2149902195455574e-06, + "loss": 0.0759, + "step": 472 + }, + { + "epoch": 0.1075, + "grad_norm": 2.4698413978210603, + "learning_rate": 1.2148428113806326e-06, + "loss": 0.1257, + "step": 473 + }, + { + "epoch": 0.10772727272727273, + "grad_norm": 2.882577018820717, + "learning_rate": 1.214695102517621e-06, + "loss": 0.1052, + "step": 474 + }, + { + "epoch": 0.10795454545454546, + "grad_norm": 5.414616020041416, + "learning_rate": 1.214547093031824e-06, + "loss": 0.1681, + "step": 475 + }, + { + "epoch": 0.10818181818181818, + "grad_norm": 3.2634120664356967, + "learning_rate": 1.2143987829986953e-06, + "loss": 0.1293, + "step": 476 + }, + { + "epoch": 0.10840909090909091, + "grad_norm": 2.2416444836113842, + "learning_rate": 1.2142501724938425e-06, + "loss": 0.1309, + "step": 477 + }, + { + "epoch": 0.10863636363636364, + "grad_norm": 1.466887546469956, + "learning_rate": 1.2141012615930266e-06, + "loss": 0.0467, + "step": 478 + }, + { + "epoch": 0.10886363636363636, + "grad_norm": 3.2563521071124, + "learning_rate": 1.2139520503721614e-06, + "loss": 0.2101, + "step": 479 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 3.873460990896338, + "learning_rate": 1.2138025389073133e-06, + "loss": 0.1078, + "step": 480 + }, + { + "epoch": 0.10931818181818181, + "grad_norm": 3.8669089729214634, + "learning_rate": 1.2136527272747032e-06, + "loss": 0.0754, + "step": 481 + }, + { + "epoch": 0.10954545454545454, + "grad_norm": 6.2375879337062035, + "learning_rate": 1.2135026155507033e-06, + "loss": 0.1833, + "step": 482 + }, + { + "epoch": 0.10977272727272727, + "grad_norm": 3.3459487445038585, + "learning_rate": 1.2133522038118398e-06, + "loss": 0.0923, + "step": 483 + }, + { + "epoch": 0.11, + "grad_norm": 4.466574087179955, + "learning_rate": 1.2132014921347917e-06, + "loss": 0.1096, + "step": 484 + }, + { + "epoch": 0.11022727272727273, + "grad_norm": 2.167393131020655, + "learning_rate": 1.213050480596391e-06, + "loss": 0.1058, + "step": 485 + }, + { + "epoch": 0.11045454545454546, + "grad_norm": 2.8914726679452816, + "learning_rate": 1.2128991692736223e-06, + "loss": 0.0797, + "step": 486 + }, + { + "epoch": 0.11068181818181819, + "grad_norm": 2.1755687565891066, + "learning_rate": 1.2127475582436232e-06, + "loss": 0.079, + "step": 487 + }, + { + "epoch": 0.11090909090909092, + "grad_norm": 3.492802551825463, + "learning_rate": 1.2125956475836837e-06, + "loss": 0.138, + "step": 488 + }, + { + "epoch": 0.11113636363636363, + "grad_norm": 1.7902508501797942, + "learning_rate": 1.2124434373712473e-06, + "loss": 0.0908, + "step": 489 + }, + { + "epoch": 0.11136363636363636, + "grad_norm": 4.034809847891115, + "learning_rate": 1.2122909276839095e-06, + "loss": 0.1272, + "step": 490 + }, + { + "epoch": 0.11159090909090909, + "grad_norm": 3.1685367135864575, + "learning_rate": 1.2121381185994192e-06, + "loss": 0.0878, + "step": 491 + }, + { + "epoch": 0.11181818181818182, + "grad_norm": 1.589573409739006, + "learning_rate": 1.211985010195677e-06, + "loss": 0.0479, + "step": 492 + }, + { + "epoch": 0.11204545454545455, + "grad_norm": 2.504739016351825, + "learning_rate": 1.2118316025507369e-06, + "loss": 0.0802, + "step": 493 + }, + { + "epoch": 0.11227272727272727, + "grad_norm": 1.9540195021173163, + "learning_rate": 1.2116778957428046e-06, + "loss": 0.1181, + "step": 494 + }, + { + "epoch": 0.1125, + "grad_norm": 3.3334987373499483, + "learning_rate": 1.2115238898502395e-06, + "loss": 0.0742, + "step": 495 + }, + { + "epoch": 0.11272727272727273, + "grad_norm": 3.025530391416926, + "learning_rate": 1.2113695849515527e-06, + "loss": 0.192, + "step": 496 + }, + { + "epoch": 0.11295454545454546, + "grad_norm": 6.741411762616305, + "learning_rate": 1.2112149811254076e-06, + "loss": 0.1643, + "step": 497 + }, + { + "epoch": 0.11318181818181818, + "grad_norm": 2.9075771835692916, + "learning_rate": 1.2110600784506202e-06, + "loss": 0.0944, + "step": 498 + }, + { + "epoch": 0.1134090909090909, + "grad_norm": 2.3717464191846527, + "learning_rate": 1.2109048770061593e-06, + "loss": 0.0694, + "step": 499 + }, + { + "epoch": 0.11363636363636363, + "grad_norm": 2.831854491797305, + "learning_rate": 1.2107493768711453e-06, + "loss": 0.0985, + "step": 500 + }, + { + "epoch": 0.11386363636363636, + "grad_norm": 1.9860156119638017, + "learning_rate": 1.2105935781248513e-06, + "loss": 0.0867, + "step": 501 + }, + { + "epoch": 0.11409090909090909, + "grad_norm": 5.228951709825794, + "learning_rate": 1.2104374808467023e-06, + "loss": 0.2457, + "step": 502 + }, + { + "epoch": 0.11431818181818182, + "grad_norm": 3.510768700023396, + "learning_rate": 1.2102810851162758e-06, + "loss": 0.1022, + "step": 503 + }, + { + "epoch": 0.11454545454545455, + "grad_norm": 3.5140110574002477, + "learning_rate": 1.2101243910133013e-06, + "loss": 0.0812, + "step": 504 + }, + { + "epoch": 0.11477272727272728, + "grad_norm": 2.554267446591213, + "learning_rate": 1.2099673986176604e-06, + "loss": 0.136, + "step": 505 + }, + { + "epoch": 0.115, + "grad_norm": 2.518498587543057, + "learning_rate": 1.2098101080093873e-06, + "loss": 0.1217, + "step": 506 + }, + { + "epoch": 0.11522727272727273, + "grad_norm": 3.8573725853503675, + "learning_rate": 1.2096525192686673e-06, + "loss": 0.2324, + "step": 507 + }, + { + "epoch": 0.11545454545454545, + "grad_norm": 5.743189687413489, + "learning_rate": 1.209494632475838e-06, + "loss": 0.1057, + "step": 508 + }, + { + "epoch": 0.11568181818181818, + "grad_norm": 2.959400513674917, + "learning_rate": 1.2093364477113893e-06, + "loss": 0.0781, + "step": 509 + }, + { + "epoch": 0.1159090909090909, + "grad_norm": 4.277580657761236, + "learning_rate": 1.2091779650559628e-06, + "loss": 0.0977, + "step": 510 + }, + { + "epoch": 0.11613636363636363, + "grad_norm": 2.517412718608715, + "learning_rate": 1.209019184590352e-06, + "loss": 0.0927, + "step": 511 + }, + { + "epoch": 0.11636363636363636, + "grad_norm": 4.4850972401394715, + "learning_rate": 1.2088601063955018e-06, + "loss": 0.1107, + "step": 512 + }, + { + "epoch": 0.11659090909090909, + "grad_norm": 4.277166175095006, + "learning_rate": 1.20870073055251e-06, + "loss": 0.1196, + "step": 513 + }, + { + "epoch": 0.11681818181818182, + "grad_norm": 3.3188059291020373, + "learning_rate": 1.2085410571426244e-06, + "loss": 0.1557, + "step": 514 + }, + { + "epoch": 0.11704545454545455, + "grad_norm": 2.835621253981015, + "learning_rate": 1.208381086247246e-06, + "loss": 0.1512, + "step": 515 + }, + { + "epoch": 0.11727272727272728, + "grad_norm": 2.5522170088661316, + "learning_rate": 1.2082208179479272e-06, + "loss": 0.1309, + "step": 516 + }, + { + "epoch": 0.1175, + "grad_norm": 5.645393854993232, + "learning_rate": 1.2080602523263715e-06, + "loss": 0.1793, + "step": 517 + }, + { + "epoch": 0.11772727272727272, + "grad_norm": 2.902480831513025, + "learning_rate": 1.207899389464434e-06, + "loss": 0.0973, + "step": 518 + }, + { + "epoch": 0.11795454545454545, + "grad_norm": 3.0474449053407677, + "learning_rate": 1.2077382294441218e-06, + "loss": 0.0922, + "step": 519 + }, + { + "epoch": 0.11818181818181818, + "grad_norm": 2.1972480331566917, + "learning_rate": 1.2075767723475932e-06, + "loss": 0.0479, + "step": 520 + }, + { + "epoch": 0.11840909090909091, + "grad_norm": 4.718271313739516, + "learning_rate": 1.2074150182571579e-06, + "loss": 0.1399, + "step": 521 + }, + { + "epoch": 0.11863636363636364, + "grad_norm": 3.1548176286505325, + "learning_rate": 1.2072529672552771e-06, + "loss": 0.1138, + "step": 522 + }, + { + "epoch": 0.11886363636363637, + "grad_norm": 3.5207935721656973, + "learning_rate": 1.2070906194245634e-06, + "loss": 0.1088, + "step": 523 + }, + { + "epoch": 0.1190909090909091, + "grad_norm": 3.0581878473011614, + "learning_rate": 1.2069279748477812e-06, + "loss": 0.0786, + "step": 524 + }, + { + "epoch": 0.11931818181818182, + "grad_norm": 2.5403240998391845, + "learning_rate": 1.206765033607845e-06, + "loss": 0.1118, + "step": 525 + }, + { + "epoch": 0.11954545454545455, + "grad_norm": 2.581209186135896, + "learning_rate": 1.2066017957878212e-06, + "loss": 0.1475, + "step": 526 + }, + { + "epoch": 0.11977272727272727, + "grad_norm": 2.7978694227102583, + "learning_rate": 1.2064382614709276e-06, + "loss": 0.1583, + "step": 527 + }, + { + "epoch": 0.12, + "grad_norm": 3.230977957726528, + "learning_rate": 1.206274430740533e-06, + "loss": 0.1672, + "step": 528 + }, + { + "epoch": 0.12022727272727272, + "grad_norm": 4.19842301957619, + "learning_rate": 1.2061103036801573e-06, + "loss": 0.1482, + "step": 529 + }, + { + "epoch": 0.12045454545454545, + "grad_norm": 5.770691163267776, + "learning_rate": 1.2059458803734712e-06, + "loss": 0.1298, + "step": 530 + }, + { + "epoch": 0.12068181818181818, + "grad_norm": 3.972768667356933, + "learning_rate": 1.2057811609042968e-06, + "loss": 0.0998, + "step": 531 + }, + { + "epoch": 0.12090909090909091, + "grad_norm": 5.080522193929101, + "learning_rate": 1.205616145356607e-06, + "loss": 0.1786, + "step": 532 + }, + { + "epoch": 0.12113636363636364, + "grad_norm": 3.6416215519669968, + "learning_rate": 1.2054508338145257e-06, + "loss": 0.1879, + "step": 533 + }, + { + "epoch": 0.12136363636363637, + "grad_norm": 3.3554603219618446, + "learning_rate": 1.2052852263623274e-06, + "loss": 0.1129, + "step": 534 + }, + { + "epoch": 0.1215909090909091, + "grad_norm": 2.9806298846905546, + "learning_rate": 1.2051193230844382e-06, + "loss": 0.0715, + "step": 535 + }, + { + "epoch": 0.12181818181818181, + "grad_norm": 3.233094123117659, + "learning_rate": 1.2049531240654343e-06, + "loss": 0.1178, + "step": 536 + }, + { + "epoch": 0.12204545454545454, + "grad_norm": 3.708896515288204, + "learning_rate": 1.2047866293900428e-06, + "loss": 0.1783, + "step": 537 + }, + { + "epoch": 0.12227272727272727, + "grad_norm": 2.6602901122102716, + "learning_rate": 1.2046198391431415e-06, + "loss": 0.1418, + "step": 538 + }, + { + "epoch": 0.1225, + "grad_norm": 4.669634772140572, + "learning_rate": 1.2044527534097595e-06, + "loss": 0.173, + "step": 539 + }, + { + "epoch": 0.12272727272727273, + "grad_norm": 1.5028169649693566, + "learning_rate": 1.2042853722750756e-06, + "loss": 0.039, + "step": 540 + }, + { + "epoch": 0.12295454545454546, + "grad_norm": 2.904497335692425, + "learning_rate": 1.2041176958244197e-06, + "loss": 0.0888, + "step": 541 + }, + { + "epoch": 0.12318181818181818, + "grad_norm": 2.6451929963797713, + "learning_rate": 1.2039497241432724e-06, + "loss": 0.0682, + "step": 542 + }, + { + "epoch": 0.12340909090909091, + "grad_norm": 3.7170549013934435, + "learning_rate": 1.2037814573172642e-06, + "loss": 0.1411, + "step": 543 + }, + { + "epoch": 0.12363636363636364, + "grad_norm": 2.474660313130358, + "learning_rate": 1.2036128954321768e-06, + "loss": 0.0947, + "step": 544 + }, + { + "epoch": 0.12386363636363637, + "grad_norm": 3.1889690497542715, + "learning_rate": 1.2034440385739418e-06, + "loss": 0.0817, + "step": 545 + }, + { + "epoch": 0.12409090909090909, + "grad_norm": 4.537820626453625, + "learning_rate": 1.2032748868286415e-06, + "loss": 0.2305, + "step": 546 + }, + { + "epoch": 0.12431818181818181, + "grad_norm": 3.4406941290200383, + "learning_rate": 1.2031054402825082e-06, + "loss": 0.0741, + "step": 547 + }, + { + "epoch": 0.12454545454545454, + "grad_norm": 2.195149120400146, + "learning_rate": 1.2029356990219248e-06, + "loss": 0.1076, + "step": 548 + }, + { + "epoch": 0.12477272727272727, + "grad_norm": 2.060858796740431, + "learning_rate": 1.2027656631334242e-06, + "loss": 0.1402, + "step": 549 + }, + { + "epoch": 0.125, + "grad_norm": 2.8721644779044535, + "learning_rate": 1.2025953327036897e-06, + "loss": 0.1092, + "step": 550 + }, + { + "epoch": 0.12522727272727271, + "grad_norm": 3.367360775893672, + "learning_rate": 1.2024247078195542e-06, + "loss": 0.1058, + "step": 551 + }, + { + "epoch": 0.12545454545454546, + "grad_norm": 7.355200822564602, + "learning_rate": 1.202253788568002e-06, + "loss": 0.1409, + "step": 552 + }, + { + "epoch": 0.12568181818181817, + "grad_norm": 3.223527339373556, + "learning_rate": 1.202082575036166e-06, + "loss": 0.1908, + "step": 553 + }, + { + "epoch": 0.12590909090909091, + "grad_norm": 2.5935790347418846, + "learning_rate": 1.2019110673113302e-06, + "loss": 0.126, + "step": 554 + }, + { + "epoch": 0.12613636363636363, + "grad_norm": 2.6788096848560268, + "learning_rate": 1.2017392654809278e-06, + "loss": 0.0734, + "step": 555 + }, + { + "epoch": 0.12636363636363637, + "grad_norm": 2.3283744556449935, + "learning_rate": 1.2015671696325423e-06, + "loss": 0.1331, + "step": 556 + }, + { + "epoch": 0.1265909090909091, + "grad_norm": 5.27073941861621, + "learning_rate": 1.2013947798539073e-06, + "loss": 0.0878, + "step": 557 + }, + { + "epoch": 0.12681818181818183, + "grad_norm": 2.606289849041831, + "learning_rate": 1.2012220962329058e-06, + "loss": 0.0542, + "step": 558 + }, + { + "epoch": 0.12704545454545454, + "grad_norm": 6.526314652415761, + "learning_rate": 1.201049118857571e-06, + "loss": 0.233, + "step": 559 + }, + { + "epoch": 0.12727272727272726, + "grad_norm": 2.2175172650303594, + "learning_rate": 1.2008758478160853e-06, + "loss": 0.0774, + "step": 560 + }, + { + "epoch": 0.1275, + "grad_norm": 3.6763556159095567, + "learning_rate": 1.2007022831967813e-06, + "loss": 0.1373, + "step": 561 + }, + { + "epoch": 0.12772727272727272, + "grad_norm": 2.8295415297405615, + "learning_rate": 1.2005284250881417e-06, + "loss": 0.1133, + "step": 562 + }, + { + "epoch": 0.12795454545454546, + "grad_norm": 2.974517601517034, + "learning_rate": 1.2003542735787973e-06, + "loss": 0.1333, + "step": 563 + }, + { + "epoch": 0.12818181818181817, + "grad_norm": 2.715704825781891, + "learning_rate": 1.20017982875753e-06, + "loss": 0.0901, + "step": 564 + }, + { + "epoch": 0.12840909090909092, + "grad_norm": 1.9739259407297394, + "learning_rate": 1.2000050907132705e-06, + "loss": 0.067, + "step": 565 + }, + { + "epoch": 0.12863636363636363, + "grad_norm": 2.906735651308808, + "learning_rate": 1.1998300595350993e-06, + "loss": 0.1435, + "step": 566 + }, + { + "epoch": 0.12886363636363637, + "grad_norm": 3.1949368752448484, + "learning_rate": 1.1996547353122461e-06, + "loss": 0.0796, + "step": 567 + }, + { + "epoch": 0.1290909090909091, + "grad_norm": 3.8738935296067205, + "learning_rate": 1.1994791181340897e-06, + "loss": 0.14, + "step": 568 + }, + { + "epoch": 0.1293181818181818, + "grad_norm": 2.9128094894337924, + "learning_rate": 1.1993032080901593e-06, + "loss": 0.1087, + "step": 569 + }, + { + "epoch": 0.12954545454545455, + "grad_norm": 2.1283511235172416, + "learning_rate": 1.1991270052701323e-06, + "loss": 0.0727, + "step": 570 + }, + { + "epoch": 0.12977272727272726, + "grad_norm": 2.95662718032164, + "learning_rate": 1.1989505097638357e-06, + "loss": 0.1825, + "step": 571 + }, + { + "epoch": 0.13, + "grad_norm": 3.0152576822935546, + "learning_rate": 1.198773721661246e-06, + "loss": 0.1685, + "step": 572 + }, + { + "epoch": 0.13022727272727272, + "grad_norm": 3.716459362719905, + "learning_rate": 1.1985966410524883e-06, + "loss": 0.1062, + "step": 573 + }, + { + "epoch": 0.13045454545454546, + "grad_norm": 2.3425653658909917, + "learning_rate": 1.1984192680278376e-06, + "loss": 0.0576, + "step": 574 + }, + { + "epoch": 0.13068181818181818, + "grad_norm": 3.298436531129224, + "learning_rate": 1.1982416026777172e-06, + "loss": 0.1288, + "step": 575 + }, + { + "epoch": 0.13090909090909092, + "grad_norm": 2.6354717815457924, + "learning_rate": 1.1980636450926999e-06, + "loss": 0.0695, + "step": 576 + }, + { + "epoch": 0.13113636363636363, + "grad_norm": 1.4220237634639286, + "learning_rate": 1.1978853953635074e-06, + "loss": 0.0528, + "step": 577 + }, + { + "epoch": 0.13136363636363638, + "grad_norm": 2.468609110409853, + "learning_rate": 1.1977068535810101e-06, + "loss": 0.104, + "step": 578 + }, + { + "epoch": 0.1315909090909091, + "grad_norm": 4.366562461588067, + "learning_rate": 1.1975280198362276e-06, + "loss": 0.1798, + "step": 579 + }, + { + "epoch": 0.1318181818181818, + "grad_norm": 3.202193742998132, + "learning_rate": 1.1973488942203282e-06, + "loss": 0.0977, + "step": 580 + }, + { + "epoch": 0.13204545454545455, + "grad_norm": 2.5513005244661504, + "learning_rate": 1.197169476824629e-06, + "loss": 0.0861, + "step": 581 + }, + { + "epoch": 0.13227272727272726, + "grad_norm": 3.5563258599113, + "learning_rate": 1.1969897677405956e-06, + "loss": 0.1155, + "step": 582 + }, + { + "epoch": 0.1325, + "grad_norm": 2.768239718969493, + "learning_rate": 1.1968097670598428e-06, + "loss": 0.1535, + "step": 583 + }, + { + "epoch": 0.13272727272727272, + "grad_norm": 4.5655239624813495, + "learning_rate": 1.1966294748741336e-06, + "loss": 0.0901, + "step": 584 + }, + { + "epoch": 0.13295454545454546, + "grad_norm": 2.5958170708040393, + "learning_rate": 1.19644889127538e-06, + "loss": 0.078, + "step": 585 + }, + { + "epoch": 0.13318181818181818, + "grad_norm": 3.1883648289427304, + "learning_rate": 1.1962680163556424e-06, + "loss": 0.0712, + "step": 586 + }, + { + "epoch": 0.13340909090909092, + "grad_norm": 2.1320009706723733, + "learning_rate": 1.1960868502071294e-06, + "loss": 0.1475, + "step": 587 + }, + { + "epoch": 0.13363636363636364, + "grad_norm": 1.9940879792713833, + "learning_rate": 1.1959053929221984e-06, + "loss": 0.0981, + "step": 588 + }, + { + "epoch": 0.13386363636363635, + "grad_norm": 3.4463318823874394, + "learning_rate": 1.1957236445933553e-06, + "loss": 0.1088, + "step": 589 + }, + { + "epoch": 0.1340909090909091, + "grad_norm": 1.8900755034083492, + "learning_rate": 1.1955416053132542e-06, + "loss": 0.0494, + "step": 590 + }, + { + "epoch": 0.1343181818181818, + "grad_norm": 3.9000513472596174, + "learning_rate": 1.1953592751746976e-06, + "loss": 0.1265, + "step": 591 + }, + { + "epoch": 0.13454545454545455, + "grad_norm": 4.291095102811638, + "learning_rate": 1.1951766542706362e-06, + "loss": 0.1225, + "step": 592 + }, + { + "epoch": 0.13477272727272727, + "grad_norm": 2.6405027569368666, + "learning_rate": 1.1949937426941689e-06, + "loss": 0.0992, + "step": 593 + }, + { + "epoch": 0.135, + "grad_norm": 2.6465513102605644, + "learning_rate": 1.1948105405385428e-06, + "loss": 0.1436, + "step": 594 + }, + { + "epoch": 0.13522727272727272, + "grad_norm": 2.259803623482083, + "learning_rate": 1.1946270478971533e-06, + "loss": 0.1057, + "step": 595 + }, + { + "epoch": 0.13545454545454547, + "grad_norm": 3.068275739482322, + "learning_rate": 1.194443264863544e-06, + "loss": 0.1208, + "step": 596 + }, + { + "epoch": 0.13568181818181818, + "grad_norm": 3.3234464117570095, + "learning_rate": 1.1942591915314058e-06, + "loss": 0.1977, + "step": 597 + }, + { + "epoch": 0.1359090909090909, + "grad_norm": 4.1562159510577, + "learning_rate": 1.1940748279945784e-06, + "loss": 0.128, + "step": 598 + }, + { + "epoch": 0.13613636363636364, + "grad_norm": 1.4529094160225136, + "learning_rate": 1.1938901743470494e-06, + "loss": 0.0663, + "step": 599 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 3.815547251224526, + "learning_rate": 1.1937052306829534e-06, + "loss": 0.0906, + "step": 600 + }, + { + "epoch": 0.1365909090909091, + "grad_norm": 2.129072993389739, + "learning_rate": 1.1935199970965741e-06, + "loss": 0.0693, + "step": 601 + }, + { + "epoch": 0.1368181818181818, + "grad_norm": 4.248435434354939, + "learning_rate": 1.193334473682342e-06, + "loss": 0.1476, + "step": 602 + }, + { + "epoch": 0.13704545454545455, + "grad_norm": 2.5575392272255626, + "learning_rate": 1.193148660534836e-06, + "loss": 0.1503, + "step": 603 + }, + { + "epoch": 0.13727272727272727, + "grad_norm": 3.010421683808598, + "learning_rate": 1.1929625577487825e-06, + "loss": 0.1423, + "step": 604 + }, + { + "epoch": 0.1375, + "grad_norm": 2.944071062469372, + "learning_rate": 1.1927761654190552e-06, + "loss": 0.0726, + "step": 605 + }, + { + "epoch": 0.13772727272727273, + "grad_norm": 3.832747957363785, + "learning_rate": 1.192589483640676e-06, + "loss": 0.1073, + "step": 606 + }, + { + "epoch": 0.13795454545454544, + "grad_norm": 3.856733174427242, + "learning_rate": 1.1924025125088138e-06, + "loss": 0.1112, + "step": 607 + }, + { + "epoch": 0.13818181818181818, + "grad_norm": 2.751887610692139, + "learning_rate": 1.1922152521187854e-06, + "loss": 0.1334, + "step": 608 + }, + { + "epoch": 0.1384090909090909, + "grad_norm": 5.0873022883095285, + "learning_rate": 1.1920277025660553e-06, + "loss": 0.1435, + "step": 609 + }, + { + "epoch": 0.13863636363636364, + "grad_norm": 4.8249303136466475, + "learning_rate": 1.1918398639462345e-06, + "loss": 0.1195, + "step": 610 + }, + { + "epoch": 0.13886363636363636, + "grad_norm": 4.874472213239946, + "learning_rate": 1.1916517363550821e-06, + "loss": 0.1632, + "step": 611 + }, + { + "epoch": 0.1390909090909091, + "grad_norm": 2.481069373476785, + "learning_rate": 1.1914633198885047e-06, + "loss": 0.1337, + "step": 612 + }, + { + "epoch": 0.1393181818181818, + "grad_norm": 2.556171296030753, + "learning_rate": 1.1912746146425555e-06, + "loss": 0.0844, + "step": 613 + }, + { + "epoch": 0.13954545454545456, + "grad_norm": 3.73896252712052, + "learning_rate": 1.1910856207134352e-06, + "loss": 0.1516, + "step": 614 + }, + { + "epoch": 0.13977272727272727, + "grad_norm": 6.2522296867232505, + "learning_rate": 1.1908963381974916e-06, + "loss": 0.1254, + "step": 615 + }, + { + "epoch": 0.14, + "grad_norm": 2.687889077036656, + "learning_rate": 1.19070676719122e-06, + "loss": 0.2058, + "step": 616 + }, + { + "epoch": 0.14022727272727273, + "grad_norm": 3.0307348123486886, + "learning_rate": 1.1905169077912623e-06, + "loss": 0.1165, + "step": 617 + }, + { + "epoch": 0.14045454545454544, + "grad_norm": 2.4821093454838543, + "learning_rate": 1.1903267600944077e-06, + "loss": 0.0626, + "step": 618 + }, + { + "epoch": 0.14068181818181819, + "grad_norm": 4.121754502209153, + "learning_rate": 1.1901363241975921e-06, + "loss": 0.0986, + "step": 619 + }, + { + "epoch": 0.1409090909090909, + "grad_norm": 3.071974785711444, + "learning_rate": 1.1899456001978987e-06, + "loss": 0.1503, + "step": 620 + }, + { + "epoch": 0.14113636363636364, + "grad_norm": 3.947611533293796, + "learning_rate": 1.1897545881925573e-06, + "loss": 0.1389, + "step": 621 + }, + { + "epoch": 0.14136363636363636, + "grad_norm": 2.076988193267275, + "learning_rate": 1.1895632882789447e-06, + "loss": 0.0882, + "step": 622 + }, + { + "epoch": 0.1415909090909091, + "grad_norm": 2.136166779141532, + "learning_rate": 1.1893717005545843e-06, + "loss": 0.0876, + "step": 623 + }, + { + "epoch": 0.14181818181818182, + "grad_norm": 5.3865638412573835, + "learning_rate": 1.189179825117146e-06, + "loss": 0.1935, + "step": 624 + }, + { + "epoch": 0.14204545454545456, + "grad_norm": 3.5245336234055045, + "learning_rate": 1.1889876620644472e-06, + "loss": 0.0805, + "step": 625 + }, + { + "epoch": 0.14227272727272727, + "grad_norm": 4.160973452396278, + "learning_rate": 1.1887952114944509e-06, + "loss": 0.0884, + "step": 626 + }, + { + "epoch": 0.1425, + "grad_norm": 1.4391339653094455, + "learning_rate": 1.1886024735052676e-06, + "loss": 0.076, + "step": 627 + }, + { + "epoch": 0.14272727272727273, + "grad_norm": 2.868684131769496, + "learning_rate": 1.1884094481951535e-06, + "loss": 0.1337, + "step": 628 + }, + { + "epoch": 0.14295454545454545, + "grad_norm": 6.437272841990759, + "learning_rate": 1.1882161356625122e-06, + "loss": 0.1887, + "step": 629 + }, + { + "epoch": 0.1431818181818182, + "grad_norm": 2.6513885033687927, + "learning_rate": 1.1880225360058925e-06, + "loss": 0.1202, + "step": 630 + }, + { + "epoch": 0.1434090909090909, + "grad_norm": 3.5666999999288826, + "learning_rate": 1.1878286493239907e-06, + "loss": 0.1429, + "step": 631 + }, + { + "epoch": 0.14363636363636365, + "grad_norm": 1.919266408920944, + "learning_rate": 1.187634475715649e-06, + "loss": 0.07, + "step": 632 + }, + { + "epoch": 0.14386363636363636, + "grad_norm": 2.570698788827823, + "learning_rate": 1.1874400152798557e-06, + "loss": 0.0937, + "step": 633 + }, + { + "epoch": 0.1440909090909091, + "grad_norm": 2.6570945083915496, + "learning_rate": 1.1872452681157453e-06, + "loss": 0.0692, + "step": 634 + }, + { + "epoch": 0.14431818181818182, + "grad_norm": 2.464540582208454, + "learning_rate": 1.1870502343225992e-06, + "loss": 0.0699, + "step": 635 + }, + { + "epoch": 0.14454545454545453, + "grad_norm": 2.2956454633742513, + "learning_rate": 1.186854913999844e-06, + "loss": 0.1185, + "step": 636 + }, + { + "epoch": 0.14477272727272728, + "grad_norm": 3.2561941076718606, + "learning_rate": 1.1866593072470527e-06, + "loss": 0.1012, + "step": 637 + }, + { + "epoch": 0.145, + "grad_norm": 2.77513438199878, + "learning_rate": 1.1864634141639448e-06, + "loss": 0.0852, + "step": 638 + }, + { + "epoch": 0.14522727272727273, + "grad_norm": 3.332667452259652, + "learning_rate": 1.1862672348503848e-06, + "loss": 0.1335, + "step": 639 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 1.4736108733640751, + "learning_rate": 1.186070769406384e-06, + "loss": 0.0483, + "step": 640 + }, + { + "epoch": 0.1456818181818182, + "grad_norm": 4.5338067314683235, + "learning_rate": 1.185874017932099e-06, + "loss": 0.1104, + "step": 641 + }, + { + "epoch": 0.1459090909090909, + "grad_norm": 2.581622979177194, + "learning_rate": 1.1856769805278327e-06, + "loss": 0.0758, + "step": 642 + }, + { + "epoch": 0.14613636363636365, + "grad_norm": 4.8451302603373225, + "learning_rate": 1.1854796572940332e-06, + "loss": 0.255, + "step": 643 + }, + { + "epoch": 0.14636363636363636, + "grad_norm": 3.4060902404925764, + "learning_rate": 1.1852820483312951e-06, + "loss": 0.165, + "step": 644 + }, + { + "epoch": 0.14659090909090908, + "grad_norm": 3.7697134812372486, + "learning_rate": 1.1850841537403577e-06, + "loss": 0.0841, + "step": 645 + }, + { + "epoch": 0.14681818181818182, + "grad_norm": 4.04555361809533, + "learning_rate": 1.1848859736221062e-06, + "loss": 0.1852, + "step": 646 + }, + { + "epoch": 0.14704545454545453, + "grad_norm": 5.292624605883491, + "learning_rate": 1.1846875080775724e-06, + "loss": 0.1396, + "step": 647 + }, + { + "epoch": 0.14727272727272728, + "grad_norm": 2.9010815097955596, + "learning_rate": 1.1844887572079322e-06, + "loss": 0.0809, + "step": 648 + }, + { + "epoch": 0.1475, + "grad_norm": 3.0154321976547265, + "learning_rate": 1.1842897211145075e-06, + "loss": 0.1082, + "step": 649 + }, + { + "epoch": 0.14772727272727273, + "grad_norm": 3.603164933951076, + "learning_rate": 1.1840903998987657e-06, + "loss": 0.0904, + "step": 650 + }, + { + "epoch": 0.14795454545454545, + "grad_norm": 2.611926077443849, + "learning_rate": 1.1838907936623196e-06, + "loss": 0.0923, + "step": 651 + }, + { + "epoch": 0.1481818181818182, + "grad_norm": 3.5045755536602647, + "learning_rate": 1.183690902506927e-06, + "loss": 0.1367, + "step": 652 + }, + { + "epoch": 0.1484090909090909, + "grad_norm": 2.079001413436961, + "learning_rate": 1.1834907265344913e-06, + "loss": 0.1189, + "step": 653 + }, + { + "epoch": 0.14863636363636365, + "grad_norm": 1.927037422831208, + "learning_rate": 1.1832902658470608e-06, + "loss": 0.115, + "step": 654 + }, + { + "epoch": 0.14886363636363636, + "grad_norm": 1.97236561610899, + "learning_rate": 1.1830895205468293e-06, + "loss": 0.06, + "step": 655 + }, + { + "epoch": 0.14909090909090908, + "grad_norm": 2.8828111328774493, + "learning_rate": 1.182888490736135e-06, + "loss": 0.0996, + "step": 656 + }, + { + "epoch": 0.14931818181818182, + "grad_norm": 4.106350195454931, + "learning_rate": 1.1826871765174622e-06, + "loss": 0.1652, + "step": 657 + }, + { + "epoch": 0.14954545454545454, + "grad_norm": 4.5075997872634925, + "learning_rate": 1.1824855779934392e-06, + "loss": 0.1068, + "step": 658 + }, + { + "epoch": 0.14977272727272728, + "grad_norm": 2.7307866883608933, + "learning_rate": 1.1822836952668397e-06, + "loss": 0.129, + "step": 659 + }, + { + "epoch": 0.15, + "grad_norm": 1.8969430815297914, + "learning_rate": 1.182081528440582e-06, + "loss": 0.0927, + "step": 660 + }, + { + "epoch": 0.15022727272727274, + "grad_norm": 2.222251072329309, + "learning_rate": 1.18187907761773e-06, + "loss": 0.0806, + "step": 661 + }, + { + "epoch": 0.15045454545454545, + "grad_norm": 2.855738831847418, + "learning_rate": 1.1816763429014917e-06, + "loss": 0.1133, + "step": 662 + }, + { + "epoch": 0.1506818181818182, + "grad_norm": 4.137168631966571, + "learning_rate": 1.1814733243952193e-06, + "loss": 0.1729, + "step": 663 + }, + { + "epoch": 0.1509090909090909, + "grad_norm": 5.250074138141893, + "learning_rate": 1.1812700222024111e-06, + "loss": 0.2671, + "step": 664 + }, + { + "epoch": 0.15113636363636362, + "grad_norm": 4.095009204321318, + "learning_rate": 1.1810664364267092e-06, + "loss": 0.0975, + "step": 665 + }, + { + "epoch": 0.15136363636363637, + "grad_norm": 1.9872727288340575, + "learning_rate": 1.1808625671718999e-06, + "loss": 0.0559, + "step": 666 + }, + { + "epoch": 0.15159090909090908, + "grad_norm": 2.9314970455180593, + "learning_rate": 1.1806584145419144e-06, + "loss": 0.1494, + "step": 667 + }, + { + "epoch": 0.15181818181818182, + "grad_norm": 3.22472741979199, + "learning_rate": 1.1804539786408292e-06, + "loss": 0.1201, + "step": 668 + }, + { + "epoch": 0.15204545454545454, + "grad_norm": 2.8590813055772557, + "learning_rate": 1.1802492595728634e-06, + "loss": 0.0895, + "step": 669 + }, + { + "epoch": 0.15227272727272728, + "grad_norm": 2.4103185687009576, + "learning_rate": 1.1800442574423823e-06, + "loss": 0.0843, + "step": 670 + }, + { + "epoch": 0.1525, + "grad_norm": 3.191758442329034, + "learning_rate": 1.1798389723538942e-06, + "loss": 0.0784, + "step": 671 + }, + { + "epoch": 0.15272727272727274, + "grad_norm": 3.8816280749453114, + "learning_rate": 1.1796334044120522e-06, + "loss": 0.153, + "step": 672 + }, + { + "epoch": 0.15295454545454545, + "grad_norm": 2.7772880110025238, + "learning_rate": 1.1794275537216534e-06, + "loss": 0.1308, + "step": 673 + }, + { + "epoch": 0.15318181818181817, + "grad_norm": 1.7311480217251134, + "learning_rate": 1.1792214203876396e-06, + "loss": 0.0612, + "step": 674 + }, + { + "epoch": 0.1534090909090909, + "grad_norm": 1.8326204714498646, + "learning_rate": 1.1790150045150958e-06, + "loss": 0.0819, + "step": 675 + }, + { + "epoch": 0.15363636363636363, + "grad_norm": 3.7804068167345344, + "learning_rate": 1.1788083062092518e-06, + "loss": 0.0889, + "step": 676 + }, + { + "epoch": 0.15386363636363637, + "grad_norm": 2.429048128257739, + "learning_rate": 1.1786013255754808e-06, + "loss": 0.1183, + "step": 677 + }, + { + "epoch": 0.15409090909090908, + "grad_norm": 3.6912740206894132, + "learning_rate": 1.1783940627193002e-06, + "loss": 0.1342, + "step": 678 + }, + { + "epoch": 0.15431818181818183, + "grad_norm": 2.315919697767897, + "learning_rate": 1.1781865177463717e-06, + "loss": 0.0948, + "step": 679 + }, + { + "epoch": 0.15454545454545454, + "grad_norm": 4.732313717049891, + "learning_rate": 1.1779786907625e-06, + "loss": 0.2081, + "step": 680 + }, + { + "epoch": 0.15477272727272728, + "grad_norm": 3.0643026784385503, + "learning_rate": 1.177770581873634e-06, + "loss": 0.1308, + "step": 681 + }, + { + "epoch": 0.155, + "grad_norm": 3.596674366098283, + "learning_rate": 1.1775621911858665e-06, + "loss": 0.1835, + "step": 682 + }, + { + "epoch": 0.1552272727272727, + "grad_norm": 2.7629336340919957, + "learning_rate": 1.1773535188054336e-06, + "loss": 0.0822, + "step": 683 + }, + { + "epoch": 0.15545454545454546, + "grad_norm": 3.3873361914536924, + "learning_rate": 1.177144564838715e-06, + "loss": 0.1179, + "step": 684 + }, + { + "epoch": 0.15568181818181817, + "grad_norm": 2.0404782084317112, + "learning_rate": 1.1769353293922341e-06, + "loss": 0.1189, + "step": 685 + }, + { + "epoch": 0.1559090909090909, + "grad_norm": 2.833626094610276, + "learning_rate": 1.1767258125726584e-06, + "loss": 0.0793, + "step": 686 + }, + { + "epoch": 0.15613636363636363, + "grad_norm": 3.78825191552253, + "learning_rate": 1.1765160144867972e-06, + "loss": 0.1063, + "step": 687 + }, + { + "epoch": 0.15636363636363637, + "grad_norm": 2.6397145292980664, + "learning_rate": 1.176305935241605e-06, + "loss": 0.0804, + "step": 688 + }, + { + "epoch": 0.1565909090909091, + "grad_norm": 3.8775177404721846, + "learning_rate": 1.1760955749441786e-06, + "loss": 0.0998, + "step": 689 + }, + { + "epoch": 0.15681818181818183, + "grad_norm": 1.7480890862718244, + "learning_rate": 1.1758849337017587e-06, + "loss": 0.0372, + "step": 690 + }, + { + "epoch": 0.15704545454545454, + "grad_norm": 4.230959165744064, + "learning_rate": 1.175674011621728e-06, + "loss": 0.2064, + "step": 691 + }, + { + "epoch": 0.1572727272727273, + "grad_norm": 3.6828290600360227, + "learning_rate": 1.1754628088116138e-06, + "loss": 0.086, + "step": 692 + }, + { + "epoch": 0.1575, + "grad_norm": 2.425606630823805, + "learning_rate": 1.1752513253790861e-06, + "loss": 0.0977, + "step": 693 + }, + { + "epoch": 0.15772727272727272, + "grad_norm": 4.4212649317439485, + "learning_rate": 1.1750395614319576e-06, + "loss": 0.1031, + "step": 694 + }, + { + "epoch": 0.15795454545454546, + "grad_norm": 4.531721414793234, + "learning_rate": 1.174827517078184e-06, + "loss": 0.1318, + "step": 695 + }, + { + "epoch": 0.15818181818181817, + "grad_norm": 1.2425254927354163, + "learning_rate": 1.1746151924258644e-06, + "loss": 0.046, + "step": 696 + }, + { + "epoch": 0.15840909090909092, + "grad_norm": 4.317477038748653, + "learning_rate": 1.1744025875832405e-06, + "loss": 0.1605, + "step": 697 + }, + { + "epoch": 0.15863636363636363, + "grad_norm": 2.2894514497196545, + "learning_rate": 1.174189702658697e-06, + "loss": 0.0667, + "step": 698 + }, + { + "epoch": 0.15886363636363637, + "grad_norm": 3.1747794315777975, + "learning_rate": 1.173976537760761e-06, + "loss": 0.178, + "step": 699 + }, + { + "epoch": 0.1590909090909091, + "grad_norm": 3.6397940252033054, + "learning_rate": 1.1737630929981026e-06, + "loss": 0.1234, + "step": 700 + }, + { + "epoch": 0.15931818181818183, + "grad_norm": 3.600394223025497, + "learning_rate": 1.1735493684795348e-06, + "loss": 0.1283, + "step": 701 + }, + { + "epoch": 0.15954545454545455, + "grad_norm": 2.1914776151398097, + "learning_rate": 1.173335364314013e-06, + "loss": 0.0861, + "step": 702 + }, + { + "epoch": 0.15977272727272726, + "grad_norm": 4.23410141845482, + "learning_rate": 1.173121080610635e-06, + "loss": 0.1418, + "step": 703 + }, + { + "epoch": 0.16, + "grad_norm": 2.8336906213405406, + "learning_rate": 1.1729065174786414e-06, + "loss": 0.0814, + "step": 704 + }, + { + "epoch": 0.16022727272727272, + "grad_norm": 1.324797775805725, + "learning_rate": 1.1726916750274148e-06, + "loss": 0.073, + "step": 705 + }, + { + "epoch": 0.16045454545454546, + "grad_norm": 1.1113270460751639, + "learning_rate": 1.1724765533664808e-06, + "loss": 0.0325, + "step": 706 + }, + { + "epoch": 0.16068181818181818, + "grad_norm": 3.329670829994521, + "learning_rate": 1.1722611526055073e-06, + "loss": 0.0846, + "step": 707 + }, + { + "epoch": 0.16090909090909092, + "grad_norm": 4.423260181219768, + "learning_rate": 1.1720454728543034e-06, + "loss": 0.0811, + "step": 708 + }, + { + "epoch": 0.16113636363636363, + "grad_norm": 3.861828831504347, + "learning_rate": 1.171829514222822e-06, + "loss": 0.1481, + "step": 709 + }, + { + "epoch": 0.16136363636363638, + "grad_norm": 3.63402240520416, + "learning_rate": 1.1716132768211572e-06, + "loss": 0.1154, + "step": 710 + }, + { + "epoch": 0.1615909090909091, + "grad_norm": 2.37915520220192, + "learning_rate": 1.1713967607595455e-06, + "loss": 0.0578, + "step": 711 + }, + { + "epoch": 0.1618181818181818, + "grad_norm": 4.41815181178646, + "learning_rate": 1.1711799661483653e-06, + "loss": 0.1123, + "step": 712 + }, + { + "epoch": 0.16204545454545455, + "grad_norm": 4.986293158191178, + "learning_rate": 1.170962893098137e-06, + "loss": 0.1236, + "step": 713 + }, + { + "epoch": 0.16227272727272726, + "grad_norm": 2.908333267858207, + "learning_rate": 1.1707455417195231e-06, + "loss": 0.1289, + "step": 714 + }, + { + "epoch": 0.1625, + "grad_norm": 3.5560662000352004, + "learning_rate": 1.170527912123328e-06, + "loss": 0.0964, + "step": 715 + }, + { + "epoch": 0.16272727272727272, + "grad_norm": 4.497015016872193, + "learning_rate": 1.1703100044204984e-06, + "loss": 0.1092, + "step": 716 + }, + { + "epoch": 0.16295454545454546, + "grad_norm": 2.8435936401692947, + "learning_rate": 1.1700918187221214e-06, + "loss": 0.1399, + "step": 717 + }, + { + "epoch": 0.16318181818181818, + "grad_norm": 3.4768244142021674, + "learning_rate": 1.169873355139427e-06, + "loss": 0.1007, + "step": 718 + }, + { + "epoch": 0.16340909090909092, + "grad_norm": 4.156552840769318, + "learning_rate": 1.1696546137837865e-06, + "loss": 0.1812, + "step": 719 + }, + { + "epoch": 0.16363636363636364, + "grad_norm": 3.6314627894350417, + "learning_rate": 1.169435594766713e-06, + "loss": 0.111, + "step": 720 + }, + { + "epoch": 0.16386363636363635, + "grad_norm": 2.248908961353641, + "learning_rate": 1.1692162981998608e-06, + "loss": 0.155, + "step": 721 + }, + { + "epoch": 0.1640909090909091, + "grad_norm": 3.5815672986259783, + "learning_rate": 1.1689967241950263e-06, + "loss": 0.105, + "step": 722 + }, + { + "epoch": 0.1643181818181818, + "grad_norm": 1.8662592496151242, + "learning_rate": 1.168776872864146e-06, + "loss": 0.0845, + "step": 723 + }, + { + "epoch": 0.16454545454545455, + "grad_norm": 3.5355589094258724, + "learning_rate": 1.1685567443192996e-06, + "loss": 0.1147, + "step": 724 + }, + { + "epoch": 0.16477272727272727, + "grad_norm": 2.7247318067219592, + "learning_rate": 1.168336338672707e-06, + "loss": 0.1253, + "step": 725 + }, + { + "epoch": 0.165, + "grad_norm": 3.5270674462195535, + "learning_rate": 1.1681156560367296e-06, + "loss": 0.1317, + "step": 726 + }, + { + "epoch": 0.16522727272727272, + "grad_norm": 3.8362477631577714, + "learning_rate": 1.1678946965238697e-06, + "loss": 0.2071, + "step": 727 + }, + { + "epoch": 0.16545454545454547, + "grad_norm": 2.445468298940002, + "learning_rate": 1.1676734602467713e-06, + "loss": 0.0797, + "step": 728 + }, + { + "epoch": 0.16568181818181818, + "grad_norm": 2.663027101554758, + "learning_rate": 1.1674519473182192e-06, + "loss": 0.0547, + "step": 729 + }, + { + "epoch": 0.16590909090909092, + "grad_norm": 3.9377492640314444, + "learning_rate": 1.1672301578511392e-06, + "loss": 0.1422, + "step": 730 + }, + { + "epoch": 0.16613636363636364, + "grad_norm": 3.1811680461685614, + "learning_rate": 1.167008091958598e-06, + "loss": 0.0796, + "step": 731 + }, + { + "epoch": 0.16636363636363635, + "grad_norm": 2.375864838080215, + "learning_rate": 1.1667857497538037e-06, + "loss": 0.0606, + "step": 732 + }, + { + "epoch": 0.1665909090909091, + "grad_norm": 2.5147347072402586, + "learning_rate": 1.166563131350105e-06, + "loss": 0.1209, + "step": 733 + }, + { + "epoch": 0.1668181818181818, + "grad_norm": 3.0045790328544033, + "learning_rate": 1.166340236860991e-06, + "loss": 0.2008, + "step": 734 + }, + { + "epoch": 0.16704545454545455, + "grad_norm": 2.217801589556193, + "learning_rate": 1.166117066400092e-06, + "loss": 0.1056, + "step": 735 + }, + { + "epoch": 0.16727272727272727, + "grad_norm": 5.587745484163016, + "learning_rate": 1.1658936200811789e-06, + "loss": 0.1688, + "step": 736 + }, + { + "epoch": 0.1675, + "grad_norm": 2.250124039431649, + "learning_rate": 1.1656698980181633e-06, + "loss": 0.0974, + "step": 737 + }, + { + "epoch": 0.16772727272727272, + "grad_norm": 3.80028598184578, + "learning_rate": 1.1654459003250971e-06, + "loss": 0.0992, + "step": 738 + }, + { + "epoch": 0.16795454545454547, + "grad_norm": 1.439158858739717, + "learning_rate": 1.1652216271161728e-06, + "loss": 0.0319, + "step": 739 + }, + { + "epoch": 0.16818181818181818, + "grad_norm": 2.2130946895334658, + "learning_rate": 1.1649970785057238e-06, + "loss": 0.1201, + "step": 740 + }, + { + "epoch": 0.1684090909090909, + "grad_norm": 2.7739842772717784, + "learning_rate": 1.1647722546082232e-06, + "loss": 0.0701, + "step": 741 + }, + { + "epoch": 0.16863636363636364, + "grad_norm": 2.609228118435818, + "learning_rate": 1.164547155538285e-06, + "loss": 0.0932, + "step": 742 + }, + { + "epoch": 0.16886363636363635, + "grad_norm": 3.703820173450133, + "learning_rate": 1.1643217814106633e-06, + "loss": 0.0899, + "step": 743 + }, + { + "epoch": 0.1690909090909091, + "grad_norm": 3.1694303813113804, + "learning_rate": 1.1640961323402522e-06, + "loss": 0.0868, + "step": 744 + }, + { + "epoch": 0.1693181818181818, + "grad_norm": 2.8568845461742995, + "learning_rate": 1.163870208442086e-06, + "loss": 0.1187, + "step": 745 + }, + { + "epoch": 0.16954545454545455, + "grad_norm": 3.797729399162994, + "learning_rate": 1.1636440098313398e-06, + "loss": 0.1722, + "step": 746 + }, + { + "epoch": 0.16977272727272727, + "grad_norm": 2.8670068838207574, + "learning_rate": 1.1634175366233278e-06, + "loss": 0.1619, + "step": 747 + }, + { + "epoch": 0.17, + "grad_norm": 3.087548193361291, + "learning_rate": 1.1631907889335046e-06, + "loss": 0.0692, + "step": 748 + }, + { + "epoch": 0.17022727272727273, + "grad_norm": 4.414850992047754, + "learning_rate": 1.162963766877465e-06, + "loss": 0.0881, + "step": 749 + }, + { + "epoch": 0.17045454545454544, + "grad_norm": 2.411365953758672, + "learning_rate": 1.1627364705709428e-06, + "loss": 0.1214, + "step": 750 + }, + { + "epoch": 0.17068181818181818, + "grad_norm": 3.2669502739528653, + "learning_rate": 1.1625089001298129e-06, + "loss": 0.1274, + "step": 751 + }, + { + "epoch": 0.1709090909090909, + "grad_norm": 5.155420414197394, + "learning_rate": 1.1622810556700889e-06, + "loss": 0.2368, + "step": 752 + }, + { + "epoch": 0.17113636363636364, + "grad_norm": 2.2393607761538825, + "learning_rate": 1.1620529373079246e-06, + "loss": 0.0885, + "step": 753 + }, + { + "epoch": 0.17136363636363636, + "grad_norm": 3.6007275076022167, + "learning_rate": 1.1618245451596128e-06, + "loss": 0.084, + "step": 754 + }, + { + "epoch": 0.1715909090909091, + "grad_norm": 2.6165665043670323, + "learning_rate": 1.161595879341587e-06, + "loss": 0.1014, + "step": 755 + }, + { + "epoch": 0.17181818181818181, + "grad_norm": 3.6672242287695846, + "learning_rate": 1.1613669399704192e-06, + "loss": 0.1334, + "step": 756 + }, + { + "epoch": 0.17204545454545456, + "grad_norm": 3.8755698559282745, + "learning_rate": 1.1611377271628213e-06, + "loss": 0.1531, + "step": 757 + }, + { + "epoch": 0.17227272727272727, + "grad_norm": 3.4636367504088352, + "learning_rate": 1.1609082410356447e-06, + "loss": 0.0729, + "step": 758 + }, + { + "epoch": 0.1725, + "grad_norm": 4.159111572875205, + "learning_rate": 1.1606784817058797e-06, + "loss": 0.1439, + "step": 759 + }, + { + "epoch": 0.17272727272727273, + "grad_norm": 6.150329364181485, + "learning_rate": 1.1604484492906562e-06, + "loss": 0.2479, + "step": 760 + }, + { + "epoch": 0.17295454545454544, + "grad_norm": 2.840827652346105, + "learning_rate": 1.1602181439072432e-06, + "loss": 0.0529, + "step": 761 + }, + { + "epoch": 0.1731818181818182, + "grad_norm": 3.4976650338811326, + "learning_rate": 1.1599875656730492e-06, + "loss": 0.1441, + "step": 762 + }, + { + "epoch": 0.1734090909090909, + "grad_norm": 3.644483649886925, + "learning_rate": 1.159756714705621e-06, + "loss": 0.1627, + "step": 763 + }, + { + "epoch": 0.17363636363636364, + "grad_norm": 4.6658927901270575, + "learning_rate": 1.1595255911226456e-06, + "loss": 0.1154, + "step": 764 + }, + { + "epoch": 0.17386363636363636, + "grad_norm": 3.3625632086878956, + "learning_rate": 1.1592941950419475e-06, + "loss": 0.1272, + "step": 765 + }, + { + "epoch": 0.1740909090909091, + "grad_norm": 3.155381762903906, + "learning_rate": 1.1590625265814918e-06, + "loss": 0.1759, + "step": 766 + }, + { + "epoch": 0.17431818181818182, + "grad_norm": 2.4035789035627073, + "learning_rate": 1.1588305858593811e-06, + "loss": 0.083, + "step": 767 + }, + { + "epoch": 0.17454545454545456, + "grad_norm": 3.8800065252955807, + "learning_rate": 1.1585983729938575e-06, + "loss": 0.098, + "step": 768 + }, + { + "epoch": 0.17477272727272727, + "grad_norm": 3.2133140224754166, + "learning_rate": 1.1583658881033013e-06, + "loss": 0.1116, + "step": 769 + }, + { + "epoch": 0.175, + "grad_norm": 5.6308258883857665, + "learning_rate": 1.158133131306232e-06, + "loss": 0.1108, + "step": 770 + }, + { + "epoch": 0.17522727272727273, + "grad_norm": 3.34568115493396, + "learning_rate": 1.1579001027213078e-06, + "loss": 0.0828, + "step": 771 + }, + { + "epoch": 0.17545454545454545, + "grad_norm": 2.6258599364639466, + "learning_rate": 1.1576668024673248e-06, + "loss": 0.0705, + "step": 772 + }, + { + "epoch": 0.1756818181818182, + "grad_norm": 3.1555688975176004, + "learning_rate": 1.157433230663218e-06, + "loss": 0.0651, + "step": 773 + }, + { + "epoch": 0.1759090909090909, + "grad_norm": 3.611867108975687, + "learning_rate": 1.1571993874280611e-06, + "loss": 0.1436, + "step": 774 + }, + { + "epoch": 0.17613636363636365, + "grad_norm": 2.408343539432884, + "learning_rate": 1.1569652728810658e-06, + "loss": 0.0916, + "step": 775 + }, + { + "epoch": 0.17636363636363636, + "grad_norm": 4.976882099384663, + "learning_rate": 1.156730887141582e-06, + "loss": 0.1282, + "step": 776 + }, + { + "epoch": 0.1765909090909091, + "grad_norm": 5.294480823644898, + "learning_rate": 1.156496230329098e-06, + "loss": 0.2772, + "step": 777 + }, + { + "epoch": 0.17681818181818182, + "grad_norm": 4.742007157127061, + "learning_rate": 1.1562613025632406e-06, + "loss": 0.1196, + "step": 778 + }, + { + "epoch": 0.17704545454545453, + "grad_norm": 2.07528997472403, + "learning_rate": 1.1560261039637744e-06, + "loss": 0.0664, + "step": 779 + }, + { + "epoch": 0.17727272727272728, + "grad_norm": 2.4312830817964133, + "learning_rate": 1.1557906346506021e-06, + "loss": 0.0795, + "step": 780 + }, + { + "epoch": 0.1775, + "grad_norm": 3.2831149389763197, + "learning_rate": 1.1555548947437642e-06, + "loss": 0.1186, + "step": 781 + }, + { + "epoch": 0.17772727272727273, + "grad_norm": 4.073389939948538, + "learning_rate": 1.1553188843634399e-06, + "loss": 0.1068, + "step": 782 + }, + { + "epoch": 0.17795454545454545, + "grad_norm": 3.431137122030362, + "learning_rate": 1.1550826036299455e-06, + "loss": 0.2235, + "step": 783 + }, + { + "epoch": 0.1781818181818182, + "grad_norm": 2.1508864223211717, + "learning_rate": 1.1548460526637354e-06, + "loss": 0.0728, + "step": 784 + }, + { + "epoch": 0.1784090909090909, + "grad_norm": 2.542600883021502, + "learning_rate": 1.1546092315854017e-06, + "loss": 0.0713, + "step": 785 + }, + { + "epoch": 0.17863636363636365, + "grad_norm": 2.4077832811127644, + "learning_rate": 1.1543721405156744e-06, + "loss": 0.1331, + "step": 786 + }, + { + "epoch": 0.17886363636363636, + "grad_norm": 2.7882544425249454, + "learning_rate": 1.154134779575421e-06, + "loss": 0.1374, + "step": 787 + }, + { + "epoch": 0.17909090909090908, + "grad_norm": 4.0339750921784665, + "learning_rate": 1.1538971488856465e-06, + "loss": 0.1363, + "step": 788 + }, + { + "epoch": 0.17931818181818182, + "grad_norm": 1.7731148396393088, + "learning_rate": 1.153659248567494e-06, + "loss": 0.0533, + "step": 789 + }, + { + "epoch": 0.17954545454545454, + "grad_norm": 3.745684753145709, + "learning_rate": 1.1534210787422425e-06, + "loss": 0.1091, + "step": 790 + }, + { + "epoch": 0.17977272727272728, + "grad_norm": 2.9989434091372162, + "learning_rate": 1.1531826395313104e-06, + "loss": 0.0749, + "step": 791 + }, + { + "epoch": 0.18, + "grad_norm": 5.215485933310487, + "learning_rate": 1.152943931056252e-06, + "loss": 0.2935, + "step": 792 + }, + { + "epoch": 0.18022727272727274, + "grad_norm": 2.5129589816791875, + "learning_rate": 1.1527049534387595e-06, + "loss": 0.1564, + "step": 793 + }, + { + "epoch": 0.18045454545454545, + "grad_norm": 3.506011025397826, + "learning_rate": 1.1524657068006622e-06, + "loss": 0.0851, + "step": 794 + }, + { + "epoch": 0.1806818181818182, + "grad_norm": 1.7827392041353078, + "learning_rate": 1.1522261912639266e-06, + "loss": 0.0586, + "step": 795 + }, + { + "epoch": 0.1809090909090909, + "grad_norm": 2.6359555068261318, + "learning_rate": 1.151986406950656e-06, + "loss": 0.0985, + "step": 796 + }, + { + "epoch": 0.18113636363636362, + "grad_norm": 3.131388496467777, + "learning_rate": 1.1517463539830908e-06, + "loss": 0.1576, + "step": 797 + }, + { + "epoch": 0.18136363636363637, + "grad_norm": 2.6525424346183093, + "learning_rate": 1.1515060324836088e-06, + "loss": 0.0994, + "step": 798 + }, + { + "epoch": 0.18159090909090908, + "grad_norm": 1.071077855106917, + "learning_rate": 1.151265442574724e-06, + "loss": 0.066, + "step": 799 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 5.438824680739619, + "learning_rate": 1.151024584379088e-06, + "loss": 0.1352, + "step": 800 + }, + { + "epoch": 0.18204545454545454, + "grad_norm": 2.1428067404342173, + "learning_rate": 1.1507834580194883e-06, + "loss": 0.0991, + "step": 801 + }, + { + "epoch": 0.18227272727272728, + "grad_norm": 2.584642668107492, + "learning_rate": 1.15054206361885e-06, + "loss": 0.0615, + "step": 802 + }, + { + "epoch": 0.1825, + "grad_norm": 3.139534261632662, + "learning_rate": 1.1503004013002341e-06, + "loss": 0.0801, + "step": 803 + }, + { + "epoch": 0.18272727272727274, + "grad_norm": 4.277335163917085, + "learning_rate": 1.1500584711868388e-06, + "loss": 0.0937, + "step": 804 + }, + { + "epoch": 0.18295454545454545, + "grad_norm": 3.3885257254631673, + "learning_rate": 1.1498162734019983e-06, + "loss": 0.0878, + "step": 805 + }, + { + "epoch": 0.1831818181818182, + "grad_norm": 3.299063354174504, + "learning_rate": 1.1495738080691834e-06, + "loss": 0.1017, + "step": 806 + }, + { + "epoch": 0.1834090909090909, + "grad_norm": 1.5696364401183154, + "learning_rate": 1.1493310753120017e-06, + "loss": 0.067, + "step": 807 + }, + { + "epoch": 0.18363636363636363, + "grad_norm": 3.2180129149672325, + "learning_rate": 1.1490880752541967e-06, + "loss": 0.1305, + "step": 808 + }, + { + "epoch": 0.18386363636363637, + "grad_norm": 2.6011996481905824, + "learning_rate": 1.148844808019648e-06, + "loss": 0.1157, + "step": 809 + }, + { + "epoch": 0.18409090909090908, + "grad_norm": 3.4288903997019036, + "learning_rate": 1.1486012737323718e-06, + "loss": 0.0795, + "step": 810 + }, + { + "epoch": 0.18431818181818183, + "grad_norm": 2.681352230471287, + "learning_rate": 1.1483574725165207e-06, + "loss": 0.1187, + "step": 811 + }, + { + "epoch": 0.18454545454545454, + "grad_norm": 1.44422465743348, + "learning_rate": 1.1481134044963825e-06, + "loss": 0.0535, + "step": 812 + }, + { + "epoch": 0.18477272727272728, + "grad_norm": 2.2931329758990957, + "learning_rate": 1.1478690697963814e-06, + "loss": 0.0903, + "step": 813 + }, + { + "epoch": 0.185, + "grad_norm": 2.799907754892058, + "learning_rate": 1.1476244685410784e-06, + "loss": 0.0669, + "step": 814 + }, + { + "epoch": 0.18522727272727274, + "grad_norm": 3.5617296862325074, + "learning_rate": 1.147379600855169e-06, + "loss": 0.1832, + "step": 815 + }, + { + "epoch": 0.18545454545454546, + "grad_norm": 2.4880736924318243, + "learning_rate": 1.1471344668634854e-06, + "loss": 0.112, + "step": 816 + }, + { + "epoch": 0.18568181818181817, + "grad_norm": 2.4444501349390304, + "learning_rate": 1.1468890666909954e-06, + "loss": 0.1155, + "step": 817 + }, + { + "epoch": 0.1859090909090909, + "grad_norm": 3.813853261716941, + "learning_rate": 1.1466434004628023e-06, + "loss": 0.1429, + "step": 818 + }, + { + "epoch": 0.18613636363636363, + "grad_norm": 5.421205135165792, + "learning_rate": 1.1463974683041455e-06, + "loss": 0.2755, + "step": 819 + }, + { + "epoch": 0.18636363636363637, + "grad_norm": 3.609600274274101, + "learning_rate": 1.1461512703403992e-06, + "loss": 0.1184, + "step": 820 + }, + { + "epoch": 0.18659090909090909, + "grad_norm": 1.8615631846224745, + "learning_rate": 1.1459048066970736e-06, + "loss": 0.0818, + "step": 821 + }, + { + "epoch": 0.18681818181818183, + "grad_norm": 2.4289052703794325, + "learning_rate": 1.1456580774998146e-06, + "loss": 0.0764, + "step": 822 + }, + { + "epoch": 0.18704545454545454, + "grad_norm": 2.713828246740441, + "learning_rate": 1.1454110828744027e-06, + "loss": 0.0912, + "step": 823 + }, + { + "epoch": 0.18727272727272729, + "grad_norm": 2.8685763209763637, + "learning_rate": 1.1451638229467547e-06, + "loss": 0.0738, + "step": 824 + }, + { + "epoch": 0.1875, + "grad_norm": 3.3705690705593923, + "learning_rate": 1.1449162978429218e-06, + "loss": 0.0788, + "step": 825 + }, + { + "epoch": 0.18772727272727271, + "grad_norm": 3.254820549961461, + "learning_rate": 1.144668507689091e-06, + "loss": 0.0681, + "step": 826 + }, + { + "epoch": 0.18795454545454546, + "grad_norm": 2.1933541978238735, + "learning_rate": 1.1444204526115837e-06, + "loss": 0.0646, + "step": 827 + }, + { + "epoch": 0.18818181818181817, + "grad_norm": 4.211851612231775, + "learning_rate": 1.144172132736857e-06, + "loss": 0.1317, + "step": 828 + }, + { + "epoch": 0.18840909090909091, + "grad_norm": 3.8238801846582278, + "learning_rate": 1.1439235481915028e-06, + "loss": 0.0938, + "step": 829 + }, + { + "epoch": 0.18863636363636363, + "grad_norm": 8.657468174143784, + "learning_rate": 1.1436746991022479e-06, + "loss": 0.1045, + "step": 830 + }, + { + "epoch": 0.18886363636363637, + "grad_norm": 3.10943093139228, + "learning_rate": 1.143425585595954e-06, + "loss": 0.1194, + "step": 831 + }, + { + "epoch": 0.1890909090909091, + "grad_norm": 2.3018430914505403, + "learning_rate": 1.1431762077996174e-06, + "loss": 0.1219, + "step": 832 + }, + { + "epoch": 0.18931818181818183, + "grad_norm": 4.427509015662018, + "learning_rate": 1.1429265658403698e-06, + "loss": 0.0977, + "step": 833 + }, + { + "epoch": 0.18954545454545454, + "grad_norm": 3.684326637673178, + "learning_rate": 1.1426766598454768e-06, + "loss": 0.0846, + "step": 834 + }, + { + "epoch": 0.18977272727272726, + "grad_norm": 5.163030810979085, + "learning_rate": 1.1424264899423383e-06, + "loss": 0.1853, + "step": 835 + }, + { + "epoch": 0.19, + "grad_norm": 3.3060632481428494, + "learning_rate": 1.1421760562584901e-06, + "loss": 0.1056, + "step": 836 + }, + { + "epoch": 0.19022727272727272, + "grad_norm": 3.9361669806006496, + "learning_rate": 1.1419253589216012e-06, + "loss": 0.2202, + "step": 837 + }, + { + "epoch": 0.19045454545454546, + "grad_norm": 2.8331931002753183, + "learning_rate": 1.1416743980594758e-06, + "loss": 0.0619, + "step": 838 + }, + { + "epoch": 0.19068181818181817, + "grad_norm": 4.096429435206798, + "learning_rate": 1.1414231738000521e-06, + "loss": 0.1873, + "step": 839 + }, + { + "epoch": 0.19090909090909092, + "grad_norm": 2.16119928523465, + "learning_rate": 1.1411716862714027e-06, + "loss": 0.0545, + "step": 840 + }, + { + "epoch": 0.19113636363636363, + "grad_norm": 5.550131065652608, + "learning_rate": 1.1409199356017339e-06, + "loss": 0.1225, + "step": 841 + }, + { + "epoch": 0.19136363636363637, + "grad_norm": 2.4101277608947926, + "learning_rate": 1.140667921919387e-06, + "loss": 0.1466, + "step": 842 + }, + { + "epoch": 0.1915909090909091, + "grad_norm": 3.795644548891534, + "learning_rate": 1.1404156453528367e-06, + "loss": 0.1144, + "step": 843 + }, + { + "epoch": 0.1918181818181818, + "grad_norm": 2.8182678499370364, + "learning_rate": 1.1401631060306921e-06, + "loss": 0.0598, + "step": 844 + }, + { + "epoch": 0.19204545454545455, + "grad_norm": 2.6353881625700155, + "learning_rate": 1.1399103040816963e-06, + "loss": 0.0938, + "step": 845 + }, + { + "epoch": 0.19227272727272726, + "grad_norm": 3.6059390754335956, + "learning_rate": 1.1396572396347257e-06, + "loss": 0.162, + "step": 846 + }, + { + "epoch": 0.1925, + "grad_norm": 2.8364878586325517, + "learning_rate": 1.1394039128187914e-06, + "loss": 0.1038, + "step": 847 + }, + { + "epoch": 0.19272727272727272, + "grad_norm": 3.6802409394806546, + "learning_rate": 1.1391503237630375e-06, + "loss": 0.1348, + "step": 848 + }, + { + "epoch": 0.19295454545454546, + "grad_norm": 1.838195768985246, + "learning_rate": 1.1388964725967423e-06, + "loss": 0.0659, + "step": 849 + }, + { + "epoch": 0.19318181818181818, + "grad_norm": 2.1533771310840946, + "learning_rate": 1.138642359449317e-06, + "loss": 0.0548, + "step": 850 + }, + { + "epoch": 0.19340909090909092, + "grad_norm": 2.1550316935790157, + "learning_rate": 1.1383879844503073e-06, + "loss": 0.0543, + "step": 851 + }, + { + "epoch": 0.19363636363636363, + "grad_norm": 3.316083219459905, + "learning_rate": 1.1381333477293918e-06, + "loss": 0.1225, + "step": 852 + }, + { + "epoch": 0.19386363636363638, + "grad_norm": 2.2512138983381855, + "learning_rate": 1.137878449416383e-06, + "loss": 0.074, + "step": 853 + }, + { + "epoch": 0.1940909090909091, + "grad_norm": 1.960701390128428, + "learning_rate": 1.137623289641226e-06, + "loss": 0.0941, + "step": 854 + }, + { + "epoch": 0.1943181818181818, + "grad_norm": 3.3429001180943536, + "learning_rate": 1.1373678685339994e-06, + "loss": 0.1151, + "step": 855 + }, + { + "epoch": 0.19454545454545455, + "grad_norm": 1.4766082260499962, + "learning_rate": 1.137112186224916e-06, + "loss": 0.0827, + "step": 856 + }, + { + "epoch": 0.19477272727272726, + "grad_norm": 5.0914055375258584, + "learning_rate": 1.1368562428443205e-06, + "loss": 0.1422, + "step": 857 + }, + { + "epoch": 0.195, + "grad_norm": 3.6649189418484878, + "learning_rate": 1.1366000385226913e-06, + "loss": 0.0883, + "step": 858 + }, + { + "epoch": 0.19522727272727272, + "grad_norm": 3.0223492961070133, + "learning_rate": 1.1363435733906398e-06, + "loss": 0.0727, + "step": 859 + }, + { + "epoch": 0.19545454545454546, + "grad_norm": 4.775911421467254, + "learning_rate": 1.13608684757891e-06, + "loss": 0.1403, + "step": 860 + }, + { + "epoch": 0.19568181818181818, + "grad_norm": 1.8306464787393781, + "learning_rate": 1.1358298612183793e-06, + "loss": 0.0519, + "step": 861 + }, + { + "epoch": 0.19590909090909092, + "grad_norm": 2.560261151483981, + "learning_rate": 1.135572614440058e-06, + "loss": 0.1326, + "step": 862 + }, + { + "epoch": 0.19613636363636364, + "grad_norm": 4.361080293809697, + "learning_rate": 1.1353151073750882e-06, + "loss": 0.1122, + "step": 863 + }, + { + "epoch": 0.19636363636363635, + "grad_norm": 5.780141713415176, + "learning_rate": 1.1350573401547457e-06, + "loss": 0.1673, + "step": 864 + }, + { + "epoch": 0.1965909090909091, + "grad_norm": 5.839517715657577, + "learning_rate": 1.1347993129104386e-06, + "loss": 0.1308, + "step": 865 + }, + { + "epoch": 0.1968181818181818, + "grad_norm": 5.86249892283722, + "learning_rate": 1.1345410257737078e-06, + "loss": 0.1348, + "step": 866 + }, + { + "epoch": 0.19704545454545455, + "grad_norm": 2.4773347048259873, + "learning_rate": 1.1342824788762258e-06, + "loss": 0.086, + "step": 867 + }, + { + "epoch": 0.19727272727272727, + "grad_norm": 2.1062215228256287, + "learning_rate": 1.1340236723497985e-06, + "loss": 0.0828, + "step": 868 + }, + { + "epoch": 0.1975, + "grad_norm": 2.322401525588964, + "learning_rate": 1.133764606326364e-06, + "loss": 0.0634, + "step": 869 + }, + { + "epoch": 0.19772727272727272, + "grad_norm": 3.0304941191038757, + "learning_rate": 1.1335052809379921e-06, + "loss": 0.0856, + "step": 870 + }, + { + "epoch": 0.19795454545454547, + "grad_norm": 2.826869993927377, + "learning_rate": 1.1332456963168854e-06, + "loss": 0.1027, + "step": 871 + }, + { + "epoch": 0.19818181818181818, + "grad_norm": 3.894930692754439, + "learning_rate": 1.1329858525953785e-06, + "loss": 0.1637, + "step": 872 + }, + { + "epoch": 0.1984090909090909, + "grad_norm": 3.554701139789984, + "learning_rate": 1.132725749905938e-06, + "loss": 0.0967, + "step": 873 + }, + { + "epoch": 0.19863636363636364, + "grad_norm": 4.171259007668478, + "learning_rate": 1.132465388381163e-06, + "loss": 0.1455, + "step": 874 + }, + { + "epoch": 0.19886363636363635, + "grad_norm": 3.9600508087933113, + "learning_rate": 1.1322047681537834e-06, + "loss": 0.0719, + "step": 875 + }, + { + "epoch": 0.1990909090909091, + "grad_norm": 5.976129345335904, + "learning_rate": 1.131943889356662e-06, + "loss": 0.1777, + "step": 876 + }, + { + "epoch": 0.1993181818181818, + "grad_norm": 2.275263782788514, + "learning_rate": 1.1316827521227935e-06, + "loss": 0.0681, + "step": 877 + }, + { + "epoch": 0.19954545454545455, + "grad_norm": 3.5336963821913736, + "learning_rate": 1.1314213565853036e-06, + "loss": 0.0928, + "step": 878 + }, + { + "epoch": 0.19977272727272727, + "grad_norm": 3.3253983060704693, + "learning_rate": 1.1311597028774503e-06, + "loss": 0.1077, + "step": 879 + }, + { + "epoch": 0.2, + "grad_norm": 3.408373012497816, + "learning_rate": 1.1308977911326229e-06, + "loss": 0.1048, + "step": 880 + }, + { + "epoch": 0.20022727272727273, + "grad_norm": 1.1404142454775106, + "learning_rate": 1.1306356214843423e-06, + "loss": 0.0502, + "step": 881 + }, + { + "epoch": 0.20045454545454544, + "grad_norm": 2.7407103948086107, + "learning_rate": 1.1303731940662608e-06, + "loss": 0.0768, + "step": 882 + }, + { + "epoch": 0.20068181818181818, + "grad_norm": 2.373689445874234, + "learning_rate": 1.1301105090121624e-06, + "loss": 0.1118, + "step": 883 + }, + { + "epoch": 0.2009090909090909, + "grad_norm": 2.523364804875991, + "learning_rate": 1.1298475664559622e-06, + "loss": 0.0987, + "step": 884 + }, + { + "epoch": 0.20113636363636364, + "grad_norm": 2.917457869309103, + "learning_rate": 1.1295843665317067e-06, + "loss": 0.0812, + "step": 885 + }, + { + "epoch": 0.20136363636363636, + "grad_norm": 2.91517100930304, + "learning_rate": 1.1293209093735732e-06, + "loss": 0.0878, + "step": 886 + }, + { + "epoch": 0.2015909090909091, + "grad_norm": 3.307951197493292, + "learning_rate": 1.129057195115871e-06, + "loss": 0.0955, + "step": 887 + }, + { + "epoch": 0.2018181818181818, + "grad_norm": 3.2338834501345723, + "learning_rate": 1.1287932238930397e-06, + "loss": 0.095, + "step": 888 + }, + { + "epoch": 0.20204545454545456, + "grad_norm": 2.0160635713962636, + "learning_rate": 1.12852899583965e-06, + "loss": 0.0944, + "step": 889 + }, + { + "epoch": 0.20227272727272727, + "grad_norm": 2.0417187168564435, + "learning_rate": 1.1282645110904036e-06, + "loss": 0.0848, + "step": 890 + }, + { + "epoch": 0.2025, + "grad_norm": 2.068173970557142, + "learning_rate": 1.1279997697801334e-06, + "loss": 0.0877, + "step": 891 + }, + { + "epoch": 0.20272727272727273, + "grad_norm": 1.5156010004576486, + "learning_rate": 1.1277347720438028e-06, + "loss": 0.0769, + "step": 892 + }, + { + "epoch": 0.20295454545454544, + "grad_norm": 2.080558521114398, + "learning_rate": 1.127469518016506e-06, + "loss": 0.0711, + "step": 893 + }, + { + "epoch": 0.20318181818181819, + "grad_norm": 2.7548962731781175, + "learning_rate": 1.1272040078334675e-06, + "loss": 0.1318, + "step": 894 + }, + { + "epoch": 0.2034090909090909, + "grad_norm": 2.166531975684643, + "learning_rate": 1.1269382416300429e-06, + "loss": 0.0744, + "step": 895 + }, + { + "epoch": 0.20363636363636364, + "grad_norm": 4.859934246879939, + "learning_rate": 1.126672219541718e-06, + "loss": 0.1879, + "step": 896 + }, + { + "epoch": 0.20386363636363636, + "grad_norm": 3.165261085320356, + "learning_rate": 1.1264059417041095e-06, + "loss": 0.1116, + "step": 897 + }, + { + "epoch": 0.2040909090909091, + "grad_norm": 2.581609984811572, + "learning_rate": 1.1261394082529638e-06, + "loss": 0.0747, + "step": 898 + }, + { + "epoch": 0.20431818181818182, + "grad_norm": 7.595407532118096, + "learning_rate": 1.1258726193241578e-06, + "loss": 0.1331, + "step": 899 + }, + { + "epoch": 0.20454545454545456, + "grad_norm": 2.460239281518681, + "learning_rate": 1.1256055750536992e-06, + "loss": 0.0569, + "step": 900 + }, + { + "epoch": 0.20477272727272727, + "grad_norm": 3.9890344248979335, + "learning_rate": 1.1253382755777253e-06, + "loss": 0.1031, + "step": 901 + }, + { + "epoch": 0.205, + "grad_norm": 2.303899054124794, + "learning_rate": 1.1250707210325036e-06, + "loss": 0.1064, + "step": 902 + }, + { + "epoch": 0.20522727272727273, + "grad_norm": 3.7644569912194763, + "learning_rate": 1.1248029115544319e-06, + "loss": 0.1531, + "step": 903 + }, + { + "epoch": 0.20545454545454545, + "grad_norm": 2.361898802494317, + "learning_rate": 1.1245348472800372e-06, + "loss": 0.1254, + "step": 904 + }, + { + "epoch": 0.2056818181818182, + "grad_norm": 3.2077710240271395, + "learning_rate": 1.1242665283459775e-06, + "loss": 0.1178, + "step": 905 + }, + { + "epoch": 0.2059090909090909, + "grad_norm": 4.5728122439497465, + "learning_rate": 1.1239979548890398e-06, + "loss": 0.1427, + "step": 906 + }, + { + "epoch": 0.20613636363636365, + "grad_norm": 2.143135780195684, + "learning_rate": 1.1237291270461413e-06, + "loss": 0.0906, + "step": 907 + }, + { + "epoch": 0.20636363636363636, + "grad_norm": 2.377923485082702, + "learning_rate": 1.1234600449543288e-06, + "loss": 0.0928, + "step": 908 + }, + { + "epoch": 0.2065909090909091, + "grad_norm": 3.95466741020973, + "learning_rate": 1.123190708750778e-06, + "loss": 0.1844, + "step": 909 + }, + { + "epoch": 0.20681818181818182, + "grad_norm": 3.6946488094845544, + "learning_rate": 1.1229211185727957e-06, + "loss": 0.1023, + "step": 910 + }, + { + "epoch": 0.20704545454545453, + "grad_norm": 4.3259911216107705, + "learning_rate": 1.1226512745578166e-06, + "loss": 0.061, + "step": 911 + }, + { + "epoch": 0.20727272727272728, + "grad_norm": 4.401604638829085, + "learning_rate": 1.1223811768434057e-06, + "loss": 0.1589, + "step": 912 + }, + { + "epoch": 0.2075, + "grad_norm": 1.6671112651180322, + "learning_rate": 1.122110825567257e-06, + "loss": 0.0733, + "step": 913 + }, + { + "epoch": 0.20772727272727273, + "grad_norm": 3.6022673624665527, + "learning_rate": 1.1218402208671938e-06, + "loss": 0.0915, + "step": 914 + }, + { + "epoch": 0.20795454545454545, + "grad_norm": 3.7529649858485694, + "learning_rate": 1.1215693628811688e-06, + "loss": 0.1332, + "step": 915 + }, + { + "epoch": 0.2081818181818182, + "grad_norm": 3.4313784464106734, + "learning_rate": 1.1212982517472636e-06, + "loss": 0.1177, + "step": 916 + }, + { + "epoch": 0.2084090909090909, + "grad_norm": 3.204234481204852, + "learning_rate": 1.1210268876036888e-06, + "loss": 0.071, + "step": 917 + }, + { + "epoch": 0.20863636363636365, + "grad_norm": 5.550411836893469, + "learning_rate": 1.1207552705887841e-06, + "loss": 0.1424, + "step": 918 + }, + { + "epoch": 0.20886363636363636, + "grad_norm": 3.796166977580573, + "learning_rate": 1.1204834008410184e-06, + "loss": 0.1275, + "step": 919 + }, + { + "epoch": 0.20909090909090908, + "grad_norm": 4.448753427467119, + "learning_rate": 1.1202112784989891e-06, + "loss": 0.1215, + "step": 920 + }, + { + "epoch": 0.20931818181818182, + "grad_norm": 3.603697902104071, + "learning_rate": 1.1199389037014221e-06, + "loss": 0.1349, + "step": 921 + }, + { + "epoch": 0.20954545454545453, + "grad_norm": 2.6656067554834175, + "learning_rate": 1.1196662765871725e-06, + "loss": 0.0758, + "step": 922 + }, + { + "epoch": 0.20977272727272728, + "grad_norm": 2.522427090552048, + "learning_rate": 1.119393397295224e-06, + "loss": 0.1295, + "step": 923 + }, + { + "epoch": 0.21, + "grad_norm": 2.5500355457890977, + "learning_rate": 1.1191202659646883e-06, + "loss": 0.1163, + "step": 924 + }, + { + "epoch": 0.21022727272727273, + "grad_norm": 3.314367017367459, + "learning_rate": 1.1188468827348066e-06, + "loss": 0.2032, + "step": 925 + }, + { + "epoch": 0.21045454545454545, + "grad_norm": 3.1745714374101732, + "learning_rate": 1.1185732477449475e-06, + "loss": 0.0621, + "step": 926 + }, + { + "epoch": 0.2106818181818182, + "grad_norm": 2.6629434499497235, + "learning_rate": 1.1182993611346084e-06, + "loss": 0.0951, + "step": 927 + }, + { + "epoch": 0.2109090909090909, + "grad_norm": 2.109660815852733, + "learning_rate": 1.1180252230434151e-06, + "loss": 0.096, + "step": 928 + }, + { + "epoch": 0.21113636363636365, + "grad_norm": 2.3764709838216462, + "learning_rate": 1.1177508336111215e-06, + "loss": 0.081, + "step": 929 + }, + { + "epoch": 0.21136363636363636, + "grad_norm": 4.423531101074121, + "learning_rate": 1.1174761929776094e-06, + "loss": 0.1349, + "step": 930 + }, + { + "epoch": 0.21159090909090908, + "grad_norm": 6.046686138198189, + "learning_rate": 1.1172013012828887e-06, + "loss": 0.214, + "step": 931 + }, + { + "epoch": 0.21181818181818182, + "grad_norm": 3.1481418952455025, + "learning_rate": 1.1169261586670976e-06, + "loss": 0.0763, + "step": 932 + }, + { + "epoch": 0.21204545454545454, + "grad_norm": 2.1896468943616814, + "learning_rate": 1.1166507652705021e-06, + "loss": 0.0732, + "step": 933 + }, + { + "epoch": 0.21227272727272728, + "grad_norm": 1.612292051343508, + "learning_rate": 1.1163751212334962e-06, + "loss": 0.0627, + "step": 934 + }, + { + "epoch": 0.2125, + "grad_norm": 3.317046111551942, + "learning_rate": 1.116099226696601e-06, + "loss": 0.086, + "step": 935 + }, + { + "epoch": 0.21272727272727274, + "grad_norm": 2.4966447432976633, + "learning_rate": 1.1158230818004656e-06, + "loss": 0.1291, + "step": 936 + }, + { + "epoch": 0.21295454545454545, + "grad_norm": 2.6833283960400474, + "learning_rate": 1.1155466866858677e-06, + "loss": 0.0992, + "step": 937 + }, + { + "epoch": 0.2131818181818182, + "grad_norm": 2.679797743777146, + "learning_rate": 1.1152700414937111e-06, + "loss": 0.1006, + "step": 938 + }, + { + "epoch": 0.2134090909090909, + "grad_norm": 2.4526717287017776, + "learning_rate": 1.114993146365028e-06, + "loss": 0.1083, + "step": 939 + }, + { + "epoch": 0.21363636363636362, + "grad_norm": 4.997802297801097, + "learning_rate": 1.1147160014409779e-06, + "loss": 0.148, + "step": 940 + }, + { + "epoch": 0.21386363636363637, + "grad_norm": 3.631739963959465, + "learning_rate": 1.1144386068628472e-06, + "loss": 0.0986, + "step": 941 + }, + { + "epoch": 0.21409090909090908, + "grad_norm": 4.673512937040969, + "learning_rate": 1.1141609627720501e-06, + "loss": 0.122, + "step": 942 + }, + { + "epoch": 0.21431818181818182, + "grad_norm": 6.356064996925455, + "learning_rate": 1.1138830693101277e-06, + "loss": 0.1312, + "step": 943 + }, + { + "epoch": 0.21454545454545454, + "grad_norm": 2.247979228235356, + "learning_rate": 1.1136049266187481e-06, + "loss": 0.0574, + "step": 944 + }, + { + "epoch": 0.21477272727272728, + "grad_norm": 2.4398860326798513, + "learning_rate": 1.113326534839707e-06, + "loss": 0.0656, + "step": 945 + }, + { + "epoch": 0.215, + "grad_norm": 2.944609555841561, + "learning_rate": 1.1130478941149268e-06, + "loss": 0.1414, + "step": 946 + }, + { + "epoch": 0.21522727272727274, + "grad_norm": 1.8780586828444517, + "learning_rate": 1.1127690045864561e-06, + "loss": 0.0642, + "step": 947 + }, + { + "epoch": 0.21545454545454545, + "grad_norm": 2.0624939432771487, + "learning_rate": 1.1124898663964718e-06, + "loss": 0.0558, + "step": 948 + }, + { + "epoch": 0.21568181818181817, + "grad_norm": 2.047645031880203, + "learning_rate": 1.1122104796872763e-06, + "loss": 0.0826, + "step": 949 + }, + { + "epoch": 0.2159090909090909, + "grad_norm": 2.4564226331161354, + "learning_rate": 1.1119308446012993e-06, + "loss": 0.1094, + "step": 950 + }, + { + "epoch": 0.21613636363636363, + "grad_norm": 2.9508930203500405, + "learning_rate": 1.111650961281097e-06, + "loss": 0.0956, + "step": 951 + }, + { + "epoch": 0.21636363636363637, + "grad_norm": 2.584735875741647, + "learning_rate": 1.111370829869352e-06, + "loss": 0.0672, + "step": 952 + }, + { + "epoch": 0.21659090909090908, + "grad_norm": 2.3544927742886177, + "learning_rate": 1.1110904505088738e-06, + "loss": 0.1015, + "step": 953 + }, + { + "epoch": 0.21681818181818183, + "grad_norm": 2.0923619070138004, + "learning_rate": 1.1108098233425977e-06, + "loss": 0.0789, + "step": 954 + }, + { + "epoch": 0.21704545454545454, + "grad_norm": 2.827344856451054, + "learning_rate": 1.1105289485135855e-06, + "loss": 0.1264, + "step": 955 + }, + { + "epoch": 0.21727272727272728, + "grad_norm": 2.7625200370114693, + "learning_rate": 1.1102478261650258e-06, + "loss": 0.1385, + "step": 956 + }, + { + "epoch": 0.2175, + "grad_norm": 4.273223319040278, + "learning_rate": 1.1099664564402327e-06, + "loss": 0.093, + "step": 957 + }, + { + "epoch": 0.2177272727272727, + "grad_norm": 3.957015148241719, + "learning_rate": 1.1096848394826467e-06, + "loss": 0.0937, + "step": 958 + }, + { + "epoch": 0.21795454545454546, + "grad_norm": 2.6039014203138406, + "learning_rate": 1.1094029754358343e-06, + "loss": 0.0893, + "step": 959 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 4.337016858549236, + "learning_rate": 1.1091208644434882e-06, + "loss": 0.0853, + "step": 960 + }, + { + "epoch": 0.2184090909090909, + "grad_norm": 2.7783412449963016, + "learning_rate": 1.1088385066494267e-06, + "loss": 0.1019, + "step": 961 + }, + { + "epoch": 0.21863636363636363, + "grad_norm": 4.567262705167418, + "learning_rate": 1.1085559021975937e-06, + "loss": 0.162, + "step": 962 + }, + { + "epoch": 0.21886363636363637, + "grad_norm": 2.7701678941373875, + "learning_rate": 1.1082730512320597e-06, + "loss": 0.0924, + "step": 963 + }, + { + "epoch": 0.2190909090909091, + "grad_norm": 4.266085656521103, + "learning_rate": 1.1079899538970196e-06, + "loss": 0.2061, + "step": 964 + }, + { + "epoch": 0.21931818181818183, + "grad_norm": 4.47817591540772, + "learning_rate": 1.1077066103367954e-06, + "loss": 0.1175, + "step": 965 + }, + { + "epoch": 0.21954545454545454, + "grad_norm": 3.883728443912027, + "learning_rate": 1.1074230206958332e-06, + "loss": 0.1186, + "step": 966 + }, + { + "epoch": 0.2197727272727273, + "grad_norm": 3.4387024504468067, + "learning_rate": 1.1071391851187052e-06, + "loss": 0.1316, + "step": 967 + }, + { + "epoch": 0.22, + "grad_norm": 4.109298772287061, + "learning_rate": 1.1068551037501093e-06, + "loss": 0.2123, + "step": 968 + }, + { + "epoch": 0.22022727272727272, + "grad_norm": 7.9804556432768585, + "learning_rate": 1.1065707767348684e-06, + "loss": 0.2646, + "step": 969 + }, + { + "epoch": 0.22045454545454546, + "grad_norm": 3.2426825766170913, + "learning_rate": 1.1062862042179302e-06, + "loss": 0.0921, + "step": 970 + }, + { + "epoch": 0.22068181818181817, + "grad_norm": 2.654212119535768, + "learning_rate": 1.1060013863443683e-06, + "loss": 0.0832, + "step": 971 + }, + { + "epoch": 0.22090909090909092, + "grad_norm": 2.3292926977665838, + "learning_rate": 1.1057163232593808e-06, + "loss": 0.0583, + "step": 972 + }, + { + "epoch": 0.22113636363636363, + "grad_norm": 7.754932245788428, + "learning_rate": 1.1054310151082913e-06, + "loss": 0.3362, + "step": 973 + }, + { + "epoch": 0.22136363636363637, + "grad_norm": 1.848455410558129, + "learning_rate": 1.1051454620365475e-06, + "loss": 0.0732, + "step": 974 + }, + { + "epoch": 0.2215909090909091, + "grad_norm": 4.175374511241659, + "learning_rate": 1.1048596641897233e-06, + "loss": 0.1147, + "step": 975 + }, + { + "epoch": 0.22181818181818183, + "grad_norm": 2.775054721276452, + "learning_rate": 1.104573621713516e-06, + "loss": 0.2448, + "step": 976 + }, + { + "epoch": 0.22204545454545455, + "grad_norm": 3.0619541038648164, + "learning_rate": 1.1042873347537485e-06, + "loss": 0.0676, + "step": 977 + }, + { + "epoch": 0.22227272727272726, + "grad_norm": 2.3963200464235466, + "learning_rate": 1.1040008034563682e-06, + "loss": 0.0595, + "step": 978 + }, + { + "epoch": 0.2225, + "grad_norm": 2.724124197152781, + "learning_rate": 1.1037140279674468e-06, + "loss": 0.0919, + "step": 979 + }, + { + "epoch": 0.22272727272727272, + "grad_norm": 2.2618149150379936, + "learning_rate": 1.1034270084331803e-06, + "loss": 0.0934, + "step": 980 + }, + { + "epoch": 0.22295454545454546, + "grad_norm": 2.4538484431914247, + "learning_rate": 1.1031397449998896e-06, + "loss": 0.1762, + "step": 981 + }, + { + "epoch": 0.22318181818181818, + "grad_norm": 3.1096558602754, + "learning_rate": 1.10285223781402e-06, + "loss": 0.0903, + "step": 982 + }, + { + "epoch": 0.22340909090909092, + "grad_norm": 2.6710422364710147, + "learning_rate": 1.1025644870221405e-06, + "loss": 0.1155, + "step": 983 + }, + { + "epoch": 0.22363636363636363, + "grad_norm": 4.404314169230577, + "learning_rate": 1.1022764927709447e-06, + "loss": 0.0618, + "step": 984 + }, + { + "epoch": 0.22386363636363638, + "grad_norm": 3.1080682317581116, + "learning_rate": 1.1019882552072502e-06, + "loss": 0.1129, + "step": 985 + }, + { + "epoch": 0.2240909090909091, + "grad_norm": 3.4982391770158126, + "learning_rate": 1.101699774477999e-06, + "loss": 0.0943, + "step": 986 + }, + { + "epoch": 0.2243181818181818, + "grad_norm": 3.7455969246134586, + "learning_rate": 1.1014110507302563e-06, + "loss": 0.0829, + "step": 987 + }, + { + "epoch": 0.22454545454545455, + "grad_norm": 3.089772830261481, + "learning_rate": 1.1011220841112118e-06, + "loss": 0.0916, + "step": 988 + }, + { + "epoch": 0.22477272727272726, + "grad_norm": 3.685996926681144, + "learning_rate": 1.1008328747681788e-06, + "loss": 0.125, + "step": 989 + }, + { + "epoch": 0.225, + "grad_norm": 5.5029006850618245, + "learning_rate": 1.1005434228485945e-06, + "loss": 0.1287, + "step": 990 + }, + { + "epoch": 0.22522727272727272, + "grad_norm": 2.503227367309242, + "learning_rate": 1.1002537285000196e-06, + "loss": 0.074, + "step": 991 + }, + { + "epoch": 0.22545454545454546, + "grad_norm": 3.663678384190874, + "learning_rate": 1.099963791870138e-06, + "loss": 0.1679, + "step": 992 + }, + { + "epoch": 0.22568181818181818, + "grad_norm": 3.6473105532007906, + "learning_rate": 1.099673613106758e-06, + "loss": 0.1851, + "step": 993 + }, + { + "epoch": 0.22590909090909092, + "grad_norm": 3.376502978870639, + "learning_rate": 1.0993831923578107e-06, + "loss": 0.1261, + "step": 994 + }, + { + "epoch": 0.22613636363636364, + "grad_norm": 2.050111521397145, + "learning_rate": 1.099092529771351e-06, + "loss": 0.1158, + "step": 995 + }, + { + "epoch": 0.22636363636363635, + "grad_norm": 3.524690286226471, + "learning_rate": 1.0988016254955565e-06, + "loss": 0.079, + "step": 996 + }, + { + "epoch": 0.2265909090909091, + "grad_norm": 3.5070904166489933, + "learning_rate": 1.0985104796787285e-06, + "loss": 0.0929, + "step": 997 + }, + { + "epoch": 0.2268181818181818, + "grad_norm": 4.456391490112464, + "learning_rate": 1.0982190924692911e-06, + "loss": 0.1171, + "step": 998 + }, + { + "epoch": 0.22704545454545455, + "grad_norm": 2.9034448771170953, + "learning_rate": 1.0979274640157919e-06, + "loss": 0.0891, + "step": 999 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 2.317303423206001, + "learning_rate": 1.097635594466901e-06, + "loss": 0.0746, + "step": 1000 + }, + { + "epoch": 0.2275, + "grad_norm": 2.3898207317284226, + "learning_rate": 1.0973434839714116e-06, + "loss": 0.1337, + "step": 1001 + }, + { + "epoch": 0.22772727272727272, + "grad_norm": 4.5542207241038195, + "learning_rate": 1.0970511326782398e-06, + "loss": 0.1, + "step": 1002 + }, + { + "epoch": 0.22795454545454547, + "grad_norm": 2.5962593883287775, + "learning_rate": 1.0967585407364246e-06, + "loss": 0.1203, + "step": 1003 + }, + { + "epoch": 0.22818181818181818, + "grad_norm": 1.4589261347771723, + "learning_rate": 1.0964657082951274e-06, + "loss": 0.0855, + "step": 1004 + }, + { + "epoch": 0.22840909090909092, + "grad_norm": 4.005718643782005, + "learning_rate": 1.0961726355036324e-06, + "loss": 0.1141, + "step": 1005 + }, + { + "epoch": 0.22863636363636364, + "grad_norm": 3.6227520455795013, + "learning_rate": 1.0958793225113459e-06, + "loss": 0.2075, + "step": 1006 + }, + { + "epoch": 0.22886363636363635, + "grad_norm": 3.352441562049467, + "learning_rate": 1.0955857694677971e-06, + "loss": 0.1172, + "step": 1007 + }, + { + "epoch": 0.2290909090909091, + "grad_norm": 3.5771013991275, + "learning_rate": 1.0952919765226378e-06, + "loss": 0.1096, + "step": 1008 + }, + { + "epoch": 0.2293181818181818, + "grad_norm": 2.820476737214537, + "learning_rate": 1.0949979438256415e-06, + "loss": 0.1502, + "step": 1009 + }, + { + "epoch": 0.22954545454545455, + "grad_norm": 3.422637080217908, + "learning_rate": 1.0947036715267039e-06, + "loss": 0.079, + "step": 1010 + }, + { + "epoch": 0.22977272727272727, + "grad_norm": 2.7288425802149234, + "learning_rate": 1.0944091597758438e-06, + "loss": 0.087, + "step": 1011 + }, + { + "epoch": 0.23, + "grad_norm": 2.431064862153476, + "learning_rate": 1.0941144087232008e-06, + "loss": 0.1365, + "step": 1012 + }, + { + "epoch": 0.23022727272727272, + "grad_norm": 3.5215545806808373, + "learning_rate": 1.0938194185190374e-06, + "loss": 0.0754, + "step": 1013 + }, + { + "epoch": 0.23045454545454547, + "grad_norm": 2.44728324136204, + "learning_rate": 1.0935241893137376e-06, + "loss": 0.0632, + "step": 1014 + }, + { + "epoch": 0.23068181818181818, + "grad_norm": 5.0499786599015835, + "learning_rate": 1.0932287212578075e-06, + "loss": 0.1379, + "step": 1015 + }, + { + "epoch": 0.2309090909090909, + "grad_norm": 3.5265344835599852, + "learning_rate": 1.0929330145018747e-06, + "loss": 0.0807, + "step": 1016 + }, + { + "epoch": 0.23113636363636364, + "grad_norm": 3.177449426677741, + "learning_rate": 1.0926370691966883e-06, + "loss": 0.1184, + "step": 1017 + }, + { + "epoch": 0.23136363636363635, + "grad_norm": 2.799778091924247, + "learning_rate": 1.0923408854931202e-06, + "loss": 0.0815, + "step": 1018 + }, + { + "epoch": 0.2315909090909091, + "grad_norm": 3.5022357220937517, + "learning_rate": 1.0920444635421622e-06, + "loss": 0.1677, + "step": 1019 + }, + { + "epoch": 0.2318181818181818, + "grad_norm": 2.409623073624271, + "learning_rate": 1.0917478034949285e-06, + "loss": 0.0712, + "step": 1020 + }, + { + "epoch": 0.23204545454545455, + "grad_norm": 2.825568763157153, + "learning_rate": 1.0914509055026545e-06, + "loss": 0.0888, + "step": 1021 + }, + { + "epoch": 0.23227272727272727, + "grad_norm": 2.446935650086231, + "learning_rate": 1.0911537697166967e-06, + "loss": 0.0814, + "step": 1022 + }, + { + "epoch": 0.2325, + "grad_norm": 3.209321781263389, + "learning_rate": 1.0908563962885337e-06, + "loss": 0.1928, + "step": 1023 + }, + { + "epoch": 0.23272727272727273, + "grad_norm": 3.1835675972038335, + "learning_rate": 1.0905587853697638e-06, + "loss": 0.1046, + "step": 1024 + }, + { + "epoch": 0.23295454545454544, + "grad_norm": 3.250106120972125, + "learning_rate": 1.0902609371121076e-06, + "loss": 0.112, + "step": 1025 + }, + { + "epoch": 0.23318181818181818, + "grad_norm": 2.5655596038091613, + "learning_rate": 1.089962851667406e-06, + "loss": 0.1314, + "step": 1026 + }, + { + "epoch": 0.2334090909090909, + "grad_norm": 3.0928004192484386, + "learning_rate": 1.089664529187621e-06, + "loss": 0.0922, + "step": 1027 + }, + { + "epoch": 0.23363636363636364, + "grad_norm": 3.3630779647276983, + "learning_rate": 1.0893659698248358e-06, + "loss": 0.1259, + "step": 1028 + }, + { + "epoch": 0.23386363636363636, + "grad_norm": 3.0639726100068145, + "learning_rate": 1.0890671737312538e-06, + "loss": 0.1063, + "step": 1029 + }, + { + "epoch": 0.2340909090909091, + "grad_norm": 1.928986513851524, + "learning_rate": 1.0887681410591994e-06, + "loss": 0.0987, + "step": 1030 + }, + { + "epoch": 0.23431818181818181, + "grad_norm": 3.3290930983318745, + "learning_rate": 1.0884688719611176e-06, + "loss": 0.0727, + "step": 1031 + }, + { + "epoch": 0.23454545454545456, + "grad_norm": 2.340235596266924, + "learning_rate": 1.0881693665895737e-06, + "loss": 0.0935, + "step": 1032 + }, + { + "epoch": 0.23477272727272727, + "grad_norm": 2.6097010578731443, + "learning_rate": 1.0878696250972536e-06, + "loss": 0.0742, + "step": 1033 + }, + { + "epoch": 0.235, + "grad_norm": 2.361225734558857, + "learning_rate": 1.087569647636964e-06, + "loss": 0.0858, + "step": 1034 + }, + { + "epoch": 0.23522727272727273, + "grad_norm": 2.0453254421623206, + "learning_rate": 1.0872694343616312e-06, + "loss": 0.066, + "step": 1035 + }, + { + "epoch": 0.23545454545454544, + "grad_norm": 5.213027010474639, + "learning_rate": 1.0869689854243019e-06, + "loss": 0.2446, + "step": 1036 + }, + { + "epoch": 0.2356818181818182, + "grad_norm": 1.1104112070804562, + "learning_rate": 1.0866683009781432e-06, + "loss": 0.068, + "step": 1037 + }, + { + "epoch": 0.2359090909090909, + "grad_norm": 4.448468896953293, + "learning_rate": 1.0863673811764419e-06, + "loss": 0.1695, + "step": 1038 + }, + { + "epoch": 0.23613636363636364, + "grad_norm": 1.9053478319797195, + "learning_rate": 1.0860662261726054e-06, + "loss": 0.0442, + "step": 1039 + }, + { + "epoch": 0.23636363636363636, + "grad_norm": 2.3912744153621093, + "learning_rate": 1.0857648361201603e-06, + "loss": 0.1416, + "step": 1040 + }, + { + "epoch": 0.2365909090909091, + "grad_norm": 6.142496178131957, + "learning_rate": 1.085463211172753e-06, + "loss": 0.1945, + "step": 1041 + }, + { + "epoch": 0.23681818181818182, + "grad_norm": 3.4672080243967924, + "learning_rate": 1.0851613514841508e-06, + "loss": 0.111, + "step": 1042 + }, + { + "epoch": 0.23704545454545456, + "grad_norm": 2.639038515311767, + "learning_rate": 1.0848592572082391e-06, + "loss": 0.0829, + "step": 1043 + }, + { + "epoch": 0.23727272727272727, + "grad_norm": 2.2188111340714682, + "learning_rate": 1.084556928499024e-06, + "loss": 0.0577, + "step": 1044 + }, + { + "epoch": 0.2375, + "grad_norm": 3.3735275365083957, + "learning_rate": 1.0842543655106305e-06, + "loss": 0.1386, + "step": 1045 + }, + { + "epoch": 0.23772727272727273, + "grad_norm": 2.8783031695202452, + "learning_rate": 1.0839515683973035e-06, + "loss": 0.094, + "step": 1046 + }, + { + "epoch": 0.23795454545454545, + "grad_norm": 2.4681897745323265, + "learning_rate": 1.083648537313407e-06, + "loss": 0.0631, + "step": 1047 + }, + { + "epoch": 0.2381818181818182, + "grad_norm": 3.5409382848693745, + "learning_rate": 1.083345272413424e-06, + "loss": 0.1112, + "step": 1048 + }, + { + "epoch": 0.2384090909090909, + "grad_norm": 3.431382674528093, + "learning_rate": 1.0830417738519575e-06, + "loss": 0.168, + "step": 1049 + }, + { + "epoch": 0.23863636363636365, + "grad_norm": 2.623520023331604, + "learning_rate": 1.0827380417837287e-06, + "loss": 0.0954, + "step": 1050 + }, + { + "epoch": 0.23886363636363636, + "grad_norm": 2.6900299562450614, + "learning_rate": 1.0824340763635785e-06, + "loss": 0.0637, + "step": 1051 + }, + { + "epoch": 0.2390909090909091, + "grad_norm": 3.3988135284447307, + "learning_rate": 1.0821298777464665e-06, + "loss": 0.1049, + "step": 1052 + }, + { + "epoch": 0.23931818181818182, + "grad_norm": 1.4388463452619982, + "learning_rate": 1.081825446087471e-06, + "loss": 0.0443, + "step": 1053 + }, + { + "epoch": 0.23954545454545453, + "grad_norm": 4.453600232712395, + "learning_rate": 1.0815207815417894e-06, + "loss": 0.1187, + "step": 1054 + }, + { + "epoch": 0.23977272727272728, + "grad_norm": 2.813752236794351, + "learning_rate": 1.081215884264738e-06, + "loss": 0.0641, + "step": 1055 + }, + { + "epoch": 0.24, + "grad_norm": 3.5103600005398614, + "learning_rate": 1.0809107544117511e-06, + "loss": 0.1622, + "step": 1056 + }, + { + "epoch": 0.24022727272727273, + "grad_norm": 3.0614316400223434, + "learning_rate": 1.0806053921383823e-06, + "loss": 0.0805, + "step": 1057 + }, + { + "epoch": 0.24045454545454545, + "grad_norm": 4.599454155105708, + "learning_rate": 1.0802997976003031e-06, + "loss": 0.1113, + "step": 1058 + }, + { + "epoch": 0.2406818181818182, + "grad_norm": 2.946255517287608, + "learning_rate": 1.0799939709533036e-06, + "loss": 0.1171, + "step": 1059 + }, + { + "epoch": 0.2409090909090909, + "grad_norm": 2.313539654888859, + "learning_rate": 1.0796879123532924e-06, + "loss": 0.1487, + "step": 1060 + }, + { + "epoch": 0.24113636363636365, + "grad_norm": 2.5859760865279218, + "learning_rate": 1.0793816219562963e-06, + "loss": 0.0851, + "step": 1061 + }, + { + "epoch": 0.24136363636363636, + "grad_norm": 2.730372275838902, + "learning_rate": 1.0790750999184598e-06, + "loss": 0.1449, + "step": 1062 + }, + { + "epoch": 0.24159090909090908, + "grad_norm": 3.1106198042263378, + "learning_rate": 1.0787683463960462e-06, + "loss": 0.0985, + "step": 1063 + }, + { + "epoch": 0.24181818181818182, + "grad_norm": 2.4312389201931373, + "learning_rate": 1.0784613615454365e-06, + "loss": 0.0903, + "step": 1064 + }, + { + "epoch": 0.24204545454545454, + "grad_norm": 2.618279878946786, + "learning_rate": 1.0781541455231294e-06, + "loss": 0.1378, + "step": 1065 + }, + { + "epoch": 0.24227272727272728, + "grad_norm": 6.440838252004829, + "learning_rate": 1.077846698485742e-06, + "loss": 0.1975, + "step": 1066 + }, + { + "epoch": 0.2425, + "grad_norm": 2.8602768490988733, + "learning_rate": 1.0775390205900084e-06, + "loss": 0.0743, + "step": 1067 + }, + { + "epoch": 0.24272727272727274, + "grad_norm": 2.426755601088994, + "learning_rate": 1.0772311119927808e-06, + "loss": 0.0998, + "step": 1068 + }, + { + "epoch": 0.24295454545454545, + "grad_norm": 4.9494508884835, + "learning_rate": 1.0769229728510298e-06, + "loss": 0.1687, + "step": 1069 + }, + { + "epoch": 0.2431818181818182, + "grad_norm": 3.221663570718401, + "learning_rate": 1.0766146033218417e-06, + "loss": 0.2123, + "step": 1070 + }, + { + "epoch": 0.2434090909090909, + "grad_norm": 2.7965004443252273, + "learning_rate": 1.076306003562422e-06, + "loss": 0.1214, + "step": 1071 + }, + { + "epoch": 0.24363636363636362, + "grad_norm": 2.6322221145774916, + "learning_rate": 1.0759971737300928e-06, + "loss": 0.0976, + "step": 1072 + }, + { + "epoch": 0.24386363636363637, + "grad_norm": 2.6699556056360527, + "learning_rate": 1.0756881139822934e-06, + "loss": 0.0987, + "step": 1073 + }, + { + "epoch": 0.24409090909090908, + "grad_norm": 1.7075151884408495, + "learning_rate": 1.0753788244765805e-06, + "loss": 0.0704, + "step": 1074 + }, + { + "epoch": 0.24431818181818182, + "grad_norm": 3.713544106708803, + "learning_rate": 1.0750693053706282e-06, + "loss": 0.1216, + "step": 1075 + }, + { + "epoch": 0.24454545454545454, + "grad_norm": 2.839568732276037, + "learning_rate": 1.0747595568222268e-06, + "loss": 0.1676, + "step": 1076 + }, + { + "epoch": 0.24477272727272728, + "grad_norm": 5.436580193861712, + "learning_rate": 1.0744495789892848e-06, + "loss": 0.1536, + "step": 1077 + }, + { + "epoch": 0.245, + "grad_norm": 2.166952315452957, + "learning_rate": 1.0741393720298263e-06, + "loss": 0.0918, + "step": 1078 + }, + { + "epoch": 0.24522727272727274, + "grad_norm": 2.2368545994346896, + "learning_rate": 1.073828936101993e-06, + "loss": 0.1229, + "step": 1079 + }, + { + "epoch": 0.24545454545454545, + "grad_norm": 2.856030235398001, + "learning_rate": 1.0735182713640436e-06, + "loss": 0.1058, + "step": 1080 + }, + { + "epoch": 0.2456818181818182, + "grad_norm": 3.705020687317192, + "learning_rate": 1.0732073779743523e-06, + "loss": 0.1313, + "step": 1081 + }, + { + "epoch": 0.2459090909090909, + "grad_norm": 3.369565605292207, + "learning_rate": 1.0728962560914108e-06, + "loss": 0.1278, + "step": 1082 + }, + { + "epoch": 0.24613636363636363, + "grad_norm": 3.2753023462687723, + "learning_rate": 1.0725849058738274e-06, + "loss": 0.0791, + "step": 1083 + }, + { + "epoch": 0.24636363636363637, + "grad_norm": 2.679693942002681, + "learning_rate": 1.0722733274803261e-06, + "loss": 0.1241, + "step": 1084 + }, + { + "epoch": 0.24659090909090908, + "grad_norm": 2.3801136174936413, + "learning_rate": 1.0719615210697476e-06, + "loss": 0.0766, + "step": 1085 + }, + { + "epoch": 0.24681818181818183, + "grad_norm": 2.449329195243545, + "learning_rate": 1.0716494868010488e-06, + "loss": 0.0994, + "step": 1086 + }, + { + "epoch": 0.24704545454545454, + "grad_norm": 3.017857292121828, + "learning_rate": 1.071337224833303e-06, + "loss": 0.092, + "step": 1087 + }, + { + "epoch": 0.24727272727272728, + "grad_norm": 4.834907623969626, + "learning_rate": 1.0710247353256988e-06, + "loss": 0.1069, + "step": 1088 + }, + { + "epoch": 0.2475, + "grad_norm": 2.817122076216917, + "learning_rate": 1.0707120184375422e-06, + "loss": 0.0953, + "step": 1089 + }, + { + "epoch": 0.24772727272727274, + "grad_norm": 2.6444140932984186, + "learning_rate": 1.0703990743282534e-06, + "loss": 0.0778, + "step": 1090 + }, + { + "epoch": 0.24795454545454546, + "grad_norm": 3.0131058641087263, + "learning_rate": 1.07008590315737e-06, + "loss": 0.1316, + "step": 1091 + }, + { + "epoch": 0.24818181818181817, + "grad_norm": 2.650454667338594, + "learning_rate": 1.069772505084544e-06, + "loss": 0.1216, + "step": 1092 + }, + { + "epoch": 0.2484090909090909, + "grad_norm": 1.3364406391602255, + "learning_rate": 1.0694588802695443e-06, + "loss": 0.0565, + "step": 1093 + }, + { + "epoch": 0.24863636363636363, + "grad_norm": 3.0543730423049102, + "learning_rate": 1.0691450288722545e-06, + "loss": 0.1239, + "step": 1094 + }, + { + "epoch": 0.24886363636363637, + "grad_norm": 3.187648535671965, + "learning_rate": 1.0688309510526742e-06, + "loss": 0.0896, + "step": 1095 + }, + { + "epoch": 0.24909090909090909, + "grad_norm": 3.1151811182727296, + "learning_rate": 1.0685166469709181e-06, + "loss": 0.1363, + "step": 1096 + }, + { + "epoch": 0.24931818181818183, + "grad_norm": 2.882473555473901, + "learning_rate": 1.0682021167872166e-06, + "loss": 0.0681, + "step": 1097 + }, + { + "epoch": 0.24954545454545454, + "grad_norm": 4.8287176838478105, + "learning_rate": 1.0678873606619152e-06, + "loss": 0.2607, + "step": 1098 + }, + { + "epoch": 0.24977272727272729, + "grad_norm": 2.7638840844991512, + "learning_rate": 1.0675723787554743e-06, + "loss": 0.0877, + "step": 1099 + }, + { + "epoch": 0.25, + "grad_norm": 4.966417813914996, + "learning_rate": 1.0672571712284697e-06, + "loss": 0.1078, + "step": 1100 + } + ], + "logging_steps": 1.0, + "max_steps": 4400, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3406754807808.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}