diff --git "a/checkpoint-4000/trainer_state.json" "b/checkpoint-4000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4000/trainer_state.json" @@ -0,0 +1,28034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4461315979754157, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00036153289949385393, + "grad_norm": 16.378230043688863, + "learning_rate": 0.0, + "loss": 2.7969, + "step": 1 + }, + { + "epoch": 0.0007230657989877079, + "grad_norm": 20.346007517382972, + "learning_rate": 3.6101083032490976e-08, + "loss": 3.3438, + "step": 2 + }, + { + "epoch": 0.0010845986984815619, + "grad_norm": 15.570725543802201, + "learning_rate": 7.220216606498195e-08, + "loss": 2.3281, + "step": 3 + }, + { + "epoch": 0.0014461315979754157, + "grad_norm": 19.614861247448776, + "learning_rate": 1.0830324909747293e-07, + "loss": 3.0469, + "step": 4 + }, + { + "epoch": 0.0018076644974692696, + "grad_norm": 14.07223001327079, + "learning_rate": 1.444043321299639e-07, + "loss": 2.8594, + "step": 5 + }, + { + "epoch": 0.0021691973969631237, + "grad_norm": 10.855103899835822, + "learning_rate": 1.805054151624549e-07, + "loss": 2.5938, + "step": 6 + }, + { + "epoch": 0.0025307302964569776, + "grad_norm": 19.615919281705462, + "learning_rate": 2.1660649819494586e-07, + "loss": 3.1094, + "step": 7 + }, + { + "epoch": 0.0028922631959508315, + "grad_norm": 16.15631512680797, + "learning_rate": 2.527075812274368e-07, + "loss": 2.9062, + "step": 8 + }, + { + "epoch": 0.0032537960954446853, + "grad_norm": 12.593512715910386, + "learning_rate": 2.888086642599278e-07, + "loss": 2.2344, + "step": 9 + }, + { + "epoch": 0.0036153289949385392, + "grad_norm": 11.932709361035554, + "learning_rate": 3.2490974729241875e-07, + "loss": 2.3594, + "step": 10 + }, + { + "epoch": 0.0039768618944323935, + "grad_norm": 12.934565956593698, + "learning_rate": 3.610108303249098e-07, + "loss": 2.3125, + "step": 11 + }, + { + "epoch": 0.004338394793926247, + "grad_norm": 9.558893454723844, + "learning_rate": 3.971119133574008e-07, + "loss": 2.5156, + "step": 12 + }, + { + "epoch": 0.004699927693420101, + "grad_norm": 12.719008162031765, + "learning_rate": 4.332129963898917e-07, + "loss": 2.2656, + "step": 13 + }, + { + "epoch": 0.005061460592913955, + "grad_norm": 17.141146114413075, + "learning_rate": 4.693140794223827e-07, + "loss": 2.6406, + "step": 14 + }, + { + "epoch": 0.005422993492407809, + "grad_norm": 13.558554817872825, + "learning_rate": 5.054151624548736e-07, + "loss": 2.4531, + "step": 15 + }, + { + "epoch": 0.005784526391901663, + "grad_norm": 14.151248934945002, + "learning_rate": 5.415162454873646e-07, + "loss": 2.9375, + "step": 16 + }, + { + "epoch": 0.006146059291395517, + "grad_norm": 19.75551537485207, + "learning_rate": 5.776173285198556e-07, + "loss": 3.25, + "step": 17 + }, + { + "epoch": 0.006507592190889371, + "grad_norm": 25.75515856935179, + "learning_rate": 6.137184115523466e-07, + "loss": 3.0938, + "step": 18 + }, + { + "epoch": 0.006869125090383225, + "grad_norm": 10.333406755191781, + "learning_rate": 6.498194945848375e-07, + "loss": 2.375, + "step": 19 + }, + { + "epoch": 0.0072306579898770785, + "grad_norm": 16.593935429043416, + "learning_rate": 6.859205776173286e-07, + "loss": 2.9844, + "step": 20 + }, + { + "epoch": 0.007592190889370932, + "grad_norm": 10.944654092395592, + "learning_rate": 7.220216606498196e-07, + "loss": 2.4062, + "step": 21 + }, + { + "epoch": 0.007953723788864787, + "grad_norm": 21.478534076268854, + "learning_rate": 7.581227436823105e-07, + "loss": 2.5625, + "step": 22 + }, + { + "epoch": 0.008315256688358641, + "grad_norm": 15.858202557779267, + "learning_rate": 7.942238267148016e-07, + "loss": 2.9375, + "step": 23 + }, + { + "epoch": 0.008676789587852495, + "grad_norm": 11.674940718521329, + "learning_rate": 8.303249097472924e-07, + "loss": 3.0, + "step": 24 + }, + { + "epoch": 0.009038322487346349, + "grad_norm": 21.40736685701313, + "learning_rate": 8.664259927797834e-07, + "loss": 3.1094, + "step": 25 + }, + { + "epoch": 0.009399855386840203, + "grad_norm": 15.138083726498593, + "learning_rate": 9.025270758122745e-07, + "loss": 2.6094, + "step": 26 + }, + { + "epoch": 0.009761388286334056, + "grad_norm": 15.517794313449986, + "learning_rate": 9.386281588447654e-07, + "loss": 2.7812, + "step": 27 + }, + { + "epoch": 0.01012292118582791, + "grad_norm": 17.821719343927366, + "learning_rate": 9.747292418772564e-07, + "loss": 3.3438, + "step": 28 + }, + { + "epoch": 0.010484454085321764, + "grad_norm": 18.187632969721424, + "learning_rate": 1.0108303249097473e-06, + "loss": 3.0, + "step": 29 + }, + { + "epoch": 0.010845986984815618, + "grad_norm": 18.455063260589476, + "learning_rate": 1.0469314079422384e-06, + "loss": 2.5625, + "step": 30 + }, + { + "epoch": 0.011207519884309472, + "grad_norm": 18.521440410819366, + "learning_rate": 1.0830324909747293e-06, + "loss": 2.7344, + "step": 31 + }, + { + "epoch": 0.011569052783803326, + "grad_norm": 18.76025247329995, + "learning_rate": 1.1191335740072204e-06, + "loss": 2.7656, + "step": 32 + }, + { + "epoch": 0.01193058568329718, + "grad_norm": 17.358748841418983, + "learning_rate": 1.1552346570397112e-06, + "loss": 2.6875, + "step": 33 + }, + { + "epoch": 0.012292118582791034, + "grad_norm": 18.66613255084333, + "learning_rate": 1.1913357400722023e-06, + "loss": 3.0625, + "step": 34 + }, + { + "epoch": 0.012653651482284888, + "grad_norm": 22.836443306980634, + "learning_rate": 1.2274368231046932e-06, + "loss": 3.1406, + "step": 35 + }, + { + "epoch": 0.013015184381778741, + "grad_norm": 16.943019995473996, + "learning_rate": 1.263537906137184e-06, + "loss": 3.0, + "step": 36 + }, + { + "epoch": 0.013376717281272595, + "grad_norm": 44.77080142354421, + "learning_rate": 1.299638989169675e-06, + "loss": 4.7188, + "step": 37 + }, + { + "epoch": 0.01373825018076645, + "grad_norm": 15.733465239818814, + "learning_rate": 1.3357400722021663e-06, + "loss": 2.6094, + "step": 38 + }, + { + "epoch": 0.014099783080260303, + "grad_norm": 17.49965684949101, + "learning_rate": 1.3718411552346572e-06, + "loss": 2.5781, + "step": 39 + }, + { + "epoch": 0.014461315979754157, + "grad_norm": 15.492691827770097, + "learning_rate": 1.407942238267148e-06, + "loss": 2.1562, + "step": 40 + }, + { + "epoch": 0.01482284887924801, + "grad_norm": 20.144161329542747, + "learning_rate": 1.4440433212996392e-06, + "loss": 2.8125, + "step": 41 + }, + { + "epoch": 0.015184381778741865, + "grad_norm": 14.548606563326674, + "learning_rate": 1.48014440433213e-06, + "loss": 2.1406, + "step": 42 + }, + { + "epoch": 0.01554591467823572, + "grad_norm": 16.877128826110077, + "learning_rate": 1.516245487364621e-06, + "loss": 1.875, + "step": 43 + }, + { + "epoch": 0.015907447577729574, + "grad_norm": 17.28674853180433, + "learning_rate": 1.5523465703971122e-06, + "loss": 2.8906, + "step": 44 + }, + { + "epoch": 0.016268980477223426, + "grad_norm": 15.792683404212282, + "learning_rate": 1.5884476534296031e-06, + "loss": 2.0625, + "step": 45 + }, + { + "epoch": 0.016630513376717282, + "grad_norm": 13.844101786820161, + "learning_rate": 1.624548736462094e-06, + "loss": 2.2656, + "step": 46 + }, + { + "epoch": 0.016992046276211134, + "grad_norm": 17.104110127841544, + "learning_rate": 1.6606498194945849e-06, + "loss": 2.3594, + "step": 47 + }, + { + "epoch": 0.01735357917570499, + "grad_norm": 13.732703614758588, + "learning_rate": 1.696750902527076e-06, + "loss": 1.9219, + "step": 48 + }, + { + "epoch": 0.017715112075198842, + "grad_norm": 18.208759791734153, + "learning_rate": 1.7328519855595669e-06, + "loss": 2.125, + "step": 49 + }, + { + "epoch": 0.018076644974692697, + "grad_norm": 16.805658008075675, + "learning_rate": 1.7689530685920577e-06, + "loss": 2.2812, + "step": 50 + }, + { + "epoch": 0.01843817787418655, + "grad_norm": 22.188843633301392, + "learning_rate": 1.805054151624549e-06, + "loss": 1.8672, + "step": 51 + }, + { + "epoch": 0.018799710773680405, + "grad_norm": 13.658962672131516, + "learning_rate": 1.84115523465704e-06, + "loss": 1.9922, + "step": 52 + }, + { + "epoch": 0.019161243673174257, + "grad_norm": 13.706594693824512, + "learning_rate": 1.8772563176895308e-06, + "loss": 2.5312, + "step": 53 + }, + { + "epoch": 0.019522776572668113, + "grad_norm": 16.683444243869, + "learning_rate": 1.9133574007220217e-06, + "loss": 1.9531, + "step": 54 + }, + { + "epoch": 0.019884309472161965, + "grad_norm": 16.220488310090612, + "learning_rate": 1.949458483754513e-06, + "loss": 1.625, + "step": 55 + }, + { + "epoch": 0.02024584237165582, + "grad_norm": 15.167548782612744, + "learning_rate": 1.9855595667870035e-06, + "loss": 1.7031, + "step": 56 + }, + { + "epoch": 0.020607375271149676, + "grad_norm": 19.182032021794537, + "learning_rate": 2.0216606498194946e-06, + "loss": 2.0469, + "step": 57 + }, + { + "epoch": 0.02096890817064353, + "grad_norm": 12.735654027594608, + "learning_rate": 2.0577617328519857e-06, + "loss": 2.2656, + "step": 58 + }, + { + "epoch": 0.021330441070137384, + "grad_norm": 17.432912572115512, + "learning_rate": 2.0938628158844768e-06, + "loss": 1.2109, + "step": 59 + }, + { + "epoch": 0.021691973969631236, + "grad_norm": 24.516441644574627, + "learning_rate": 2.129963898916968e-06, + "loss": 1.4531, + "step": 60 + }, + { + "epoch": 0.022053506869125092, + "grad_norm": 19.310451481141417, + "learning_rate": 2.1660649819494585e-06, + "loss": 1.1094, + "step": 61 + }, + { + "epoch": 0.022415039768618944, + "grad_norm": 18.182868262067963, + "learning_rate": 2.2021660649819496e-06, + "loss": 7.7188, + "step": 62 + }, + { + "epoch": 0.0227765726681128, + "grad_norm": 19.718129890985534, + "learning_rate": 2.2382671480144407e-06, + "loss": 1.1641, + "step": 63 + }, + { + "epoch": 0.023138105567606652, + "grad_norm": 19.462628231016243, + "learning_rate": 2.274368231046932e-06, + "loss": 0.75, + "step": 64 + }, + { + "epoch": 0.023499638467100507, + "grad_norm": 23.569701643103294, + "learning_rate": 2.3104693140794225e-06, + "loss": 0.8867, + "step": 65 + }, + { + "epoch": 0.02386117136659436, + "grad_norm": 23.545576315739147, + "learning_rate": 2.3465703971119136e-06, + "loss": 0.2285, + "step": 66 + }, + { + "epoch": 0.024222704266088215, + "grad_norm": 10.16551728074609, + "learning_rate": 2.3826714801444047e-06, + "loss": 0.1738, + "step": 67 + }, + { + "epoch": 0.024584237165582067, + "grad_norm": 16.816170957671652, + "learning_rate": 2.4187725631768953e-06, + "loss": 0.1157, + "step": 68 + }, + { + "epoch": 0.024945770065075923, + "grad_norm": 1.8820747546694334, + "learning_rate": 2.4548736462093864e-06, + "loss": 0.0179, + "step": 69 + }, + { + "epoch": 0.025307302964569775, + "grad_norm": 8.418765006561033, + "learning_rate": 2.4909747292418775e-06, + "loss": 0.2021, + "step": 70 + }, + { + "epoch": 0.02566883586406363, + "grad_norm": 9.402718487200802, + "learning_rate": 2.527075812274368e-06, + "loss": 0.3145, + "step": 71 + }, + { + "epoch": 0.026030368763557483, + "grad_norm": 0.0545781506552934, + "learning_rate": 2.5631768953068593e-06, + "loss": 0.0013, + "step": 72 + }, + { + "epoch": 0.02639190166305134, + "grad_norm": 3.536630005002213, + "learning_rate": 2.59927797833935e-06, + "loss": 0.0659, + "step": 73 + }, + { + "epoch": 0.02675343456254519, + "grad_norm": 1.1986029006399808, + "learning_rate": 2.6353790613718415e-06, + "loss": 0.0083, + "step": 74 + }, + { + "epoch": 0.027114967462039046, + "grad_norm": 3.628418711813289, + "learning_rate": 2.6714801444043326e-06, + "loss": 0.5039, + "step": 75 + }, + { + "epoch": 0.0274765003615329, + "grad_norm": 6.706459901368315, + "learning_rate": 2.7075812274368237e-06, + "loss": 0.2852, + "step": 76 + }, + { + "epoch": 0.027838033261026754, + "grad_norm": 2.5610138264775033, + "learning_rate": 2.7436823104693144e-06, + "loss": 0.043, + "step": 77 + }, + { + "epoch": 0.028199566160520606, + "grad_norm": 0.5856128288603193, + "learning_rate": 2.7797833935018055e-06, + "loss": 0.0164, + "step": 78 + }, + { + "epoch": 0.02856109906001446, + "grad_norm": 1.7702819388633455, + "learning_rate": 2.815884476534296e-06, + "loss": 0.054, + "step": 79 + }, + { + "epoch": 0.028922631959508314, + "grad_norm": 16.58486441094387, + "learning_rate": 2.8519855595667872e-06, + "loss": 0.1348, + "step": 80 + }, + { + "epoch": 0.02928416485900217, + "grad_norm": 0.8499828759100255, + "learning_rate": 2.8880866425992783e-06, + "loss": 0.0139, + "step": 81 + }, + { + "epoch": 0.02964569775849602, + "grad_norm": 2.4298899157197247, + "learning_rate": 2.924187725631769e-06, + "loss": 0.0549, + "step": 82 + }, + { + "epoch": 0.030007230657989877, + "grad_norm": 2.3807677635825395, + "learning_rate": 2.96028880866426e-06, + "loss": 0.0649, + "step": 83 + }, + { + "epoch": 0.03036876355748373, + "grad_norm": 2.816730479355395, + "learning_rate": 2.996389891696751e-06, + "loss": 0.0923, + "step": 84 + }, + { + "epoch": 0.030730296456977585, + "grad_norm": 0.34709271020697946, + "learning_rate": 3.032490974729242e-06, + "loss": 0.0076, + "step": 85 + }, + { + "epoch": 0.03109182935647144, + "grad_norm": 2.076506240888657, + "learning_rate": 3.068592057761733e-06, + "loss": 0.061, + "step": 86 + }, + { + "epoch": 0.031453362255965296, + "grad_norm": 1.1804417562115723, + "learning_rate": 3.1046931407942245e-06, + "loss": 0.0215, + "step": 87 + }, + { + "epoch": 0.03181489515545915, + "grad_norm": 0.026071314077138672, + "learning_rate": 3.140794223826715e-06, + "loss": 0.0007, + "step": 88 + }, + { + "epoch": 0.032176428054953, + "grad_norm": 0.11085105626627612, + "learning_rate": 3.1768953068592062e-06, + "loss": 0.0032, + "step": 89 + }, + { + "epoch": 0.03253796095444685, + "grad_norm": 0.3796014388328141, + "learning_rate": 3.2129963898916973e-06, + "loss": 0.01, + "step": 90 + }, + { + "epoch": 0.03289949385394071, + "grad_norm": 3.921589813252878, + "learning_rate": 3.249097472924188e-06, + "loss": 0.106, + "step": 91 + }, + { + "epoch": 0.033261026753434564, + "grad_norm": 0.01580216449580874, + "learning_rate": 3.285198555956679e-06, + "loss": 0.0004, + "step": 92 + }, + { + "epoch": 0.033622559652928416, + "grad_norm": 5.879458171253627, + "learning_rate": 3.3212996389891698e-06, + "loss": 0.1719, + "step": 93 + }, + { + "epoch": 0.03398409255242227, + "grad_norm": 4.195774139406421, + "learning_rate": 3.357400722021661e-06, + "loss": 0.5586, + "step": 94 + }, + { + "epoch": 0.03434562545191613, + "grad_norm": 0.9597823881453733, + "learning_rate": 3.393501805054152e-06, + "loss": 0.0201, + "step": 95 + }, + { + "epoch": 0.03470715835140998, + "grad_norm": 0.11857239985953921, + "learning_rate": 3.4296028880866426e-06, + "loss": 0.0023, + "step": 96 + }, + { + "epoch": 0.03506869125090383, + "grad_norm": 0.0164437410925149, + "learning_rate": 3.4657039711191337e-06, + "loss": 0.0006, + "step": 97 + }, + { + "epoch": 0.035430224150397684, + "grad_norm": 0.7422026981465785, + "learning_rate": 3.501805054151625e-06, + "loss": 0.0173, + "step": 98 + }, + { + "epoch": 0.03579175704989154, + "grad_norm": 0.18929723174099133, + "learning_rate": 3.5379061371841155e-06, + "loss": 0.0039, + "step": 99 + }, + { + "epoch": 0.036153289949385395, + "grad_norm": 0.016315786558297775, + "learning_rate": 3.574007220216607e-06, + "loss": 0.0005, + "step": 100 + }, + { + "epoch": 0.03651482284887925, + "grad_norm": 2.159275354770509, + "learning_rate": 3.610108303249098e-06, + "loss": 0.0894, + "step": 101 + }, + { + "epoch": 0.0368763557483731, + "grad_norm": 0.023643402065539196, + "learning_rate": 3.6462093862815888e-06, + "loss": 0.0007, + "step": 102 + }, + { + "epoch": 0.03723788864786696, + "grad_norm": 3.8455143587195653, + "learning_rate": 3.68231046931408e-06, + "loss": 0.377, + "step": 103 + }, + { + "epoch": 0.03759942154736081, + "grad_norm": 0.047224464026884384, + "learning_rate": 3.718411552346571e-06, + "loss": 0.0014, + "step": 104 + }, + { + "epoch": 0.03796095444685466, + "grad_norm": 0.03373228615702662, + "learning_rate": 3.7545126353790616e-06, + "loss": 0.001, + "step": 105 + }, + { + "epoch": 0.038322487346348515, + "grad_norm": 0.7833237127945889, + "learning_rate": 3.7906137184115527e-06, + "loss": 0.0162, + "step": 106 + }, + { + "epoch": 0.038684020245842374, + "grad_norm": 1.392968407470076, + "learning_rate": 3.826714801444043e-06, + "loss": 0.0371, + "step": 107 + }, + { + "epoch": 0.039045553145336226, + "grad_norm": 1.0855845328868676, + "learning_rate": 3.862815884476535e-06, + "loss": 0.0233, + "step": 108 + }, + { + "epoch": 0.03940708604483008, + "grad_norm": 21.96686906948574, + "learning_rate": 3.898916967509026e-06, + "loss": 2.6875, + "step": 109 + }, + { + "epoch": 0.03976861894432393, + "grad_norm": 0.030581161436268975, + "learning_rate": 3.935018050541516e-06, + "loss": 0.0009, + "step": 110 + }, + { + "epoch": 0.04013015184381779, + "grad_norm": 1.854139651268432, + "learning_rate": 3.971119133574007e-06, + "loss": 0.0447, + "step": 111 + }, + { + "epoch": 0.04049168474331164, + "grad_norm": 0.031223763235547056, + "learning_rate": 4.0072202166064985e-06, + "loss": 0.0009, + "step": 112 + }, + { + "epoch": 0.040853217642805494, + "grad_norm": 5.698413473083582, + "learning_rate": 4.043321299638989e-06, + "loss": 0.2441, + "step": 113 + }, + { + "epoch": 0.04121475054229935, + "grad_norm": 0.5186747892178895, + "learning_rate": 4.079422382671481e-06, + "loss": 0.01, + "step": 114 + }, + { + "epoch": 0.041576283441793205, + "grad_norm": 4.046283266156677, + "learning_rate": 4.115523465703971e-06, + "loss": 0.105, + "step": 115 + }, + { + "epoch": 0.04193781634128706, + "grad_norm": 3.340765562061972, + "learning_rate": 4.151624548736463e-06, + "loss": 0.4082, + "step": 116 + }, + { + "epoch": 0.04229934924078091, + "grad_norm": 2.8891903883067593, + "learning_rate": 4.1877256317689535e-06, + "loss": 0.0532, + "step": 117 + }, + { + "epoch": 0.04266088214027477, + "grad_norm": 1.3500871675646395, + "learning_rate": 4.223826714801444e-06, + "loss": 0.0259, + "step": 118 + }, + { + "epoch": 0.04302241503976862, + "grad_norm": 4.414310533551601, + "learning_rate": 4.259927797833936e-06, + "loss": 0.1299, + "step": 119 + }, + { + "epoch": 0.04338394793926247, + "grad_norm": 0.9763602923216815, + "learning_rate": 4.296028880866426e-06, + "loss": 0.0214, + "step": 120 + }, + { + "epoch": 0.043745480838756325, + "grad_norm": 1.1773219805592248, + "learning_rate": 4.332129963898917e-06, + "loss": 0.0242, + "step": 121 + }, + { + "epoch": 0.044107013738250184, + "grad_norm": 4.683173792560477, + "learning_rate": 4.3682310469314086e-06, + "loss": 0.2461, + "step": 122 + }, + { + "epoch": 0.044468546637744036, + "grad_norm": 0.3781549576256878, + "learning_rate": 4.404332129963899e-06, + "loss": 0.0048, + "step": 123 + }, + { + "epoch": 0.04483007953723789, + "grad_norm": 1.2396498987204665, + "learning_rate": 4.44043321299639e-06, + "loss": 0.0133, + "step": 124 + }, + { + "epoch": 0.04519161243673174, + "grad_norm": 0.31058087017176383, + "learning_rate": 4.4765342960288814e-06, + "loss": 0.0061, + "step": 125 + }, + { + "epoch": 0.0455531453362256, + "grad_norm": 0.7135550168451511, + "learning_rate": 4.512635379061372e-06, + "loss": 0.0164, + "step": 126 + }, + { + "epoch": 0.04591467823571945, + "grad_norm": 3.8563031394075993, + "learning_rate": 4.548736462093864e-06, + "loss": 0.4082, + "step": 127 + }, + { + "epoch": 0.046276211135213303, + "grad_norm": 0.5287497030757917, + "learning_rate": 4.584837545126354e-06, + "loss": 0.012, + "step": 128 + }, + { + "epoch": 0.046637744034707156, + "grad_norm": 0.490496847779193, + "learning_rate": 4.620938628158845e-06, + "loss": 0.0118, + "step": 129 + }, + { + "epoch": 0.046999276934201015, + "grad_norm": 0.2889672738303459, + "learning_rate": 4.6570397111913365e-06, + "loss": 0.0085, + "step": 130 + }, + { + "epoch": 0.04736080983369487, + "grad_norm": 0.33226807751735343, + "learning_rate": 4.693140794223827e-06, + "loss": 0.0074, + "step": 131 + }, + { + "epoch": 0.04772234273318872, + "grad_norm": 1.045000334518343, + "learning_rate": 4.729241877256318e-06, + "loss": 0.0209, + "step": 132 + }, + { + "epoch": 0.04808387563268257, + "grad_norm": 0.04879141640301132, + "learning_rate": 4.765342960288809e-06, + "loss": 0.0009, + "step": 133 + }, + { + "epoch": 0.04844540853217643, + "grad_norm": 0.9979938405826598, + "learning_rate": 4.8014440433213e-06, + "loss": 0.0208, + "step": 134 + }, + { + "epoch": 0.04880694143167028, + "grad_norm": 0.08015734980023463, + "learning_rate": 4.837545126353791e-06, + "loss": 0.0014, + "step": 135 + }, + { + "epoch": 0.049168474331164135, + "grad_norm": 0.014553034078573968, + "learning_rate": 4.873646209386282e-06, + "loss": 0.0004, + "step": 136 + }, + { + "epoch": 0.04953000723065799, + "grad_norm": 0.03199195582100914, + "learning_rate": 4.909747292418773e-06, + "loss": 0.0008, + "step": 137 + }, + { + "epoch": 0.049891540130151846, + "grad_norm": 0.7425653773610343, + "learning_rate": 4.9458483754512636e-06, + "loss": 0.0106, + "step": 138 + }, + { + "epoch": 0.0502530730296457, + "grad_norm": 0.030589346779087898, + "learning_rate": 4.981949458483755e-06, + "loss": 0.0008, + "step": 139 + }, + { + "epoch": 0.05061460592913955, + "grad_norm": 0.05139917797136386, + "learning_rate": 5.018050541516246e-06, + "loss": 0.0009, + "step": 140 + }, + { + "epoch": 0.0509761388286334, + "grad_norm": 0.2804896424768487, + "learning_rate": 5.054151624548736e-06, + "loss": 0.0052, + "step": 141 + }, + { + "epoch": 0.05133767172812726, + "grad_norm": 0.5647779260982275, + "learning_rate": 5.090252707581228e-06, + "loss": 0.0072, + "step": 142 + }, + { + "epoch": 0.05169920462762111, + "grad_norm": 0.059338827874302294, + "learning_rate": 5.126353790613719e-06, + "loss": 0.0015, + "step": 143 + }, + { + "epoch": 0.052060737527114966, + "grad_norm": 0.016833181042069142, + "learning_rate": 5.16245487364621e-06, + "loss": 0.0004, + "step": 144 + }, + { + "epoch": 0.052422270426608825, + "grad_norm": 0.020720247942236526, + "learning_rate": 5.1985559566787e-06, + "loss": 0.0005, + "step": 145 + }, + { + "epoch": 0.05278380332610268, + "grad_norm": 0.8008133381436261, + "learning_rate": 5.2346570397111915e-06, + "loss": 0.0149, + "step": 146 + }, + { + "epoch": 0.05314533622559653, + "grad_norm": 0.024568300138572405, + "learning_rate": 5.270758122743683e-06, + "loss": 0.0008, + "step": 147 + }, + { + "epoch": 0.05350686912509038, + "grad_norm": 7.186612499123365, + "learning_rate": 5.306859205776174e-06, + "loss": 0.8867, + "step": 148 + }, + { + "epoch": 0.05386840202458424, + "grad_norm": 6.536804833826344, + "learning_rate": 5.342960288808665e-06, + "loss": 0.373, + "step": 149 + }, + { + "epoch": 0.05422993492407809, + "grad_norm": 0.5178145224343003, + "learning_rate": 5.379061371841156e-06, + "loss": 0.0118, + "step": 150 + }, + { + "epoch": 0.054591467823571944, + "grad_norm": 6.650397080235223, + "learning_rate": 5.415162454873647e-06, + "loss": 0.4141, + "step": 151 + }, + { + "epoch": 0.0549530007230658, + "grad_norm": 3.0712831772151943, + "learning_rate": 5.451263537906137e-06, + "loss": 0.0483, + "step": 152 + }, + { + "epoch": 0.055314533622559656, + "grad_norm": 3.604339045190966, + "learning_rate": 5.487364620938629e-06, + "loss": 0.2559, + "step": 153 + }, + { + "epoch": 0.05567606652205351, + "grad_norm": 0.012121491207441971, + "learning_rate": 5.523465703971119e-06, + "loss": 0.0004, + "step": 154 + }, + { + "epoch": 0.05603759942154736, + "grad_norm": 0.034223848359539254, + "learning_rate": 5.559566787003611e-06, + "loss": 0.0008, + "step": 155 + }, + { + "epoch": 0.05639913232104121, + "grad_norm": 3.9654390558842603, + "learning_rate": 5.595667870036101e-06, + "loss": 0.1289, + "step": 156 + }, + { + "epoch": 0.05676066522053507, + "grad_norm": 0.038791200207217776, + "learning_rate": 5.631768953068592e-06, + "loss": 0.0012, + "step": 157 + }, + { + "epoch": 0.05712219812002892, + "grad_norm": 0.09187800852767856, + "learning_rate": 5.667870036101083e-06, + "loss": 0.002, + "step": 158 + }, + { + "epoch": 0.057483731019522775, + "grad_norm": 4.098298750095974, + "learning_rate": 5.7039711191335744e-06, + "loss": 0.1113, + "step": 159 + }, + { + "epoch": 0.05784526391901663, + "grad_norm": 0.32843717950066514, + "learning_rate": 5.740072202166066e-06, + "loss": 0.006, + "step": 160 + }, + { + "epoch": 0.05820679681851049, + "grad_norm": 0.5138400939822615, + "learning_rate": 5.776173285198557e-06, + "loss": 0.0096, + "step": 161 + }, + { + "epoch": 0.05856832971800434, + "grad_norm": 3.592042606942738, + "learning_rate": 5.812274368231048e-06, + "loss": 0.332, + "step": 162 + }, + { + "epoch": 0.05892986261749819, + "grad_norm": 5.143449789293266, + "learning_rate": 5.848375451263538e-06, + "loss": 0.2559, + "step": 163 + }, + { + "epoch": 0.05929139551699204, + "grad_norm": 0.09388629524105364, + "learning_rate": 5.8844765342960295e-06, + "loss": 0.0026, + "step": 164 + }, + { + "epoch": 0.0596529284164859, + "grad_norm": 0.16515966458949058, + "learning_rate": 5.92057761732852e-06, + "loss": 0.0046, + "step": 165 + }, + { + "epoch": 0.060014461315979754, + "grad_norm": 0.013324025732606205, + "learning_rate": 5.956678700361012e-06, + "loss": 0.0004, + "step": 166 + }, + { + "epoch": 0.060375994215473607, + "grad_norm": 0.13868821867218076, + "learning_rate": 5.992779783393502e-06, + "loss": 0.0048, + "step": 167 + }, + { + "epoch": 0.06073752711496746, + "grad_norm": 0.5660957228047933, + "learning_rate": 6.028880866425994e-06, + "loss": 0.0133, + "step": 168 + }, + { + "epoch": 0.06109906001446132, + "grad_norm": 0.03535681677711619, + "learning_rate": 6.064981949458484e-06, + "loss": 0.0011, + "step": 169 + }, + { + "epoch": 0.06146059291395517, + "grad_norm": 0.12719170419848488, + "learning_rate": 6.101083032490975e-06, + "loss": 0.0036, + "step": 170 + }, + { + "epoch": 0.06182212581344902, + "grad_norm": 0.2694115224378066, + "learning_rate": 6.137184115523466e-06, + "loss": 0.0066, + "step": 171 + }, + { + "epoch": 0.06218365871294288, + "grad_norm": 1.7698235583209558, + "learning_rate": 6.173285198555957e-06, + "loss": 0.053, + "step": 172 + }, + { + "epoch": 0.06254519161243673, + "grad_norm": 3.086752442430962, + "learning_rate": 6.209386281588449e-06, + "loss": 0.2344, + "step": 173 + }, + { + "epoch": 0.06290672451193059, + "grad_norm": 4.267661494026807, + "learning_rate": 6.245487364620939e-06, + "loss": 0.2891, + "step": 174 + }, + { + "epoch": 0.06326825741142444, + "grad_norm": 0.18842777542052794, + "learning_rate": 6.28158844765343e-06, + "loss": 0.0051, + "step": 175 + }, + { + "epoch": 0.0636297903109183, + "grad_norm": 0.7684215463920879, + "learning_rate": 6.317689530685921e-06, + "loss": 0.0134, + "step": 176 + }, + { + "epoch": 0.06399132321041215, + "grad_norm": 2.647893531869819, + "learning_rate": 6.3537906137184125e-06, + "loss": 0.1035, + "step": 177 + }, + { + "epoch": 0.064352856109906, + "grad_norm": 0.01896560828633322, + "learning_rate": 6.389891696750903e-06, + "loss": 0.0006, + "step": 178 + }, + { + "epoch": 0.06471438900939985, + "grad_norm": 0.8974127668173799, + "learning_rate": 6.425992779783395e-06, + "loss": 0.0233, + "step": 179 + }, + { + "epoch": 0.0650759219088937, + "grad_norm": 3.4011783287520365, + "learning_rate": 6.4620938628158845e-06, + "loss": 0.1553, + "step": 180 + }, + { + "epoch": 0.06543745480838756, + "grad_norm": 0.18016951111675264, + "learning_rate": 6.498194945848376e-06, + "loss": 0.0033, + "step": 181 + }, + { + "epoch": 0.06579898770788142, + "grad_norm": 4.199136779964373, + "learning_rate": 6.534296028880867e-06, + "loss": 0.3477, + "step": 182 + }, + { + "epoch": 0.06616052060737528, + "grad_norm": 0.0545247222875143, + "learning_rate": 6.570397111913358e-06, + "loss": 0.0013, + "step": 183 + }, + { + "epoch": 0.06652205350686913, + "grad_norm": 0.2444466610707507, + "learning_rate": 6.606498194945848e-06, + "loss": 0.0054, + "step": 184 + }, + { + "epoch": 0.06688358640636298, + "grad_norm": 0.053230393817154185, + "learning_rate": 6.6425992779783395e-06, + "loss": 0.0007, + "step": 185 + }, + { + "epoch": 0.06724511930585683, + "grad_norm": 3.8423491179433693, + "learning_rate": 6.678700361010831e-06, + "loss": 0.1748, + "step": 186 + }, + { + "epoch": 0.06760665220535068, + "grad_norm": 4.901826856157424, + "learning_rate": 6.714801444043322e-06, + "loss": 0.5, + "step": 187 + }, + { + "epoch": 0.06796818510484454, + "grad_norm": 0.0850299338155856, + "learning_rate": 6.750902527075813e-06, + "loss": 0.0018, + "step": 188 + }, + { + "epoch": 0.06832971800433839, + "grad_norm": 2.247049204963777, + "learning_rate": 6.787003610108304e-06, + "loss": 0.053, + "step": 189 + }, + { + "epoch": 0.06869125090383225, + "grad_norm": 3.9569114296125254, + "learning_rate": 6.8231046931407954e-06, + "loss": 0.2461, + "step": 190 + }, + { + "epoch": 0.0690527838033261, + "grad_norm": 0.06165285509650954, + "learning_rate": 6.859205776173285e-06, + "loss": 0.0019, + "step": 191 + }, + { + "epoch": 0.06941431670281996, + "grad_norm": 0.0671984414111177, + "learning_rate": 6.895306859205777e-06, + "loss": 0.0018, + "step": 192 + }, + { + "epoch": 0.06977584960231381, + "grad_norm": 3.534383742955296, + "learning_rate": 6.9314079422382674e-06, + "loss": 0.2891, + "step": 193 + }, + { + "epoch": 0.07013738250180766, + "grad_norm": 0.18946956582570323, + "learning_rate": 6.967509025270759e-06, + "loss": 0.005, + "step": 194 + }, + { + "epoch": 0.07049891540130152, + "grad_norm": 0.8381777451055672, + "learning_rate": 7.00361010830325e-06, + "loss": 0.032, + "step": 195 + }, + { + "epoch": 0.07086044830079537, + "grad_norm": 0.1054178657204655, + "learning_rate": 7.039711191335741e-06, + "loss": 0.0029, + "step": 196 + }, + { + "epoch": 0.07122198120028922, + "grad_norm": 3.10024954004179, + "learning_rate": 7.075812274368231e-06, + "loss": 0.2891, + "step": 197 + }, + { + "epoch": 0.07158351409978309, + "grad_norm": 0.48860110693984543, + "learning_rate": 7.1119133574007225e-06, + "loss": 0.0166, + "step": 198 + }, + { + "epoch": 0.07194504699927694, + "grad_norm": 0.049906549586449356, + "learning_rate": 7.148014440433214e-06, + "loss": 0.0013, + "step": 199 + }, + { + "epoch": 0.07230657989877079, + "grad_norm": 0.487116899508749, + "learning_rate": 7.184115523465705e-06, + "loss": 0.0133, + "step": 200 + }, + { + "epoch": 0.07266811279826464, + "grad_norm": 2.6198489406481538, + "learning_rate": 7.220216606498196e-06, + "loss": 0.1191, + "step": 201 + }, + { + "epoch": 0.0730296456977585, + "grad_norm": 1.5581617254328661, + "learning_rate": 7.256317689530686e-06, + "loss": 0.0583, + "step": 202 + }, + { + "epoch": 0.07339117859725235, + "grad_norm": 0.045759918158191255, + "learning_rate": 7.2924187725631776e-06, + "loss": 0.0015, + "step": 203 + }, + { + "epoch": 0.0737527114967462, + "grad_norm": 3.218360068286435, + "learning_rate": 7.328519855595668e-06, + "loss": 0.2988, + "step": 204 + }, + { + "epoch": 0.07411424439624006, + "grad_norm": 0.15308851577181853, + "learning_rate": 7.36462093862816e-06, + "loss": 0.0043, + "step": 205 + }, + { + "epoch": 0.07447577729573392, + "grad_norm": 4.320599196449289, + "learning_rate": 7.40072202166065e-06, + "loss": 0.0579, + "step": 206 + }, + { + "epoch": 0.07483731019522777, + "grad_norm": 0.0884685078670022, + "learning_rate": 7.436823104693142e-06, + "loss": 0.0026, + "step": 207 + }, + { + "epoch": 0.07519884309472162, + "grad_norm": 1.3042229202888276, + "learning_rate": 7.472924187725632e-06, + "loss": 0.0435, + "step": 208 + }, + { + "epoch": 0.07556037599421547, + "grad_norm": 0.9640668919770726, + "learning_rate": 7.509025270758123e-06, + "loss": 0.0356, + "step": 209 + }, + { + "epoch": 0.07592190889370933, + "grad_norm": 0.9362655433095011, + "learning_rate": 7.545126353790614e-06, + "loss": 0.0292, + "step": 210 + }, + { + "epoch": 0.07628344179320318, + "grad_norm": 0.09739863703063652, + "learning_rate": 7.5812274368231055e-06, + "loss": 0.003, + "step": 211 + }, + { + "epoch": 0.07664497469269703, + "grad_norm": 0.10337767617825858, + "learning_rate": 7.617328519855596e-06, + "loss": 0.0032, + "step": 212 + }, + { + "epoch": 0.0770065075921909, + "grad_norm": 1.6771133350503034, + "learning_rate": 7.653429602888087e-06, + "loss": 0.0757, + "step": 213 + }, + { + "epoch": 0.07736804049168475, + "grad_norm": 3.190526664419477, + "learning_rate": 7.68953068592058e-06, + "loss": 0.1836, + "step": 214 + }, + { + "epoch": 0.0777295733911786, + "grad_norm": 1.145983458633985, + "learning_rate": 7.72563176895307e-06, + "loss": 0.0437, + "step": 215 + }, + { + "epoch": 0.07809110629067245, + "grad_norm": 1.1874020871931577, + "learning_rate": 7.76173285198556e-06, + "loss": 0.0396, + "step": 216 + }, + { + "epoch": 0.0784526391901663, + "grad_norm": 0.09667530715788139, + "learning_rate": 7.797833935018051e-06, + "loss": 0.0032, + "step": 217 + }, + { + "epoch": 0.07881417208966016, + "grad_norm": 0.14971574916679362, + "learning_rate": 7.833935018050542e-06, + "loss": 0.004, + "step": 218 + }, + { + "epoch": 0.07917570498915401, + "grad_norm": 0.14346174349325796, + "learning_rate": 7.870036101083033e-06, + "loss": 0.0035, + "step": 219 + }, + { + "epoch": 0.07953723788864786, + "grad_norm": 2.481901750392619, + "learning_rate": 7.906137184115525e-06, + "loss": 0.1738, + "step": 220 + }, + { + "epoch": 0.07989877078814173, + "grad_norm": 0.08530324779274066, + "learning_rate": 7.942238267148014e-06, + "loss": 0.0032, + "step": 221 + }, + { + "epoch": 0.08026030368763558, + "grad_norm": 1.9989738165637332, + "learning_rate": 7.978339350180506e-06, + "loss": 0.0967, + "step": 222 + }, + { + "epoch": 0.08062183658712943, + "grad_norm": 0.024130592016423744, + "learning_rate": 8.014440433212997e-06, + "loss": 0.0008, + "step": 223 + }, + { + "epoch": 0.08098336948662328, + "grad_norm": 0.19182586572054083, + "learning_rate": 8.050541516245488e-06, + "loss": 0.006, + "step": 224 + }, + { + "epoch": 0.08134490238611713, + "grad_norm": 0.9397048975599661, + "learning_rate": 8.086642599277978e-06, + "loss": 0.0439, + "step": 225 + }, + { + "epoch": 0.08170643528561099, + "grad_norm": 1.1818664621782111, + "learning_rate": 8.12274368231047e-06, + "loss": 0.021, + "step": 226 + }, + { + "epoch": 0.08206796818510484, + "grad_norm": 0.08554097136049325, + "learning_rate": 8.158844765342961e-06, + "loss": 0.0029, + "step": 227 + }, + { + "epoch": 0.0824295010845987, + "grad_norm": 0.23334854423481197, + "learning_rate": 8.194945848375452e-06, + "loss": 0.0083, + "step": 228 + }, + { + "epoch": 0.08279103398409256, + "grad_norm": 2.8762114301501196, + "learning_rate": 8.231046931407943e-06, + "loss": 0.2891, + "step": 229 + }, + { + "epoch": 0.08315256688358641, + "grad_norm": 1.3414522615764344, + "learning_rate": 8.267148014440433e-06, + "loss": 0.0583, + "step": 230 + }, + { + "epoch": 0.08351409978308026, + "grad_norm": 1.036931388256486, + "learning_rate": 8.303249097472926e-06, + "loss": 0.0292, + "step": 231 + }, + { + "epoch": 0.08387563268257411, + "grad_norm": 0.07904431683961233, + "learning_rate": 8.339350180505416e-06, + "loss": 0.0029, + "step": 232 + }, + { + "epoch": 0.08423716558206797, + "grad_norm": 0.3068159802992688, + "learning_rate": 8.375451263537907e-06, + "loss": 0.0118, + "step": 233 + }, + { + "epoch": 0.08459869848156182, + "grad_norm": 0.04838165880656883, + "learning_rate": 8.411552346570398e-06, + "loss": 0.0017, + "step": 234 + }, + { + "epoch": 0.08496023138105567, + "grad_norm": 2.8443633884106183, + "learning_rate": 8.447653429602888e-06, + "loss": 0.3105, + "step": 235 + }, + { + "epoch": 0.08532176428054954, + "grad_norm": 0.2513637901735174, + "learning_rate": 8.483754512635379e-06, + "loss": 0.0066, + "step": 236 + }, + { + "epoch": 0.08568329718004339, + "grad_norm": 0.06554057203201265, + "learning_rate": 8.519855595667871e-06, + "loss": 0.0023, + "step": 237 + }, + { + "epoch": 0.08604483007953724, + "grad_norm": 0.7832296827109824, + "learning_rate": 8.55595667870036e-06, + "loss": 0.0292, + "step": 238 + }, + { + "epoch": 0.08640636297903109, + "grad_norm": 1.4556368808015117, + "learning_rate": 8.592057761732853e-06, + "loss": 0.0583, + "step": 239 + }, + { + "epoch": 0.08676789587852494, + "grad_norm": 0.9267592435944044, + "learning_rate": 8.628158844765343e-06, + "loss": 0.0233, + "step": 240 + }, + { + "epoch": 0.0871294287780188, + "grad_norm": 2.415443560847249, + "learning_rate": 8.664259927797834e-06, + "loss": 0.2891, + "step": 241 + }, + { + "epoch": 0.08749096167751265, + "grad_norm": 1.4193740524322997, + "learning_rate": 8.700361010830326e-06, + "loss": 0.0157, + "step": 242 + }, + { + "epoch": 0.0878524945770065, + "grad_norm": 0.730447028648764, + "learning_rate": 8.736462093862817e-06, + "loss": 0.0259, + "step": 243 + }, + { + "epoch": 0.08821402747650037, + "grad_norm": 0.061658468148718275, + "learning_rate": 8.772563176895308e-06, + "loss": 0.0023, + "step": 244 + }, + { + "epoch": 0.08857556037599422, + "grad_norm": 1.8914458476876634, + "learning_rate": 8.808664259927798e-06, + "loss": 0.0889, + "step": 245 + }, + { + "epoch": 0.08893709327548807, + "grad_norm": 0.2609911074528374, + "learning_rate": 8.84476534296029e-06, + "loss": 0.0093, + "step": 246 + }, + { + "epoch": 0.08929862617498192, + "grad_norm": 2.814904006788487, + "learning_rate": 8.88086642599278e-06, + "loss": 0.2129, + "step": 247 + }, + { + "epoch": 0.08966015907447578, + "grad_norm": 0.15002763783288844, + "learning_rate": 8.916967509025272e-06, + "loss": 0.0058, + "step": 248 + }, + { + "epoch": 0.09002169197396963, + "grad_norm": 0.12150146971805226, + "learning_rate": 8.953068592057763e-06, + "loss": 0.0032, + "step": 249 + }, + { + "epoch": 0.09038322487346348, + "grad_norm": 1.3273031557627246, + "learning_rate": 8.989169675090254e-06, + "loss": 0.0635, + "step": 250 + }, + { + "epoch": 0.09074475777295733, + "grad_norm": 1.4599298303917096, + "learning_rate": 9.025270758122744e-06, + "loss": 0.0437, + "step": 251 + }, + { + "epoch": 0.0911062906724512, + "grad_norm": 2.7121747068134745, + "learning_rate": 9.061371841155235e-06, + "loss": 0.1553, + "step": 252 + }, + { + "epoch": 0.09146782357194505, + "grad_norm": 2.2240935825880297, + "learning_rate": 9.097472924187727e-06, + "loss": 0.1367, + "step": 253 + }, + { + "epoch": 0.0918293564714389, + "grad_norm": 0.5473702864775827, + "learning_rate": 9.133574007220218e-06, + "loss": 0.0186, + "step": 254 + }, + { + "epoch": 0.09219088937093275, + "grad_norm": 0.27094416187059606, + "learning_rate": 9.169675090252709e-06, + "loss": 0.0079, + "step": 255 + }, + { + "epoch": 0.09255242227042661, + "grad_norm": 1.5769391518989624, + "learning_rate": 9.2057761732852e-06, + "loss": 0.0757, + "step": 256 + }, + { + "epoch": 0.09291395516992046, + "grad_norm": 0.5168941456457659, + "learning_rate": 9.24187725631769e-06, + "loss": 0.0184, + "step": 257 + }, + { + "epoch": 0.09327548806941431, + "grad_norm": 1.634925034542276, + "learning_rate": 9.27797833935018e-06, + "loss": 0.0698, + "step": 258 + }, + { + "epoch": 0.09363702096890818, + "grad_norm": 1.9197779540019488, + "learning_rate": 9.314079422382673e-06, + "loss": 0.0957, + "step": 259 + }, + { + "epoch": 0.09399855386840203, + "grad_norm": 1.1238788845153576, + "learning_rate": 9.350180505415164e-06, + "loss": 0.0525, + "step": 260 + }, + { + "epoch": 0.09436008676789588, + "grad_norm": 0.05737233371242335, + "learning_rate": 9.386281588447654e-06, + "loss": 0.0016, + "step": 261 + }, + { + "epoch": 0.09472161966738973, + "grad_norm": 2.067478407629818, + "learning_rate": 9.422382671480145e-06, + "loss": 0.1113, + "step": 262 + }, + { + "epoch": 0.09508315256688359, + "grad_norm": 0.751162172739565, + "learning_rate": 9.458483754512636e-06, + "loss": 0.0288, + "step": 263 + }, + { + "epoch": 0.09544468546637744, + "grad_norm": 0.2666004582883668, + "learning_rate": 9.494584837545126e-06, + "loss": 0.0083, + "step": 264 + }, + { + "epoch": 0.09580621836587129, + "grad_norm": 0.95719441630325, + "learning_rate": 9.530685920577619e-06, + "loss": 0.0354, + "step": 265 + }, + { + "epoch": 0.09616775126536514, + "grad_norm": 1.2323211653595565, + "learning_rate": 9.56678700361011e-06, + "loss": 0.0393, + "step": 266 + }, + { + "epoch": 0.09652928416485901, + "grad_norm": 0.033296448246986486, + "learning_rate": 9.6028880866426e-06, + "loss": 0.001, + "step": 267 + }, + { + "epoch": 0.09689081706435286, + "grad_norm": 1.112467152059722, + "learning_rate": 9.63898916967509e-06, + "loss": 0.0393, + "step": 268 + }, + { + "epoch": 0.09725234996384671, + "grad_norm": 1.7489973332977498, + "learning_rate": 9.675090252707581e-06, + "loss": 0.0825, + "step": 269 + }, + { + "epoch": 0.09761388286334056, + "grad_norm": 2.551038742989138, + "learning_rate": 9.711191335740074e-06, + "loss": 0.0957, + "step": 270 + }, + { + "epoch": 0.09797541576283442, + "grad_norm": 1.3633599525864275, + "learning_rate": 9.747292418772564e-06, + "loss": 0.0476, + "step": 271 + }, + { + "epoch": 0.09833694866232827, + "grad_norm": 2.0366537061538583, + "learning_rate": 9.783393501805055e-06, + "loss": 0.0825, + "step": 272 + }, + { + "epoch": 0.09869848156182212, + "grad_norm": 1.2513027286009084, + "learning_rate": 9.819494584837546e-06, + "loss": 0.0432, + "step": 273 + }, + { + "epoch": 0.09906001446131597, + "grad_norm": 0.021652428286847006, + "learning_rate": 9.855595667870036e-06, + "loss": 0.0006, + "step": 274 + }, + { + "epoch": 0.09942154736080984, + "grad_norm": 0.17083654687327912, + "learning_rate": 9.891696750902527e-06, + "loss": 0.0041, + "step": 275 + }, + { + "epoch": 0.09978308026030369, + "grad_norm": 3.1413789279394364, + "learning_rate": 9.92779783393502e-06, + "loss": 0.2031, + "step": 276 + }, + { + "epoch": 0.10014461315979754, + "grad_norm": 0.3347972975601623, + "learning_rate": 9.96389891696751e-06, + "loss": 0.0107, + "step": 277 + }, + { + "epoch": 0.1005061460592914, + "grad_norm": 0.07575888343417547, + "learning_rate": 1e-05, + "loss": 0.0022, + "step": 278 + }, + { + "epoch": 0.10086767895878525, + "grad_norm": 2.307848812459562, + "learning_rate": 9.999999106500529e-06, + "loss": 0.1738, + "step": 279 + }, + { + "epoch": 0.1012292118582791, + "grad_norm": 0.19818716084815294, + "learning_rate": 9.99999642600243e-06, + "loss": 0.0049, + "step": 280 + }, + { + "epoch": 0.10159074475777295, + "grad_norm": 1.2362318031036525, + "learning_rate": 9.999991958506665e-06, + "loss": 0.0322, + "step": 281 + }, + { + "epoch": 0.1019522776572668, + "grad_norm": 4.211319169306802, + "learning_rate": 9.999985704014829e-06, + "loss": 0.4082, + "step": 282 + }, + { + "epoch": 0.10231381055676067, + "grad_norm": 2.164035968364779, + "learning_rate": 9.999977662529157e-06, + "loss": 0.1035, + "step": 283 + }, + { + "epoch": 0.10267534345625452, + "grad_norm": 0.32339819399614206, + "learning_rate": 9.999967834052524e-06, + "loss": 0.0107, + "step": 284 + }, + { + "epoch": 0.10303687635574837, + "grad_norm": 1.0531068684480898, + "learning_rate": 9.999956218588443e-06, + "loss": 0.0393, + "step": 285 + }, + { + "epoch": 0.10339840925524223, + "grad_norm": 2.9095965698117503, + "learning_rate": 9.999942816141063e-06, + "loss": 0.1738, + "step": 286 + }, + { + "epoch": 0.10375994215473608, + "grad_norm": 0.622768154078758, + "learning_rate": 9.999927626715178e-06, + "loss": 0.0208, + "step": 287 + }, + { + "epoch": 0.10412147505422993, + "grad_norm": 1.3799139526743438, + "learning_rate": 9.999910650316214e-06, + "loss": 0.0435, + "step": 288 + }, + { + "epoch": 0.10448300795372378, + "grad_norm": 0.15061625754345834, + "learning_rate": 9.999891886950236e-06, + "loss": 0.0051, + "step": 289 + }, + { + "epoch": 0.10484454085321765, + "grad_norm": 0.2907579480307444, + "learning_rate": 9.999871336623956e-06, + "loss": 0.0084, + "step": 290 + }, + { + "epoch": 0.1052060737527115, + "grad_norm": 0.09611118039793587, + "learning_rate": 9.999848999344714e-06, + "loss": 0.0037, + "step": 291 + }, + { + "epoch": 0.10556760665220535, + "grad_norm": 0.17430597088763278, + "learning_rate": 9.999824875120495e-06, + "loss": 0.0074, + "step": 292 + }, + { + "epoch": 0.1059291395516992, + "grad_norm": 0.19408667083234127, + "learning_rate": 9.99979896395992e-06, + "loss": 0.006, + "step": 293 + }, + { + "epoch": 0.10629067245119306, + "grad_norm": 3.0153203300364413, + "learning_rate": 9.99977126587225e-06, + "loss": 0.2559, + "step": 294 + }, + { + "epoch": 0.10665220535068691, + "grad_norm": 0.15124752622899423, + "learning_rate": 9.999741780867388e-06, + "loss": 0.0059, + "step": 295 + }, + { + "epoch": 0.10701373825018076, + "grad_norm": 2.3951393812298756, + "learning_rate": 9.999710508955866e-06, + "loss": 0.2891, + "step": 296 + }, + { + "epoch": 0.10737527114967461, + "grad_norm": 0.09939555290150033, + "learning_rate": 9.999677450148864e-06, + "loss": 0.0033, + "step": 297 + }, + { + "epoch": 0.10773680404916848, + "grad_norm": 1.8570134019514948, + "learning_rate": 9.999642604458196e-06, + "loss": 0.1113, + "step": 298 + }, + { + "epoch": 0.10809833694866233, + "grad_norm": 2.224204966191308, + "learning_rate": 9.999605971896317e-06, + "loss": 0.2676, + "step": 299 + }, + { + "epoch": 0.10845986984815618, + "grad_norm": 2.9932694984389476, + "learning_rate": 9.999567552476318e-06, + "loss": 0.2773, + "step": 300 + }, + { + "epoch": 0.10882140274765004, + "grad_norm": 0.06989102954684646, + "learning_rate": 9.99952734621193e-06, + "loss": 0.0029, + "step": 301 + }, + { + "epoch": 0.10918293564714389, + "grad_norm": 1.663823397194662, + "learning_rate": 9.999485353117526e-06, + "loss": 0.1113, + "step": 302 + }, + { + "epoch": 0.10954446854663774, + "grad_norm": 0.15146679077030742, + "learning_rate": 9.99944157320811e-06, + "loss": 0.0065, + "step": 303 + }, + { + "epoch": 0.1099060014461316, + "grad_norm": 1.8765733610697186, + "learning_rate": 9.999396006499331e-06, + "loss": 0.2129, + "step": 304 + }, + { + "epoch": 0.11026753434562545, + "grad_norm": 1.9193674842067656, + "learning_rate": 9.999348653007475e-06, + "loss": 0.2461, + "step": 305 + }, + { + "epoch": 0.11062906724511931, + "grad_norm": 1.901435739123263, + "learning_rate": 9.999299512749465e-06, + "loss": 0.1836, + "step": 306 + }, + { + "epoch": 0.11099060014461316, + "grad_norm": 0.5358434682238613, + "learning_rate": 9.999248585742865e-06, + "loss": 0.0186, + "step": 307 + }, + { + "epoch": 0.11135213304410702, + "grad_norm": 0.17659686190907536, + "learning_rate": 9.999195872005874e-06, + "loss": 0.0036, + "step": 308 + }, + { + "epoch": 0.11171366594360087, + "grad_norm": 0.6503133299381231, + "learning_rate": 9.999141371557334e-06, + "loss": 0.0393, + "step": 309 + }, + { + "epoch": 0.11207519884309472, + "grad_norm": 0.46132729770372066, + "learning_rate": 9.999085084416724e-06, + "loss": 0.0233, + "step": 310 + }, + { + "epoch": 0.11243673174258857, + "grad_norm": 0.2813434720200595, + "learning_rate": 9.999027010604159e-06, + "loss": 0.0147, + "step": 311 + }, + { + "epoch": 0.11279826464208242, + "grad_norm": 0.42822035985425355, + "learning_rate": 9.998967150140395e-06, + "loss": 0.0288, + "step": 312 + }, + { + "epoch": 0.11315979754157629, + "grad_norm": 0.7424693922985044, + "learning_rate": 9.998905503046827e-06, + "loss": 0.0435, + "step": 313 + }, + { + "epoch": 0.11352133044107014, + "grad_norm": 0.8239571573350664, + "learning_rate": 9.998842069345486e-06, + "loss": 0.063, + "step": 314 + }, + { + "epoch": 0.113882863340564, + "grad_norm": 1.0932701007567336, + "learning_rate": 9.998776849059046e-06, + "loss": 0.0579, + "step": 315 + }, + { + "epoch": 0.11424439624005785, + "grad_norm": 1.1567518292841692, + "learning_rate": 9.998709842210815e-06, + "loss": 0.063, + "step": 316 + }, + { + "epoch": 0.1146059291395517, + "grad_norm": 0.9469316488126628, + "learning_rate": 9.998641048824741e-06, + "loss": 0.0635, + "step": 317 + }, + { + "epoch": 0.11496746203904555, + "grad_norm": 1.7322101891720085, + "learning_rate": 9.998570468925411e-06, + "loss": 0.1035, + "step": 318 + }, + { + "epoch": 0.1153289949385394, + "grad_norm": 0.4322525532572112, + "learning_rate": 9.99849810253805e-06, + "loss": 0.0258, + "step": 319 + }, + { + "epoch": 0.11569052783803326, + "grad_norm": 0.9680734047795788, + "learning_rate": 9.998423949688523e-06, + "loss": 0.063, + "step": 320 + }, + { + "epoch": 0.11605206073752712, + "grad_norm": 0.46624124096683134, + "learning_rate": 9.99834801040333e-06, + "loss": 0.0288, + "step": 321 + }, + { + "epoch": 0.11641359363702097, + "grad_norm": 0.7138223067139348, + "learning_rate": 9.998270284709612e-06, + "loss": 0.0322, + "step": 322 + }, + { + "epoch": 0.11677512653651483, + "grad_norm": 2.7811488718602475, + "learning_rate": 9.998190772635151e-06, + "loss": 0.1748, + "step": 323 + }, + { + "epoch": 0.11713665943600868, + "grad_norm": 1.1538656823249716, + "learning_rate": 9.998109474208363e-06, + "loss": 0.082, + "step": 324 + }, + { + "epoch": 0.11749819233550253, + "grad_norm": 0.3509561185224214, + "learning_rate": 9.998026389458301e-06, + "loss": 0.0077, + "step": 325 + }, + { + "epoch": 0.11785972523499638, + "grad_norm": 0.5438588200791205, + "learning_rate": 9.997941518414665e-06, + "loss": 0.0288, + "step": 326 + }, + { + "epoch": 0.11822125813449023, + "grad_norm": 0.5839906517848371, + "learning_rate": 9.997854861107786e-06, + "loss": 0.0187, + "step": 327 + }, + { + "epoch": 0.11858279103398409, + "grad_norm": 2.7255758331397955, + "learning_rate": 9.99776641756863e-06, + "loss": 0.0571, + "step": 328 + }, + { + "epoch": 0.11894432393347795, + "grad_norm": 1.6107264390738565, + "learning_rate": 9.997676187828816e-06, + "loss": 0.0762, + "step": 329 + }, + { + "epoch": 0.1193058568329718, + "grad_norm": 0.435608438291499, + "learning_rate": 9.997584171920583e-06, + "loss": 0.0259, + "step": 330 + }, + { + "epoch": 0.11966738973246566, + "grad_norm": 0.7858403975672837, + "learning_rate": 9.997490369876823e-06, + "loss": 0.0354, + "step": 331 + }, + { + "epoch": 0.12002892263195951, + "grad_norm": 0.6685600930012832, + "learning_rate": 9.99739478173106e-06, + "loss": 0.0286, + "step": 332 + }, + { + "epoch": 0.12039045553145336, + "grad_norm": 0.2886078995945223, + "learning_rate": 9.997297407517456e-06, + "loss": 0.0131, + "step": 333 + }, + { + "epoch": 0.12075198843094721, + "grad_norm": 0.2661523462016586, + "learning_rate": 9.997198247270816e-06, + "loss": 0.0131, + "step": 334 + }, + { + "epoch": 0.12111352133044107, + "grad_norm": 0.47282885593472657, + "learning_rate": 9.997097301026573e-06, + "loss": 0.0259, + "step": 335 + }, + { + "epoch": 0.12147505422993492, + "grad_norm": 0.8831976163828206, + "learning_rate": 9.996994568820811e-06, + "loss": 0.0435, + "step": 336 + }, + { + "epoch": 0.12183658712942878, + "grad_norm": 0.9036289389078619, + "learning_rate": 9.996890050690246e-06, + "loss": 0.0476, + "step": 337 + }, + { + "epoch": 0.12219812002892264, + "grad_norm": 0.262025326474556, + "learning_rate": 9.996783746672229e-06, + "loss": 0.0131, + "step": 338 + }, + { + "epoch": 0.12255965292841649, + "grad_norm": 0.1724784045137916, + "learning_rate": 9.996675656804757e-06, + "loss": 0.0082, + "step": 339 + }, + { + "epoch": 0.12292118582791034, + "grad_norm": 0.22858339508053407, + "learning_rate": 9.99656578112646e-06, + "loss": 0.0104, + "step": 340 + }, + { + "epoch": 0.12328271872740419, + "grad_norm": 0.16505136768113032, + "learning_rate": 9.996454119676607e-06, + "loss": 0.0082, + "step": 341 + }, + { + "epoch": 0.12364425162689804, + "grad_norm": 0.459694777947965, + "learning_rate": 9.996340672495104e-06, + "loss": 0.0208, + "step": 342 + }, + { + "epoch": 0.1240057845263919, + "grad_norm": 0.11355992573406032, + "learning_rate": 9.996225439622501e-06, + "loss": 0.0058, + "step": 343 + }, + { + "epoch": 0.12436731742588576, + "grad_norm": 1.1752018385365521, + "learning_rate": 9.99610842109998e-06, + "loss": 0.0525, + "step": 344 + }, + { + "epoch": 0.12472885032537961, + "grad_norm": 0.04182879225414766, + "learning_rate": 9.995989616969363e-06, + "loss": 0.0018, + "step": 345 + }, + { + "epoch": 0.12509038322487345, + "grad_norm": 0.030197175765298442, + "learning_rate": 9.995869027273113e-06, + "loss": 0.0014, + "step": 346 + }, + { + "epoch": 0.12545191612436732, + "grad_norm": 0.19975034118749982, + "learning_rate": 9.995746652054325e-06, + "loss": 0.0082, + "step": 347 + }, + { + "epoch": 0.12581344902386118, + "grad_norm": 0.06870109103087477, + "learning_rate": 9.99562249135674e-06, + "loss": 0.0026, + "step": 348 + }, + { + "epoch": 0.12617498192335502, + "grad_norm": 3.303640245827721, + "learning_rate": 9.995496545224729e-06, + "loss": 0.2246, + "step": 349 + }, + { + "epoch": 0.1265365148228489, + "grad_norm": 0.07371886267660688, + "learning_rate": 9.995368813703307e-06, + "loss": 0.0029, + "step": 350 + }, + { + "epoch": 0.12689804772234273, + "grad_norm": 0.26070871984773253, + "learning_rate": 9.995239296838126e-06, + "loss": 0.0117, + "step": 351 + }, + { + "epoch": 0.1272595806218366, + "grad_norm": 0.01781240267593001, + "learning_rate": 9.995107994675475e-06, + "loss": 0.0007, + "step": 352 + }, + { + "epoch": 0.12762111352133043, + "grad_norm": 2.446669033652435, + "learning_rate": 9.99497490726228e-06, + "loss": 0.2676, + "step": 353 + }, + { + "epoch": 0.1279826464208243, + "grad_norm": 0.06438658644020795, + "learning_rate": 9.994840034646108e-06, + "loss": 0.0025, + "step": 354 + }, + { + "epoch": 0.12834417932031814, + "grad_norm": 2.516979141834116, + "learning_rate": 9.99470337687516e-06, + "loss": 0.2246, + "step": 355 + }, + { + "epoch": 0.128705712219812, + "grad_norm": 1.7282743562511258, + "learning_rate": 9.994564933998281e-06, + "loss": 0.0635, + "step": 356 + }, + { + "epoch": 0.12906724511930587, + "grad_norm": 0.41956657456686597, + "learning_rate": 9.994424706064946e-06, + "loss": 0.0149, + "step": 357 + }, + { + "epoch": 0.1294287780187997, + "grad_norm": 0.07604661642564217, + "learning_rate": 9.99428269312528e-06, + "loss": 0.0025, + "step": 358 + }, + { + "epoch": 0.12979031091829357, + "grad_norm": 0.03855120683611115, + "learning_rate": 9.994138895230029e-06, + "loss": 0.0016, + "step": 359 + }, + { + "epoch": 0.1301518438177874, + "grad_norm": 0.12383023649387166, + "learning_rate": 9.993993312430592e-06, + "loss": 0.0035, + "step": 360 + }, + { + "epoch": 0.13051337671728128, + "grad_norm": 0.06922233127432541, + "learning_rate": 9.993845944779e-06, + "loss": 0.0019, + "step": 361 + }, + { + "epoch": 0.13087490961677511, + "grad_norm": 0.030988780900421516, + "learning_rate": 9.99369679232792e-06, + "loss": 0.0014, + "step": 362 + }, + { + "epoch": 0.13123644251626898, + "grad_norm": 0.08624473779458197, + "learning_rate": 9.993545855130662e-06, + "loss": 0.0032, + "step": 363 + }, + { + "epoch": 0.13159797541576285, + "grad_norm": 0.5862845366653742, + "learning_rate": 9.993393133241167e-06, + "loss": 0.0208, + "step": 364 + }, + { + "epoch": 0.13195950831525669, + "grad_norm": 42.43895800682535, + "learning_rate": 9.993238626714021e-06, + "loss": 4.6875, + "step": 365 + }, + { + "epoch": 0.13232104121475055, + "grad_norm": 0.07248243671168537, + "learning_rate": 9.993082335604445e-06, + "loss": 0.0028, + "step": 366 + }, + { + "epoch": 0.1326825741142444, + "grad_norm": 2.338059417226217, + "learning_rate": 9.992924259968292e-06, + "loss": 0.1553, + "step": 367 + }, + { + "epoch": 0.13304410701373826, + "grad_norm": 0.43557959981756056, + "learning_rate": 9.992764399862067e-06, + "loss": 0.0132, + "step": 368 + }, + { + "epoch": 0.1334056399132321, + "grad_norm": 1.7087812999590692, + "learning_rate": 9.992602755342896e-06, + "loss": 0.063, + "step": 369 + }, + { + "epoch": 0.13376717281272596, + "grad_norm": 0.24065197301691285, + "learning_rate": 9.992439326468554e-06, + "loss": 0.0117, + "step": 370 + }, + { + "epoch": 0.13412870571221983, + "grad_norm": 0.1578913401345198, + "learning_rate": 9.992274113297453e-06, + "loss": 0.0065, + "step": 371 + }, + { + "epoch": 0.13449023861171366, + "grad_norm": 0.06616995082700143, + "learning_rate": 9.992107115888637e-06, + "loss": 0.0028, + "step": 372 + }, + { + "epoch": 0.13485177151120753, + "grad_norm": 0.06150121994102503, + "learning_rate": 9.991938334301789e-06, + "loss": 0.0025, + "step": 373 + }, + { + "epoch": 0.13521330441070137, + "grad_norm": 0.0328531562846898, + "learning_rate": 9.991767768597233e-06, + "loss": 0.001, + "step": 374 + }, + { + "epoch": 0.13557483731019523, + "grad_norm": 0.10580118134140508, + "learning_rate": 9.991595418835933e-06, + "loss": 0.003, + "step": 375 + }, + { + "epoch": 0.13593637020968907, + "grad_norm": 0.17639524378024277, + "learning_rate": 9.991421285079484e-06, + "loss": 0.0042, + "step": 376 + }, + { + "epoch": 0.13629790310918294, + "grad_norm": 0.12879156883848938, + "learning_rate": 9.991245367390119e-06, + "loss": 0.003, + "step": 377 + }, + { + "epoch": 0.13665943600867678, + "grad_norm": 4.163461836261817, + "learning_rate": 9.991067665830714e-06, + "loss": 0.2988, + "step": 378 + }, + { + "epoch": 0.13702096890817064, + "grad_norm": 0.025231981084502864, + "learning_rate": 9.990888180464777e-06, + "loss": 0.0008, + "step": 379 + }, + { + "epoch": 0.1373825018076645, + "grad_norm": 2.497859751262197, + "learning_rate": 9.990706911356459e-06, + "loss": 0.1641, + "step": 380 + }, + { + "epoch": 0.13774403470715835, + "grad_norm": 2.677318697173203, + "learning_rate": 9.990523858570544e-06, + "loss": 0.2891, + "step": 381 + }, + { + "epoch": 0.1381055676066522, + "grad_norm": 0.9349017283625811, + "learning_rate": 9.990339022172454e-06, + "loss": 0.0288, + "step": 382 + }, + { + "epoch": 0.13846710050614605, + "grad_norm": 0.05530169428794182, + "learning_rate": 9.990152402228252e-06, + "loss": 0.0022, + "step": 383 + }, + { + "epoch": 0.13882863340563992, + "grad_norm": 0.05292275521669027, + "learning_rate": 9.989963998804636e-06, + "loss": 0.0016, + "step": 384 + }, + { + "epoch": 0.13919016630513376, + "grad_norm": 2.1648860083332124, + "learning_rate": 9.989773811968938e-06, + "loss": 0.2461, + "step": 385 + }, + { + "epoch": 0.13955169920462762, + "grad_norm": 0.6325774946770601, + "learning_rate": 9.989581841789132e-06, + "loss": 0.0288, + "step": 386 + }, + { + "epoch": 0.1399132321041215, + "grad_norm": 0.24631044671490648, + "learning_rate": 9.989388088333829e-06, + "loss": 0.0059, + "step": 387 + }, + { + "epoch": 0.14027476500361533, + "grad_norm": 0.5895276305669332, + "learning_rate": 9.989192551672278e-06, + "loss": 0.0233, + "step": 388 + }, + { + "epoch": 0.1406362979031092, + "grad_norm": 0.4507703851966112, + "learning_rate": 9.98899523187436e-06, + "loss": 0.0147, + "step": 389 + }, + { + "epoch": 0.14099783080260303, + "grad_norm": 0.7916507896431818, + "learning_rate": 9.9887961290106e-06, + "loss": 0.0208, + "step": 390 + }, + { + "epoch": 0.1413593637020969, + "grad_norm": 0.0893560718598237, + "learning_rate": 9.988595243152155e-06, + "loss": 0.004, + "step": 391 + }, + { + "epoch": 0.14172089660159073, + "grad_norm": 0.12514958438381923, + "learning_rate": 9.988392574370825e-06, + "loss": 0.0034, + "step": 392 + }, + { + "epoch": 0.1420824295010846, + "grad_norm": 0.9481150017564033, + "learning_rate": 9.988188122739039e-06, + "loss": 0.0289, + "step": 393 + }, + { + "epoch": 0.14244396240057844, + "grad_norm": 0.7259539300518246, + "learning_rate": 9.987981888329874e-06, + "loss": 0.0135, + "step": 394 + }, + { + "epoch": 0.1428054953000723, + "grad_norm": 1.5102339451384743, + "learning_rate": 9.987773871217033e-06, + "loss": 0.0435, + "step": 395 + }, + { + "epoch": 0.14316702819956617, + "grad_norm": 0.09102611125652296, + "learning_rate": 9.987564071474862e-06, + "loss": 0.0023, + "step": 396 + }, + { + "epoch": 0.14352856109906, + "grad_norm": 0.379819809094871, + "learning_rate": 9.987352489178346e-06, + "loss": 0.0131, + "step": 397 + }, + { + "epoch": 0.14389009399855388, + "grad_norm": 0.10437601978957425, + "learning_rate": 9.987139124403102e-06, + "loss": 0.0052, + "step": 398 + }, + { + "epoch": 0.1442516268980477, + "grad_norm": 0.05127813869826543, + "learning_rate": 9.986923977225388e-06, + "loss": 0.002, + "step": 399 + }, + { + "epoch": 0.14461315979754158, + "grad_norm": 0.11368312526004382, + "learning_rate": 9.986707047722097e-06, + "loss": 0.0036, + "step": 400 + }, + { + "epoch": 0.14497469269703542, + "grad_norm": 0.06788182022060937, + "learning_rate": 9.986488335970759e-06, + "loss": 0.0028, + "step": 401 + }, + { + "epoch": 0.14533622559652928, + "grad_norm": 1.7577125281269546, + "learning_rate": 9.986267842049542e-06, + "loss": 0.0693, + "step": 402 + }, + { + "epoch": 0.14569775849602315, + "grad_norm": 2.334205973148439, + "learning_rate": 9.986045566037252e-06, + "loss": 0.2129, + "step": 403 + }, + { + "epoch": 0.146059291395517, + "grad_norm": 0.1177060826172035, + "learning_rate": 9.985821508013327e-06, + "loss": 0.0036, + "step": 404 + }, + { + "epoch": 0.14642082429501085, + "grad_norm": 2.1065448687989647, + "learning_rate": 9.985595668057848e-06, + "loss": 0.3418, + "step": 405 + }, + { + "epoch": 0.1467823571945047, + "grad_norm": 0.49914981259827085, + "learning_rate": 9.98536804625153e-06, + "loss": 0.0208, + "step": 406 + }, + { + "epoch": 0.14714389009399856, + "grad_norm": 2.0151532169238924, + "learning_rate": 9.985138642675723e-06, + "loss": 0.2891, + "step": 407 + }, + { + "epoch": 0.1475054229934924, + "grad_norm": 2.688077339152591, + "learning_rate": 9.984907457412419e-06, + "loss": 0.3105, + "step": 408 + }, + { + "epoch": 0.14786695589298626, + "grad_norm": 0.12843924280405408, + "learning_rate": 9.98467449054424e-06, + "loss": 0.0057, + "step": 409 + }, + { + "epoch": 0.14822848879248013, + "grad_norm": 0.055064860954369146, + "learning_rate": 9.98443974215445e-06, + "loss": 0.0019, + "step": 410 + }, + { + "epoch": 0.14859002169197397, + "grad_norm": 2.526015187216938, + "learning_rate": 9.98420321232695e-06, + "loss": 0.1836, + "step": 411 + }, + { + "epoch": 0.14895155459146783, + "grad_norm": 0.04951435269137093, + "learning_rate": 9.983964901146272e-06, + "loss": 0.0013, + "step": 412 + }, + { + "epoch": 0.14931308749096167, + "grad_norm": 2.138977857899028, + "learning_rate": 9.983724808697591e-06, + "loss": 0.1367, + "step": 413 + }, + { + "epoch": 0.14967462039045554, + "grad_norm": 0.07835634503588947, + "learning_rate": 9.983482935066716e-06, + "loss": 0.0018, + "step": 414 + }, + { + "epoch": 0.15003615328994938, + "grad_norm": 2.22771798387388, + "learning_rate": 9.98323928034009e-06, + "loss": 0.2246, + "step": 415 + }, + { + "epoch": 0.15039768618944324, + "grad_norm": 0.7255073124376292, + "learning_rate": 9.982993844604799e-06, + "loss": 0.0231, + "step": 416 + }, + { + "epoch": 0.15075921908893708, + "grad_norm": 1.3729498024268494, + "learning_rate": 9.982746627948556e-06, + "loss": 0.1191, + "step": 417 + }, + { + "epoch": 0.15112075198843095, + "grad_norm": 0.17463876459862285, + "learning_rate": 9.982497630459723e-06, + "loss": 0.0041, + "step": 418 + }, + { + "epoch": 0.1514822848879248, + "grad_norm": 1.0584591339083746, + "learning_rate": 9.982246852227287e-06, + "loss": 0.0688, + "step": 419 + }, + { + "epoch": 0.15184381778741865, + "grad_norm": 1.4376539617313584, + "learning_rate": 9.981994293340878e-06, + "loss": 0.0815, + "step": 420 + }, + { + "epoch": 0.15220535068691252, + "grad_norm": 1.5571016830467734, + "learning_rate": 9.98173995389076e-06, + "loss": 0.0889, + "step": 421 + }, + { + "epoch": 0.15256688358640635, + "grad_norm": 3.0748892638100234, + "learning_rate": 9.981483833967833e-06, + "loss": 0.2461, + "step": 422 + }, + { + "epoch": 0.15292841648590022, + "grad_norm": 0.6682961997740948, + "learning_rate": 9.981225933663634e-06, + "loss": 0.0206, + "step": 423 + }, + { + "epoch": 0.15328994938539406, + "grad_norm": 1.5408642946782047, + "learning_rate": 9.98096625307034e-06, + "loss": 0.1035, + "step": 424 + }, + { + "epoch": 0.15365148228488792, + "grad_norm": 0.06343721837980008, + "learning_rate": 9.980704792280758e-06, + "loss": 0.0006, + "step": 425 + }, + { + "epoch": 0.1540130151843818, + "grad_norm": 1.3406892036110756, + "learning_rate": 9.980441551388332e-06, + "loss": 0.0957, + "step": 426 + }, + { + "epoch": 0.15437454808387563, + "grad_norm": 1.0123355501546634, + "learning_rate": 9.980176530487149e-06, + "loss": 0.0393, + "step": 427 + }, + { + "epoch": 0.1547360809833695, + "grad_norm": 0.61598710937598, + "learning_rate": 9.979909729671923e-06, + "loss": 0.0476, + "step": 428 + }, + { + "epoch": 0.15509761388286333, + "grad_norm": 1.1377340604038666, + "learning_rate": 9.979641149038013e-06, + "loss": 0.0815, + "step": 429 + }, + { + "epoch": 0.1554591467823572, + "grad_norm": 2.1982798708997544, + "learning_rate": 9.979370788681406e-06, + "loss": 0.1367, + "step": 430 + }, + { + "epoch": 0.15582067968185104, + "grad_norm": 0.9426994429716684, + "learning_rate": 9.979098648698731e-06, + "loss": 0.0815, + "step": 431 + }, + { + "epoch": 0.1561822125813449, + "grad_norm": 0.5784624848794515, + "learning_rate": 9.978824729187248e-06, + "loss": 0.0391, + "step": 432 + }, + { + "epoch": 0.15654374548083877, + "grad_norm": 3.611519415501974, + "learning_rate": 9.978549030244858e-06, + "loss": 0.2559, + "step": 433 + }, + { + "epoch": 0.1569052783803326, + "grad_norm": 0.5147753033868753, + "learning_rate": 9.978271551970095e-06, + "loss": 0.0354, + "step": 434 + }, + { + "epoch": 0.15726681127982647, + "grad_norm": 0.40714515652015076, + "learning_rate": 9.97799229446213e-06, + "loss": 0.0286, + "step": 435 + }, + { + "epoch": 0.1576283441793203, + "grad_norm": 0.5301876853816998, + "learning_rate": 9.977711257820772e-06, + "loss": 0.0354, + "step": 436 + }, + { + "epoch": 0.15798987707881418, + "grad_norm": 0.44517903290671373, + "learning_rate": 9.977428442146459e-06, + "loss": 0.032, + "step": 437 + }, + { + "epoch": 0.15835140997830802, + "grad_norm": 0.45263812026260025, + "learning_rate": 9.977143847540272e-06, + "loss": 0.0206, + "step": 438 + }, + { + "epoch": 0.15871294287780188, + "grad_norm": 0.4104264526653474, + "learning_rate": 9.976857474103922e-06, + "loss": 0.0148, + "step": 439 + }, + { + "epoch": 0.15907447577729572, + "grad_norm": 0.1522075667665534, + "learning_rate": 9.976569321939763e-06, + "loss": 0.0047, + "step": 440 + }, + { + "epoch": 0.1594360086767896, + "grad_norm": 0.34003918178469245, + "learning_rate": 9.976279391150778e-06, + "loss": 0.0148, + "step": 441 + }, + { + "epoch": 0.15979754157628345, + "grad_norm": 0.28105466921810707, + "learning_rate": 9.975987681840589e-06, + "loss": 0.0165, + "step": 442 + }, + { + "epoch": 0.1601590744757773, + "grad_norm": 0.2636203457160436, + "learning_rate": 9.975694194113452e-06, + "loss": 0.0104, + "step": 443 + }, + { + "epoch": 0.16052060737527116, + "grad_norm": 15.185832818852065, + "learning_rate": 9.975398928074262e-06, + "loss": 3.6094, + "step": 444 + }, + { + "epoch": 0.160882140274765, + "grad_norm": 0.3692698616755233, + "learning_rate": 9.975101883828543e-06, + "loss": 0.0148, + "step": 445 + }, + { + "epoch": 0.16124367317425886, + "grad_norm": 1.0692395389121925, + "learning_rate": 9.97480306148246e-06, + "loss": 0.0393, + "step": 446 + }, + { + "epoch": 0.1616052060737527, + "grad_norm": 0.2823320807890828, + "learning_rate": 9.974502461142815e-06, + "loss": 0.0184, + "step": 447 + }, + { + "epoch": 0.16196673897324657, + "grad_norm": 0.1471658995207907, + "learning_rate": 9.97420008291704e-06, + "loss": 0.0033, + "step": 448 + }, + { + "epoch": 0.16232827187274043, + "grad_norm": 0.06165878324816701, + "learning_rate": 9.973895926913203e-06, + "loss": 0.0028, + "step": 449 + }, + { + "epoch": 0.16268980477223427, + "grad_norm": 0.35910451332200616, + "learning_rate": 9.973589993240015e-06, + "loss": 0.0147, + "step": 450 + }, + { + "epoch": 0.16305133767172814, + "grad_norm": 0.20219038827849165, + "learning_rate": 9.973282282006812e-06, + "loss": 0.0103, + "step": 451 + }, + { + "epoch": 0.16341287057122197, + "grad_norm": 0.19826055597016617, + "learning_rate": 9.972972793323568e-06, + "loss": 0.0117, + "step": 452 + }, + { + "epoch": 0.16377440347071584, + "grad_norm": 0.1093004758106876, + "learning_rate": 9.9726615273009e-06, + "loss": 0.0029, + "step": 453 + }, + { + "epoch": 0.16413593637020968, + "grad_norm": 0.08252338302503337, + "learning_rate": 9.97234848405005e-06, + "loss": 0.004, + "step": 454 + }, + { + "epoch": 0.16449746926970354, + "grad_norm": 3.431346259605653, + "learning_rate": 9.9720336636829e-06, + "loss": 0.1934, + "step": 455 + }, + { + "epoch": 0.1648590021691974, + "grad_norm": 0.23246100599526007, + "learning_rate": 9.971717066311971e-06, + "loss": 0.0093, + "step": 456 + }, + { + "epoch": 0.16522053506869125, + "grad_norm": 0.0936435658832973, + "learning_rate": 9.971398692050411e-06, + "loss": 0.0029, + "step": 457 + }, + { + "epoch": 0.16558206796818511, + "grad_norm": 1.2832323009527982, + "learning_rate": 9.971078541012007e-06, + "loss": 0.0576, + "step": 458 + }, + { + "epoch": 0.16594360086767895, + "grad_norm": 0.08115023904340299, + "learning_rate": 9.97075661331118e-06, + "loss": 0.004, + "step": 459 + }, + { + "epoch": 0.16630513376717282, + "grad_norm": 0.07518720033544407, + "learning_rate": 9.97043290906299e-06, + "loss": 0.0036, + "step": 460 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 1.3939107537837845, + "learning_rate": 9.970107428383129e-06, + "loss": 0.0476, + "step": 461 + }, + { + "epoch": 0.16702819956616052, + "grad_norm": 1.7300994289058766, + "learning_rate": 9.969780171387919e-06, + "loss": 0.2559, + "step": 462 + }, + { + "epoch": 0.16738973246565436, + "grad_norm": 0.0908035082773377, + "learning_rate": 9.969451138194326e-06, + "loss": 0.0036, + "step": 463 + }, + { + "epoch": 0.16775126536514823, + "grad_norm": 1.162484852623909, + "learning_rate": 9.969120328919946e-06, + "loss": 0.0688, + "step": 464 + }, + { + "epoch": 0.1681127982646421, + "grad_norm": 0.5198427146427821, + "learning_rate": 9.968787743683008e-06, + "loss": 0.0206, + "step": 465 + }, + { + "epoch": 0.16847433116413593, + "grad_norm": 0.11392572516016286, + "learning_rate": 9.968453382602378e-06, + "loss": 0.004, + "step": 466 + }, + { + "epoch": 0.1688358640636298, + "grad_norm": 2.323359677877996, + "learning_rate": 9.968117245797559e-06, + "loss": 0.2891, + "step": 467 + }, + { + "epoch": 0.16919739696312364, + "grad_norm": 0.05464193724627972, + "learning_rate": 9.967779333388682e-06, + "loss": 0.0023, + "step": 468 + }, + { + "epoch": 0.1695589298626175, + "grad_norm": 0.06341204461732629, + "learning_rate": 9.967439645496523e-06, + "loss": 0.0028, + "step": 469 + }, + { + "epoch": 0.16992046276211134, + "grad_norm": 1.7826015678620226, + "learning_rate": 9.967098182242482e-06, + "loss": 0.0576, + "step": 470 + }, + { + "epoch": 0.1702819956616052, + "grad_norm": 19.422942324445362, + "learning_rate": 9.9667549437486e-06, + "loss": 2.2344, + "step": 471 + }, + { + "epoch": 0.17064352856109907, + "grad_norm": 0.09566344182831575, + "learning_rate": 9.966409930137548e-06, + "loss": 0.0058, + "step": 472 + }, + { + "epoch": 0.1710050614605929, + "grad_norm": 1.8691218698361811, + "learning_rate": 9.966063141532634e-06, + "loss": 0.2988, + "step": 473 + }, + { + "epoch": 0.17136659436008678, + "grad_norm": 2.396309525267796, + "learning_rate": 9.965714578057803e-06, + "loss": 0.1641, + "step": 474 + }, + { + "epoch": 0.17172812725958062, + "grad_norm": 0.15975633157753547, + "learning_rate": 9.965364239837629e-06, + "loss": 0.0073, + "step": 475 + }, + { + "epoch": 0.17208966015907448, + "grad_norm": 0.10384674100879272, + "learning_rate": 9.965012126997325e-06, + "loss": 0.0057, + "step": 476 + }, + { + "epoch": 0.17245119305856832, + "grad_norm": 0.06150704212880218, + "learning_rate": 9.964658239662734e-06, + "loss": 0.0032, + "step": 477 + }, + { + "epoch": 0.17281272595806219, + "grad_norm": 0.8807045540479174, + "learning_rate": 9.964302577960334e-06, + "loss": 0.0391, + "step": 478 + }, + { + "epoch": 0.17317425885755602, + "grad_norm": 0.0896988197433753, + "learning_rate": 9.963945142017241e-06, + "loss": 0.0045, + "step": 479 + }, + { + "epoch": 0.1735357917570499, + "grad_norm": 1.032575293565955, + "learning_rate": 9.963585931961203e-06, + "loss": 0.0435, + "step": 480 + }, + { + "epoch": 0.17389732465654376, + "grad_norm": 0.037717204085536256, + "learning_rate": 9.963224947920601e-06, + "loss": 0.0017, + "step": 481 + }, + { + "epoch": 0.1742588575560376, + "grad_norm": 1.754607966944283, + "learning_rate": 9.962862190024449e-06, + "loss": 0.0815, + "step": 482 + }, + { + "epoch": 0.17462039045553146, + "grad_norm": 0.08092630707585863, + "learning_rate": 9.962497658402396e-06, + "loss": 0.0051, + "step": 483 + }, + { + "epoch": 0.1749819233550253, + "grad_norm": 0.10778721264970846, + "learning_rate": 9.96213135318473e-06, + "loss": 0.0046, + "step": 484 + }, + { + "epoch": 0.17534345625451916, + "grad_norm": 1.5863844478448366, + "learning_rate": 9.961763274502364e-06, + "loss": 0.2344, + "step": 485 + }, + { + "epoch": 0.175704989154013, + "grad_norm": 0.8827977542627127, + "learning_rate": 9.961393422486851e-06, + "loss": 0.0259, + "step": 486 + }, + { + "epoch": 0.17606652205350687, + "grad_norm": 0.25601068882246614, + "learning_rate": 9.961021797270376e-06, + "loss": 0.0184, + "step": 487 + }, + { + "epoch": 0.17642805495300073, + "grad_norm": 0.42438476269008873, + "learning_rate": 9.960648398985758e-06, + "loss": 0.0132, + "step": 488 + }, + { + "epoch": 0.17678958785249457, + "grad_norm": 0.913705141786148, + "learning_rate": 9.960273227766448e-06, + "loss": 0.0148, + "step": 489 + }, + { + "epoch": 0.17715112075198844, + "grad_norm": 0.15528922178349647, + "learning_rate": 9.959896283746535e-06, + "loss": 0.0082, + "step": 490 + }, + { + "epoch": 0.17751265365148228, + "grad_norm": 0.143317607814118, + "learning_rate": 9.959517567060736e-06, + "loss": 0.0037, + "step": 491 + }, + { + "epoch": 0.17787418655097614, + "grad_norm": 1.1690563628596953, + "learning_rate": 9.959137077844405e-06, + "loss": 0.0187, + "step": 492 + }, + { + "epoch": 0.17823571945046998, + "grad_norm": 0.15340858164598112, + "learning_rate": 9.95875481623353e-06, + "loss": 0.0073, + "step": 493 + }, + { + "epoch": 0.17859725234996385, + "grad_norm": 1.624210533192456, + "learning_rate": 9.958370782364728e-06, + "loss": 0.0815, + "step": 494 + }, + { + "epoch": 0.1789587852494577, + "grad_norm": 0.5888149299589495, + "learning_rate": 9.957984976375258e-06, + "loss": 0.0286, + "step": 495 + }, + { + "epoch": 0.17932031814895155, + "grad_norm": 0.44284434233761993, + "learning_rate": 9.957597398403e-06, + "loss": 0.0118, + "step": 496 + }, + { + "epoch": 0.17968185104844542, + "grad_norm": 0.6204852824887075, + "learning_rate": 9.95720804858648e-06, + "loss": 0.0187, + "step": 497 + }, + { + "epoch": 0.18004338394793926, + "grad_norm": 0.35230471216961246, + "learning_rate": 9.95681692706485e-06, + "loss": 0.0052, + "step": 498 + }, + { + "epoch": 0.18040491684743312, + "grad_norm": 1.60471283105409, + "learning_rate": 9.956424033977896e-06, + "loss": 0.1836, + "step": 499 + }, + { + "epoch": 0.18076644974692696, + "grad_norm": 0.011689460418429541, + "learning_rate": 9.956029369466038e-06, + "loss": 0.0004, + "step": 500 + }, + { + "epoch": 0.18112798264642083, + "grad_norm": 3.0943932506974425, + "learning_rate": 9.955632933670329e-06, + "loss": 0.1035, + "step": 501 + }, + { + "epoch": 0.18148951554591466, + "grad_norm": 1.4746334288225875, + "learning_rate": 9.955234726732455e-06, + "loss": 0.053, + "step": 502 + }, + { + "epoch": 0.18185104844540853, + "grad_norm": 0.8452643425334224, + "learning_rate": 9.954834748794737e-06, + "loss": 0.0432, + "step": 503 + }, + { + "epoch": 0.1822125813449024, + "grad_norm": 1.6893469142648814, + "learning_rate": 9.954433000000123e-06, + "loss": 0.1191, + "step": 504 + }, + { + "epoch": 0.18257411424439624, + "grad_norm": 0.8603592410218831, + "learning_rate": 9.954029480492202e-06, + "loss": 0.0289, + "step": 505 + }, + { + "epoch": 0.1829356471438901, + "grad_norm": 0.3480223943449447, + "learning_rate": 9.95362419041519e-06, + "loss": 0.0166, + "step": 506 + }, + { + "epoch": 0.18329718004338394, + "grad_norm": 0.4686749542871695, + "learning_rate": 9.953217129913939e-06, + "loss": 0.0075, + "step": 507 + }, + { + "epoch": 0.1836587129428778, + "grad_norm": 10.368182583535466, + "learning_rate": 9.95280829913393e-06, + "loss": 2.0781, + "step": 508 + }, + { + "epoch": 0.18402024584237164, + "grad_norm": 0.02137145236416931, + "learning_rate": 9.95239769822128e-06, + "loss": 0.0006, + "step": 509 + }, + { + "epoch": 0.1843817787418655, + "grad_norm": 1.3015093563504205, + "learning_rate": 9.951985327322738e-06, + "loss": 0.063, + "step": 510 + }, + { + "epoch": 0.18474331164135938, + "grad_norm": 1.1929239969413556, + "learning_rate": 9.951571186585685e-06, + "loss": 0.063, + "step": 511 + }, + { + "epoch": 0.18510484454085321, + "grad_norm": 1.6173579800133464, + "learning_rate": 9.951155276158134e-06, + "loss": 0.0476, + "step": 512 + }, + { + "epoch": 0.18546637744034708, + "grad_norm": 0.002498246704149135, + "learning_rate": 9.950737596188733e-06, + "loss": 0.0001, + "step": 513 + }, + { + "epoch": 0.18582791033984092, + "grad_norm": 2.2533497830663647, + "learning_rate": 9.950318146826759e-06, + "loss": 0.0957, + "step": 514 + }, + { + "epoch": 0.18618944323933478, + "grad_norm": 1.6091458986585032, + "learning_rate": 9.949896928222126e-06, + "loss": 0.0525, + "step": 515 + }, + { + "epoch": 0.18655097613882862, + "grad_norm": 4.767370492133749, + "learning_rate": 9.949473940525374e-06, + "loss": 0.3105, + "step": 516 + }, + { + "epoch": 0.1869125090383225, + "grad_norm": 0.0032756618835387877, + "learning_rate": 9.94904918388768e-06, + "loss": 0.0001, + "step": 517 + }, + { + "epoch": 0.18727404193781635, + "grad_norm": 0.11766039342461293, + "learning_rate": 9.948622658460853e-06, + "loss": 0.0029, + "step": 518 + }, + { + "epoch": 0.1876355748373102, + "grad_norm": 0.005763040184500082, + "learning_rate": 9.948194364397332e-06, + "loss": 0.0002, + "step": 519 + }, + { + "epoch": 0.18799710773680406, + "grad_norm": 0.5734942811398016, + "learning_rate": 9.94776430185019e-06, + "loss": 0.0354, + "step": 520 + }, + { + "epoch": 0.1883586406362979, + "grad_norm": 4.257119111639193, + "learning_rate": 9.94733247097313e-06, + "loss": 0.2344, + "step": 521 + }, + { + "epoch": 0.18872017353579176, + "grad_norm": 0.03382462572425305, + "learning_rate": 9.946898871920489e-06, + "loss": 0.0006, + "step": 522 + }, + { + "epoch": 0.1890817064352856, + "grad_norm": 1.883031543279253, + "learning_rate": 9.946463504847235e-06, + "loss": 0.1553, + "step": 523 + }, + { + "epoch": 0.18944323933477947, + "grad_norm": 0.004454253020411943, + "learning_rate": 9.946026369908968e-06, + "loss": 0.0002, + "step": 524 + }, + { + "epoch": 0.1898047722342733, + "grad_norm": 0.052762704570903526, + "learning_rate": 9.945587467261922e-06, + "loss": 0.0008, + "step": 525 + }, + { + "epoch": 0.19016630513376717, + "grad_norm": 0.5048786967894324, + "learning_rate": 9.945146797062957e-06, + "loss": 0.032, + "step": 526 + }, + { + "epoch": 0.19052783803326104, + "grad_norm": 4.181260564457284, + "learning_rate": 9.94470435946957e-06, + "loss": 0.4141, + "step": 527 + }, + { + "epoch": 0.19088937093275488, + "grad_norm": 1.8677348261182711, + "learning_rate": 9.944260154639891e-06, + "loss": 0.2129, + "step": 528 + }, + { + "epoch": 0.19125090383224874, + "grad_norm": 2.827589853773033, + "learning_rate": 9.943814182732674e-06, + "loss": 0.1367, + "step": 529 + }, + { + "epoch": 0.19161243673174258, + "grad_norm": 0.18906911310035102, + "learning_rate": 9.943366443907312e-06, + "loss": 0.0117, + "step": 530 + }, + { + "epoch": 0.19197396963123645, + "grad_norm": 2.747205431941601, + "learning_rate": 9.942916938323825e-06, + "loss": 0.1641, + "step": 531 + }, + { + "epoch": 0.19233550253073028, + "grad_norm": 1.3101081573349094, + "learning_rate": 9.94246566614287e-06, + "loss": 0.0957, + "step": 532 + }, + { + "epoch": 0.19269703543022415, + "grad_norm": 0.16340755139636273, + "learning_rate": 9.942012627525728e-06, + "loss": 0.0092, + "step": 533 + }, + { + "epoch": 0.19305856832971802, + "grad_norm": 4.103915734264287, + "learning_rate": 9.941557822634316e-06, + "loss": 0.2988, + "step": 534 + }, + { + "epoch": 0.19342010122921185, + "grad_norm": 1.501682519030153, + "learning_rate": 9.94110125163118e-06, + "loss": 0.1641, + "step": 535 + }, + { + "epoch": 0.19378163412870572, + "grad_norm": 1.2079143185806338, + "learning_rate": 9.9406429146795e-06, + "loss": 0.1191, + "step": 536 + }, + { + "epoch": 0.19414316702819956, + "grad_norm": 0.09557083431865748, + "learning_rate": 9.940182811943084e-06, + "loss": 0.0025, + "step": 537 + }, + { + "epoch": 0.19450469992769343, + "grad_norm": 0.3554780145481188, + "learning_rate": 9.939720943586376e-06, + "loss": 0.0255, + "step": 538 + }, + { + "epoch": 0.19486623282718726, + "grad_norm": 1.0102656187989685, + "learning_rate": 9.939257309774442e-06, + "loss": 0.0476, + "step": 539 + }, + { + "epoch": 0.19522776572668113, + "grad_norm": 0.4115251000893408, + "learning_rate": 9.93879191067299e-06, + "loss": 0.032, + "step": 540 + }, + { + "epoch": 0.195589298626175, + "grad_norm": 0.050937908305169755, + "learning_rate": 9.93832474644835e-06, + "loss": 0.0013, + "step": 541 + }, + { + "epoch": 0.19595083152566883, + "grad_norm": 0.4335999311352835, + "learning_rate": 9.93785581726749e-06, + "loss": 0.032, + "step": 542 + }, + { + "epoch": 0.1963123644251627, + "grad_norm": 0.22683060237870797, + "learning_rate": 9.937385123298002e-06, + "loss": 0.0083, + "step": 543 + }, + { + "epoch": 0.19667389732465654, + "grad_norm": 0.1005868920452253, + "learning_rate": 9.936912664708112e-06, + "loss": 0.0032, + "step": 544 + }, + { + "epoch": 0.1970354302241504, + "grad_norm": 1.4021724550678973, + "learning_rate": 9.936438441666678e-06, + "loss": 0.0576, + "step": 545 + }, + { + "epoch": 0.19739696312364424, + "grad_norm": 1.5908090031942206, + "learning_rate": 9.935962454343188e-06, + "loss": 0.0957, + "step": 546 + }, + { + "epoch": 0.1977584960231381, + "grad_norm": 0.872981793808145, + "learning_rate": 9.935484702907757e-06, + "loss": 0.063, + "step": 547 + }, + { + "epoch": 0.19812002892263195, + "grad_norm": 1.0264625066025168, + "learning_rate": 9.935005187531135e-06, + "loss": 0.063, + "step": 548 + }, + { + "epoch": 0.1984815618221258, + "grad_norm": 0.1639847333141914, + "learning_rate": 9.934523908384701e-06, + "loss": 0.0052, + "step": 549 + }, + { + "epoch": 0.19884309472161968, + "grad_norm": 0.9354561620224328, + "learning_rate": 9.934040865640463e-06, + "loss": 0.0576, + "step": 550 + }, + { + "epoch": 0.19920462762111352, + "grad_norm": 0.2220466520589381, + "learning_rate": 9.933556059471061e-06, + "loss": 0.0117, + "step": 551 + }, + { + "epoch": 0.19956616052060738, + "grad_norm": 0.31528117545073303, + "learning_rate": 9.933069490049765e-06, + "loss": 0.0233, + "step": 552 + }, + { + "epoch": 0.19992769342010122, + "grad_norm": 0.4489593366338688, + "learning_rate": 9.932581157550475e-06, + "loss": 0.0286, + "step": 553 + }, + { + "epoch": 0.2002892263195951, + "grad_norm": 0.22566067252497987, + "learning_rate": 9.932091062147717e-06, + "loss": 0.0117, + "step": 554 + }, + { + "epoch": 0.20065075921908893, + "grad_norm": 0.95111513772618, + "learning_rate": 9.931599204016658e-06, + "loss": 0.0393, + "step": 555 + }, + { + "epoch": 0.2010122921185828, + "grad_norm": 0.22700767163063446, + "learning_rate": 9.931105583333082e-06, + "loss": 0.0165, + "step": 556 + }, + { + "epoch": 0.20137382501807666, + "grad_norm": 0.165372302954391, + "learning_rate": 9.930610200273412e-06, + "loss": 0.0104, + "step": 557 + }, + { + "epoch": 0.2017353579175705, + "grad_norm": 0.24208235651885432, + "learning_rate": 9.930113055014696e-06, + "loss": 0.0085, + "step": 558 + }, + { + "epoch": 0.20209689081706436, + "grad_norm": 0.6343989919664211, + "learning_rate": 9.929614147734617e-06, + "loss": 0.0286, + "step": 559 + }, + { + "epoch": 0.2024584237165582, + "grad_norm": 0.08222630101595499, + "learning_rate": 9.92911347861148e-06, + "loss": 0.0051, + "step": 560 + }, + { + "epoch": 0.20281995661605207, + "grad_norm": 1.1321066208594828, + "learning_rate": 9.928611047824226e-06, + "loss": 0.063, + "step": 561 + }, + { + "epoch": 0.2031814895155459, + "grad_norm": 0.9466149364477172, + "learning_rate": 9.928106855552424e-06, + "loss": 0.0354, + "step": 562 + }, + { + "epoch": 0.20354302241503977, + "grad_norm": 0.10724601860934718, + "learning_rate": 9.927600901976273e-06, + "loss": 0.0065, + "step": 563 + }, + { + "epoch": 0.2039045553145336, + "grad_norm": 1.2109172408408204, + "learning_rate": 9.927093187276597e-06, + "loss": 0.2031, + "step": 564 + }, + { + "epoch": 0.20426608821402747, + "grad_norm": 0.25467819608180847, + "learning_rate": 9.926583711634857e-06, + "loss": 0.0093, + "step": 565 + }, + { + "epoch": 0.20462762111352134, + "grad_norm": 0.08671126282900724, + "learning_rate": 9.926072475233139e-06, + "loss": 0.0026, + "step": 566 + }, + { + "epoch": 0.20498915401301518, + "grad_norm": 1.406557291222627, + "learning_rate": 9.925559478254157e-06, + "loss": 0.1367, + "step": 567 + }, + { + "epoch": 0.20535068691250905, + "grad_norm": 1.7417036533500172, + "learning_rate": 9.925044720881257e-06, + "loss": 0.0393, + "step": 568 + }, + { + "epoch": 0.20571221981200288, + "grad_norm": 1.922426904035556, + "learning_rate": 9.924528203298413e-06, + "loss": 0.1553, + "step": 569 + }, + { + "epoch": 0.20607375271149675, + "grad_norm": 0.08187134120217222, + "learning_rate": 9.924009925690229e-06, + "loss": 0.0026, + "step": 570 + }, + { + "epoch": 0.2064352856109906, + "grad_norm": 4.103471324913989, + "learning_rate": 9.923489888241936e-06, + "loss": 0.2344, + "step": 571 + }, + { + "epoch": 0.20679681851048445, + "grad_norm": 0.3346692799995266, + "learning_rate": 9.922968091139397e-06, + "loss": 0.0117, + "step": 572 + }, + { + "epoch": 0.20715835140997832, + "grad_norm": 0.153306742181067, + "learning_rate": 9.9224445345691e-06, + "loss": 0.0047, + "step": 573 + }, + { + "epoch": 0.20751988430947216, + "grad_norm": 0.05223729806859658, + "learning_rate": 9.921919218718165e-06, + "loss": 0.0015, + "step": 574 + }, + { + "epoch": 0.20788141720896602, + "grad_norm": 1.595756152497511, + "learning_rate": 9.921392143774342e-06, + "loss": 0.1035, + "step": 575 + }, + { + "epoch": 0.20824295010845986, + "grad_norm": 0.21521898498562209, + "learning_rate": 9.920863309926003e-06, + "loss": 0.0103, + "step": 576 + }, + { + "epoch": 0.20860448300795373, + "grad_norm": 0.10584034127587726, + "learning_rate": 9.920332717362157e-06, + "loss": 0.0023, + "step": 577 + }, + { + "epoch": 0.20896601590744757, + "grad_norm": 0.18211375191959153, + "learning_rate": 9.919800366272436e-06, + "loss": 0.0104, + "step": 578 + }, + { + "epoch": 0.20932754880694143, + "grad_norm": 0.37118634599360706, + "learning_rate": 9.919266256847102e-06, + "loss": 0.0047, + "step": 579 + }, + { + "epoch": 0.2096890817064353, + "grad_norm": 0.24747143622234347, + "learning_rate": 9.918730389277046e-06, + "loss": 0.0147, + "step": 580 + }, + { + "epoch": 0.21005061460592914, + "grad_norm": 0.06186630147701197, + "learning_rate": 9.918192763753788e-06, + "loss": 0.0014, + "step": 581 + }, + { + "epoch": 0.210412147505423, + "grad_norm": 0.2174719393574005, + "learning_rate": 9.917653380469475e-06, + "loss": 0.0117, + "step": 582 + }, + { + "epoch": 0.21077368040491684, + "grad_norm": 1.4410835729980636, + "learning_rate": 9.91711223961688e-06, + "loss": 0.0815, + "step": 583 + }, + { + "epoch": 0.2111352133044107, + "grad_norm": 0.07169744467544663, + "learning_rate": 9.916569341389405e-06, + "loss": 0.0023, + "step": 584 + }, + { + "epoch": 0.21149674620390455, + "grad_norm": 0.1139623594264718, + "learning_rate": 9.91602468598109e-06, + "loss": 0.0026, + "step": 585 + }, + { + "epoch": 0.2118582791033984, + "grad_norm": 1.888046580940155, + "learning_rate": 9.915478273586587e-06, + "loss": 0.1279, + "step": 586 + }, + { + "epoch": 0.21221981200289225, + "grad_norm": 0.1552354439915934, + "learning_rate": 9.914930104401187e-06, + "loss": 0.0059, + "step": 587 + }, + { + "epoch": 0.21258134490238612, + "grad_norm": 0.13323682117087052, + "learning_rate": 9.914380178620807e-06, + "loss": 0.0082, + "step": 588 + }, + { + "epoch": 0.21294287780187998, + "grad_norm": 0.20205978742034858, + "learning_rate": 9.913828496441985e-06, + "loss": 0.0059, + "step": 589 + }, + { + "epoch": 0.21330441070137382, + "grad_norm": 0.3725976249745482, + "learning_rate": 9.913275058061898e-06, + "loss": 0.0131, + "step": 590 + }, + { + "epoch": 0.21366594360086769, + "grad_norm": 0.16746603734139637, + "learning_rate": 9.91271986367834e-06, + "loss": 0.004, + "step": 591 + }, + { + "epoch": 0.21402747650036152, + "grad_norm": 0.2922168311719542, + "learning_rate": 9.91216291348974e-06, + "loss": 0.0165, + "step": 592 + }, + { + "epoch": 0.2143890093998554, + "grad_norm": 1.6129352080859716, + "learning_rate": 9.911604207695153e-06, + "loss": 0.2344, + "step": 593 + }, + { + "epoch": 0.21475054229934923, + "grad_norm": 0.42061277145260456, + "learning_rate": 9.911043746494258e-06, + "loss": 0.0186, + "step": 594 + }, + { + "epoch": 0.2151120751988431, + "grad_norm": 0.0994358759492297, + "learning_rate": 9.910481530087363e-06, + "loss": 0.0051, + "step": 595 + }, + { + "epoch": 0.21547360809833696, + "grad_norm": 0.8514352765124475, + "learning_rate": 9.909917558675406e-06, + "loss": 0.0259, + "step": 596 + }, + { + "epoch": 0.2158351409978308, + "grad_norm": 0.08760733598218132, + "learning_rate": 9.90935183245995e-06, + "loss": 0.0021, + "step": 597 + }, + { + "epoch": 0.21619667389732466, + "grad_norm": 3.0465585633500867, + "learning_rate": 9.908784351643186e-06, + "loss": 0.2129, + "step": 598 + }, + { + "epoch": 0.2165582067968185, + "grad_norm": 1.8388863463706167, + "learning_rate": 9.90821511642793e-06, + "loss": 0.1367, + "step": 599 + }, + { + "epoch": 0.21691973969631237, + "grad_norm": 0.17579437924084954, + "learning_rate": 9.907644127017627e-06, + "loss": 0.0104, + "step": 600 + }, + { + "epoch": 0.2172812725958062, + "grad_norm": 1.1957691096608483, + "learning_rate": 9.907071383616349e-06, + "loss": 0.063, + "step": 601 + }, + { + "epoch": 0.21764280549530007, + "grad_norm": 0.08726971167653289, + "learning_rate": 9.906496886428793e-06, + "loss": 0.0028, + "step": 602 + }, + { + "epoch": 0.21800433839479394, + "grad_norm": 1.3895661995724533, + "learning_rate": 9.905920635660286e-06, + "loss": 0.1113, + "step": 603 + }, + { + "epoch": 0.21836587129428778, + "grad_norm": 0.22987893068335688, + "learning_rate": 9.90534263151678e-06, + "loss": 0.0131, + "step": 604 + }, + { + "epoch": 0.21872740419378164, + "grad_norm": 0.2562736042561247, + "learning_rate": 9.904762874204853e-06, + "loss": 0.0093, + "step": 605 + }, + { + "epoch": 0.21908893709327548, + "grad_norm": 0.13248662523505206, + "learning_rate": 9.90418136393171e-06, + "loss": 0.0052, + "step": 606 + }, + { + "epoch": 0.21945046999276935, + "grad_norm": 0.8924230647293501, + "learning_rate": 9.90359810090518e-06, + "loss": 0.0354, + "step": 607 + }, + { + "epoch": 0.2198120028922632, + "grad_norm": 0.29578167721408105, + "learning_rate": 9.903013085333727e-06, + "loss": 0.0229, + "step": 608 + }, + { + "epoch": 0.22017353579175705, + "grad_norm": 3.5010247280301625, + "learning_rate": 9.902426317426428e-06, + "loss": 0.3105, + "step": 609 + }, + { + "epoch": 0.2205350686912509, + "grad_norm": 0.22209198220357895, + "learning_rate": 9.901837797393e-06, + "loss": 0.0052, + "step": 610 + }, + { + "epoch": 0.22089660159074476, + "grad_norm": 0.07207728144705607, + "learning_rate": 9.901247525443778e-06, + "loss": 0.0021, + "step": 611 + }, + { + "epoch": 0.22125813449023862, + "grad_norm": 0.7493629146056063, + "learning_rate": 9.900655501789725e-06, + "loss": 0.0354, + "step": 612 + }, + { + "epoch": 0.22161966738973246, + "grad_norm": 1.6265583309335732, + "learning_rate": 9.900061726642428e-06, + "loss": 0.2031, + "step": 613 + }, + { + "epoch": 0.22198120028922633, + "grad_norm": 0.031578157750191044, + "learning_rate": 9.899466200214105e-06, + "loss": 0.001, + "step": 614 + }, + { + "epoch": 0.22234273318872017, + "grad_norm": 0.03834931322030123, + "learning_rate": 9.898868922717598e-06, + "loss": 0.0009, + "step": 615 + }, + { + "epoch": 0.22270426608821403, + "grad_norm": 0.3706332605143682, + "learning_rate": 9.89826989436637e-06, + "loss": 0.0105, + "step": 616 + }, + { + "epoch": 0.22306579898770787, + "grad_norm": 0.11625189583355125, + "learning_rate": 9.897669115374516e-06, + "loss": 0.0018, + "step": 617 + }, + { + "epoch": 0.22342733188720174, + "grad_norm": 0.783681042287793, + "learning_rate": 9.897066585956752e-06, + "loss": 0.0391, + "step": 618 + }, + { + "epoch": 0.2237888647866956, + "grad_norm": 0.0901001492801636, + "learning_rate": 9.896462306328425e-06, + "loss": 0.0013, + "step": 619 + }, + { + "epoch": 0.22415039768618944, + "grad_norm": 0.4466486268363722, + "learning_rate": 9.895856276705504e-06, + "loss": 0.0255, + "step": 620 + }, + { + "epoch": 0.2245119305856833, + "grad_norm": 0.8232728848300563, + "learning_rate": 9.895248497304581e-06, + "loss": 0.0437, + "step": 621 + }, + { + "epoch": 0.22487346348517714, + "grad_norm": 2.6764280421317395, + "learning_rate": 9.89463896834288e-06, + "loss": 0.1279, + "step": 622 + }, + { + "epoch": 0.225234996384671, + "grad_norm": 2.507838153202748, + "learning_rate": 9.894027690038244e-06, + "loss": 0.1035, + "step": 623 + }, + { + "epoch": 0.22559652928416485, + "grad_norm": 0.49253967910110313, + "learning_rate": 9.893414662609144e-06, + "loss": 0.0186, + "step": 624 + }, + { + "epoch": 0.22595806218365871, + "grad_norm": 0.04747025240055902, + "learning_rate": 9.892799886274676e-06, + "loss": 0.0005, + "step": 625 + }, + { + "epoch": 0.22631959508315258, + "grad_norm": 0.8466175845430384, + "learning_rate": 9.892183361254561e-06, + "loss": 0.0317, + "step": 626 + }, + { + "epoch": 0.22668112798264642, + "grad_norm": 2.2289457221211557, + "learning_rate": 9.891565087769145e-06, + "loss": 0.063, + "step": 627 + }, + { + "epoch": 0.22704266088214028, + "grad_norm": 3.1225212739290633, + "learning_rate": 9.890945066039402e-06, + "loss": 0.5117, + "step": 628 + }, + { + "epoch": 0.22740419378163412, + "grad_norm": 1.132990622036322, + "learning_rate": 9.890323296286923e-06, + "loss": 0.1035, + "step": 629 + }, + { + "epoch": 0.227765726681128, + "grad_norm": 0.2794596400490444, + "learning_rate": 9.889699778733928e-06, + "loss": 0.0184, + "step": 630 + }, + { + "epoch": 0.22812725958062183, + "grad_norm": 1.5204475585072572, + "learning_rate": 9.889074513603265e-06, + "loss": 0.1738, + "step": 631 + }, + { + "epoch": 0.2284887924801157, + "grad_norm": 0.20118697309187497, + "learning_rate": 9.888447501118404e-06, + "loss": 0.0009, + "step": 632 + }, + { + "epoch": 0.22885032537960953, + "grad_norm": 0.46129711551427205, + "learning_rate": 9.887818741503436e-06, + "loss": 0.0255, + "step": 633 + }, + { + "epoch": 0.2292118582791034, + "grad_norm": 0.16158507335613964, + "learning_rate": 9.887188234983082e-06, + "loss": 0.0058, + "step": 634 + }, + { + "epoch": 0.22957339117859726, + "grad_norm": 0.38088811010394963, + "learning_rate": 9.886555981782685e-06, + "loss": 0.0131, + "step": 635 + }, + { + "epoch": 0.2299349240780911, + "grad_norm": 0.2485317860060448, + "learning_rate": 9.885921982128211e-06, + "loss": 0.0165, + "step": 636 + }, + { + "epoch": 0.23029645697758497, + "grad_norm": 2.8129956623662697, + "learning_rate": 9.88528623624625e-06, + "loss": 0.2246, + "step": 637 + }, + { + "epoch": 0.2306579898770788, + "grad_norm": 0.2896513632835734, + "learning_rate": 9.884648744364021e-06, + "loss": 0.0184, + "step": 638 + }, + { + "epoch": 0.23101952277657267, + "grad_norm": 0.6698546758271161, + "learning_rate": 9.884009506709361e-06, + "loss": 0.0391, + "step": 639 + }, + { + "epoch": 0.2313810556760665, + "grad_norm": 0.9659842413650708, + "learning_rate": 9.883368523510734e-06, + "loss": 0.0815, + "step": 640 + }, + { + "epoch": 0.23174258857556038, + "grad_norm": 0.1876553537943572, + "learning_rate": 9.882725794997228e-06, + "loss": 0.0059, + "step": 641 + }, + { + "epoch": 0.23210412147505424, + "grad_norm": 0.7044873121901761, + "learning_rate": 9.882081321398554e-06, + "loss": 0.0317, + "step": 642 + }, + { + "epoch": 0.23246565437454808, + "grad_norm": 0.6527171912527191, + "learning_rate": 9.881435102945043e-06, + "loss": 0.0391, + "step": 643 + }, + { + "epoch": 0.23282718727404195, + "grad_norm": 1.0110528750393264, + "learning_rate": 9.880787139867659e-06, + "loss": 0.0476, + "step": 644 + }, + { + "epoch": 0.23318872017353579, + "grad_norm": 2.6522043760944536, + "learning_rate": 9.88013743239798e-06, + "loss": 0.3203, + "step": 645 + }, + { + "epoch": 0.23355025307302965, + "grad_norm": 1.3647827282293217, + "learning_rate": 9.879485980768213e-06, + "loss": 0.1553, + "step": 646 + }, + { + "epoch": 0.2339117859725235, + "grad_norm": 0.5950053008094998, + "learning_rate": 9.878832785211187e-06, + "loss": 0.0391, + "step": 647 + }, + { + "epoch": 0.23427331887201736, + "grad_norm": 0.9355937192853171, + "learning_rate": 9.878177845960351e-06, + "loss": 0.032, + "step": 648 + }, + { + "epoch": 0.2346348517715112, + "grad_norm": 0.3270017854006271, + "learning_rate": 9.877521163249785e-06, + "loss": 0.0255, + "step": 649 + }, + { + "epoch": 0.23499638467100506, + "grad_norm": 0.33395564943689376, + "learning_rate": 9.876862737314184e-06, + "loss": 0.0229, + "step": 650 + }, + { + "epoch": 0.23535791757049893, + "grad_norm": 0.1059748664118033, + "learning_rate": 9.876202568388868e-06, + "loss": 0.0051, + "step": 651 + }, + { + "epoch": 0.23571945046999276, + "grad_norm": 1.8635895912286489, + "learning_rate": 9.875540656709784e-06, + "loss": 0.1455, + "step": 652 + }, + { + "epoch": 0.23608098336948663, + "grad_norm": 1.5020687804107842, + "learning_rate": 9.874877002513499e-06, + "loss": 0.0476, + "step": 653 + }, + { + "epoch": 0.23644251626898047, + "grad_norm": 0.15198047168913853, + "learning_rate": 9.874211606037201e-06, + "loss": 0.0046, + "step": 654 + }, + { + "epoch": 0.23680404916847433, + "grad_norm": 0.43635236855178094, + "learning_rate": 9.873544467518705e-06, + "loss": 0.0184, + "step": 655 + }, + { + "epoch": 0.23716558206796817, + "grad_norm": 0.37050174454329415, + "learning_rate": 9.872875587196444e-06, + "loss": 0.0317, + "step": 656 + }, + { + "epoch": 0.23752711496746204, + "grad_norm": 1.6494752512825477, + "learning_rate": 9.872204965309478e-06, + "loss": 0.0525, + "step": 657 + }, + { + "epoch": 0.2378886478669559, + "grad_norm": 0.2228774810038387, + "learning_rate": 9.871532602097483e-06, + "loss": 0.0147, + "step": 658 + }, + { + "epoch": 0.23825018076644974, + "grad_norm": 0.5592785815367379, + "learning_rate": 9.870858497800766e-06, + "loss": 0.0317, + "step": 659 + }, + { + "epoch": 0.2386117136659436, + "grad_norm": 0.24307516136611407, + "learning_rate": 9.87018265266025e-06, + "loss": 0.0093, + "step": 660 + }, + { + "epoch": 0.23897324656543745, + "grad_norm": 0.4573674107224317, + "learning_rate": 9.86950506691748e-06, + "loss": 0.0165, + "step": 661 + }, + { + "epoch": 0.2393347794649313, + "grad_norm": 2.650112147397205, + "learning_rate": 9.868825740814627e-06, + "loss": 0.0889, + "step": 662 + }, + { + "epoch": 0.23969631236442515, + "grad_norm": 2.0532491802195265, + "learning_rate": 9.868144674594483e-06, + "loss": 0.2246, + "step": 663 + }, + { + "epoch": 0.24005784526391902, + "grad_norm": 0.40751944195247636, + "learning_rate": 9.867461868500459e-06, + "loss": 0.0104, + "step": 664 + }, + { + "epoch": 0.24041937816341288, + "grad_norm": 0.07523575359556796, + "learning_rate": 9.866777322776591e-06, + "loss": 0.0023, + "step": 665 + }, + { + "epoch": 0.24078091106290672, + "grad_norm": 2.117128201477639, + "learning_rate": 9.866091037667534e-06, + "loss": 0.0889, + "step": 666 + }, + { + "epoch": 0.2411424439624006, + "grad_norm": 0.15059458362953415, + "learning_rate": 9.86540301341857e-06, + "loss": 0.0092, + "step": 667 + }, + { + "epoch": 0.24150397686189443, + "grad_norm": 1.5127750119025936, + "learning_rate": 9.86471325027559e-06, + "loss": 0.1738, + "step": 668 + }, + { + "epoch": 0.2418655097613883, + "grad_norm": 0.11554652137376271, + "learning_rate": 9.864021748485126e-06, + "loss": 0.0064, + "step": 669 + }, + { + "epoch": 0.24222704266088213, + "grad_norm": 2.7682179467113652, + "learning_rate": 9.863328508294313e-06, + "loss": 0.1553, + "step": 670 + }, + { + "epoch": 0.242588575560376, + "grad_norm": 1.1477778423002811, + "learning_rate": 9.862633529950918e-06, + "loss": 0.1279, + "step": 671 + }, + { + "epoch": 0.24295010845986983, + "grad_norm": 0.19425140867988797, + "learning_rate": 9.861936813703327e-06, + "loss": 0.0094, + "step": 672 + }, + { + "epoch": 0.2433116413593637, + "grad_norm": 1.6026654018522715, + "learning_rate": 9.861238359800543e-06, + "loss": 0.1738, + "step": 673 + }, + { + "epoch": 0.24367317425885757, + "grad_norm": 0.6406584677223165, + "learning_rate": 9.860538168492198e-06, + "loss": 0.0354, + "step": 674 + }, + { + "epoch": 0.2440347071583514, + "grad_norm": 0.29131610527704227, + "learning_rate": 9.859836240028534e-06, + "loss": 0.0105, + "step": 675 + }, + { + "epoch": 0.24439624005784527, + "grad_norm": 0.6765359756612508, + "learning_rate": 9.859132574660426e-06, + "loss": 0.0476, + "step": 676 + }, + { + "epoch": 0.2447577729573391, + "grad_norm": 0.4563205014857123, + "learning_rate": 9.85842717263936e-06, + "loss": 0.0317, + "step": 677 + }, + { + "epoch": 0.24511930585683298, + "grad_norm": 0.3482489973013043, + "learning_rate": 9.857720034217446e-06, + "loss": 0.0107, + "step": 678 + }, + { + "epoch": 0.2454808387563268, + "grad_norm": 0.09552710944517014, + "learning_rate": 9.857011159647419e-06, + "loss": 0.0036, + "step": 679 + }, + { + "epoch": 0.24584237165582068, + "grad_norm": 0.5965070235742688, + "learning_rate": 9.85630054918263e-06, + "loss": 0.0258, + "step": 680 + }, + { + "epoch": 0.24620390455531455, + "grad_norm": 0.2623807868352116, + "learning_rate": 9.855588203077047e-06, + "loss": 0.0184, + "step": 681 + }, + { + "epoch": 0.24656543745480838, + "grad_norm": 0.03035832910314884, + "learning_rate": 9.854874121585266e-06, + "loss": 0.0005, + "step": 682 + }, + { + "epoch": 0.24692697035430225, + "grad_norm": 0.34783548741941284, + "learning_rate": 9.854158304962498e-06, + "loss": 0.0131, + "step": 683 + }, + { + "epoch": 0.2472885032537961, + "grad_norm": 1.099060879115793, + "learning_rate": 9.853440753464578e-06, + "loss": 0.1279, + "step": 684 + }, + { + "epoch": 0.24765003615328995, + "grad_norm": 2.2500355246189514, + "learning_rate": 9.852721467347954e-06, + "loss": 0.1553, + "step": 685 + }, + { + "epoch": 0.2480115690527838, + "grad_norm": 0.9142829107831082, + "learning_rate": 9.852000446869704e-06, + "loss": 0.063, + "step": 686 + }, + { + "epoch": 0.24837310195227766, + "grad_norm": 0.8906254508410729, + "learning_rate": 9.851277692287518e-06, + "loss": 0.0286, + "step": 687 + }, + { + "epoch": 0.24873463485177152, + "grad_norm": 1.0206414453355306, + "learning_rate": 9.850553203859707e-06, + "loss": 0.0752, + "step": 688 + }, + { + "epoch": 0.24909616775126536, + "grad_norm": 1.2564745185223447, + "learning_rate": 9.849826981845206e-06, + "loss": 0.0889, + "step": 689 + }, + { + "epoch": 0.24945770065075923, + "grad_norm": 0.300583942346695, + "learning_rate": 9.849099026503565e-06, + "loss": 0.0165, + "step": 690 + }, + { + "epoch": 0.24981923355025307, + "grad_norm": 0.8733891973677405, + "learning_rate": 9.848369338094955e-06, + "loss": 0.0815, + "step": 691 + }, + { + "epoch": 0.2501807664497469, + "grad_norm": 0.4660666861121612, + "learning_rate": 9.847637916880167e-06, + "loss": 0.0391, + "step": 692 + }, + { + "epoch": 0.25054229934924077, + "grad_norm": 0.8205901538410111, + "learning_rate": 9.84690476312061e-06, + "loss": 0.1113, + "step": 693 + }, + { + "epoch": 0.25090383224873464, + "grad_norm": 1.8413940051601243, + "learning_rate": 9.846169877078315e-06, + "loss": 0.063, + "step": 694 + }, + { + "epoch": 0.2512653651482285, + "grad_norm": 0.705620708088606, + "learning_rate": 9.845433259015929e-06, + "loss": 0.0752, + "step": 695 + }, + { + "epoch": 0.25162689804772237, + "grad_norm": 0.451680513280138, + "learning_rate": 9.844694909196717e-06, + "loss": 0.0046, + "step": 696 + }, + { + "epoch": 0.2519884309472162, + "grad_norm": 1.043124708558501, + "learning_rate": 9.843954827884568e-06, + "loss": 0.1191, + "step": 697 + }, + { + "epoch": 0.25234996384671005, + "grad_norm": 0.02854444810069108, + "learning_rate": 9.843213015343985e-06, + "loss": 0.001, + "step": 698 + }, + { + "epoch": 0.2527114967462039, + "grad_norm": 0.07391762360932815, + "learning_rate": 9.84246947184009e-06, + "loss": 0.002, + "step": 699 + }, + { + "epoch": 0.2530730296456978, + "grad_norm": 0.11215741365698025, + "learning_rate": 9.841724197638631e-06, + "loss": 0.0036, + "step": 700 + }, + { + "epoch": 0.2534345625451916, + "grad_norm": 0.5960838619152484, + "learning_rate": 9.840977193005966e-06, + "loss": 0.0432, + "step": 701 + }, + { + "epoch": 0.25379609544468545, + "grad_norm": 0.3687134925432022, + "learning_rate": 9.840228458209074e-06, + "loss": 0.0286, + "step": 702 + }, + { + "epoch": 0.2541576283441793, + "grad_norm": 0.9427808062412701, + "learning_rate": 9.839477993515549e-06, + "loss": 0.1113, + "step": 703 + }, + { + "epoch": 0.2545191612436732, + "grad_norm": 0.006861511205735992, + "learning_rate": 9.838725799193614e-06, + "loss": 0.0003, + "step": 704 + }, + { + "epoch": 0.25488069414316705, + "grad_norm": 0.8417358983799558, + "learning_rate": 9.837971875512098e-06, + "loss": 0.0889, + "step": 705 + }, + { + "epoch": 0.25524222704266086, + "grad_norm": 1.5729729844640397, + "learning_rate": 9.837216222740456e-06, + "loss": 0.082, + "step": 706 + }, + { + "epoch": 0.25560375994215473, + "grad_norm": 0.06951495910043547, + "learning_rate": 9.836458841148755e-06, + "loss": 0.002, + "step": 707 + }, + { + "epoch": 0.2559652928416486, + "grad_norm": 0.04434739903470972, + "learning_rate": 9.835699731007686e-06, + "loss": 0.001, + "step": 708 + }, + { + "epoch": 0.25632682574114246, + "grad_norm": 0.46712885768245865, + "learning_rate": 9.834938892588553e-06, + "loss": 0.0352, + "step": 709 + }, + { + "epoch": 0.25668835864063627, + "grad_norm": 1.4580328388826824, + "learning_rate": 9.834176326163281e-06, + "loss": 0.1191, + "step": 710 + }, + { + "epoch": 0.25704989154013014, + "grad_norm": 2.9279485192187322, + "learning_rate": 9.833412032004407e-06, + "loss": 0.2461, + "step": 711 + }, + { + "epoch": 0.257411424439624, + "grad_norm": 1.003899145753313, + "learning_rate": 9.832646010385097e-06, + "loss": 0.1113, + "step": 712 + }, + { + "epoch": 0.25777295733911787, + "grad_norm": 1.415586970359512, + "learning_rate": 9.831878261579122e-06, + "loss": 0.1113, + "step": 713 + }, + { + "epoch": 0.25813449023861174, + "grad_norm": 0.04079396434235705, + "learning_rate": 9.831108785860875e-06, + "loss": 0.0012, + "step": 714 + }, + { + "epoch": 0.25849602313810555, + "grad_norm": 0.8692539779030164, + "learning_rate": 9.830337583505367e-06, + "loss": 0.0688, + "step": 715 + }, + { + "epoch": 0.2588575560375994, + "grad_norm": 0.9324182925124262, + "learning_rate": 9.829564654788227e-06, + "loss": 0.0258, + "step": 716 + }, + { + "epoch": 0.2592190889370933, + "grad_norm": 0.3738025714261764, + "learning_rate": 9.8287899999857e-06, + "loss": 0.0317, + "step": 717 + }, + { + "epoch": 0.25958062183658714, + "grad_norm": 0.5412452066236726, + "learning_rate": 9.828013619374644e-06, + "loss": 0.0476, + "step": 718 + }, + { + "epoch": 0.259942154736081, + "grad_norm": 0.47840299796030966, + "learning_rate": 9.827235513232539e-06, + "loss": 0.0354, + "step": 719 + }, + { + "epoch": 0.2603036876355748, + "grad_norm": 0.41367929986459756, + "learning_rate": 9.82645568183748e-06, + "loss": 0.0391, + "step": 720 + }, + { + "epoch": 0.2606652205350687, + "grad_norm": 0.3222418239110859, + "learning_rate": 9.82567412546818e-06, + "loss": 0.0317, + "step": 721 + }, + { + "epoch": 0.26102675343456255, + "grad_norm": 0.17863996145340943, + "learning_rate": 9.824890844403968e-06, + "loss": 0.0074, + "step": 722 + }, + { + "epoch": 0.2613882863340564, + "grad_norm": 0.9502551645794642, + "learning_rate": 9.824105838924784e-06, + "loss": 0.1279, + "step": 723 + }, + { + "epoch": 0.26174981923355023, + "grad_norm": 0.36228783578704044, + "learning_rate": 9.82331910931119e-06, + "loss": 0.0286, + "step": 724 + }, + { + "epoch": 0.2621113521330441, + "grad_norm": 0.21675533353326612, + "learning_rate": 9.822530655844367e-06, + "loss": 0.0165, + "step": 725 + }, + { + "epoch": 0.26247288503253796, + "grad_norm": 0.8617259843747371, + "learning_rate": 9.821740478806104e-06, + "loss": 0.1191, + "step": 726 + }, + { + "epoch": 0.2628344179320318, + "grad_norm": 0.15726830608141537, + "learning_rate": 9.820948578478813e-06, + "loss": 0.0103, + "step": 727 + }, + { + "epoch": 0.2631959508315257, + "grad_norm": 0.7027516911410226, + "learning_rate": 9.820154955145516e-06, + "loss": 0.0354, + "step": 728 + }, + { + "epoch": 0.2635574837310195, + "grad_norm": 0.40043352426748646, + "learning_rate": 9.819359609089855e-06, + "loss": 0.0354, + "step": 729 + }, + { + "epoch": 0.26391901663051337, + "grad_norm": 0.9147421380210623, + "learning_rate": 9.818562540596087e-06, + "loss": 0.0525, + "step": 730 + }, + { + "epoch": 0.26428054953000724, + "grad_norm": 0.24383636795165536, + "learning_rate": 9.817763749949083e-06, + "loss": 0.0184, + "step": 731 + }, + { + "epoch": 0.2646420824295011, + "grad_norm": 1.1436101414845248, + "learning_rate": 9.816963237434334e-06, + "loss": 0.0815, + "step": 732 + }, + { + "epoch": 0.2650036153289949, + "grad_norm": 1.1877383762251978, + "learning_rate": 9.816161003337938e-06, + "loss": 0.1641, + "step": 733 + }, + { + "epoch": 0.2653651482284888, + "grad_norm": 5.794608846320638, + "learning_rate": 9.815357047946618e-06, + "loss": 1.7969, + "step": 734 + }, + { + "epoch": 0.26572668112798264, + "grad_norm": 0.30822732563822075, + "learning_rate": 9.814551371547704e-06, + "loss": 0.0229, + "step": 735 + }, + { + "epoch": 0.2660882140274765, + "grad_norm": 0.5797942752187166, + "learning_rate": 9.813743974429147e-06, + "loss": 0.0476, + "step": 736 + }, + { + "epoch": 0.2664497469269704, + "grad_norm": 0.09655019600294898, + "learning_rate": 9.812934856879507e-06, + "loss": 0.0041, + "step": 737 + }, + { + "epoch": 0.2668112798264642, + "grad_norm": 0.7316102490140385, + "learning_rate": 9.812124019187967e-06, + "loss": 0.0889, + "step": 738 + }, + { + "epoch": 0.26717281272595805, + "grad_norm": 0.17153327757695402, + "learning_rate": 9.811311461644317e-06, + "loss": 0.0036, + "step": 739 + }, + { + "epoch": 0.2675343456254519, + "grad_norm": 0.5667024413717481, + "learning_rate": 9.810497184538967e-06, + "loss": 0.0286, + "step": 740 + }, + { + "epoch": 0.2678958785249458, + "grad_norm": 0.37095937360452585, + "learning_rate": 9.809681188162938e-06, + "loss": 0.0354, + "step": 741 + }, + { + "epoch": 0.26825741142443965, + "grad_norm": 0.4756381814776486, + "learning_rate": 9.808863472807868e-06, + "loss": 0.0354, + "step": 742 + }, + { + "epoch": 0.26861894432393346, + "grad_norm": 1.263376282044656, + "learning_rate": 9.808044038766006e-06, + "loss": 0.1279, + "step": 743 + }, + { + "epoch": 0.26898047722342733, + "grad_norm": 1.1246832068917703, + "learning_rate": 9.807222886330221e-06, + "loss": 0.1035, + "step": 744 + }, + { + "epoch": 0.2693420101229212, + "grad_norm": 0.49099485506984375, + "learning_rate": 9.806400015793991e-06, + "loss": 0.0258, + "step": 745 + }, + { + "epoch": 0.26970354302241506, + "grad_norm": 0.184169178166989, + "learning_rate": 9.805575427451409e-06, + "loss": 0.0058, + "step": 746 + }, + { + "epoch": 0.27006507592190887, + "grad_norm": 0.0713070345209403, + "learning_rate": 9.804749121597182e-06, + "loss": 0.0032, + "step": 747 + }, + { + "epoch": 0.27042660882140274, + "grad_norm": 0.3473233802444413, + "learning_rate": 9.803921098526634e-06, + "loss": 0.0255, + "step": 748 + }, + { + "epoch": 0.2707881417208966, + "grad_norm": 1.579792214656993, + "learning_rate": 9.8030913585357e-06, + "loss": 0.2031, + "step": 749 + }, + { + "epoch": 0.27114967462039047, + "grad_norm": 0.2255667832320835, + "learning_rate": 9.802259901920927e-06, + "loss": 0.0117, + "step": 750 + }, + { + "epoch": 0.27151120751988433, + "grad_norm": 0.22855066248785624, + "learning_rate": 9.80142672897948e-06, + "loss": 0.0082, + "step": 751 + }, + { + "epoch": 0.27187274041937814, + "grad_norm": 2.5956822853383703, + "learning_rate": 9.800591840009133e-06, + "loss": 0.0889, + "step": 752 + }, + { + "epoch": 0.272234273318872, + "grad_norm": 0.06094349570694409, + "learning_rate": 9.799755235308274e-06, + "loss": 0.0015, + "step": 753 + }, + { + "epoch": 0.2725958062183659, + "grad_norm": 0.3363734378602122, + "learning_rate": 9.798916915175908e-06, + "loss": 0.0258, + "step": 754 + }, + { + "epoch": 0.27295733911785974, + "grad_norm": 0.9196186087789151, + "learning_rate": 9.798076879911649e-06, + "loss": 0.0815, + "step": 755 + }, + { + "epoch": 0.27331887201735355, + "grad_norm": 0.04542434472983415, + "learning_rate": 9.797235129815725e-06, + "loss": 0.0016, + "step": 756 + }, + { + "epoch": 0.2736804049168474, + "grad_norm": 0.38001116451106937, + "learning_rate": 9.796391665188979e-06, + "loss": 0.0317, + "step": 757 + }, + { + "epoch": 0.2740419378163413, + "grad_norm": 2.616681353682035, + "learning_rate": 9.795546486332864e-06, + "loss": 0.3203, + "step": 758 + }, + { + "epoch": 0.27440347071583515, + "grad_norm": 0.4189062642995545, + "learning_rate": 9.794699593549446e-06, + "loss": 0.0432, + "step": 759 + }, + { + "epoch": 0.274765003615329, + "grad_norm": 0.5102811142565945, + "learning_rate": 9.793850987141407e-06, + "loss": 0.0317, + "step": 760 + }, + { + "epoch": 0.27512653651482283, + "grad_norm": 0.48041376659299767, + "learning_rate": 9.793000667412034e-06, + "loss": 0.0317, + "step": 761 + }, + { + "epoch": 0.2754880694143167, + "grad_norm": 0.15495285564313585, + "learning_rate": 9.792148634665237e-06, + "loss": 0.0058, + "step": 762 + }, + { + "epoch": 0.27584960231381056, + "grad_norm": 0.0302729602155214, + "learning_rate": 9.791294889205528e-06, + "loss": 0.0012, + "step": 763 + }, + { + "epoch": 0.2762111352133044, + "grad_norm": 0.8874528004583132, + "learning_rate": 9.790439431338037e-06, + "loss": 0.0889, + "step": 764 + }, + { + "epoch": 0.2765726681127983, + "grad_norm": 1.583944119816404, + "learning_rate": 9.789582261368504e-06, + "loss": 0.1836, + "step": 765 + }, + { + "epoch": 0.2769342010122921, + "grad_norm": 1.2281467657582927, + "learning_rate": 9.78872337960328e-06, + "loss": 0.1035, + "step": 766 + }, + { + "epoch": 0.27729573391178597, + "grad_norm": 0.8745948395467474, + "learning_rate": 9.787862786349334e-06, + "loss": 0.0525, + "step": 767 + }, + { + "epoch": 0.27765726681127983, + "grad_norm": 0.6142732509486734, + "learning_rate": 9.787000481914235e-06, + "loss": 0.0576, + "step": 768 + }, + { + "epoch": 0.2780187997107737, + "grad_norm": 0.9732747457863964, + "learning_rate": 9.786136466606176e-06, + "loss": 0.1279, + "step": 769 + }, + { + "epoch": 0.2783803326102675, + "grad_norm": 0.8435475929022719, + "learning_rate": 9.785270740733954e-06, + "loss": 0.1367, + "step": 770 + }, + { + "epoch": 0.2787418655097614, + "grad_norm": 0.18009889731924625, + "learning_rate": 9.78440330460698e-06, + "loss": 0.0065, + "step": 771 + }, + { + "epoch": 0.27910339840925524, + "grad_norm": 0.20207176411613287, + "learning_rate": 9.783534158535272e-06, + "loss": 0.0093, + "step": 772 + }, + { + "epoch": 0.2794649313087491, + "grad_norm": 0.1774867956983302, + "learning_rate": 9.782663302829467e-06, + "loss": 0.0083, + "step": 773 + }, + { + "epoch": 0.279826464208243, + "grad_norm": 0.24816675151707565, + "learning_rate": 9.781790737800808e-06, + "loss": 0.0184, + "step": 774 + }, + { + "epoch": 0.2801879971077368, + "grad_norm": 0.3854236677410395, + "learning_rate": 9.780916463761145e-06, + "loss": 0.0258, + "step": 775 + }, + { + "epoch": 0.28054953000723065, + "grad_norm": 0.8042472292938392, + "learning_rate": 9.78004048102295e-06, + "loss": 0.0476, + "step": 776 + }, + { + "epoch": 0.2809110629067245, + "grad_norm": 1.4262927119271636, + "learning_rate": 9.779162789899295e-06, + "loss": 0.0957, + "step": 777 + }, + { + "epoch": 0.2812725958062184, + "grad_norm": 0.48532440558185896, + "learning_rate": 9.778283390703867e-06, + "loss": 0.0393, + "step": 778 + }, + { + "epoch": 0.2816341287057122, + "grad_norm": 0.3565155620004285, + "learning_rate": 9.777402283750965e-06, + "loss": 0.0154, + "step": 779 + }, + { + "epoch": 0.28199566160520606, + "grad_norm": 0.17696801534826564, + "learning_rate": 9.776519469355492e-06, + "loss": 0.0146, + "step": 780 + }, + { + "epoch": 0.2823571945046999, + "grad_norm": 0.8313495425775386, + "learning_rate": 9.775634947832971e-06, + "loss": 0.1553, + "step": 781 + }, + { + "epoch": 0.2827187274041938, + "grad_norm": 0.16977427561027414, + "learning_rate": 9.774748719499528e-06, + "loss": 0.0117, + "step": 782 + }, + { + "epoch": 0.28308026030368766, + "grad_norm": 0.2551417300346592, + "learning_rate": 9.773860784671898e-06, + "loss": 0.0206, + "step": 783 + }, + { + "epoch": 0.28344179320318147, + "grad_norm": 0.9379267122270303, + "learning_rate": 9.772971143667433e-06, + "loss": 0.1553, + "step": 784 + }, + { + "epoch": 0.28380332610267534, + "grad_norm": 0.11260782514842821, + "learning_rate": 9.772079796804088e-06, + "loss": 0.0046, + "step": 785 + }, + { + "epoch": 0.2841648590021692, + "grad_norm": 0.2928072915608268, + "learning_rate": 9.77118674440043e-06, + "loss": 0.0317, + "step": 786 + }, + { + "epoch": 0.28452639190166307, + "grad_norm": 0.1910094754203036, + "learning_rate": 9.770291986775637e-06, + "loss": 0.0184, + "step": 787 + }, + { + "epoch": 0.2848879248011569, + "grad_norm": 0.2856526987579013, + "learning_rate": 9.769395524249496e-06, + "loss": 0.0148, + "step": 788 + }, + { + "epoch": 0.28524945770065074, + "grad_norm": 0.6123047702811285, + "learning_rate": 9.768497357142399e-06, + "loss": 0.0889, + "step": 789 + }, + { + "epoch": 0.2856109906001446, + "grad_norm": 0.28910426337367556, + "learning_rate": 9.767597485775355e-06, + "loss": 0.0286, + "step": 790 + }, + { + "epoch": 0.2859725234996385, + "grad_norm": 0.7289392615772391, + "learning_rate": 9.766695910469974e-06, + "loss": 0.1367, + "step": 791 + }, + { + "epoch": 0.28633405639913234, + "grad_norm": 0.25962766632253165, + "learning_rate": 9.76579263154848e-06, + "loss": 0.0146, + "step": 792 + }, + { + "epoch": 0.28669558929862615, + "grad_norm": 0.9959848199871459, + "learning_rate": 9.764887649333707e-06, + "loss": 0.0752, + "step": 793 + }, + { + "epoch": 0.28705712219812, + "grad_norm": 0.47379792813121707, + "learning_rate": 9.763980964149093e-06, + "loss": 0.0286, + "step": 794 + }, + { + "epoch": 0.2874186550976139, + "grad_norm": 0.3915347186845854, + "learning_rate": 9.763072576318688e-06, + "loss": 0.0286, + "step": 795 + }, + { + "epoch": 0.28778018799710775, + "grad_norm": 0.7129668333152407, + "learning_rate": 9.76216248616715e-06, + "loss": 0.1035, + "step": 796 + }, + { + "epoch": 0.2881417208966016, + "grad_norm": 0.3214886580979836, + "learning_rate": 9.761250694019743e-06, + "loss": 0.0317, + "step": 797 + }, + { + "epoch": 0.2885032537960954, + "grad_norm": 0.2469200947706717, + "learning_rate": 9.760337200202344e-06, + "loss": 0.0286, + "step": 798 + }, + { + "epoch": 0.2888647866955893, + "grad_norm": 0.6414088804933954, + "learning_rate": 9.759422005041432e-06, + "loss": 0.063, + "step": 799 + }, + { + "epoch": 0.28922631959508316, + "grad_norm": 1.8799011432710064, + "learning_rate": 9.758505108864103e-06, + "loss": 0.0393, + "step": 800 + }, + { + "epoch": 0.289587852494577, + "grad_norm": 0.25462973941398315, + "learning_rate": 9.75758651199805e-06, + "loss": 0.0166, + "step": 801 + }, + { + "epoch": 0.28994938539407084, + "grad_norm": 0.1457552906337936, + "learning_rate": 9.756666214771583e-06, + "loss": 0.0092, + "step": 802 + }, + { + "epoch": 0.2903109182935647, + "grad_norm": 0.2955053282131958, + "learning_rate": 9.755744217513615e-06, + "loss": 0.0258, + "step": 803 + }, + { + "epoch": 0.29067245119305857, + "grad_norm": 0.7754459550701978, + "learning_rate": 9.754820520553666e-06, + "loss": 0.0752, + "step": 804 + }, + { + "epoch": 0.29103398409255243, + "grad_norm": 0.4104340482828379, + "learning_rate": 9.753895124221865e-06, + "loss": 0.0354, + "step": 805 + }, + { + "epoch": 0.2913955169920463, + "grad_norm": 0.4102001430261855, + "learning_rate": 9.752968028848953e-06, + "loss": 0.0432, + "step": 806 + }, + { + "epoch": 0.2917570498915401, + "grad_norm": 0.43467283386944905, + "learning_rate": 9.752039234766272e-06, + "loss": 0.0476, + "step": 807 + }, + { + "epoch": 0.292118582791034, + "grad_norm": 0.31353517915951373, + "learning_rate": 9.751108742305766e-06, + "loss": 0.0317, + "step": 808 + }, + { + "epoch": 0.29248011569052784, + "grad_norm": 1.0324185292852397, + "learning_rate": 9.750176551800001e-06, + "loss": 0.1035, + "step": 809 + }, + { + "epoch": 0.2928416485900217, + "grad_norm": 0.3727776127611994, + "learning_rate": 9.74924266358214e-06, + "loss": 0.0391, + "step": 810 + }, + { + "epoch": 0.2932031814895155, + "grad_norm": 0.3956124044476249, + "learning_rate": 9.748307077985951e-06, + "loss": 0.0258, + "step": 811 + }, + { + "epoch": 0.2935647143890094, + "grad_norm": 0.0812234309413224, + "learning_rate": 9.747369795345815e-06, + "loss": 0.0026, + "step": 812 + }, + { + "epoch": 0.29392624728850325, + "grad_norm": 0.8300086393605252, + "learning_rate": 9.746430815996717e-06, + "loss": 0.0525, + "step": 813 + }, + { + "epoch": 0.2942877801879971, + "grad_norm": 0.06265059912304978, + "learning_rate": 9.745490140274248e-06, + "loss": 0.0008, + "step": 814 + }, + { + "epoch": 0.294649313087491, + "grad_norm": 1.374774906985255, + "learning_rate": 9.744547768514602e-06, + "loss": 0.1455, + "step": 815 + }, + { + "epoch": 0.2950108459869848, + "grad_norm": 0.6528401907677465, + "learning_rate": 9.743603701054585e-06, + "loss": 0.0688, + "step": 816 + }, + { + "epoch": 0.29537237888647866, + "grad_norm": 1.7903975032775947, + "learning_rate": 9.742657938231607e-06, + "loss": 0.1191, + "step": 817 + }, + { + "epoch": 0.2957339117859725, + "grad_norm": 0.8322475922016437, + "learning_rate": 9.741710480383683e-06, + "loss": 0.1191, + "step": 818 + }, + { + "epoch": 0.2960954446854664, + "grad_norm": 0.1999780090004919, + "learning_rate": 9.740761327849435e-06, + "loss": 0.0082, + "step": 819 + }, + { + "epoch": 0.29645697758496026, + "grad_norm": 0.3297763939796925, + "learning_rate": 9.739810480968088e-06, + "loss": 0.032, + "step": 820 + }, + { + "epoch": 0.29681851048445407, + "grad_norm": 0.23389789525646756, + "learning_rate": 9.738857940079474e-06, + "loss": 0.0082, + "step": 821 + }, + { + "epoch": 0.29718004338394793, + "grad_norm": 0.37363968786691637, + "learning_rate": 9.737903705524034e-06, + "loss": 0.0354, + "step": 822 + }, + { + "epoch": 0.2975415762834418, + "grad_norm": 0.8780985784532591, + "learning_rate": 9.736947777642809e-06, + "loss": 0.0688, + "step": 823 + }, + { + "epoch": 0.29790310918293567, + "grad_norm": 0.3531695397256166, + "learning_rate": 9.735990156777447e-06, + "loss": 0.0391, + "step": 824 + }, + { + "epoch": 0.2982646420824295, + "grad_norm": 0.13255197957733186, + "learning_rate": 9.735030843270203e-06, + "loss": 0.0051, + "step": 825 + }, + { + "epoch": 0.29862617498192334, + "grad_norm": 0.38498370615695393, + "learning_rate": 9.734069837463935e-06, + "loss": 0.0432, + "step": 826 + }, + { + "epoch": 0.2989877078814172, + "grad_norm": 0.8484219939367277, + "learning_rate": 9.733107139702107e-06, + "loss": 0.0354, + "step": 827 + }, + { + "epoch": 0.2993492407809111, + "grad_norm": 0.025260915712018012, + "learning_rate": 9.732142750328786e-06, + "loss": 0.0009, + "step": 828 + }, + { + "epoch": 0.29971077368040494, + "grad_norm": 0.028000837346606987, + "learning_rate": 9.731176669688645e-06, + "loss": 0.001, + "step": 829 + }, + { + "epoch": 0.30007230657989875, + "grad_norm": 0.1733513331957167, + "learning_rate": 9.73020889812696e-06, + "loss": 0.0065, + "step": 830 + }, + { + "epoch": 0.3004338394793926, + "grad_norm": 0.10819590852814548, + "learning_rate": 9.729239435989613e-06, + "loss": 0.004, + "step": 831 + }, + { + "epoch": 0.3007953723788865, + "grad_norm": 0.6334607964345339, + "learning_rate": 9.72826828362309e-06, + "loss": 0.0525, + "step": 832 + }, + { + "epoch": 0.30115690527838035, + "grad_norm": 0.029056583174867787, + "learning_rate": 9.72729544137448e-06, + "loss": 0.001, + "step": 833 + }, + { + "epoch": 0.30151843817787416, + "grad_norm": 0.049770725075439205, + "learning_rate": 9.726320909591475e-06, + "loss": 0.0015, + "step": 834 + }, + { + "epoch": 0.301879971077368, + "grad_norm": 1.033121853210772, + "learning_rate": 9.725344688622377e-06, + "loss": 0.1455, + "step": 835 + }, + { + "epoch": 0.3022415039768619, + "grad_norm": 0.8774626013739147, + "learning_rate": 9.724366778816083e-06, + "loss": 0.1035, + "step": 836 + }, + { + "epoch": 0.30260303687635576, + "grad_norm": 1.070469763880205, + "learning_rate": 9.723387180522101e-06, + "loss": 0.0525, + "step": 837 + }, + { + "epoch": 0.3029645697758496, + "grad_norm": 6.824841286974049, + "learning_rate": 9.722405894090536e-06, + "loss": 1.75, + "step": 838 + }, + { + "epoch": 0.30332610267534343, + "grad_norm": 0.3719692303058278, + "learning_rate": 9.721422919872102e-06, + "loss": 0.0186, + "step": 839 + }, + { + "epoch": 0.3036876355748373, + "grad_norm": 0.3617233751543645, + "learning_rate": 9.720438258218112e-06, + "loss": 0.0317, + "step": 840 + }, + { + "epoch": 0.30404916847433117, + "grad_norm": 2.717109913844845, + "learning_rate": 9.719451909480487e-06, + "loss": 0.3203, + "step": 841 + }, + { + "epoch": 0.30441070137382503, + "grad_norm": 0.026384654315307166, + "learning_rate": 9.718463874011742e-06, + "loss": 0.0008, + "step": 842 + }, + { + "epoch": 0.3047722342733189, + "grad_norm": 0.0071150611749866075, + "learning_rate": 9.717474152165007e-06, + "loss": 0.0002, + "step": 843 + }, + { + "epoch": 0.3051337671728127, + "grad_norm": 23.800309167466214, + "learning_rate": 9.716482744294004e-06, + "loss": 1.875, + "step": 844 + }, + { + "epoch": 0.3054953000723066, + "grad_norm": 0.5219843691446643, + "learning_rate": 9.715489650753064e-06, + "loss": 0.0525, + "step": 845 + }, + { + "epoch": 0.30585683297180044, + "grad_norm": 0.3122824575227391, + "learning_rate": 9.714494871897118e-06, + "loss": 0.0317, + "step": 846 + }, + { + "epoch": 0.3062183658712943, + "grad_norm": 0.895235416816564, + "learning_rate": 9.7134984080817e-06, + "loss": 0.032, + "step": 847 + }, + { + "epoch": 0.3065798987707881, + "grad_norm": 0.09639758964163164, + "learning_rate": 9.712500259662945e-06, + "loss": 0.0045, + "step": 848 + }, + { + "epoch": 0.306941431670282, + "grad_norm": 0.2853242037322062, + "learning_rate": 9.711500426997593e-06, + "loss": 0.0229, + "step": 849 + }, + { + "epoch": 0.30730296456977585, + "grad_norm": 0.7270745089756556, + "learning_rate": 9.71049891044298e-06, + "loss": 0.063, + "step": 850 + }, + { + "epoch": 0.3076644974692697, + "grad_norm": 0.08972676916699429, + "learning_rate": 9.709495710357053e-06, + "loss": 0.0025, + "step": 851 + }, + { + "epoch": 0.3080260303687636, + "grad_norm": 0.4619746922935456, + "learning_rate": 9.708490827098352e-06, + "loss": 0.0432, + "step": 852 + }, + { + "epoch": 0.3083875632682574, + "grad_norm": 0.12996184543235464, + "learning_rate": 9.707484261026023e-06, + "loss": 0.0022, + "step": 853 + }, + { + "epoch": 0.30874909616775126, + "grad_norm": 0.3246317391430631, + "learning_rate": 9.706476012499815e-06, + "loss": 0.0317, + "step": 854 + }, + { + "epoch": 0.3091106290672451, + "grad_norm": 0.40476614590676524, + "learning_rate": 9.70546608188007e-06, + "loss": 0.0206, + "step": 855 + }, + { + "epoch": 0.309472161966739, + "grad_norm": 0.046687315738813816, + "learning_rate": 9.704454469527741e-06, + "loss": 0.0014, + "step": 856 + }, + { + "epoch": 0.3098336948662328, + "grad_norm": 1.3071964358871053, + "learning_rate": 9.70344117580438e-06, + "loss": 0.1641, + "step": 857 + }, + { + "epoch": 0.31019522776572667, + "grad_norm": 0.6391652678036779, + "learning_rate": 9.702426201072133e-06, + "loss": 0.063, + "step": 858 + }, + { + "epoch": 0.31055676066522053, + "grad_norm": 2.6564160201511617, + "learning_rate": 9.701409545693754e-06, + "loss": 0.1553, + "step": 859 + }, + { + "epoch": 0.3109182935647144, + "grad_norm": 0.2675270108060841, + "learning_rate": 9.700391210032597e-06, + "loss": 0.0117, + "step": 860 + }, + { + "epoch": 0.31127982646420826, + "grad_norm": 0.04194116206332464, + "learning_rate": 9.699371194452613e-06, + "loss": 0.0014, + "step": 861 + }, + { + "epoch": 0.3116413593637021, + "grad_norm": 0.37411588238140986, + "learning_rate": 9.698349499318356e-06, + "loss": 0.0317, + "step": 862 + }, + { + "epoch": 0.31200289226319594, + "grad_norm": 0.11484456601092813, + "learning_rate": 9.697326124994979e-06, + "loss": 0.0032, + "step": 863 + }, + { + "epoch": 0.3123644251626898, + "grad_norm": 0.3750897511392771, + "learning_rate": 9.696301071848235e-06, + "loss": 0.0354, + "step": 864 + }, + { + "epoch": 0.3127259580621837, + "grad_norm": 0.9521742288049069, + "learning_rate": 9.69527434024448e-06, + "loss": 0.1367, + "step": 865 + }, + { + "epoch": 0.31308749096167754, + "grad_norm": 0.7252907398816235, + "learning_rate": 9.694245930550668e-06, + "loss": 0.0815, + "step": 866 + }, + { + "epoch": 0.31344902386117135, + "grad_norm": 0.03682043530324307, + "learning_rate": 9.693215843134351e-06, + "loss": 0.0007, + "step": 867 + }, + { + "epoch": 0.3138105567606652, + "grad_norm": 0.6541295740510106, + "learning_rate": 9.69218407836368e-06, + "loss": 0.0354, + "step": 868 + }, + { + "epoch": 0.3141720896601591, + "grad_norm": 0.009210540369230975, + "learning_rate": 9.691150636607411e-06, + "loss": 0.0003, + "step": 869 + }, + { + "epoch": 0.31453362255965295, + "grad_norm": 0.04534975148891092, + "learning_rate": 9.690115518234894e-06, + "loss": 0.0018, + "step": 870 + }, + { + "epoch": 0.31489515545914676, + "grad_norm": 0.011503084008568504, + "learning_rate": 9.689078723616081e-06, + "loss": 0.0004, + "step": 871 + }, + { + "epoch": 0.3152566883586406, + "grad_norm": 2.1341371784271796, + "learning_rate": 9.688040253121523e-06, + "loss": 0.1553, + "step": 872 + }, + { + "epoch": 0.3156182212581345, + "grad_norm": 0.5991047460189359, + "learning_rate": 9.687000107122367e-06, + "loss": 0.0576, + "step": 873 + }, + { + "epoch": 0.31597975415762836, + "grad_norm": 0.014578400288586497, + "learning_rate": 9.68595828599036e-06, + "loss": 0.0003, + "step": 874 + }, + { + "epoch": 0.3163412870571222, + "grad_norm": 0.12903040713635755, + "learning_rate": 9.684914790097852e-06, + "loss": 0.0046, + "step": 875 + }, + { + "epoch": 0.31670281995661603, + "grad_norm": 0.3153443572313845, + "learning_rate": 9.683869619817788e-06, + "loss": 0.0317, + "step": 876 + }, + { + "epoch": 0.3170643528561099, + "grad_norm": 1.0942353366999766, + "learning_rate": 9.682822775523709e-06, + "loss": 0.1641, + "step": 877 + }, + { + "epoch": 0.31742588575560376, + "grad_norm": 0.1498323938714585, + "learning_rate": 9.681774257589758e-06, + "loss": 0.0045, + "step": 878 + }, + { + "epoch": 0.31778741865509763, + "grad_norm": 3.076398795065705, + "learning_rate": 9.680724066390675e-06, + "loss": 0.2461, + "step": 879 + }, + { + "epoch": 0.31814895155459144, + "grad_norm": 0.052030978942200844, + "learning_rate": 9.6796722023018e-06, + "loss": 0.0014, + "step": 880 + }, + { + "epoch": 0.3185104844540853, + "grad_norm": 0.29311629713984577, + "learning_rate": 9.678618665699067e-06, + "loss": 0.0258, + "step": 881 + }, + { + "epoch": 0.3188720173535792, + "grad_norm": 0.30084421048328286, + "learning_rate": 9.677563456959009e-06, + "loss": 0.0229, + "step": 882 + }, + { + "epoch": 0.31923355025307304, + "grad_norm": 0.04394189543426199, + "learning_rate": 9.67650657645876e-06, + "loss": 0.0007, + "step": 883 + }, + { + "epoch": 0.3195950831525669, + "grad_norm": 3.3659716900953787, + "learning_rate": 9.675448024576048e-06, + "loss": 0.1836, + "step": 884 + }, + { + "epoch": 0.3199566160520607, + "grad_norm": 1.2674396471595428, + "learning_rate": 9.674387801689198e-06, + "loss": 0.1191, + "step": 885 + }, + { + "epoch": 0.3203181489515546, + "grad_norm": 0.2496971251586867, + "learning_rate": 9.673325908177133e-06, + "loss": 0.0165, + "step": 886 + }, + { + "epoch": 0.32067968185104845, + "grad_norm": 0.031869042708268965, + "learning_rate": 9.672262344419377e-06, + "loss": 0.001, + "step": 887 + }, + { + "epoch": 0.3210412147505423, + "grad_norm": 0.17505498037117287, + "learning_rate": 9.671197110796043e-06, + "loss": 0.0051, + "step": 888 + }, + { + "epoch": 0.3214027476500362, + "grad_norm": 0.12116134434998392, + "learning_rate": 9.670130207687848e-06, + "loss": 0.0046, + "step": 889 + }, + { + "epoch": 0.32176428054953, + "grad_norm": 0.8923398273290141, + "learning_rate": 9.669061635476103e-06, + "loss": 0.0393, + "step": 890 + }, + { + "epoch": 0.32212581344902386, + "grad_norm": 0.3296754276739184, + "learning_rate": 9.667991394542712e-06, + "loss": 0.0148, + "step": 891 + }, + { + "epoch": 0.3224873463485177, + "grad_norm": 0.20956794041045662, + "learning_rate": 9.666919485270186e-06, + "loss": 0.0082, + "step": 892 + }, + { + "epoch": 0.3228488792480116, + "grad_norm": 1.2922487982334088, + "learning_rate": 9.66584590804162e-06, + "loss": 0.1191, + "step": 893 + }, + { + "epoch": 0.3232104121475054, + "grad_norm": 0.04099801440855057, + "learning_rate": 9.664770663240708e-06, + "loss": 0.0011, + "step": 894 + }, + { + "epoch": 0.32357194504699927, + "grad_norm": 0.01842045951695523, + "learning_rate": 9.663693751251749e-06, + "loss": 0.0006, + "step": 895 + }, + { + "epoch": 0.32393347794649313, + "grad_norm": 0.5886803762386537, + "learning_rate": 9.662615172459626e-06, + "loss": 0.0231, + "step": 896 + }, + { + "epoch": 0.324295010845987, + "grad_norm": 0.2589524403227859, + "learning_rate": 9.661534927249824e-06, + "loss": 0.0286, + "step": 897 + }, + { + "epoch": 0.32465654374548086, + "grad_norm": 0.4052526644280482, + "learning_rate": 9.660453016008423e-06, + "loss": 0.0391, + "step": 898 + }, + { + "epoch": 0.3250180766449747, + "grad_norm": 1.4187693216869026, + "learning_rate": 9.659369439122096e-06, + "loss": 0.1367, + "step": 899 + }, + { + "epoch": 0.32537960954446854, + "grad_norm": 0.7007078187012522, + "learning_rate": 9.658284196978118e-06, + "loss": 0.0815, + "step": 900 + }, + { + "epoch": 0.3257411424439624, + "grad_norm": 0.3546513355531661, + "learning_rate": 9.657197289964352e-06, + "loss": 0.0317, + "step": 901 + }, + { + "epoch": 0.32610267534345627, + "grad_norm": 0.661910444664002, + "learning_rate": 9.656108718469252e-06, + "loss": 0.0957, + "step": 902 + }, + { + "epoch": 0.3264642082429501, + "grad_norm": 0.3060321172984597, + "learning_rate": 9.655018482881883e-06, + "loss": 0.0354, + "step": 903 + }, + { + "epoch": 0.32682574114244395, + "grad_norm": 1.235651343321759, + "learning_rate": 9.65392658359189e-06, + "loss": 0.0432, + "step": 904 + }, + { + "epoch": 0.3271872740419378, + "grad_norm": 0.8153079114653755, + "learning_rate": 9.652833020989516e-06, + "loss": 0.1367, + "step": 905 + }, + { + "epoch": 0.3275488069414317, + "grad_norm": 0.6692410745006053, + "learning_rate": 9.651737795465604e-06, + "loss": 0.1113, + "step": 906 + }, + { + "epoch": 0.32791033984092555, + "grad_norm": 1.0387057258389196, + "learning_rate": 9.650640907411587e-06, + "loss": 0.0688, + "step": 907 + }, + { + "epoch": 0.32827187274041936, + "grad_norm": 0.521508605118284, + "learning_rate": 9.649542357219487e-06, + "loss": 0.0815, + "step": 908 + }, + { + "epoch": 0.3286334056399132, + "grad_norm": 2.5673773164390767, + "learning_rate": 9.648442145281933e-06, + "loss": 0.2246, + "step": 909 + }, + { + "epoch": 0.3289949385394071, + "grad_norm": 0.15645103231777438, + "learning_rate": 9.647340271992136e-06, + "loss": 0.0073, + "step": 910 + }, + { + "epoch": 0.32935647143890096, + "grad_norm": 0.007419876694257726, + "learning_rate": 9.646236737743907e-06, + "loss": 0.0003, + "step": 911 + }, + { + "epoch": 0.3297180043383948, + "grad_norm": 0.6260666454483683, + "learning_rate": 9.64513154293165e-06, + "loss": 0.0432, + "step": 912 + }, + { + "epoch": 0.33007953723788863, + "grad_norm": 0.530495033684286, + "learning_rate": 9.644024687950358e-06, + "loss": 0.0815, + "step": 913 + }, + { + "epoch": 0.3304410701373825, + "grad_norm": 0.4506838594840356, + "learning_rate": 9.642916173195623e-06, + "loss": 0.063, + "step": 914 + }, + { + "epoch": 0.33080260303687636, + "grad_norm": 0.6571868544318928, + "learning_rate": 9.641805999063627e-06, + "loss": 0.0576, + "step": 915 + }, + { + "epoch": 0.33116413593637023, + "grad_norm": 0.5835245951760996, + "learning_rate": 9.640694165951148e-06, + "loss": 0.0688, + "step": 916 + }, + { + "epoch": 0.33152566883586404, + "grad_norm": 0.08469799730672202, + "learning_rate": 9.639580674255553e-06, + "loss": 0.0036, + "step": 917 + }, + { + "epoch": 0.3318872017353579, + "grad_norm": 0.4485435802575873, + "learning_rate": 9.638465524374803e-06, + "loss": 0.063, + "step": 918 + }, + { + "epoch": 0.3322487346348518, + "grad_norm": 0.3481362121053799, + "learning_rate": 9.637348716707455e-06, + "loss": 0.0286, + "step": 919 + }, + { + "epoch": 0.33261026753434564, + "grad_norm": 0.009500048509825781, + "learning_rate": 9.636230251652653e-06, + "loss": 0.0003, + "step": 920 + }, + { + "epoch": 0.3329718004338395, + "grad_norm": 1.171207533491409, + "learning_rate": 9.635110129610138e-06, + "loss": 0.0957, + "step": 921 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.39808662498422664, + "learning_rate": 9.633988350980241e-06, + "loss": 0.0231, + "step": 922 + }, + { + "epoch": 0.3336948662328272, + "grad_norm": 0.45076269378834716, + "learning_rate": 9.632864916163886e-06, + "loss": 0.0576, + "step": 923 + }, + { + "epoch": 0.33405639913232105, + "grad_norm": 0.5182543002599737, + "learning_rate": 9.631739825562586e-06, + "loss": 0.0815, + "step": 924 + }, + { + "epoch": 0.3344179320318149, + "grad_norm": 0.5336378260814236, + "learning_rate": 9.63061307957845e-06, + "loss": 0.0815, + "step": 925 + }, + { + "epoch": 0.3347794649313087, + "grad_norm": 0.39643505908337534, + "learning_rate": 9.629484678614179e-06, + "loss": 0.063, + "step": 926 + }, + { + "epoch": 0.3351409978308026, + "grad_norm": 0.10417243859226152, + "learning_rate": 9.628354623073059e-06, + "loss": 0.0029, + "step": 927 + }, + { + "epoch": 0.33550253073029646, + "grad_norm": 0.4101152174664137, + "learning_rate": 9.627222913358973e-06, + "loss": 0.0576, + "step": 928 + }, + { + "epoch": 0.3358640636297903, + "grad_norm": 0.24940880843884317, + "learning_rate": 9.626089549876395e-06, + "loss": 0.0093, + "step": 929 + }, + { + "epoch": 0.3362255965292842, + "grad_norm": 0.3777481699546548, + "learning_rate": 9.624954533030388e-06, + "loss": 0.0317, + "step": 930 + }, + { + "epoch": 0.336587129428778, + "grad_norm": 0.5656460169874631, + "learning_rate": 9.623817863226607e-06, + "loss": 0.0815, + "step": 931 + }, + { + "epoch": 0.33694866232827186, + "grad_norm": 0.7489580076624428, + "learning_rate": 9.622679540871299e-06, + "loss": 0.0752, + "step": 932 + }, + { + "epoch": 0.33731019522776573, + "grad_norm": 0.19945176678487214, + "learning_rate": 9.621539566371297e-06, + "loss": 0.0074, + "step": 933 + }, + { + "epoch": 0.3376717281272596, + "grad_norm": 2.204032658209504, + "learning_rate": 9.620397940134029e-06, + "loss": 0.2246, + "step": 934 + }, + { + "epoch": 0.33803326102675346, + "grad_norm": 1.9045666432143888, + "learning_rate": 9.619254662567512e-06, + "loss": 0.1191, + "step": 935 + }, + { + "epoch": 0.3383947939262473, + "grad_norm": 0.16688027080925316, + "learning_rate": 9.618109734080355e-06, + "loss": 0.0065, + "step": 936 + }, + { + "epoch": 0.33875632682574114, + "grad_norm": 0.497450412315066, + "learning_rate": 9.616963155081753e-06, + "loss": 0.0432, + "step": 937 + }, + { + "epoch": 0.339117859725235, + "grad_norm": 0.3660537129891257, + "learning_rate": 9.615814925981492e-06, + "loss": 0.0525, + "step": 938 + }, + { + "epoch": 0.33947939262472887, + "grad_norm": 0.025407165570136534, + "learning_rate": 9.614665047189953e-06, + "loss": 0.0009, + "step": 939 + }, + { + "epoch": 0.3398409255242227, + "grad_norm": 2.4783934986009233, + "learning_rate": 9.6135135191181e-06, + "loss": 0.1934, + "step": 940 + }, + { + "epoch": 0.34020245842371655, + "grad_norm": 0.41982327400102226, + "learning_rate": 9.612360342177487e-06, + "loss": 0.0576, + "step": 941 + }, + { + "epoch": 0.3405639913232104, + "grad_norm": 0.13349682759986367, + "learning_rate": 9.611205516780262e-06, + "loss": 0.0046, + "step": 942 + }, + { + "epoch": 0.3409255242227043, + "grad_norm": 0.023000704913063906, + "learning_rate": 9.61004904333916e-06, + "loss": 0.0007, + "step": 943 + }, + { + "epoch": 0.34128705712219815, + "grad_norm": 0.691138504988036, + "learning_rate": 9.608890922267501e-06, + "loss": 0.0957, + "step": 944 + }, + { + "epoch": 0.34164859002169196, + "grad_norm": 0.06579330256090535, + "learning_rate": 9.607731153979198e-06, + "loss": 0.0025, + "step": 945 + }, + { + "epoch": 0.3420101229211858, + "grad_norm": 0.32735117209886994, + "learning_rate": 9.606569738888755e-06, + "loss": 0.0041, + "step": 946 + }, + { + "epoch": 0.3423716558206797, + "grad_norm": 0.45288618467553476, + "learning_rate": 9.60540667741126e-06, + "loss": 0.0187, + "step": 947 + }, + { + "epoch": 0.34273318872017355, + "grad_norm": 0.15320168785968805, + "learning_rate": 9.604241969962389e-06, + "loss": 0.0074, + "step": 948 + }, + { + "epoch": 0.34309472161966736, + "grad_norm": 0.2143076278513828, + "learning_rate": 9.60307561695841e-06, + "loss": 0.0165, + "step": 949 + }, + { + "epoch": 0.34345625451916123, + "grad_norm": 0.33040621328135894, + "learning_rate": 9.601907618816177e-06, + "loss": 0.0476, + "step": 950 + }, + { + "epoch": 0.3438177874186551, + "grad_norm": 0.27788287702034903, + "learning_rate": 9.600737975953131e-06, + "loss": 0.0391, + "step": 951 + }, + { + "epoch": 0.34417932031814896, + "grad_norm": 0.10388802229546952, + "learning_rate": 9.599566688787305e-06, + "loss": 0.004, + "step": 952 + }, + { + "epoch": 0.34454085321764283, + "grad_norm": 0.4673791177424619, + "learning_rate": 9.598393757737315e-06, + "loss": 0.0393, + "step": 953 + }, + { + "epoch": 0.34490238611713664, + "grad_norm": 0.07566728619032727, + "learning_rate": 9.597219183222366e-06, + "loss": 0.0025, + "step": 954 + }, + { + "epoch": 0.3452639190166305, + "grad_norm": 0.2599010556076738, + "learning_rate": 9.596042965662252e-06, + "loss": 0.0286, + "step": 955 + }, + { + "epoch": 0.34562545191612437, + "grad_norm": 0.2851221000536249, + "learning_rate": 9.594865105477352e-06, + "loss": 0.0148, + "step": 956 + }, + { + "epoch": 0.34598698481561824, + "grad_norm": 8.094498724178786, + "learning_rate": 9.59368560308863e-06, + "loss": 0.2559, + "step": 957 + }, + { + "epoch": 0.34634851771511205, + "grad_norm": 1.4973986769980567, + "learning_rate": 9.592504458917646e-06, + "loss": 0.1367, + "step": 958 + }, + { + "epoch": 0.3467100506146059, + "grad_norm": 0.4324518090008808, + "learning_rate": 9.591321673386536e-06, + "loss": 0.0432, + "step": 959 + }, + { + "epoch": 0.3470715835140998, + "grad_norm": 0.25284077395341914, + "learning_rate": 9.59013724691803e-06, + "loss": 0.0093, + "step": 960 + }, + { + "epoch": 0.34743311641359365, + "grad_norm": 3.7848315742405916, + "learning_rate": 9.58895117993544e-06, + "loss": 0.1191, + "step": 961 + }, + { + "epoch": 0.3477946493130875, + "grad_norm": 0.26520077742212866, + "learning_rate": 9.587763472862667e-06, + "loss": 0.0317, + "step": 962 + }, + { + "epoch": 0.3481561822125813, + "grad_norm": 0.22455240325718212, + "learning_rate": 9.586574126124198e-06, + "loss": 0.0317, + "step": 963 + }, + { + "epoch": 0.3485177151120752, + "grad_norm": 0.4234691067111972, + "learning_rate": 9.585383140145101e-06, + "loss": 0.0208, + "step": 964 + }, + { + "epoch": 0.34887924801156905, + "grad_norm": 0.23013590902311365, + "learning_rate": 9.58419051535104e-06, + "loss": 0.0184, + "step": 965 + }, + { + "epoch": 0.3492407809110629, + "grad_norm": 0.014206995874406577, + "learning_rate": 9.582996252168256e-06, + "loss": 0.0005, + "step": 966 + }, + { + "epoch": 0.3496023138105568, + "grad_norm": 0.42213940334085476, + "learning_rate": 9.58180035102358e-06, + "loss": 0.0393, + "step": 967 + }, + { + "epoch": 0.3499638467100506, + "grad_norm": 2.8609201003651745, + "learning_rate": 9.580602812344422e-06, + "loss": 0.1836, + "step": 968 + }, + { + "epoch": 0.35032537960954446, + "grad_norm": 2.091476095098092, + "learning_rate": 9.579403636558789e-06, + "loss": 0.1191, + "step": 969 + }, + { + "epoch": 0.35068691250903833, + "grad_norm": 1.3262474093283267, + "learning_rate": 9.57820282409526e-06, + "loss": 0.1279, + "step": 970 + }, + { + "epoch": 0.3510484454085322, + "grad_norm": 0.25564719160918176, + "learning_rate": 9.57700037538301e-06, + "loss": 0.0286, + "step": 971 + }, + { + "epoch": 0.351409978308026, + "grad_norm": 0.13699306887962812, + "learning_rate": 9.575796290851791e-06, + "loss": 0.0041, + "step": 972 + }, + { + "epoch": 0.35177151120751987, + "grad_norm": 1.3562666572464417, + "learning_rate": 9.574590570931944e-06, + "loss": 0.1836, + "step": 973 + }, + { + "epoch": 0.35213304410701374, + "grad_norm": 0.5368990053861918, + "learning_rate": 9.57338321605439e-06, + "loss": 0.0525, + "step": 974 + }, + { + "epoch": 0.3524945770065076, + "grad_norm": 0.20903444202058202, + "learning_rate": 9.572174226650641e-06, + "loss": 0.0255, + "step": 975 + }, + { + "epoch": 0.35285610990600147, + "grad_norm": 0.4726799053034383, + "learning_rate": 9.57096360315279e-06, + "loss": 0.0317, + "step": 976 + }, + { + "epoch": 0.3532176428054953, + "grad_norm": 1.6725660093346557, + "learning_rate": 9.56975134599351e-06, + "loss": 0.1279, + "step": 977 + }, + { + "epoch": 0.35357917570498915, + "grad_norm": 0.26981089256778945, + "learning_rate": 9.568537455606064e-06, + "loss": 0.0317, + "step": 978 + }, + { + "epoch": 0.353940708604483, + "grad_norm": 1.6718154536590568, + "learning_rate": 9.567321932424297e-06, + "loss": 0.0208, + "step": 979 + }, + { + "epoch": 0.3543022415039769, + "grad_norm": 0.5963580119484778, + "learning_rate": 9.566104776882631e-06, + "loss": 0.0317, + "step": 980 + }, + { + "epoch": 0.3546637744034707, + "grad_norm": 0.30886542027262465, + "learning_rate": 9.564885989416085e-06, + "loss": 0.0317, + "step": 981 + }, + { + "epoch": 0.35502530730296455, + "grad_norm": 0.6228506695888125, + "learning_rate": 9.56366557046025e-06, + "loss": 0.1279, + "step": 982 + }, + { + "epoch": 0.3553868402024584, + "grad_norm": 1.0555212736497954, + "learning_rate": 9.562443520451301e-06, + "loss": 0.1367, + "step": 983 + }, + { + "epoch": 0.3557483731019523, + "grad_norm": 0.08106397326247941, + "learning_rate": 9.561219839826e-06, + "loss": 0.0025, + "step": 984 + }, + { + "epoch": 0.35610990600144615, + "grad_norm": 1.0292669247669515, + "learning_rate": 9.559994529021695e-06, + "loss": 0.1035, + "step": 985 + }, + { + "epoch": 0.35647143890093996, + "grad_norm": 0.09430737342735282, + "learning_rate": 9.558767588476305e-06, + "loss": 0.0028, + "step": 986 + }, + { + "epoch": 0.35683297180043383, + "grad_norm": 0.3547848932350056, + "learning_rate": 9.55753901862834e-06, + "loss": 0.0258, + "step": 987 + }, + { + "epoch": 0.3571945046999277, + "grad_norm": 0.22568120465738573, + "learning_rate": 9.556308819916892e-06, + "loss": 0.0231, + "step": 988 + }, + { + "epoch": 0.35755603759942156, + "grad_norm": 0.9001762836255857, + "learning_rate": 9.555076992781636e-06, + "loss": 0.0479, + "step": 989 + }, + { + "epoch": 0.3579175704989154, + "grad_norm": 0.24614583405243512, + "learning_rate": 9.55384353766282e-06, + "loss": 0.0117, + "step": 990 + }, + { + "epoch": 0.35827910339840924, + "grad_norm": 0.2543384031452251, + "learning_rate": 9.552608455001287e-06, + "loss": 0.0286, + "step": 991 + }, + { + "epoch": 0.3586406362979031, + "grad_norm": 0.18816058361600463, + "learning_rate": 9.551371745238454e-06, + "loss": 0.0165, + "step": 992 + }, + { + "epoch": 0.35900216919739697, + "grad_norm": 0.9499633257469098, + "learning_rate": 9.550133408816317e-06, + "loss": 0.1279, + "step": 993 + }, + { + "epoch": 0.35936370209689084, + "grad_norm": 0.5510125205362694, + "learning_rate": 9.548893446177463e-06, + "loss": 0.1035, + "step": 994 + }, + { + "epoch": 0.35972523499638465, + "grad_norm": 0.8976524444921611, + "learning_rate": 9.547651857765049e-06, + "loss": 0.063, + "step": 995 + }, + { + "epoch": 0.3600867678958785, + "grad_norm": 0.4911438066672238, + "learning_rate": 9.546408644022822e-06, + "loss": 0.0231, + "step": 996 + }, + { + "epoch": 0.3604483007953724, + "grad_norm": 0.587042804024525, + "learning_rate": 9.545163805395103e-06, + "loss": 0.0317, + "step": 997 + }, + { + "epoch": 0.36080983369486624, + "grad_norm": 0.2621947502977773, + "learning_rate": 9.543917342326802e-06, + "loss": 0.0208, + "step": 998 + }, + { + "epoch": 0.3611713665943601, + "grad_norm": 0.22816287298246962, + "learning_rate": 9.5426692552634e-06, + "loss": 0.0258, + "step": 999 + }, + { + "epoch": 0.3615328994938539, + "grad_norm": 0.2792358550169854, + "learning_rate": 9.541419544650966e-06, + "loss": 0.0286, + "step": 1000 + }, + { + "epoch": 0.3618944323933478, + "grad_norm": 1.4796957274733773, + "learning_rate": 9.540168210936145e-06, + "loss": 0.1367, + "step": 1001 + }, + { + "epoch": 0.36225596529284165, + "grad_norm": 0.40943054199712675, + "learning_rate": 9.538915254566163e-06, + "loss": 0.0432, + "step": 1002 + }, + { + "epoch": 0.3626174981923355, + "grad_norm": 0.1909548222557548, + "learning_rate": 9.537660675988827e-06, + "loss": 0.0229, + "step": 1003 + }, + { + "epoch": 0.36297903109182933, + "grad_norm": 0.29364871824065186, + "learning_rate": 9.536404475652524e-06, + "loss": 0.0317, + "step": 1004 + }, + { + "epoch": 0.3633405639913232, + "grad_norm": 0.269183335789309, + "learning_rate": 9.535146654006216e-06, + "loss": 0.0258, + "step": 1005 + }, + { + "epoch": 0.36370209689081706, + "grad_norm": 0.011915774509862392, + "learning_rate": 9.533887211499453e-06, + "loss": 0.0004, + "step": 1006 + }, + { + "epoch": 0.3640636297903109, + "grad_norm": 0.2008319655327075, + "learning_rate": 9.532626148582358e-06, + "loss": 0.0206, + "step": 1007 + }, + { + "epoch": 0.3644251626898048, + "grad_norm": 0.23180497616116713, + "learning_rate": 9.531363465705633e-06, + "loss": 0.0231, + "step": 1008 + }, + { + "epoch": 0.3647866955892986, + "grad_norm": 0.02037915451008082, + "learning_rate": 9.530099163320562e-06, + "loss": 0.0007, + "step": 1009 + }, + { + "epoch": 0.36514822848879247, + "grad_norm": 1.456072324555624, + "learning_rate": 9.528833241879006e-06, + "loss": 0.063, + "step": 1010 + }, + { + "epoch": 0.36550976138828634, + "grad_norm": 1.0934714913210877, + "learning_rate": 9.527565701833405e-06, + "loss": 0.0957, + "step": 1011 + }, + { + "epoch": 0.3658712942877802, + "grad_norm": 0.16772454761279262, + "learning_rate": 9.526296543636777e-06, + "loss": 0.0165, + "step": 1012 + }, + { + "epoch": 0.36623282718727407, + "grad_norm": 1.201148701907276, + "learning_rate": 9.525025767742721e-06, + "loss": 0.1035, + "step": 1013 + }, + { + "epoch": 0.3665943600867679, + "grad_norm": 0.2802267742329838, + "learning_rate": 9.52375337460541e-06, + "loss": 0.0148, + "step": 1014 + }, + { + "epoch": 0.36695589298626174, + "grad_norm": 1.0495372441885165, + "learning_rate": 9.522479364679598e-06, + "loss": 0.1191, + "step": 1015 + }, + { + "epoch": 0.3673174258857556, + "grad_norm": 0.19031756752450726, + "learning_rate": 9.521203738420614e-06, + "loss": 0.0166, + "step": 1016 + }, + { + "epoch": 0.3676789587852495, + "grad_norm": 0.06798852167745646, + "learning_rate": 9.519926496284369e-06, + "loss": 0.0025, + "step": 1017 + }, + { + "epoch": 0.3680404916847433, + "grad_norm": 0.031236439631694396, + "learning_rate": 9.518647638727348e-06, + "loss": 0.0011, + "step": 1018 + }, + { + "epoch": 0.36840202458423715, + "grad_norm": 0.28536522294044103, + "learning_rate": 9.517367166206615e-06, + "loss": 0.0286, + "step": 1019 + }, + { + "epoch": 0.368763557483731, + "grad_norm": 0.23286679964013385, + "learning_rate": 9.516085079179809e-06, + "loss": 0.0231, + "step": 1020 + }, + { + "epoch": 0.3691250903832249, + "grad_norm": 0.0654259017729062, + "learning_rate": 9.51480137810515e-06, + "loss": 0.0025, + "step": 1021 + }, + { + "epoch": 0.36948662328271875, + "grad_norm": 0.04467170833675855, + "learning_rate": 9.513516063441431e-06, + "loss": 0.0011, + "step": 1022 + }, + { + "epoch": 0.36984815618221256, + "grad_norm": 0.003756096546505878, + "learning_rate": 9.512229135648023e-06, + "loss": 0.0002, + "step": 1023 + }, + { + "epoch": 0.37020968908170643, + "grad_norm": 0.22694193373452276, + "learning_rate": 9.510940595184875e-06, + "loss": 0.0258, + "step": 1024 + }, + { + "epoch": 0.3705712219812003, + "grad_norm": 2.639061414298632, + "learning_rate": 9.50965044251251e-06, + "loss": 0.1367, + "step": 1025 + }, + { + "epoch": 0.37093275488069416, + "grad_norm": 0.2851961559545363, + "learning_rate": 9.508358678092028e-06, + "loss": 0.0118, + "step": 1026 + }, + { + "epoch": 0.37129428778018797, + "grad_norm": 0.052033486961419344, + "learning_rate": 9.507065302385107e-06, + "loss": 0.0016, + "step": 1027 + }, + { + "epoch": 0.37165582067968184, + "grad_norm": 0.42364731484894563, + "learning_rate": 9.505770315853998e-06, + "loss": 0.0288, + "step": 1028 + }, + { + "epoch": 0.3720173535791757, + "grad_norm": 0.01577921937529022, + "learning_rate": 9.504473718961526e-06, + "loss": 0.0003, + "step": 1029 + }, + { + "epoch": 0.37237888647866957, + "grad_norm": 0.008317310179433214, + "learning_rate": 9.503175512171102e-06, + "loss": 0.0003, + "step": 1030 + }, + { + "epoch": 0.37274041937816343, + "grad_norm": 0.2999648180563223, + "learning_rate": 9.501875695946697e-06, + "loss": 0.0286, + "step": 1031 + }, + { + "epoch": 0.37310195227765725, + "grad_norm": 0.14730976759072623, + "learning_rate": 9.50057427075287e-06, + "loss": 0.0131, + "step": 1032 + }, + { + "epoch": 0.3734634851771511, + "grad_norm": 1.2094485118284592, + "learning_rate": 9.499271237054748e-06, + "loss": 0.0815, + "step": 1033 + }, + { + "epoch": 0.373825018076645, + "grad_norm": 0.06260980679507942, + "learning_rate": 9.497966595318035e-06, + "loss": 0.002, + "step": 1034 + }, + { + "epoch": 0.37418655097613884, + "grad_norm": 0.004796284398753123, + "learning_rate": 9.49666034600901e-06, + "loss": 0.0002, + "step": 1035 + }, + { + "epoch": 0.3745480838756327, + "grad_norm": 0.3065890033977543, + "learning_rate": 9.495352489594528e-06, + "loss": 0.0184, + "step": 1036 + }, + { + "epoch": 0.3749096167751265, + "grad_norm": 0.015356586167729594, + "learning_rate": 9.494043026542013e-06, + "loss": 0.0003, + "step": 1037 + }, + { + "epoch": 0.3752711496746204, + "grad_norm": 0.45624209263035786, + "learning_rate": 9.492731957319467e-06, + "loss": 0.0258, + "step": 1038 + }, + { + "epoch": 0.37563268257411425, + "grad_norm": 0.17732846513422368, + "learning_rate": 9.491419282395471e-06, + "loss": 0.0146, + "step": 1039 + }, + { + "epoch": 0.3759942154736081, + "grad_norm": 1.6757655971834067, + "learning_rate": 9.49010500223917e-06, + "loss": 0.1934, + "step": 1040 + }, + { + "epoch": 0.37635574837310193, + "grad_norm": 0.2381090671452057, + "learning_rate": 9.48878911732029e-06, + "loss": 0.0286, + "step": 1041 + }, + { + "epoch": 0.3767172812725958, + "grad_norm": 0.17420895249870375, + "learning_rate": 9.487471628109124e-06, + "loss": 0.0146, + "step": 1042 + }, + { + "epoch": 0.37707881417208966, + "grad_norm": 0.27953151322648107, + "learning_rate": 9.486152535076548e-06, + "loss": 0.0231, + "step": 1043 + }, + { + "epoch": 0.3774403470715835, + "grad_norm": 1.3495069466975531, + "learning_rate": 9.484831838694002e-06, + "loss": 0.082, + "step": 1044 + }, + { + "epoch": 0.3778018799710774, + "grad_norm": 0.13865537983271448, + "learning_rate": 9.483509539433501e-06, + "loss": 0.0131, + "step": 1045 + }, + { + "epoch": 0.3781634128705712, + "grad_norm": 1.3237134350841877, + "learning_rate": 9.48218563776764e-06, + "loss": 0.1553, + "step": 1046 + }, + { + "epoch": 0.37852494577006507, + "grad_norm": 0.1759877794445702, + "learning_rate": 9.480860134169577e-06, + "loss": 0.0131, + "step": 1047 + }, + { + "epoch": 0.37888647866955893, + "grad_norm": 0.29783921450592216, + "learning_rate": 9.479533029113047e-06, + "loss": 0.0104, + "step": 1048 + }, + { + "epoch": 0.3792480115690528, + "grad_norm": 0.16222053303006115, + "learning_rate": 9.47820432307236e-06, + "loss": 0.0146, + "step": 1049 + }, + { + "epoch": 0.3796095444685466, + "grad_norm": 0.07695245836665823, + "learning_rate": 9.47687401652239e-06, + "loss": 0.005, + "step": 1050 + }, + { + "epoch": 0.3799710773680405, + "grad_norm": 1.2385372433976716, + "learning_rate": 9.475542109938592e-06, + "loss": 0.0957, + "step": 1051 + }, + { + "epoch": 0.38033261026753434, + "grad_norm": 1.2224325528285473, + "learning_rate": 9.47420860379699e-06, + "loss": 0.0525, + "step": 1052 + }, + { + "epoch": 0.3806941431670282, + "grad_norm": 0.6312755477359355, + "learning_rate": 9.472873498574175e-06, + "loss": 0.0432, + "step": 1053 + }, + { + "epoch": 0.3810556760665221, + "grad_norm": 0.24228512904425872, + "learning_rate": 9.471536794747319e-06, + "loss": 0.0093, + "step": 1054 + }, + { + "epoch": 0.3814172089660159, + "grad_norm": 1.075840741876844, + "learning_rate": 9.470198492794152e-06, + "loss": 0.0889, + "step": 1055 + }, + { + "epoch": 0.38177874186550975, + "grad_norm": 2.1086241279928086, + "learning_rate": 9.468858593192989e-06, + "loss": 0.0525, + "step": 1056 + }, + { + "epoch": 0.3821402747650036, + "grad_norm": 0.020293583674238565, + "learning_rate": 9.467517096422709e-06, + "loss": 0.0003, + "step": 1057 + }, + { + "epoch": 0.3825018076644975, + "grad_norm": 0.5400505935551531, + "learning_rate": 9.46617400296276e-06, + "loss": 0.0167, + "step": 1058 + }, + { + "epoch": 0.38286334056399135, + "grad_norm": 0.936512900198395, + "learning_rate": 9.464829313293164e-06, + "loss": 0.1934, + "step": 1059 + }, + { + "epoch": 0.38322487346348516, + "grad_norm": 0.7466104075364791, + "learning_rate": 9.463483027894516e-06, + "loss": 0.0815, + "step": 1060 + }, + { + "epoch": 0.383586406362979, + "grad_norm": 0.4984801657887544, + "learning_rate": 9.462135147247975e-06, + "loss": 0.0166, + "step": 1061 + }, + { + "epoch": 0.3839479392624729, + "grad_norm": 0.3085497366788731, + "learning_rate": 9.460785671835275e-06, + "loss": 0.0165, + "step": 1062 + }, + { + "epoch": 0.38430947216196676, + "grad_norm": 0.0796270058832809, + "learning_rate": 9.459434602138715e-06, + "loss": 0.0028, + "step": 1063 + }, + { + "epoch": 0.38467100506146057, + "grad_norm": 0.14660980903460777, + "learning_rate": 9.45808193864117e-06, + "loss": 0.0103, + "step": 1064 + }, + { + "epoch": 0.38503253796095444, + "grad_norm": 0.2482079709835047, + "learning_rate": 9.456727681826082e-06, + "loss": 0.0146, + "step": 1065 + }, + { + "epoch": 0.3853940708604483, + "grad_norm": 0.07436904790353784, + "learning_rate": 9.45537183217746e-06, + "loss": 0.004, + "step": 1066 + }, + { + "epoch": 0.38575560375994217, + "grad_norm": 0.3257601181128312, + "learning_rate": 9.454014390179886e-06, + "loss": 0.0131, + "step": 1067 + }, + { + "epoch": 0.38611713665943603, + "grad_norm": 0.9999026148386884, + "learning_rate": 9.45265535631851e-06, + "loss": 0.0525, + "step": 1068 + }, + { + "epoch": 0.38647866955892984, + "grad_norm": 0.07816755667213485, + "learning_rate": 9.451294731079049e-06, + "loss": 0.002, + "step": 1069 + }, + { + "epoch": 0.3868402024584237, + "grad_norm": 1.252811195940043, + "learning_rate": 9.449932514947788e-06, + "loss": 0.1201, + "step": 1070 + }, + { + "epoch": 0.3872017353579176, + "grad_norm": 0.013103313702875045, + "learning_rate": 9.448568708411588e-06, + "loss": 0.0003, + "step": 1071 + }, + { + "epoch": 0.38756326825741144, + "grad_norm": 2.0942341314741744, + "learning_rate": 9.447203311957869e-06, + "loss": 0.2246, + "step": 1072 + }, + { + "epoch": 0.38792480115690525, + "grad_norm": 0.005982771487324485, + "learning_rate": 9.445836326074625e-06, + "loss": 0.0002, + "step": 1073 + }, + { + "epoch": 0.3882863340563991, + "grad_norm": 0.6264146624476477, + "learning_rate": 9.444467751250416e-06, + "loss": 0.0184, + "step": 1074 + }, + { + "epoch": 0.388647866955893, + "grad_norm": 0.2006204068129661, + "learning_rate": 9.443097587974373e-06, + "loss": 0.0104, + "step": 1075 + }, + { + "epoch": 0.38900939985538685, + "grad_norm": 2.7763988587502757, + "learning_rate": 9.441725836736186e-06, + "loss": 0.1748, + "step": 1076 + }, + { + "epoch": 0.3893709327548807, + "grad_norm": 1.993934134761054, + "learning_rate": 9.440352498026126e-06, + "loss": 0.1641, + "step": 1077 + }, + { + "epoch": 0.3897324656543745, + "grad_norm": 0.021063893148171642, + "learning_rate": 9.438977572335017e-06, + "loss": 0.0007, + "step": 1078 + }, + { + "epoch": 0.3900939985538684, + "grad_norm": 0.16644739202134204, + "learning_rate": 9.437601060154263e-06, + "loss": 0.0146, + "step": 1079 + }, + { + "epoch": 0.39045553145336226, + "grad_norm": 0.1573429274016727, + "learning_rate": 9.436222961975826e-06, + "loss": 0.0131, + "step": 1080 + }, + { + "epoch": 0.3908170643528561, + "grad_norm": 0.1676669478854443, + "learning_rate": 9.434843278292239e-06, + "loss": 0.0165, + "step": 1081 + }, + { + "epoch": 0.39117859725235, + "grad_norm": 0.15369173946019793, + "learning_rate": 9.433462009596598e-06, + "loss": 0.0103, + "step": 1082 + }, + { + "epoch": 0.3915401301518438, + "grad_norm": 0.017310615918788422, + "learning_rate": 9.432079156382572e-06, + "loss": 0.0006, + "step": 1083 + }, + { + "epoch": 0.39190166305133767, + "grad_norm": 0.24932562185631774, + "learning_rate": 9.43069471914439e-06, + "loss": 0.0165, + "step": 1084 + }, + { + "epoch": 0.39226319595083153, + "grad_norm": 0.2194564033171015, + "learning_rate": 9.429308698376854e-06, + "loss": 0.0165, + "step": 1085 + }, + { + "epoch": 0.3926247288503254, + "grad_norm": 0.7787346108614959, + "learning_rate": 9.42792109457532e-06, + "loss": 0.0752, + "step": 1086 + }, + { + "epoch": 0.3929862617498192, + "grad_norm": 0.9752296047018193, + "learning_rate": 9.426531908235721e-06, + "loss": 0.0579, + "step": 1087 + }, + { + "epoch": 0.3933477946493131, + "grad_norm": 0.4723095741502487, + "learning_rate": 9.425141139854555e-06, + "loss": 0.0258, + "step": 1088 + }, + { + "epoch": 0.39370932754880694, + "grad_norm": 0.494033895978888, + "learning_rate": 9.423748789928878e-06, + "loss": 0.0187, + "step": 1089 + }, + { + "epoch": 0.3940708604483008, + "grad_norm": 0.1216466023051404, + "learning_rate": 9.422354858956317e-06, + "loss": 0.0103, + "step": 1090 + }, + { + "epoch": 0.3944323933477947, + "grad_norm": 0.11451229698203315, + "learning_rate": 9.420959347435064e-06, + "loss": 0.0082, + "step": 1091 + }, + { + "epoch": 0.3947939262472885, + "grad_norm": 0.01734470675003864, + "learning_rate": 9.41956225586387e-06, + "loss": 0.0004, + "step": 1092 + }, + { + "epoch": 0.39515545914678235, + "grad_norm": 0.11600804188725891, + "learning_rate": 9.418163584742061e-06, + "loss": 0.0092, + "step": 1093 + }, + { + "epoch": 0.3955169920462762, + "grad_norm": 0.2829564823986706, + "learning_rate": 9.41676333456952e-06, + "loss": 0.0184, + "step": 1094 + }, + { + "epoch": 0.3958785249457701, + "grad_norm": 0.1741302386248618, + "learning_rate": 9.415361505846693e-06, + "loss": 0.0103, + "step": 1095 + }, + { + "epoch": 0.3962400578452639, + "grad_norm": 0.29337957509713075, + "learning_rate": 9.413958099074598e-06, + "loss": 0.0131, + "step": 1096 + }, + { + "epoch": 0.39660159074475776, + "grad_norm": 0.09867968606778085, + "learning_rate": 9.412553114754807e-06, + "loss": 0.0032, + "step": 1097 + }, + { + "epoch": 0.3969631236442516, + "grad_norm": 2.211453313844027, + "learning_rate": 9.411146553389467e-06, + "loss": 0.1836, + "step": 1098 + }, + { + "epoch": 0.3973246565437455, + "grad_norm": 0.538350709526569, + "learning_rate": 9.409738415481278e-06, + "loss": 0.0147, + "step": 1099 + }, + { + "epoch": 0.39768618944323936, + "grad_norm": 0.1416869734979054, + "learning_rate": 9.40832870153351e-06, + "loss": 0.0103, + "step": 1100 + }, + { + "epoch": 0.39804772234273317, + "grad_norm": 0.12111717841248403, + "learning_rate": 9.406917412049995e-06, + "loss": 0.0064, + "step": 1101 + }, + { + "epoch": 0.39840925524222703, + "grad_norm": 0.09167553339842499, + "learning_rate": 9.405504547535127e-06, + "loss": 0.0057, + "step": 1102 + }, + { + "epoch": 0.3987707881417209, + "grad_norm": 1.7592068494962259, + "learning_rate": 9.404090108493863e-06, + "loss": 0.0752, + "step": 1103 + }, + { + "epoch": 0.39913232104121477, + "grad_norm": 1.4531788835558297, + "learning_rate": 9.402674095431724e-06, + "loss": 0.2773, + "step": 1104 + }, + { + "epoch": 0.39949385394070863, + "grad_norm": 0.058786188271589115, + "learning_rate": 9.401256508854793e-06, + "loss": 0.0045, + "step": 1105 + }, + { + "epoch": 0.39985538684020244, + "grad_norm": 0.2534479783507594, + "learning_rate": 9.399837349269713e-06, + "loss": 0.0147, + "step": 1106 + }, + { + "epoch": 0.4002169197396963, + "grad_norm": 0.17167363142228226, + "learning_rate": 9.398416617183694e-06, + "loss": 0.0067, + "step": 1107 + }, + { + "epoch": 0.4005784526391902, + "grad_norm": 1.1762705705142953, + "learning_rate": 9.396994313104504e-06, + "loss": 0.063, + "step": 1108 + }, + { + "epoch": 0.40093998553868404, + "grad_norm": 0.3914113440586638, + "learning_rate": 9.395570437540474e-06, + "loss": 0.0354, + "step": 1109 + }, + { + "epoch": 0.40130151843817785, + "grad_norm": 0.3186820185387412, + "learning_rate": 9.394144991000497e-06, + "loss": 0.0092, + "step": 1110 + }, + { + "epoch": 0.4016630513376717, + "grad_norm": 1.9575996640901516, + "learning_rate": 9.392717973994028e-06, + "loss": 0.2773, + "step": 1111 + }, + { + "epoch": 0.4020245842371656, + "grad_norm": 0.8286568470319644, + "learning_rate": 9.391289387031084e-06, + "loss": 0.0525, + "step": 1112 + }, + { + "epoch": 0.40238611713665945, + "grad_norm": 0.10726559508428564, + "learning_rate": 9.389859230622237e-06, + "loss": 0.0082, + "step": 1113 + }, + { + "epoch": 0.4027476500361533, + "grad_norm": 0.05994873787875815, + "learning_rate": 9.38842750527863e-06, + "loss": 0.0045, + "step": 1114 + }, + { + "epoch": 0.4031091829356471, + "grad_norm": 0.08225907359090456, + "learning_rate": 9.386994211511957e-06, + "loss": 0.0064, + "step": 1115 + }, + { + "epoch": 0.403470715835141, + "grad_norm": 1.587536446013399, + "learning_rate": 9.385559349834478e-06, + "loss": 0.0957, + "step": 1116 + }, + { + "epoch": 0.40383224873463486, + "grad_norm": 0.018143406136166766, + "learning_rate": 9.384122920759014e-06, + "loss": 0.0005, + "step": 1117 + }, + { + "epoch": 0.4041937816341287, + "grad_norm": 0.08212926508566187, + "learning_rate": 9.382684924798944e-06, + "loss": 0.0073, + "step": 1118 + }, + { + "epoch": 0.40455531453362253, + "grad_norm": 0.09246246012405138, + "learning_rate": 9.381245362468206e-06, + "loss": 0.0072, + "step": 1119 + }, + { + "epoch": 0.4049168474331164, + "grad_norm": 0.08087525665156131, + "learning_rate": 9.3798042342813e-06, + "loss": 0.0064, + "step": 1120 + }, + { + "epoch": 0.40527838033261027, + "grad_norm": 2.2152342337267674, + "learning_rate": 9.378361540753284e-06, + "loss": 0.1641, + "step": 1121 + }, + { + "epoch": 0.40563991323210413, + "grad_norm": 0.0859001710394085, + "learning_rate": 9.37691728239978e-06, + "loss": 0.0064, + "step": 1122 + }, + { + "epoch": 0.406001446131598, + "grad_norm": 1.3300950307096202, + "learning_rate": 9.37547145973696e-06, + "loss": 0.2129, + "step": 1123 + }, + { + "epoch": 0.4063629790310918, + "grad_norm": 0.2497130207683132, + "learning_rate": 9.374024073281566e-06, + "loss": 0.0184, + "step": 1124 + }, + { + "epoch": 0.4067245119305857, + "grad_norm": 1.2005256567448723, + "learning_rate": 9.37257512355089e-06, + "loss": 0.0752, + "step": 1125 + }, + { + "epoch": 0.40708604483007954, + "grad_norm": 0.0925127925580318, + "learning_rate": 9.371124611062788e-06, + "loss": 0.0082, + "step": 1126 + }, + { + "epoch": 0.4074475777295734, + "grad_norm": 0.1036805400637577, + "learning_rate": 9.369672536335673e-06, + "loss": 0.0029, + "step": 1127 + }, + { + "epoch": 0.4078091106290672, + "grad_norm": 0.057603315928424596, + "learning_rate": 9.368218899888515e-06, + "loss": 0.0039, + "step": 1128 + }, + { + "epoch": 0.4081706435285611, + "grad_norm": 0.2781477355545481, + "learning_rate": 9.366763702240844e-06, + "loss": 0.0206, + "step": 1129 + }, + { + "epoch": 0.40853217642805495, + "grad_norm": 1.0434294811558473, + "learning_rate": 9.365306943912747e-06, + "loss": 0.0476, + "step": 1130 + }, + { + "epoch": 0.4088937093275488, + "grad_norm": 0.6147083049223371, + "learning_rate": 9.36384862542487e-06, + "loss": 0.0286, + "step": 1131 + }, + { + "epoch": 0.4092552422270427, + "grad_norm": 0.9425504550542481, + "learning_rate": 9.362388747298417e-06, + "loss": 0.0957, + "step": 1132 + }, + { + "epoch": 0.4096167751265365, + "grad_norm": 0.21102668415557, + "learning_rate": 9.360927310055144e-06, + "loss": 0.0206, + "step": 1133 + }, + { + "epoch": 0.40997830802603036, + "grad_norm": 0.09507011729608754, + "learning_rate": 9.35946431421737e-06, + "loss": 0.0057, + "step": 1134 + }, + { + "epoch": 0.4103398409255242, + "grad_norm": 1.4703476443315495, + "learning_rate": 9.357999760307973e-06, + "loss": 0.1035, + "step": 1135 + }, + { + "epoch": 0.4107013738250181, + "grad_norm": 0.4855163949729886, + "learning_rate": 9.356533648850378e-06, + "loss": 0.0206, + "step": 1136 + }, + { + "epoch": 0.41106290672451196, + "grad_norm": 0.11924580319809996, + "learning_rate": 9.35506598036858e-06, + "loss": 0.0073, + "step": 1137 + }, + { + "epoch": 0.41142443962400577, + "grad_norm": 0.07533688761878315, + "learning_rate": 9.353596755387117e-06, + "loss": 0.0064, + "step": 1138 + }, + { + "epoch": 0.41178597252349963, + "grad_norm": 1.5682969116774583, + "learning_rate": 9.352125974431095e-06, + "loss": 0.0525, + "step": 1139 + }, + { + "epoch": 0.4121475054229935, + "grad_norm": 0.06683799602098554, + "learning_rate": 9.350653638026165e-06, + "loss": 0.0024, + "step": 1140 + }, + { + "epoch": 0.41250903832248736, + "grad_norm": 1.0197684461197327, + "learning_rate": 9.349179746698545e-06, + "loss": 0.1191, + "step": 1141 + }, + { + "epoch": 0.4128705712219812, + "grad_norm": 0.13944507037089907, + "learning_rate": 9.347704300975e-06, + "loss": 0.0051, + "step": 1142 + }, + { + "epoch": 0.41323210412147504, + "grad_norm": 0.6498911428813499, + "learning_rate": 9.346227301382857e-06, + "loss": 0.0391, + "step": 1143 + }, + { + "epoch": 0.4135936370209689, + "grad_norm": 1.1818813196428035, + "learning_rate": 9.344748748449993e-06, + "loss": 0.1553, + "step": 1144 + }, + { + "epoch": 0.4139551699204628, + "grad_norm": 0.006942046488374129, + "learning_rate": 9.343268642704843e-06, + "loss": 0.0002, + "step": 1145 + }, + { + "epoch": 0.41431670281995664, + "grad_norm": 0.25892774553843356, + "learning_rate": 9.341786984676397e-06, + "loss": 0.0186, + "step": 1146 + }, + { + "epoch": 0.41467823571945045, + "grad_norm": 0.08168369252998034, + "learning_rate": 9.340303774894198e-06, + "loss": 0.0064, + "step": 1147 + }, + { + "epoch": 0.4150397686189443, + "grad_norm": 0.39769164504364213, + "learning_rate": 9.338819013888347e-06, + "loss": 0.0231, + "step": 1148 + }, + { + "epoch": 0.4154013015184382, + "grad_norm": 1.4139952523626949, + "learning_rate": 9.337332702189494e-06, + "loss": 0.1367, + "step": 1149 + }, + { + "epoch": 0.41576283441793205, + "grad_norm": 1.1640536054742625, + "learning_rate": 9.335844840328851e-06, + "loss": 0.2246, + "step": 1150 + }, + { + "epoch": 0.41612436731742586, + "grad_norm": 0.14207754999564937, + "learning_rate": 9.334355428838179e-06, + "loss": 0.0064, + "step": 1151 + }, + { + "epoch": 0.4164859002169197, + "grad_norm": 0.08577186002327768, + "learning_rate": 9.332864468249788e-06, + "loss": 0.0064, + "step": 1152 + }, + { + "epoch": 0.4168474331164136, + "grad_norm": 0.32130283186290104, + "learning_rate": 9.33137195909655e-06, + "loss": 0.0231, + "step": 1153 + }, + { + "epoch": 0.41720896601590746, + "grad_norm": 3.2264071944121104, + "learning_rate": 9.329877901911889e-06, + "loss": 0.2246, + "step": 1154 + }, + { + "epoch": 0.4175704989154013, + "grad_norm": 1.180588764537464, + "learning_rate": 9.32838229722978e-06, + "loss": 0.1641, + "step": 1155 + }, + { + "epoch": 0.41793203181489513, + "grad_norm": 0.009816816354921697, + "learning_rate": 9.326885145584753e-06, + "loss": 0.0002, + "step": 1156 + }, + { + "epoch": 0.418293564714389, + "grad_norm": 0.4284785292129483, + "learning_rate": 9.325386447511884e-06, + "loss": 0.0255, + "step": 1157 + }, + { + "epoch": 0.41865509761388287, + "grad_norm": 3.017148324864925, + "learning_rate": 9.323886203546815e-06, + "loss": 0.2559, + "step": 1158 + }, + { + "epoch": 0.41901663051337673, + "grad_norm": 0.1018771884928545, + "learning_rate": 9.322384414225727e-06, + "loss": 0.0045, + "step": 1159 + }, + { + "epoch": 0.4193781634128706, + "grad_norm": 1.2167193919148422, + "learning_rate": 9.320881080085363e-06, + "loss": 0.1279, + "step": 1160 + }, + { + "epoch": 0.4197396963123644, + "grad_norm": 1.3491201031620592, + "learning_rate": 9.319376201663012e-06, + "loss": 0.1367, + "step": 1161 + }, + { + "epoch": 0.4201012292118583, + "grad_norm": 0.34425401021977675, + "learning_rate": 9.31786977949652e-06, + "loss": 0.0117, + "step": 1162 + }, + { + "epoch": 0.42046276211135214, + "grad_norm": 0.7424499969071638, + "learning_rate": 9.316361814124278e-06, + "loss": 0.0815, + "step": 1163 + }, + { + "epoch": 0.420824295010846, + "grad_norm": 0.23669084125577614, + "learning_rate": 9.314852306085235e-06, + "loss": 0.0206, + "step": 1164 + }, + { + "epoch": 0.4211858279103398, + "grad_norm": 0.3567294566723959, + "learning_rate": 9.313341255918889e-06, + "loss": 0.0117, + "step": 1165 + }, + { + "epoch": 0.4215473608098337, + "grad_norm": 0.4899956588604835, + "learning_rate": 9.31182866416529e-06, + "loss": 0.0354, + "step": 1166 + }, + { + "epoch": 0.42190889370932755, + "grad_norm": 0.21504704459159868, + "learning_rate": 9.310314531365033e-06, + "loss": 0.0165, + "step": 1167 + }, + { + "epoch": 0.4222704266088214, + "grad_norm": 0.28175254734532357, + "learning_rate": 9.308798858059274e-06, + "loss": 0.0165, + "step": 1168 + }, + { + "epoch": 0.4226319595083153, + "grad_norm": 0.17709138482396072, + "learning_rate": 9.307281644789712e-06, + "loss": 0.0165, + "step": 1169 + }, + { + "epoch": 0.4229934924078091, + "grad_norm": 0.9737171647774402, + "learning_rate": 9.3057628920986e-06, + "loss": 0.0889, + "step": 1170 + }, + { + "epoch": 0.42335502530730296, + "grad_norm": 0.7215162855950229, + "learning_rate": 9.304242600528738e-06, + "loss": 0.1553, + "step": 1171 + }, + { + "epoch": 0.4237165582067968, + "grad_norm": 0.083739603079089, + "learning_rate": 9.302720770623479e-06, + "loss": 0.0028, + "step": 1172 + }, + { + "epoch": 0.4240780911062907, + "grad_norm": 0.3948965385623532, + "learning_rate": 9.301197402926726e-06, + "loss": 0.0317, + "step": 1173 + }, + { + "epoch": 0.4244396240057845, + "grad_norm": 1.1094043522170145, + "learning_rate": 9.299672497982926e-06, + "loss": 0.1455, + "step": 1174 + }, + { + "epoch": 0.42480115690527837, + "grad_norm": 0.31471271469315676, + "learning_rate": 9.298146056337085e-06, + "loss": 0.0231, + "step": 1175 + }, + { + "epoch": 0.42516268980477223, + "grad_norm": 0.059253533599419, + "learning_rate": 9.29661807853475e-06, + "loss": 0.0016, + "step": 1176 + }, + { + "epoch": 0.4255242227042661, + "grad_norm": 0.21728276699741064, + "learning_rate": 9.29508856512202e-06, + "loss": 0.0231, + "step": 1177 + }, + { + "epoch": 0.42588575560375996, + "grad_norm": 0.6025952200798772, + "learning_rate": 9.293557516645543e-06, + "loss": 0.0206, + "step": 1178 + }, + { + "epoch": 0.4262472885032538, + "grad_norm": 0.8284383646254898, + "learning_rate": 9.292024933652518e-06, + "loss": 0.082, + "step": 1179 + }, + { + "epoch": 0.42660882140274764, + "grad_norm": 0.24059131556851346, + "learning_rate": 9.290490816690685e-06, + "loss": 0.0231, + "step": 1180 + }, + { + "epoch": 0.4269703543022415, + "grad_norm": 0.2991343902986977, + "learning_rate": 9.28895516630834e-06, + "loss": 0.0206, + "step": 1181 + }, + { + "epoch": 0.42733188720173537, + "grad_norm": 0.1391444607335759, + "learning_rate": 9.287417983054326e-06, + "loss": 0.0131, + "step": 1182 + }, + { + "epoch": 0.42769342010122924, + "grad_norm": 0.17912561303473346, + "learning_rate": 9.285879267478027e-06, + "loss": 0.0206, + "step": 1183 + }, + { + "epoch": 0.42805495300072305, + "grad_norm": 0.5678511428960745, + "learning_rate": 9.284339020129382e-06, + "loss": 0.0432, + "step": 1184 + }, + { + "epoch": 0.4284164859002169, + "grad_norm": 0.1265417625218881, + "learning_rate": 9.282797241558876e-06, + "loss": 0.0131, + "step": 1185 + }, + { + "epoch": 0.4287780187997108, + "grad_norm": 0.5211928281594582, + "learning_rate": 9.281253932317542e-06, + "loss": 0.0317, + "step": 1186 + }, + { + "epoch": 0.42913955169920465, + "grad_norm": 0.8547104311749817, + "learning_rate": 9.279709092956955e-06, + "loss": 0.1738, + "step": 1187 + }, + { + "epoch": 0.42950108459869846, + "grad_norm": 0.13168574740710717, + "learning_rate": 9.27816272402924e-06, + "loss": 0.0131, + "step": 1188 + }, + { + "epoch": 0.4298626174981923, + "grad_norm": 0.7261856541766722, + "learning_rate": 9.27661482608707e-06, + "loss": 0.0579, + "step": 1189 + }, + { + "epoch": 0.4302241503976862, + "grad_norm": 0.10961977292072446, + "learning_rate": 9.275065399683665e-06, + "loss": 0.0045, + "step": 1190 + }, + { + "epoch": 0.43058568329718006, + "grad_norm": 0.5553779519890406, + "learning_rate": 9.273514445372787e-06, + "loss": 0.0957, + "step": 1191 + }, + { + "epoch": 0.4309472161966739, + "grad_norm": 0.21738774112406928, + "learning_rate": 9.27196196370875e-06, + "loss": 0.0146, + "step": 1192 + }, + { + "epoch": 0.43130874909616773, + "grad_norm": 0.23154164556419493, + "learning_rate": 9.270407955246408e-06, + "loss": 0.0231, + "step": 1193 + }, + { + "epoch": 0.4316702819956616, + "grad_norm": 0.3160999168922757, + "learning_rate": 9.268852420541163e-06, + "loss": 0.0231, + "step": 1194 + }, + { + "epoch": 0.43203181489515546, + "grad_norm": 0.6564806779854103, + "learning_rate": 9.267295360148965e-06, + "loss": 0.0579, + "step": 1195 + }, + { + "epoch": 0.43239334779464933, + "grad_norm": 1.238621576761136, + "learning_rate": 9.265736774626306e-06, + "loss": 0.1191, + "step": 1196 + }, + { + "epoch": 0.43275488069414314, + "grad_norm": 0.39143805471761906, + "learning_rate": 9.264176664530223e-06, + "loss": 0.0393, + "step": 1197 + }, + { + "epoch": 0.433116413593637, + "grad_norm": 0.12649635204824258, + "learning_rate": 9.262615030418301e-06, + "loss": 0.0131, + "step": 1198 + }, + { + "epoch": 0.4334779464931309, + "grad_norm": 0.08228413978367108, + "learning_rate": 9.261051872848666e-06, + "loss": 0.0019, + "step": 1199 + }, + { + "epoch": 0.43383947939262474, + "grad_norm": 0.8383735204810343, + "learning_rate": 9.259487192379991e-06, + "loss": 0.1553, + "step": 1200 + }, + { + "epoch": 0.4342010122921186, + "grad_norm": 0.609987175919984, + "learning_rate": 9.257920989571492e-06, + "loss": 0.0354, + "step": 1201 + }, + { + "epoch": 0.4345625451916124, + "grad_norm": 0.570026341897741, + "learning_rate": 9.25635326498293e-06, + "loss": 0.0688, + "step": 1202 + }, + { + "epoch": 0.4349240780911063, + "grad_norm": 0.12500996535984785, + "learning_rate": 9.254784019174611e-06, + "loss": 0.0131, + "step": 1203 + }, + { + "epoch": 0.43528561099060015, + "grad_norm": 0.9543436090179716, + "learning_rate": 9.253213252707381e-06, + "loss": 0.1738, + "step": 1204 + }, + { + "epoch": 0.435647143890094, + "grad_norm": 0.16144410761128974, + "learning_rate": 9.25164096614263e-06, + "loss": 0.0184, + "step": 1205 + }, + { + "epoch": 0.4360086767895879, + "grad_norm": 0.16840436751985455, + "learning_rate": 9.250067160042296e-06, + "loss": 0.0165, + "step": 1206 + }, + { + "epoch": 0.4363702096890817, + "grad_norm": 0.7273483428444154, + "learning_rate": 9.248491834968857e-06, + "loss": 0.1455, + "step": 1207 + }, + { + "epoch": 0.43673174258857556, + "grad_norm": 0.6089729541374654, + "learning_rate": 9.246914991485332e-06, + "loss": 0.1367, + "step": 1208 + }, + { + "epoch": 0.4370932754880694, + "grad_norm": 0.6006941999042075, + "learning_rate": 9.245336630155285e-06, + "loss": 0.1455, + "step": 1209 + }, + { + "epoch": 0.4374548083875633, + "grad_norm": 0.3375503730757157, + "learning_rate": 9.243756751542823e-06, + "loss": 0.0258, + "step": 1210 + }, + { + "epoch": 0.4378163412870571, + "grad_norm": 0.20144046382770192, + "learning_rate": 9.242175356212592e-06, + "loss": 0.0165, + "step": 1211 + }, + { + "epoch": 0.43817787418655096, + "grad_norm": 0.5609605549304093, + "learning_rate": 9.240592444729786e-06, + "loss": 0.0688, + "step": 1212 + }, + { + "epoch": 0.43853940708604483, + "grad_norm": 0.519361376975237, + "learning_rate": 9.239008017660133e-06, + "loss": 0.0288, + "step": 1213 + }, + { + "epoch": 0.4389009399855387, + "grad_norm": 0.1757152994550518, + "learning_rate": 9.237422075569912e-06, + "loss": 0.0117, + "step": 1214 + }, + { + "epoch": 0.43926247288503256, + "grad_norm": 0.2424069057980919, + "learning_rate": 9.235834619025934e-06, + "loss": 0.0317, + "step": 1215 + }, + { + "epoch": 0.4396240057845264, + "grad_norm": 0.7205043830682674, + "learning_rate": 9.234245648595557e-06, + "loss": 0.0815, + "step": 1216 + }, + { + "epoch": 0.43998553868402024, + "grad_norm": 0.364846491223204, + "learning_rate": 9.232655164846678e-06, + "loss": 0.0525, + "step": 1217 + }, + { + "epoch": 0.4403470715835141, + "grad_norm": 0.6569152898696073, + "learning_rate": 9.231063168347736e-06, + "loss": 0.063, + "step": 1218 + }, + { + "epoch": 0.44070860448300797, + "grad_norm": 0.24761801392722915, + "learning_rate": 9.229469659667713e-06, + "loss": 0.0317, + "step": 1219 + }, + { + "epoch": 0.4410701373825018, + "grad_norm": 0.3498679989538099, + "learning_rate": 9.227874639376124e-06, + "loss": 0.0231, + "step": 1220 + }, + { + "epoch": 0.44143167028199565, + "grad_norm": 0.3540284124142474, + "learning_rate": 9.226278108043032e-06, + "loss": 0.0432, + "step": 1221 + }, + { + "epoch": 0.4417932031814895, + "grad_norm": 0.07446113771352528, + "learning_rate": 9.224680066239037e-06, + "loss": 0.0029, + "step": 1222 + }, + { + "epoch": 0.4421547360809834, + "grad_norm": 0.5236782036399603, + "learning_rate": 9.223080514535277e-06, + "loss": 0.063, + "step": 1223 + }, + { + "epoch": 0.44251626898047725, + "grad_norm": 0.39613075860950236, + "learning_rate": 9.221479453503433e-06, + "loss": 0.0354, + "step": 1224 + }, + { + "epoch": 0.44287780187997106, + "grad_norm": 0.48548714932106857, + "learning_rate": 9.219876883715722e-06, + "loss": 0.0576, + "step": 1225 + }, + { + "epoch": 0.4432393347794649, + "grad_norm": 0.7643901335268081, + "learning_rate": 9.218272805744903e-06, + "loss": 0.0815, + "step": 1226 + }, + { + "epoch": 0.4436008676789588, + "grad_norm": 0.11277968608216814, + "learning_rate": 9.216667220164276e-06, + "loss": 0.0032, + "step": 1227 + }, + { + "epoch": 0.44396240057845265, + "grad_norm": 0.2771172300281208, + "learning_rate": 9.215060127547671e-06, + "loss": 0.0258, + "step": 1228 + }, + { + "epoch": 0.4443239334779465, + "grad_norm": 0.8735111401909389, + "learning_rate": 9.213451528469468e-06, + "loss": 0.0957, + "step": 1229 + }, + { + "epoch": 0.44468546637744033, + "grad_norm": 0.30023543025025184, + "learning_rate": 9.211841423504577e-06, + "loss": 0.0184, + "step": 1230 + }, + { + "epoch": 0.4450469992769342, + "grad_norm": 0.589879493253699, + "learning_rate": 9.21022981322845e-06, + "loss": 0.0476, + "step": 1231 + }, + { + "epoch": 0.44540853217642806, + "grad_norm": 0.5197212429520317, + "learning_rate": 9.208616698217079e-06, + "loss": 0.1035, + "step": 1232 + }, + { + "epoch": 0.44577006507592193, + "grad_norm": 0.38590001676704533, + "learning_rate": 9.207002079046985e-06, + "loss": 0.0104, + "step": 1233 + }, + { + "epoch": 0.44613159797541574, + "grad_norm": 0.21950391212045592, + "learning_rate": 9.205385956295238e-06, + "loss": 0.0317, + "step": 1234 + }, + { + "epoch": 0.4464931308749096, + "grad_norm": 0.18775136185666133, + "learning_rate": 9.203768330539436e-06, + "loss": 0.0092, + "step": 1235 + }, + { + "epoch": 0.44685466377440347, + "grad_norm": 0.27419383734373814, + "learning_rate": 9.202149202357721e-06, + "loss": 0.0391, + "step": 1236 + }, + { + "epoch": 0.44721619667389734, + "grad_norm": 0.3389404405982564, + "learning_rate": 9.200528572328768e-06, + "loss": 0.0354, + "step": 1237 + }, + { + "epoch": 0.4475777295733912, + "grad_norm": 3.8344516138292937, + "learning_rate": 9.19890644103179e-06, + "loss": 1.1875, + "step": 1238 + }, + { + "epoch": 0.447939262472885, + "grad_norm": 0.585573548932553, + "learning_rate": 9.197282809046533e-06, + "loss": 0.063, + "step": 1239 + }, + { + "epoch": 0.4483007953723789, + "grad_norm": 0.1609806057707306, + "learning_rate": 9.195657676953288e-06, + "loss": 0.0103, + "step": 1240 + }, + { + "epoch": 0.44866232827187275, + "grad_norm": 0.5481986001093979, + "learning_rate": 9.194031045332877e-06, + "loss": 0.0476, + "step": 1241 + }, + { + "epoch": 0.4490238611713666, + "grad_norm": 0.6840787525876617, + "learning_rate": 9.19240291476665e-06, + "loss": 0.0752, + "step": 1242 + }, + { + "epoch": 0.4493853940708604, + "grad_norm": 0.5639282300102497, + "learning_rate": 9.190773285836513e-06, + "loss": 0.063, + "step": 1243 + }, + { + "epoch": 0.4497469269703543, + "grad_norm": 0.5095113611708466, + "learning_rate": 9.189142159124883e-06, + "loss": 0.063, + "step": 1244 + }, + { + "epoch": 0.45010845986984815, + "grad_norm": 0.32885234615355863, + "learning_rate": 9.187509535214731e-06, + "loss": 0.0432, + "step": 1245 + }, + { + "epoch": 0.450469992769342, + "grad_norm": 0.23778164132619944, + "learning_rate": 9.185875414689553e-06, + "loss": 0.0255, + "step": 1246 + }, + { + "epoch": 0.4508315256688359, + "grad_norm": 0.3011498590848506, + "learning_rate": 9.184239798133387e-06, + "loss": 0.0131, + "step": 1247 + }, + { + "epoch": 0.4511930585683297, + "grad_norm": 0.24549766316405178, + "learning_rate": 9.182602686130802e-06, + "loss": 0.0286, + "step": 1248 + }, + { + "epoch": 0.45155459146782356, + "grad_norm": 0.3199316957600134, + "learning_rate": 9.180964079266897e-06, + "loss": 0.0391, + "step": 1249 + }, + { + "epoch": 0.45191612436731743, + "grad_norm": 0.15207511710905883, + "learning_rate": 9.179323978127313e-06, + "loss": 0.0073, + "step": 1250 + }, + { + "epoch": 0.4522776572668113, + "grad_norm": 0.18885533711967034, + "learning_rate": 9.17768238329822e-06, + "loss": 0.0045, + "step": 1251 + }, + { + "epoch": 0.45263919016630516, + "grad_norm": 0.3812363089696088, + "learning_rate": 9.176039295366328e-06, + "loss": 0.0231, + "step": 1252 + }, + { + "epoch": 0.45300072306579897, + "grad_norm": 0.22305706471754086, + "learning_rate": 9.17439471491887e-06, + "loss": 0.0073, + "step": 1253 + }, + { + "epoch": 0.45336225596529284, + "grad_norm": 0.01880934672226833, + "learning_rate": 9.172748642543624e-06, + "loss": 0.0005, + "step": 1254 + }, + { + "epoch": 0.4537237888647867, + "grad_norm": 0.00997960506667746, + "learning_rate": 9.171101078828893e-06, + "loss": 0.0003, + "step": 1255 + }, + { + "epoch": 0.45408532176428057, + "grad_norm": 0.08120421331384185, + "learning_rate": 9.169452024363517e-06, + "loss": 0.002, + "step": 1256 + }, + { + "epoch": 0.4544468546637744, + "grad_norm": 0.017400379404158547, + "learning_rate": 9.167801479736866e-06, + "loss": 0.0005, + "step": 1257 + }, + { + "epoch": 0.45480838756326825, + "grad_norm": 0.235317345968551, + "learning_rate": 9.166149445538848e-06, + "loss": 0.0104, + "step": 1258 + }, + { + "epoch": 0.4551699204627621, + "grad_norm": 0.19539868545619668, + "learning_rate": 9.164495922359895e-06, + "loss": 0.0146, + "step": 1259 + }, + { + "epoch": 0.455531453362256, + "grad_norm": 0.12889219622984485, + "learning_rate": 9.16284091079098e-06, + "loss": 0.0115, + "step": 1260 + }, + { + "epoch": 0.45589298626174984, + "grad_norm": 1.5973866912829473, + "learning_rate": 9.161184411423602e-06, + "loss": 0.0957, + "step": 1261 + }, + { + "epoch": 0.45625451916124365, + "grad_norm": 0.2204040393866392, + "learning_rate": 9.159526424849792e-06, + "loss": 0.0229, + "step": 1262 + }, + { + "epoch": 0.4566160520607375, + "grad_norm": 0.3494939566130848, + "learning_rate": 9.157866951662117e-06, + "loss": 0.0231, + "step": 1263 + }, + { + "epoch": 0.4569775849602314, + "grad_norm": 0.8598646699240999, + "learning_rate": 9.15620599245367e-06, + "loss": 0.1367, + "step": 1264 + }, + { + "epoch": 0.45733911785972525, + "grad_norm": 0.010311069305442391, + "learning_rate": 9.15454354781808e-06, + "loss": 0.0003, + "step": 1265 + }, + { + "epoch": 0.45770065075921906, + "grad_norm": 0.6636049557118738, + "learning_rate": 9.152879618349502e-06, + "loss": 0.0286, + "step": 1266 + }, + { + "epoch": 0.45806218365871293, + "grad_norm": 0.8816313073958003, + "learning_rate": 9.151214204642623e-06, + "loss": 0.0354, + "step": 1267 + }, + { + "epoch": 0.4584237165582068, + "grad_norm": 0.14726477488625997, + "learning_rate": 9.149547307292665e-06, + "loss": 0.0036, + "step": 1268 + }, + { + "epoch": 0.45878524945770066, + "grad_norm": 0.45243763961589994, + "learning_rate": 9.147878926895375e-06, + "loss": 0.0391, + "step": 1269 + }, + { + "epoch": 0.4591467823571945, + "grad_norm": 0.8560202782231517, + "learning_rate": 9.146209064047031e-06, + "loss": 0.1836, + "step": 1270 + }, + { + "epoch": 0.45950831525668834, + "grad_norm": 0.22601356271195683, + "learning_rate": 9.144537719344445e-06, + "loss": 0.0072, + "step": 1271 + }, + { + "epoch": 0.4598698481561822, + "grad_norm": 8.418948278595058, + "learning_rate": 9.14286489338495e-06, + "loss": 1.7578, + "step": 1272 + }, + { + "epoch": 0.46023138105567607, + "grad_norm": 0.013106215159109405, + "learning_rate": 9.141190586766418e-06, + "loss": 0.0005, + "step": 1273 + }, + { + "epoch": 0.46059291395516994, + "grad_norm": 1.339622032805676, + "learning_rate": 9.139514800087243e-06, + "loss": 0.1279, + "step": 1274 + }, + { + "epoch": 0.4609544468546638, + "grad_norm": 0.21270311844939735, + "learning_rate": 9.137837533946355e-06, + "loss": 0.0165, + "step": 1275 + }, + { + "epoch": 0.4613159797541576, + "grad_norm": 1.9751558300103287, + "learning_rate": 9.136158788943203e-06, + "loss": 0.1641, + "step": 1276 + }, + { + "epoch": 0.4616775126536515, + "grad_norm": 10.134744063098134, + "learning_rate": 9.134478565677772e-06, + "loss": 1.6797, + "step": 1277 + }, + { + "epoch": 0.46203904555314534, + "grad_norm": 0.14254655441139796, + "learning_rate": 9.132796864750575e-06, + "loss": 0.0146, + "step": 1278 + }, + { + "epoch": 0.4624005784526392, + "grad_norm": 0.33688908189145855, + "learning_rate": 9.131113686762652e-06, + "loss": 0.0184, + "step": 1279 + }, + { + "epoch": 0.462762111352133, + "grad_norm": 1.2369954517265032, + "learning_rate": 9.129429032315568e-06, + "loss": 0.0815, + "step": 1280 + }, + { + "epoch": 0.4631236442516269, + "grad_norm": 0.1462130590156436, + "learning_rate": 9.127742902011419e-06, + "loss": 0.0131, + "step": 1281 + }, + { + "epoch": 0.46348517715112075, + "grad_norm": 0.14103756285703187, + "learning_rate": 9.12605529645283e-06, + "loss": 0.0131, + "step": 1282 + }, + { + "epoch": 0.4638467100506146, + "grad_norm": 0.03509048774056504, + "learning_rate": 9.124366216242947e-06, + "loss": 0.0006, + "step": 1283 + }, + { + "epoch": 0.4642082429501085, + "grad_norm": 0.011155428974450217, + "learning_rate": 9.122675661985449e-06, + "loss": 0.0003, + "step": 1284 + }, + { + "epoch": 0.4645697758496023, + "grad_norm": 0.248642250589403, + "learning_rate": 9.120983634284539e-06, + "loss": 0.0117, + "step": 1285 + }, + { + "epoch": 0.46493130874909616, + "grad_norm": 0.11319570267491223, + "learning_rate": 9.119290133744947e-06, + "loss": 0.0103, + "step": 1286 + }, + { + "epoch": 0.46529284164859, + "grad_norm": 0.13428549703940107, + "learning_rate": 9.117595160971932e-06, + "loss": 0.0131, + "step": 1287 + }, + { + "epoch": 0.4656543745480839, + "grad_norm": 0.26290221153458804, + "learning_rate": 9.115898716571276e-06, + "loss": 0.0229, + "step": 1288 + }, + { + "epoch": 0.4660159074475777, + "grad_norm": 1.4295740588356434, + "learning_rate": 9.114200801149286e-06, + "loss": 0.0957, + "step": 1289 + }, + { + "epoch": 0.46637744034707157, + "grad_norm": 0.1640279322025806, + "learning_rate": 9.1125014153128e-06, + "loss": 0.0165, + "step": 1290 + }, + { + "epoch": 0.46673897324656544, + "grad_norm": 0.870404566199469, + "learning_rate": 9.110800559669172e-06, + "loss": 0.0525, + "step": 1291 + }, + { + "epoch": 0.4671005061460593, + "grad_norm": 1.1253362747120352, + "learning_rate": 9.109098234826295e-06, + "loss": 0.1455, + "step": 1292 + }, + { + "epoch": 0.46746203904555317, + "grad_norm": 0.014242404003511455, + "learning_rate": 9.107394441392576e-06, + "loss": 0.0004, + "step": 1293 + }, + { + "epoch": 0.467823571945047, + "grad_norm": 0.1413233454850026, + "learning_rate": 9.10568917997695e-06, + "loss": 0.0131, + "step": 1294 + }, + { + "epoch": 0.46818510484454084, + "grad_norm": 0.17475754191871784, + "learning_rate": 9.103982451188876e-06, + "loss": 0.0165, + "step": 1295 + }, + { + "epoch": 0.4685466377440347, + "grad_norm": 0.23792999068787243, + "learning_rate": 9.102274255638341e-06, + "loss": 0.0165, + "step": 1296 + }, + { + "epoch": 0.4689081706435286, + "grad_norm": 1.4488984717176268, + "learning_rate": 9.100564593935854e-06, + "loss": 0.2344, + "step": 1297 + }, + { + "epoch": 0.4692697035430224, + "grad_norm": 0.3364276090810589, + "learning_rate": 9.098853466692445e-06, + "loss": 0.0184, + "step": 1298 + }, + { + "epoch": 0.46963123644251625, + "grad_norm": 1.4457171053500444, + "learning_rate": 9.097140874519672e-06, + "loss": 0.1113, + "step": 1299 + }, + { + "epoch": 0.4699927693420101, + "grad_norm": 0.5816529362788357, + "learning_rate": 9.095426818029616e-06, + "loss": 0.0317, + "step": 1300 + }, + { + "epoch": 0.470354302241504, + "grad_norm": 0.0035180631175086197, + "learning_rate": 9.09371129783488e-06, + "loss": 0.0001, + "step": 1301 + }, + { + "epoch": 0.47071583514099785, + "grad_norm": 0.09450867455849873, + "learning_rate": 9.09199431454859e-06, + "loss": 0.0036, + "step": 1302 + }, + { + "epoch": 0.47107736804049166, + "grad_norm": 0.08391724676996345, + "learning_rate": 9.090275868784392e-06, + "loss": 0.0007, + "step": 1303 + }, + { + "epoch": 0.47143890093998553, + "grad_norm": 0.20284820490953137, + "learning_rate": 9.088555961156466e-06, + "loss": 0.0057, + "step": 1304 + }, + { + "epoch": 0.4718004338394794, + "grad_norm": 0.4398274456272215, + "learning_rate": 9.086834592279501e-06, + "loss": 0.0286, + "step": 1305 + }, + { + "epoch": 0.47216196673897326, + "grad_norm": 2.6966427727721136, + "learning_rate": 9.085111762768714e-06, + "loss": 0.2129, + "step": 1306 + }, + { + "epoch": 0.4725234996384671, + "grad_norm": 0.05647116830414856, + "learning_rate": 9.083387473239847e-06, + "loss": 0.0012, + "step": 1307 + }, + { + "epoch": 0.47288503253796094, + "grad_norm": 0.15444343524698012, + "learning_rate": 9.081661724309157e-06, + "loss": 0.004, + "step": 1308 + }, + { + "epoch": 0.4732465654374548, + "grad_norm": 0.1424169226563073, + "learning_rate": 9.07993451659343e-06, + "loss": 0.0146, + "step": 1309 + }, + { + "epoch": 0.47360809833694867, + "grad_norm": 1.1846329335884314, + "learning_rate": 9.078205850709967e-06, + "loss": 0.0889, + "step": 1310 + }, + { + "epoch": 0.47396963123644253, + "grad_norm": 0.01089804072144288, + "learning_rate": 9.076475727276592e-06, + "loss": 0.0003, + "step": 1311 + }, + { + "epoch": 0.47433116413593635, + "grad_norm": 0.2114479180657507, + "learning_rate": 9.074744146911654e-06, + "loss": 0.0104, + "step": 1312 + }, + { + "epoch": 0.4746926970354302, + "grad_norm": 3.9750177907439888, + "learning_rate": 9.073011110234017e-06, + "loss": 0.0889, + "step": 1313 + }, + { + "epoch": 0.4750542299349241, + "grad_norm": 1.301820092754917, + "learning_rate": 9.07127661786307e-06, + "loss": 0.0957, + "step": 1314 + }, + { + "epoch": 0.47541576283441794, + "grad_norm": 0.7279099797496328, + "learning_rate": 9.06954067041872e-06, + "loss": 0.1113, + "step": 1315 + }, + { + "epoch": 0.4757772957339118, + "grad_norm": 0.05050447706466068, + "learning_rate": 9.06780326852139e-06, + "loss": 0.0006, + "step": 1316 + }, + { + "epoch": 0.4761388286334056, + "grad_norm": 0.323779846928689, + "learning_rate": 9.066064412792033e-06, + "loss": 0.0104, + "step": 1317 + }, + { + "epoch": 0.4765003615328995, + "grad_norm": 0.18069976061570164, + "learning_rate": 9.06432410385211e-06, + "loss": 0.0165, + "step": 1318 + }, + { + "epoch": 0.47686189443239335, + "grad_norm": 0.37101227568530754, + "learning_rate": 9.062582342323613e-06, + "loss": 0.0258, + "step": 1319 + }, + { + "epoch": 0.4772234273318872, + "grad_norm": 1.0097622796570669, + "learning_rate": 9.060839128829044e-06, + "loss": 0.1279, + "step": 1320 + }, + { + "epoch": 0.47758496023138103, + "grad_norm": 1.0206442060564624, + "learning_rate": 9.059094463991426e-06, + "loss": 0.1553, + "step": 1321 + }, + { + "epoch": 0.4779464931308749, + "grad_norm": 0.11639425090764796, + "learning_rate": 9.057348348434304e-06, + "loss": 0.0092, + "step": 1322 + }, + { + "epoch": 0.47830802603036876, + "grad_norm": 1.8452572481396046, + "learning_rate": 9.055600782781738e-06, + "loss": 0.1934, + "step": 1323 + }, + { + "epoch": 0.4786695589298626, + "grad_norm": 1.3087862803759691, + "learning_rate": 9.053851767658309e-06, + "loss": 0.1934, + "step": 1324 + }, + { + "epoch": 0.4790310918293565, + "grad_norm": 0.010248129178286807, + "learning_rate": 9.052101303689113e-06, + "loss": 0.0001, + "step": 1325 + }, + { + "epoch": 0.4793926247288503, + "grad_norm": 0.31583261902524784, + "learning_rate": 9.050349391499766e-06, + "loss": 0.0131, + "step": 1326 + }, + { + "epoch": 0.47975415762834417, + "grad_norm": 0.7569173780529477, + "learning_rate": 9.0485960317164e-06, + "loss": 0.1641, + "step": 1327 + }, + { + "epoch": 0.48011569052783803, + "grad_norm": 0.14620801787002016, + "learning_rate": 9.046841224965671e-06, + "loss": 0.0117, + "step": 1328 + }, + { + "epoch": 0.4804772234273319, + "grad_norm": 0.23269549102596876, + "learning_rate": 9.045084971874738e-06, + "loss": 0.0255, + "step": 1329 + }, + { + "epoch": 0.48083875632682577, + "grad_norm": 1.0404078419083114, + "learning_rate": 9.043327273071292e-06, + "loss": 0.063, + "step": 1330 + }, + { + "epoch": 0.4812002892263196, + "grad_norm": 0.5840902897626555, + "learning_rate": 9.04156812918353e-06, + "loss": 0.1367, + "step": 1331 + }, + { + "epoch": 0.48156182212581344, + "grad_norm": 0.6181863702900559, + "learning_rate": 9.039807540840173e-06, + "loss": 0.1191, + "step": 1332 + }, + { + "epoch": 0.4819233550253073, + "grad_norm": 1.0972304033956888, + "learning_rate": 9.038045508670453e-06, + "loss": 0.0815, + "step": 1333 + }, + { + "epoch": 0.4822848879248012, + "grad_norm": 0.21148082776839264, + "learning_rate": 9.03628203330412e-06, + "loss": 0.0093, + "step": 1334 + }, + { + "epoch": 0.482646420824295, + "grad_norm": 0.2004188944124644, + "learning_rate": 9.034517115371442e-06, + "loss": 0.0286, + "step": 1335 + }, + { + "epoch": 0.48300795372378885, + "grad_norm": 0.215514634177018, + "learning_rate": 9.032750755503196e-06, + "loss": 0.0317, + "step": 1336 + }, + { + "epoch": 0.4833694866232827, + "grad_norm": 0.6779849101656431, + "learning_rate": 9.030982954330683e-06, + "loss": 0.0432, + "step": 1337 + }, + { + "epoch": 0.4837310195227766, + "grad_norm": 0.010007928410905415, + "learning_rate": 9.029213712485711e-06, + "loss": 0.0003, + "step": 1338 + }, + { + "epoch": 0.48409255242227045, + "grad_norm": 1.2108315793188862, + "learning_rate": 9.02744303060061e-06, + "loss": 0.0957, + "step": 1339 + }, + { + "epoch": 0.48445408532176426, + "grad_norm": 0.6306650157252662, + "learning_rate": 9.02567090930822e-06, + "loss": 0.0576, + "step": 1340 + }, + { + "epoch": 0.4848156182212581, + "grad_norm": 0.2685968103845754, + "learning_rate": 9.023897349241895e-06, + "loss": 0.0231, + "step": 1341 + }, + { + "epoch": 0.485177151120752, + "grad_norm": 0.45157023489486187, + "learning_rate": 9.022122351035507e-06, + "loss": 0.0576, + "step": 1342 + }, + { + "epoch": 0.48553868402024586, + "grad_norm": 0.5130010715495086, + "learning_rate": 9.020345915323441e-06, + "loss": 0.063, + "step": 1343 + }, + { + "epoch": 0.48590021691973967, + "grad_norm": 0.01082765109108481, + "learning_rate": 9.018568042740593e-06, + "loss": 0.0003, + "step": 1344 + }, + { + "epoch": 0.48626174981923354, + "grad_norm": 0.3293754325383362, + "learning_rate": 9.016788733922372e-06, + "loss": 0.0354, + "step": 1345 + }, + { + "epoch": 0.4866232827187274, + "grad_norm": 0.20050563340133065, + "learning_rate": 9.015007989504707e-06, + "loss": 0.0317, + "step": 1346 + }, + { + "epoch": 0.48698481561822127, + "grad_norm": 5.069741831441081, + "learning_rate": 9.013225810124032e-06, + "loss": 1.0938, + "step": 1347 + }, + { + "epoch": 0.48734634851771513, + "grad_norm": 0.19709542055059437, + "learning_rate": 9.011442196417304e-06, + "loss": 0.0317, + "step": 1348 + }, + { + "epoch": 0.48770788141720894, + "grad_norm": 0.15435111973886265, + "learning_rate": 9.00965714902198e-06, + "loss": 0.0072, + "step": 1349 + }, + { + "epoch": 0.4880694143167028, + "grad_norm": 0.48025044505526565, + "learning_rate": 9.007870668576035e-06, + "loss": 0.0525, + "step": 1350 + }, + { + "epoch": 0.4884309472161967, + "grad_norm": 0.27934379133432863, + "learning_rate": 9.00608275571796e-06, + "loss": 0.0258, + "step": 1351 + }, + { + "epoch": 0.48879248011569054, + "grad_norm": 0.311286011552607, + "learning_rate": 9.004293411086753e-06, + "loss": 0.0476, + "step": 1352 + }, + { + "epoch": 0.4891540130151844, + "grad_norm": 0.224710394858545, + "learning_rate": 9.002502635321925e-06, + "loss": 0.0229, + "step": 1353 + }, + { + "epoch": 0.4895155459146782, + "grad_norm": 0.5865633840296417, + "learning_rate": 9.000710429063503e-06, + "loss": 0.0432, + "step": 1354 + }, + { + "epoch": 0.4898770788141721, + "grad_norm": 0.23633725974805356, + "learning_rate": 8.998916792952016e-06, + "loss": 0.0354, + "step": 1355 + }, + { + "epoch": 0.49023861171366595, + "grad_norm": 0.5016676781072419, + "learning_rate": 8.99712172762851e-06, + "loss": 0.1113, + "step": 1356 + }, + { + "epoch": 0.4906001446131598, + "grad_norm": 0.21484649741546955, + "learning_rate": 8.995325233734544e-06, + "loss": 0.0286, + "step": 1357 + }, + { + "epoch": 0.4909616775126536, + "grad_norm": 0.25124016865632903, + "learning_rate": 8.993527311912182e-06, + "loss": 0.0184, + "step": 1358 + }, + { + "epoch": 0.4913232104121475, + "grad_norm": 0.2878077718369638, + "learning_rate": 8.991727962804002e-06, + "loss": 0.0206, + "step": 1359 + }, + { + "epoch": 0.49168474331164136, + "grad_norm": 0.18322502525587334, + "learning_rate": 8.98992718705309e-06, + "loss": 0.0082, + "step": 1360 + }, + { + "epoch": 0.4920462762111352, + "grad_norm": 0.3932533869084161, + "learning_rate": 8.988124985303045e-06, + "loss": 0.0391, + "step": 1361 + }, + { + "epoch": 0.4924078091106291, + "grad_norm": 0.17089760649488375, + "learning_rate": 8.98632135819797e-06, + "loss": 0.0255, + "step": 1362 + }, + { + "epoch": 0.4927693420101229, + "grad_norm": 0.9423875919875252, + "learning_rate": 8.984516306382482e-06, + "loss": 0.0688, + "step": 1363 + }, + { + "epoch": 0.49313087490961677, + "grad_norm": 0.9843489924034626, + "learning_rate": 8.982709830501708e-06, + "loss": 0.1641, + "step": 1364 + }, + { + "epoch": 0.49349240780911063, + "grad_norm": 1.1770646999584342, + "learning_rate": 8.980901931201283e-06, + "loss": 0.1455, + "step": 1365 + }, + { + "epoch": 0.4938539407086045, + "grad_norm": 0.1872657864276523, + "learning_rate": 8.979092609127344e-06, + "loss": 0.0184, + "step": 1366 + }, + { + "epoch": 0.4942154736080983, + "grad_norm": 1.4937517349794247, + "learning_rate": 8.97728186492655e-06, + "loss": 0.0752, + "step": 1367 + }, + { + "epoch": 0.4945770065075922, + "grad_norm": 0.9794383237122604, + "learning_rate": 8.975469699246055e-06, + "loss": 0.1934, + "step": 1368 + }, + { + "epoch": 0.49493853940708604, + "grad_norm": 0.5173145580187526, + "learning_rate": 8.973656112733529e-06, + "loss": 0.1113, + "step": 1369 + }, + { + "epoch": 0.4953000723065799, + "grad_norm": 0.6508898675426484, + "learning_rate": 8.971841106037148e-06, + "loss": 0.1191, + "step": 1370 + }, + { + "epoch": 0.4956616052060738, + "grad_norm": 0.29041622889557456, + "learning_rate": 8.970024679805592e-06, + "loss": 0.0103, + "step": 1371 + }, + { + "epoch": 0.4960231381055676, + "grad_norm": 0.7271197343875183, + "learning_rate": 8.968206834688052e-06, + "loss": 0.1367, + "step": 1372 + }, + { + "epoch": 0.49638467100506145, + "grad_norm": 0.21345119983333308, + "learning_rate": 8.96638757133423e-06, + "loss": 0.0206, + "step": 1373 + }, + { + "epoch": 0.4967462039045553, + "grad_norm": 0.4223705835202118, + "learning_rate": 8.964566890394325e-06, + "loss": 0.0752, + "step": 1374 + }, + { + "epoch": 0.4971077368040492, + "grad_norm": 0.36610152219108527, + "learning_rate": 8.962744792519052e-06, + "loss": 0.0391, + "step": 1375 + }, + { + "epoch": 0.49746926970354305, + "grad_norm": 0.22435518711518826, + "learning_rate": 8.960921278359624e-06, + "loss": 0.0391, + "step": 1376 + }, + { + "epoch": 0.49783080260303686, + "grad_norm": 0.2809304491185506, + "learning_rate": 8.959096348567769e-06, + "loss": 0.0476, + "step": 1377 + }, + { + "epoch": 0.4981923355025307, + "grad_norm": 0.47490693501458553, + "learning_rate": 8.957270003795713e-06, + "loss": 0.1035, + "step": 1378 + }, + { + "epoch": 0.4985538684020246, + "grad_norm": 0.4569744136406275, + "learning_rate": 8.955442244696193e-06, + "loss": 0.0206, + "step": 1379 + }, + { + "epoch": 0.49891540130151846, + "grad_norm": 0.27206984359548386, + "learning_rate": 8.953613071922451e-06, + "loss": 0.0317, + "step": 1380 + }, + { + "epoch": 0.49927693420101227, + "grad_norm": 0.6768959407885451, + "learning_rate": 8.95178248612823e-06, + "loss": 0.0688, + "step": 1381 + }, + { + "epoch": 0.49963846710050613, + "grad_norm": 0.05341547784409402, + "learning_rate": 8.949950487967782e-06, + "loss": 0.0013, + "step": 1382 + }, + { + "epoch": 0.5, + "grad_norm": 0.10729244076191469, + "learning_rate": 8.948117078095865e-06, + "loss": 0.004, + "step": 1383 + }, + { + "epoch": 0.5003615328994938, + "grad_norm": 0.3111144414154994, + "learning_rate": 8.946282257167736e-06, + "loss": 0.0317, + "step": 1384 + }, + { + "epoch": 0.5007230657989877, + "grad_norm": 0.5522304213143823, + "learning_rate": 8.944446025839161e-06, + "loss": 0.0391, + "step": 1385 + }, + { + "epoch": 0.5010845986984815, + "grad_norm": 1.076886976161399, + "learning_rate": 8.942608384766412e-06, + "loss": 0.0476, + "step": 1386 + }, + { + "epoch": 0.5014461315979755, + "grad_norm": 0.28925274679917423, + "learning_rate": 8.940769334606254e-06, + "loss": 0.0131, + "step": 1387 + }, + { + "epoch": 0.5018076644974693, + "grad_norm": 0.28144969910531226, + "learning_rate": 8.93892887601597e-06, + "loss": 0.0432, + "step": 1388 + }, + { + "epoch": 0.5021691973969631, + "grad_norm": 0.3080656649860646, + "learning_rate": 8.937087009653335e-06, + "loss": 0.0476, + "step": 1389 + }, + { + "epoch": 0.502530730296457, + "grad_norm": 1.1702407850622378, + "learning_rate": 8.935243736176636e-06, + "loss": 0.0354, + "step": 1390 + }, + { + "epoch": 0.5028922631959508, + "grad_norm": 0.39430162721777884, + "learning_rate": 8.933399056244655e-06, + "loss": 0.0258, + "step": 1391 + }, + { + "epoch": 0.5032537960954447, + "grad_norm": 0.31066226381080797, + "learning_rate": 8.931552970516681e-06, + "loss": 0.0354, + "step": 1392 + }, + { + "epoch": 0.5036153289949385, + "grad_norm": 4.194948083847059, + "learning_rate": 8.929705479652507e-06, + "loss": 0.0391, + "step": 1393 + }, + { + "epoch": 0.5039768618944324, + "grad_norm": 0.789994892942114, + "learning_rate": 8.927856584312422e-06, + "loss": 0.0288, + "step": 1394 + }, + { + "epoch": 0.5043383947939263, + "grad_norm": 0.2476754958237187, + "learning_rate": 8.926006285157223e-06, + "loss": 0.0432, + "step": 1395 + }, + { + "epoch": 0.5046999276934201, + "grad_norm": 0.5156524733857035, + "learning_rate": 8.924154582848205e-06, + "loss": 0.0815, + "step": 1396 + }, + { + "epoch": 0.5050614605929139, + "grad_norm": 0.31667759661764133, + "learning_rate": 8.922301478047171e-06, + "loss": 0.0354, + "step": 1397 + }, + { + "epoch": 0.5054229934924078, + "grad_norm": 0.9847733498140083, + "learning_rate": 8.920446971416413e-06, + "loss": 0.0889, + "step": 1398 + }, + { + "epoch": 0.5057845263919016, + "grad_norm": 0.44051646311669135, + "learning_rate": 8.918591063618735e-06, + "loss": 0.0525, + "step": 1399 + }, + { + "epoch": 0.5061460592913956, + "grad_norm": 0.27015264992722543, + "learning_rate": 8.916733755317439e-06, + "loss": 0.0432, + "step": 1400 + }, + { + "epoch": 0.5065075921908894, + "grad_norm": 0.33534011906684646, + "learning_rate": 8.914875047176325e-06, + "loss": 0.0432, + "step": 1401 + }, + { + "epoch": 0.5068691250903832, + "grad_norm": 0.019637314561378818, + "learning_rate": 8.913014939859697e-06, + "loss": 0.0006, + "step": 1402 + }, + { + "epoch": 0.5072306579898771, + "grad_norm": 0.07100045138112834, + "learning_rate": 8.911153434032354e-06, + "loss": 0.0014, + "step": 1403 + }, + { + "epoch": 0.5075921908893709, + "grad_norm": 0.1668174473276178, + "learning_rate": 8.909290530359597e-06, + "loss": 0.0065, + "step": 1404 + }, + { + "epoch": 0.5079537237888648, + "grad_norm": 0.336180307732027, + "learning_rate": 8.907426229507233e-06, + "loss": 0.0525, + "step": 1405 + }, + { + "epoch": 0.5083152566883586, + "grad_norm": 0.7776228568283378, + "learning_rate": 8.905560532141556e-06, + "loss": 0.0752, + "step": 1406 + }, + { + "epoch": 0.5086767895878525, + "grad_norm": 0.3510668100131032, + "learning_rate": 8.903693438929371e-06, + "loss": 0.0476, + "step": 1407 + }, + { + "epoch": 0.5090383224873464, + "grad_norm": 1.0187641013236655, + "learning_rate": 8.901824950537975e-06, + "loss": 0.1367, + "step": 1408 + }, + { + "epoch": 0.5093998553868402, + "grad_norm": 0.30245718971877955, + "learning_rate": 8.899955067635164e-06, + "loss": 0.0432, + "step": 1409 + }, + { + "epoch": 0.5097613882863341, + "grad_norm": 0.29652106088775015, + "learning_rate": 8.898083790889236e-06, + "loss": 0.0286, + "step": 1410 + }, + { + "epoch": 0.5101229211858279, + "grad_norm": 0.3463902777424068, + "learning_rate": 8.896211120968983e-06, + "loss": 0.0391, + "step": 1411 + }, + { + "epoch": 0.5104844540853217, + "grad_norm": 0.5682247916258633, + "learning_rate": 8.894337058543699e-06, + "loss": 0.0476, + "step": 1412 + }, + { + "epoch": 0.5108459869848156, + "grad_norm": 0.5652969419946142, + "learning_rate": 8.892461604283169e-06, + "loss": 0.0889, + "step": 1413 + }, + { + "epoch": 0.5112075198843095, + "grad_norm": 0.31110714373255777, + "learning_rate": 8.890584758857685e-06, + "loss": 0.0432, + "step": 1414 + }, + { + "epoch": 0.5115690527838034, + "grad_norm": 0.02862548143174699, + "learning_rate": 8.88870652293803e-06, + "loss": 0.0006, + "step": 1415 + }, + { + "epoch": 0.5119305856832972, + "grad_norm": 0.27072377471941594, + "learning_rate": 8.886826897195484e-06, + "loss": 0.0354, + "step": 1416 + }, + { + "epoch": 0.512292118582791, + "grad_norm": 0.014059408887437444, + "learning_rate": 8.884945882301825e-06, + "loss": 0.0004, + "step": 1417 + }, + { + "epoch": 0.5126536514822849, + "grad_norm": 0.05953954102513866, + "learning_rate": 8.883063478929327e-06, + "loss": 0.002, + "step": 1418 + }, + { + "epoch": 0.5130151843817787, + "grad_norm": 0.4674033238839596, + "learning_rate": 8.881179687750761e-06, + "loss": 0.0286, + "step": 1419 + }, + { + "epoch": 0.5133767172812725, + "grad_norm": 0.31032130212631687, + "learning_rate": 8.879294509439394e-06, + "loss": 0.0206, + "step": 1420 + }, + { + "epoch": 0.5137382501807665, + "grad_norm": 0.08294816053151526, + "learning_rate": 8.877407944668988e-06, + "loss": 0.0019, + "step": 1421 + }, + { + "epoch": 0.5140997830802603, + "grad_norm": 0.9277879074351228, + "learning_rate": 8.875519994113802e-06, + "loss": 0.063, + "step": 1422 + }, + { + "epoch": 0.5144613159797542, + "grad_norm": 0.16220331602042012, + "learning_rate": 8.873630658448586e-06, + "loss": 0.0146, + "step": 1423 + }, + { + "epoch": 0.514822848879248, + "grad_norm": 0.3314502938500384, + "learning_rate": 8.871739938348591e-06, + "loss": 0.0286, + "step": 1424 + }, + { + "epoch": 0.5151843817787418, + "grad_norm": 0.9780045947372844, + "learning_rate": 8.86984783448956e-06, + "loss": 0.0889, + "step": 1425 + }, + { + "epoch": 0.5155459146782357, + "grad_norm": 0.021152053951261716, + "learning_rate": 8.86795434754773e-06, + "loss": 0.0002, + "step": 1426 + }, + { + "epoch": 0.5159074475777295, + "grad_norm": 1.0338913333562356, + "learning_rate": 8.86605947819983e-06, + "loss": 0.0957, + "step": 1427 + }, + { + "epoch": 0.5162689804772235, + "grad_norm": 0.02338481475708954, + "learning_rate": 8.86416322712309e-06, + "loss": 0.0008, + "step": 1428 + }, + { + "epoch": 0.5166305133767173, + "grad_norm": 0.014330783069416781, + "learning_rate": 8.862265594995227e-06, + "loss": 0.0004, + "step": 1429 + }, + { + "epoch": 0.5169920462762111, + "grad_norm": 0.31556731940837063, + "learning_rate": 8.860366582494457e-06, + "loss": 0.0092, + "step": 1430 + }, + { + "epoch": 0.517353579175705, + "grad_norm": 0.9230229167490274, + "learning_rate": 8.858466190299486e-06, + "loss": 0.1113, + "step": 1431 + }, + { + "epoch": 0.5177151120751988, + "grad_norm": 0.8246623367886754, + "learning_rate": 8.856564419089511e-06, + "loss": 0.1113, + "step": 1432 + }, + { + "epoch": 0.5180766449746927, + "grad_norm": 2.1394374189083085, + "learning_rate": 8.854661269544227e-06, + "loss": 0.1367, + "step": 1433 + }, + { + "epoch": 0.5184381778741866, + "grad_norm": 0.6543591273984968, + "learning_rate": 8.852756742343818e-06, + "loss": 0.063, + "step": 1434 + }, + { + "epoch": 0.5187997107736804, + "grad_norm": 0.23936684988181312, + "learning_rate": 8.850850838168963e-06, + "loss": 0.0117, + "step": 1435 + }, + { + "epoch": 0.5191612436731743, + "grad_norm": 0.18448366018784543, + "learning_rate": 8.848943557700831e-06, + "loss": 0.0065, + "step": 1436 + }, + { + "epoch": 0.5195227765726681, + "grad_norm": 0.9640069035129124, + "learning_rate": 8.847034901621083e-06, + "loss": 0.0693, + "step": 1437 + }, + { + "epoch": 0.519884309472162, + "grad_norm": 0.3583756754937636, + "learning_rate": 8.845124870611875e-06, + "loss": 0.0391, + "step": 1438 + }, + { + "epoch": 0.5202458423716558, + "grad_norm": 0.21039777653296557, + "learning_rate": 8.843213465355848e-06, + "loss": 0.0286, + "step": 1439 + }, + { + "epoch": 0.5206073752711496, + "grad_norm": 0.07171860128520706, + "learning_rate": 8.841300686536141e-06, + "loss": 0.0015, + "step": 1440 + }, + { + "epoch": 0.5209689081706436, + "grad_norm": 0.2621688523278279, + "learning_rate": 8.839386534836378e-06, + "loss": 0.0105, + "step": 1441 + }, + { + "epoch": 0.5213304410701374, + "grad_norm": 1.0078016471591418, + "learning_rate": 8.837471010940678e-06, + "loss": 0.0688, + "step": 1442 + }, + { + "epoch": 0.5216919739696312, + "grad_norm": 0.18759117690330987, + "learning_rate": 8.835554115533649e-06, + "loss": 0.0205, + "step": 1443 + }, + { + "epoch": 0.5220535068691251, + "grad_norm": 0.016752244219866378, + "learning_rate": 8.833635849300389e-06, + "loss": 0.0005, + "step": 1444 + }, + { + "epoch": 0.5224150397686189, + "grad_norm": 0.4523686505286018, + "learning_rate": 8.831716212926484e-06, + "loss": 0.0317, + "step": 1445 + }, + { + "epoch": 0.5227765726681128, + "grad_norm": 0.6204957414413947, + "learning_rate": 8.829795207098013e-06, + "loss": 0.0476, + "step": 1446 + }, + { + "epoch": 0.5231381055676066, + "grad_norm": 0.0008346320477224026, + "learning_rate": 8.827872832501545e-06, + "loss": 0.0, + "step": 1447 + }, + { + "epoch": 0.5234996384671005, + "grad_norm": 8.212917595906717, + "learning_rate": 8.825949089824133e-06, + "loss": 1.375, + "step": 1448 + }, + { + "epoch": 0.5238611713665944, + "grad_norm": 0.29454264557453924, + "learning_rate": 8.824023979753325e-06, + "loss": 0.0317, + "step": 1449 + }, + { + "epoch": 0.5242227042660882, + "grad_norm": 0.21127499564977473, + "learning_rate": 8.822097502977153e-06, + "loss": 0.0229, + "step": 1450 + }, + { + "epoch": 0.5245842371655821, + "grad_norm": 1.3746929162482553, + "learning_rate": 8.820169660184141e-06, + "loss": 0.063, + "step": 1451 + }, + { + "epoch": 0.5249457700650759, + "grad_norm": 0.015818527218787674, + "learning_rate": 8.818240452063297e-06, + "loss": 0.0003, + "step": 1452 + }, + { + "epoch": 0.5253073029645697, + "grad_norm": 0.6300016262558501, + "learning_rate": 8.816309879304122e-06, + "loss": 0.0393, + "step": 1453 + }, + { + "epoch": 0.5256688358640637, + "grad_norm": 0.2725379319224233, + "learning_rate": 8.814377942596602e-06, + "loss": 0.0184, + "step": 1454 + }, + { + "epoch": 0.5260303687635575, + "grad_norm": 0.3026518703462417, + "learning_rate": 8.812444642631208e-06, + "loss": 0.0231, + "step": 1455 + }, + { + "epoch": 0.5263919016630514, + "grad_norm": 0.4545493593663851, + "learning_rate": 8.810509980098907e-06, + "loss": 0.0391, + "step": 1456 + }, + { + "epoch": 0.5267534345625452, + "grad_norm": 0.16201199474311942, + "learning_rate": 8.80857395569114e-06, + "loss": 0.0146, + "step": 1457 + }, + { + "epoch": 0.527114967462039, + "grad_norm": 0.17091276424255475, + "learning_rate": 8.806636570099847e-06, + "loss": 0.0165, + "step": 1458 + }, + { + "epoch": 0.5274765003615329, + "grad_norm": 0.1116749629034277, + "learning_rate": 8.804697824017447e-06, + "loss": 0.0082, + "step": 1459 + }, + { + "epoch": 0.5278380332610267, + "grad_norm": 0.07268944368345294, + "learning_rate": 8.802757718136846e-06, + "loss": 0.0015, + "step": 1460 + }, + { + "epoch": 0.5281995661605207, + "grad_norm": 0.21824720640643658, + "learning_rate": 8.80081625315144e-06, + "loss": 0.0184, + "step": 1461 + }, + { + "epoch": 0.5285610990600145, + "grad_norm": 0.2081614975027287, + "learning_rate": 8.798873429755108e-06, + "loss": 0.0184, + "step": 1462 + }, + { + "epoch": 0.5289226319595083, + "grad_norm": 0.25326711435450877, + "learning_rate": 8.796929248642213e-06, + "loss": 0.0131, + "step": 1463 + }, + { + "epoch": 0.5292841648590022, + "grad_norm": 0.08704782408860468, + "learning_rate": 8.794983710507607e-06, + "loss": 0.0064, + "step": 1464 + }, + { + "epoch": 0.529645697758496, + "grad_norm": 0.2232621689548058, + "learning_rate": 8.793036816046622e-06, + "loss": 0.0065, + "step": 1465 + }, + { + "epoch": 0.5300072306579898, + "grad_norm": 0.09069305944040651, + "learning_rate": 8.79108856595508e-06, + "loss": 0.0072, + "step": 1466 + }, + { + "epoch": 0.5303687635574837, + "grad_norm": 1.508210289296832, + "learning_rate": 8.789138960929286e-06, + "loss": 0.0688, + "step": 1467 + }, + { + "epoch": 0.5307302964569776, + "grad_norm": 0.5621729378549136, + "learning_rate": 8.787188001666027e-06, + "loss": 0.0317, + "step": 1468 + }, + { + "epoch": 0.5310918293564715, + "grad_norm": 1.4018522747000763, + "learning_rate": 8.785235688862574e-06, + "loss": 0.2129, + "step": 1469 + }, + { + "epoch": 0.5314533622559653, + "grad_norm": 0.05893289716956671, + "learning_rate": 8.783282023216685e-06, + "loss": 0.0045, + "step": 1470 + }, + { + "epoch": 0.5318148951554591, + "grad_norm": 0.01196283154819781, + "learning_rate": 8.7813270054266e-06, + "loss": 0.0003, + "step": 1471 + }, + { + "epoch": 0.532176428054953, + "grad_norm": 0.13627385130667388, + "learning_rate": 8.77937063619104e-06, + "loss": 0.0092, + "step": 1472 + }, + { + "epoch": 0.5325379609544468, + "grad_norm": 0.4649486738764149, + "learning_rate": 8.777412916209214e-06, + "loss": 0.0165, + "step": 1473 + }, + { + "epoch": 0.5328994938539408, + "grad_norm": 0.12686438660118893, + "learning_rate": 8.775453846180807e-06, + "loss": 0.0082, + "step": 1474 + }, + { + "epoch": 0.5332610267534346, + "grad_norm": 0.003918996310863843, + "learning_rate": 8.773493426805993e-06, + "loss": 0.0002, + "step": 1475 + }, + { + "epoch": 0.5336225596529284, + "grad_norm": 0.007480889156315746, + "learning_rate": 8.771531658785425e-06, + "loss": 0.0002, + "step": 1476 + }, + { + "epoch": 0.5339840925524223, + "grad_norm": 1.5140279931281977, + "learning_rate": 8.769568542820238e-06, + "loss": 0.063, + "step": 1477 + }, + { + "epoch": 0.5343456254519161, + "grad_norm": 0.24960114706100653, + "learning_rate": 8.767604079612049e-06, + "loss": 0.0131, + "step": 1478 + }, + { + "epoch": 0.53470715835141, + "grad_norm": 0.20225411123937498, + "learning_rate": 8.765638269862957e-06, + "loss": 0.0117, + "step": 1479 + }, + { + "epoch": 0.5350686912509038, + "grad_norm": 0.07839444608231055, + "learning_rate": 8.763671114275542e-06, + "loss": 0.0057, + "step": 1480 + }, + { + "epoch": 0.5354302241503976, + "grad_norm": 0.12167010809656519, + "learning_rate": 8.761702613552866e-06, + "loss": 0.0081, + "step": 1481 + }, + { + "epoch": 0.5357917570498916, + "grad_norm": 0.24010724853498264, + "learning_rate": 8.759732768398468e-06, + "loss": 0.0131, + "step": 1482 + }, + { + "epoch": 0.5361532899493854, + "grad_norm": 0.0009150990114448551, + "learning_rate": 8.757761579516372e-06, + "loss": 0.0, + "step": 1483 + }, + { + "epoch": 0.5365148228488793, + "grad_norm": 0.12344957238472135, + "learning_rate": 8.755789047611083e-06, + "loss": 0.0064, + "step": 1484 + }, + { + "epoch": 0.5368763557483731, + "grad_norm": 0.3962821796627426, + "learning_rate": 8.75381517338758e-06, + "loss": 0.0051, + "step": 1485 + }, + { + "epoch": 0.5372378886478669, + "grad_norm": 0.39049004858528646, + "learning_rate": 8.751839957551326e-06, + "loss": 0.0184, + "step": 1486 + }, + { + "epoch": 0.5375994215473608, + "grad_norm": 0.07289313331148914, + "learning_rate": 8.749863400808263e-06, + "loss": 0.0016, + "step": 1487 + }, + { + "epoch": 0.5379609544468547, + "grad_norm": 0.19041256400967735, + "learning_rate": 8.74788550386481e-06, + "loss": 0.0117, + "step": 1488 + }, + { + "epoch": 0.5383224873463485, + "grad_norm": 0.33646251966618473, + "learning_rate": 8.745906267427871e-06, + "loss": 0.0229, + "step": 1489 + }, + { + "epoch": 0.5386840202458424, + "grad_norm": 0.006913078072523653, + "learning_rate": 8.743925692204823e-06, + "loss": 0.0002, + "step": 1490 + }, + { + "epoch": 0.5390455531453362, + "grad_norm": 0.18541039410618607, + "learning_rate": 8.741943778903522e-06, + "loss": 0.0081, + "step": 1491 + }, + { + "epoch": 0.5394070860448301, + "grad_norm": 2.7931632923355325, + "learning_rate": 8.739960528232302e-06, + "loss": 0.2773, + "step": 1492 + }, + { + "epoch": 0.5397686189443239, + "grad_norm": 0.043977753409105536, + "learning_rate": 8.737975940899981e-06, + "loss": 0.0024, + "step": 1493 + }, + { + "epoch": 0.5401301518438177, + "grad_norm": 0.05147907738532659, + "learning_rate": 8.735990017615848e-06, + "loss": 0.0031, + "step": 1494 + }, + { + "epoch": 0.5404916847433117, + "grad_norm": 0.11079153423583268, + "learning_rate": 8.73400275908967e-06, + "loss": 0.0072, + "step": 1495 + }, + { + "epoch": 0.5408532176428055, + "grad_norm": 0.003410328281438532, + "learning_rate": 8.732014166031694e-06, + "loss": 0.0001, + "step": 1496 + }, + { + "epoch": 0.5412147505422994, + "grad_norm": 0.0004869058319844442, + "learning_rate": 8.730024239152643e-06, + "loss": 0.0, + "step": 1497 + }, + { + "epoch": 0.5415762834417932, + "grad_norm": 0.0008728321705151489, + "learning_rate": 8.728032979163717e-06, + "loss": 0.0, + "step": 1498 + }, + { + "epoch": 0.541937816341287, + "grad_norm": 0.04233831577529851, + "learning_rate": 8.72604038677659e-06, + "loss": 0.0027, + "step": 1499 + }, + { + "epoch": 0.5422993492407809, + "grad_norm": 1.1566844503314735, + "learning_rate": 8.724046462703413e-06, + "loss": 0.0354, + "step": 1500 + }, + { + "epoch": 0.5426608821402747, + "grad_norm": 0.0007756807989353834, + "learning_rate": 8.722051207656819e-06, + "loss": 0.0, + "step": 1501 + }, + { + "epoch": 0.5430224150397687, + "grad_norm": 0.03539682538698138, + "learning_rate": 8.720054622349906e-06, + "loss": 0.0015, + "step": 1502 + }, + { + "epoch": 0.5433839479392625, + "grad_norm": 0.03307572514608805, + "learning_rate": 8.71805670749626e-06, + "loss": 0.0017, + "step": 1503 + }, + { + "epoch": 0.5437454808387563, + "grad_norm": 0.0027581695046507313, + "learning_rate": 8.716057463809928e-06, + "loss": 0.0001, + "step": 1504 + }, + { + "epoch": 0.5441070137382502, + "grad_norm": 0.0016909953426555747, + "learning_rate": 8.714056892005445e-06, + "loss": 0.0001, + "step": 1505 + }, + { + "epoch": 0.544468546637744, + "grad_norm": 3.5622490590467315, + "learning_rate": 8.712054992797812e-06, + "loss": 0.1035, + "step": 1506 + }, + { + "epoch": 0.5448300795372379, + "grad_norm": 1.7720943555784545, + "learning_rate": 8.710051766902508e-06, + "loss": 0.377, + "step": 1507 + }, + { + "epoch": 0.5451916124367318, + "grad_norm": 0.016479686490750524, + "learning_rate": 8.708047215035484e-06, + "loss": 0.0003, + "step": 1508 + }, + { + "epoch": 0.5455531453362256, + "grad_norm": 1.5325344432655144, + "learning_rate": 8.706041337913169e-06, + "loss": 0.0354, + "step": 1509 + }, + { + "epoch": 0.5459146782357195, + "grad_norm": 0.02869488108706025, + "learning_rate": 8.704034136252463e-06, + "loss": 0.0017, + "step": 1510 + }, + { + "epoch": 0.5462762111352133, + "grad_norm": 0.003107387351716314, + "learning_rate": 8.702025610770738e-06, + "loss": 0.0001, + "step": 1511 + }, + { + "epoch": 0.5466377440347071, + "grad_norm": 1.6162636310494087, + "learning_rate": 8.700015762185839e-06, + "loss": 0.3535, + "step": 1512 + }, + { + "epoch": 0.546999276934201, + "grad_norm": 0.07029645635236773, + "learning_rate": 8.69800459121609e-06, + "loss": 0.0045, + "step": 1513 + }, + { + "epoch": 0.5473608098336948, + "grad_norm": 0.1252521295186626, + "learning_rate": 8.695992098580279e-06, + "loss": 0.0064, + "step": 1514 + }, + { + "epoch": 0.5477223427331888, + "grad_norm": 0.24861728953643394, + "learning_rate": 8.693978284997672e-06, + "loss": 0.0103, + "step": 1515 + }, + { + "epoch": 0.5480838756326826, + "grad_norm": 1.1723075872102515, + "learning_rate": 8.691963151188005e-06, + "loss": 0.2461, + "step": 1516 + }, + { + "epoch": 0.5484454085321764, + "grad_norm": 0.0055761349962802, + "learning_rate": 8.68994669787149e-06, + "loss": 0.0002, + "step": 1517 + }, + { + "epoch": 0.5488069414316703, + "grad_norm": 0.7280674602604561, + "learning_rate": 8.6879289257688e-06, + "loss": 0.0317, + "step": 1518 + }, + { + "epoch": 0.5491684743311641, + "grad_norm": 0.47601121438385907, + "learning_rate": 8.685909835601091e-06, + "loss": 0.0258, + "step": 1519 + }, + { + "epoch": 0.549530007230658, + "grad_norm": 1.1338016749545967, + "learning_rate": 8.683889428089987e-06, + "loss": 0.1191, + "step": 1520 + }, + { + "epoch": 0.5498915401301518, + "grad_norm": 0.0008827919013345546, + "learning_rate": 8.681867703957577e-06, + "loss": 0.0, + "step": 1521 + }, + { + "epoch": 0.5502530730296457, + "grad_norm": 1.2118078479191279, + "learning_rate": 8.679844663926426e-06, + "loss": 0.1934, + "step": 1522 + }, + { + "epoch": 0.5506146059291396, + "grad_norm": 1.2740508855718975, + "learning_rate": 8.677820308719572e-06, + "loss": 0.1934, + "step": 1523 + }, + { + "epoch": 0.5509761388286334, + "grad_norm": 0.0032338395937144306, + "learning_rate": 8.675794639060513e-06, + "loss": 0.0001, + "step": 1524 + }, + { + "epoch": 0.5513376717281273, + "grad_norm": 0.2529741483793497, + "learning_rate": 8.673767655673227e-06, + "loss": 0.0255, + "step": 1525 + }, + { + "epoch": 0.5516992046276211, + "grad_norm": 0.7227027332826184, + "learning_rate": 8.671739359282154e-06, + "loss": 0.1455, + "step": 1526 + }, + { + "epoch": 0.5520607375271149, + "grad_norm": 0.7451894471400534, + "learning_rate": 8.66970975061221e-06, + "loss": 0.1191, + "step": 1527 + }, + { + "epoch": 0.5524222704266089, + "grad_norm": 0.22788986178239448, + "learning_rate": 8.667678830388777e-06, + "loss": 0.0206, + "step": 1528 + }, + { + "epoch": 0.5527838033261027, + "grad_norm": 0.23527613611428053, + "learning_rate": 8.665646599337703e-06, + "loss": 0.0255, + "step": 1529 + }, + { + "epoch": 0.5531453362255966, + "grad_norm": 0.6363904701563388, + "learning_rate": 8.663613058185307e-06, + "loss": 0.1279, + "step": 1530 + }, + { + "epoch": 0.5535068691250904, + "grad_norm": 0.23268024489072966, + "learning_rate": 8.661578207658379e-06, + "loss": 0.0286, + "step": 1531 + }, + { + "epoch": 0.5538684020245842, + "grad_norm": 0.383936870130971, + "learning_rate": 8.659542048484172e-06, + "loss": 0.0354, + "step": 1532 + }, + { + "epoch": 0.5542299349240781, + "grad_norm": 0.004125517153023074, + "learning_rate": 8.657504581390409e-06, + "loss": 0.0001, + "step": 1533 + }, + { + "epoch": 0.5545914678235719, + "grad_norm": 0.2965870887019476, + "learning_rate": 8.655465807105279e-06, + "loss": 0.0391, + "step": 1534 + }, + { + "epoch": 0.5549530007230657, + "grad_norm": 1.6899726522852374, + "learning_rate": 8.65342572635744e-06, + "loss": 0.0889, + "step": 1535 + }, + { + "epoch": 0.5553145336225597, + "grad_norm": 3.5793352485054024, + "learning_rate": 8.651384339876022e-06, + "loss": 0.2148, + "step": 1536 + }, + { + "epoch": 0.5556760665220535, + "grad_norm": 0.0006572617324386877, + "learning_rate": 8.649341648390608e-06, + "loss": 0.0, + "step": 1537 + }, + { + "epoch": 0.5560375994215474, + "grad_norm": 0.2581258165414333, + "learning_rate": 8.64729765263126e-06, + "loss": 0.0317, + "step": 1538 + }, + { + "epoch": 0.5563991323210412, + "grad_norm": 0.2576966062114656, + "learning_rate": 8.645252353328502e-06, + "loss": 0.0206, + "step": 1539 + }, + { + "epoch": 0.556760665220535, + "grad_norm": 3.923107438188765, + "learning_rate": 8.64320575121332e-06, + "loss": 0.8906, + "step": 1540 + }, + { + "epoch": 0.5571221981200289, + "grad_norm": 0.8007354823712649, + "learning_rate": 8.641157847017172e-06, + "loss": 0.0432, + "step": 1541 + }, + { + "epoch": 0.5574837310195228, + "grad_norm": 24.09893043328695, + "learning_rate": 8.639108641471978e-06, + "loss": 0.4941, + "step": 1542 + }, + { + "epoch": 0.5578452639190167, + "grad_norm": 0.3998691443112529, + "learning_rate": 8.637058135310124e-06, + "loss": 0.0117, + "step": 1543 + }, + { + "epoch": 0.5582067968185105, + "grad_norm": 1.4280938885111878, + "learning_rate": 8.635006329264459e-06, + "loss": 0.0752, + "step": 1544 + }, + { + "epoch": 0.5585683297180043, + "grad_norm": 0.17061662126699817, + "learning_rate": 8.6329532240683e-06, + "loss": 0.0146, + "step": 1545 + }, + { + "epoch": 0.5589298626174982, + "grad_norm": 0.3854062714939839, + "learning_rate": 8.630898820455425e-06, + "loss": 0.0391, + "step": 1546 + }, + { + "epoch": 0.559291395516992, + "grad_norm": 2.5459536481885534, + "learning_rate": 8.628843119160079e-06, + "loss": 0.1191, + "step": 1547 + }, + { + "epoch": 0.559652928416486, + "grad_norm": 0.8958760916700867, + "learning_rate": 8.626786120916967e-06, + "loss": 0.0889, + "step": 1548 + }, + { + "epoch": 0.5600144613159798, + "grad_norm": 0.33449803006893253, + "learning_rate": 8.62472782646126e-06, + "loss": 0.0391, + "step": 1549 + }, + { + "epoch": 0.5603759942154736, + "grad_norm": 4.030516381475724, + "learning_rate": 8.622668236528594e-06, + "loss": 0.2344, + "step": 1550 + }, + { + "epoch": 0.5607375271149675, + "grad_norm": 0.84463358168154, + "learning_rate": 8.620607351855065e-06, + "loss": 0.1455, + "step": 1551 + }, + { + "epoch": 0.5610990600144613, + "grad_norm": 0.6154599511192748, + "learning_rate": 8.618545173177231e-06, + "loss": 0.0231, + "step": 1552 + }, + { + "epoch": 0.5614605929139552, + "grad_norm": 0.9808287460989791, + "learning_rate": 8.616481701232118e-06, + "loss": 0.0693, + "step": 1553 + }, + { + "epoch": 0.561822125813449, + "grad_norm": 1.972953702804021, + "learning_rate": 8.614416936757206e-06, + "loss": 0.0752, + "step": 1554 + }, + { + "epoch": 0.5621836587129428, + "grad_norm": 1.0443764982184132, + "learning_rate": 8.612350880490446e-06, + "loss": 0.063, + "step": 1555 + }, + { + "epoch": 0.5625451916124368, + "grad_norm": 0.550354352662479, + "learning_rate": 8.61028353317024e-06, + "loss": 0.1035, + "step": 1556 + }, + { + "epoch": 0.5629067245119306, + "grad_norm": 1.2779816984078571, + "learning_rate": 8.608214895535465e-06, + "loss": 0.1553, + "step": 1557 + }, + { + "epoch": 0.5632682574114244, + "grad_norm": 0.28207156135189815, + "learning_rate": 8.606144968325445e-06, + "loss": 0.0432, + "step": 1558 + }, + { + "epoch": 0.5636297903109183, + "grad_norm": 0.27851406020407127, + "learning_rate": 8.604073752279975e-06, + "loss": 0.0391, + "step": 1559 + }, + { + "epoch": 0.5639913232104121, + "grad_norm": 0.002099161245643818, + "learning_rate": 8.602001248139308e-06, + "loss": 0.0001, + "step": 1560 + }, + { + "epoch": 0.564352856109906, + "grad_norm": 0.4450521696522388, + "learning_rate": 8.599927456644155e-06, + "loss": 0.0231, + "step": 1561 + }, + { + "epoch": 0.5647143890093999, + "grad_norm": 0.2604311046457349, + "learning_rate": 8.597852378535687e-06, + "loss": 0.0354, + "step": 1562 + }, + { + "epoch": 0.5650759219088937, + "grad_norm": 0.002421823818056419, + "learning_rate": 8.595776014555539e-06, + "loss": 0.0001, + "step": 1563 + }, + { + "epoch": 0.5654374548083876, + "grad_norm": 0.2895447219821541, + "learning_rate": 8.593698365445802e-06, + "loss": 0.0432, + "step": 1564 + }, + { + "epoch": 0.5657989877078814, + "grad_norm": 2.0396674754484216, + "learning_rate": 8.591619431949028e-06, + "loss": 0.1367, + "step": 1565 + }, + { + "epoch": 0.5661605206073753, + "grad_norm": 0.035951342297065786, + "learning_rate": 8.589539214808228e-06, + "loss": 0.0009, + "step": 1566 + }, + { + "epoch": 0.5665220535068691, + "grad_norm": 0.27232649598357644, + "learning_rate": 8.587457714766868e-06, + "loss": 0.0206, + "step": 1567 + }, + { + "epoch": 0.5668835864063629, + "grad_norm": 0.44575280631512937, + "learning_rate": 8.58537493256888e-06, + "loss": 0.0957, + "step": 1568 + }, + { + "epoch": 0.5672451193058569, + "grad_norm": 0.00925396758608355, + "learning_rate": 8.583290868958646e-06, + "loss": 0.0002, + "step": 1569 + }, + { + "epoch": 0.5676066522053507, + "grad_norm": 0.4535025121973245, + "learning_rate": 8.581205524681012e-06, + "loss": 0.0131, + "step": 1570 + }, + { + "epoch": 0.5679681851048446, + "grad_norm": 0.5028773857569707, + "learning_rate": 8.57911890048128e-06, + "loss": 0.0479, + "step": 1571 + }, + { + "epoch": 0.5683297180043384, + "grad_norm": 0.49097006397243, + "learning_rate": 8.577030997105206e-06, + "loss": 0.0957, + "step": 1572 + }, + { + "epoch": 0.5686912509038322, + "grad_norm": 0.38018068961598844, + "learning_rate": 8.574941815299012e-06, + "loss": 0.0391, + "step": 1573 + }, + { + "epoch": 0.5690527838033261, + "grad_norm": 0.010322642733800564, + "learning_rate": 8.572851355809366e-06, + "loss": 0.0002, + "step": 1574 + }, + { + "epoch": 0.56941431670282, + "grad_norm": 0.4829480980593988, + "learning_rate": 8.5707596193834e-06, + "loss": 0.0206, + "step": 1575 + }, + { + "epoch": 0.5697758496023138, + "grad_norm": 0.26544218843842965, + "learning_rate": 8.5686666067687e-06, + "loss": 0.0317, + "step": 1576 + }, + { + "epoch": 0.5701373825018077, + "grad_norm": 0.8785139267352867, + "learning_rate": 8.566572318713306e-06, + "loss": 0.1279, + "step": 1577 + }, + { + "epoch": 0.5704989154013015, + "grad_norm": 0.1708270914656964, + "learning_rate": 8.564476755965718e-06, + "loss": 0.0002, + "step": 1578 + }, + { + "epoch": 0.5708604483007954, + "grad_norm": 0.22593816172246384, + "learning_rate": 8.562379919274892e-06, + "loss": 0.0391, + "step": 1579 + }, + { + "epoch": 0.5712219812002892, + "grad_norm": 0.20698514984355337, + "learning_rate": 8.560281809390232e-06, + "loss": 0.0354, + "step": 1580 + }, + { + "epoch": 0.571583514099783, + "grad_norm": 0.7421391972953817, + "learning_rate": 8.558182427061606e-06, + "loss": 0.0258, + "step": 1581 + }, + { + "epoch": 0.571945046999277, + "grad_norm": 0.014457491611290254, + "learning_rate": 8.556081773039333e-06, + "loss": 0.0003, + "step": 1582 + }, + { + "epoch": 0.5723065798987708, + "grad_norm": 0.27581889148569894, + "learning_rate": 8.553979848074182e-06, + "loss": 0.0082, + "step": 1583 + }, + { + "epoch": 0.5726681127982647, + "grad_norm": 0.44800918863302197, + "learning_rate": 8.551876652917385e-06, + "loss": 0.0432, + "step": 1584 + }, + { + "epoch": 0.5730296456977585, + "grad_norm": 0.17728601355714457, + "learning_rate": 8.54977218832062e-06, + "loss": 0.0146, + "step": 1585 + }, + { + "epoch": 0.5733911785972523, + "grad_norm": 0.2715768053447013, + "learning_rate": 8.547666455036026e-06, + "loss": 0.0317, + "step": 1586 + }, + { + "epoch": 0.5737527114967462, + "grad_norm": 1.100725332003606, + "learning_rate": 8.54555945381619e-06, + "loss": 0.063, + "step": 1587 + }, + { + "epoch": 0.57411424439624, + "grad_norm": 1.0801161854946735, + "learning_rate": 8.543451185414152e-06, + "loss": 0.063, + "step": 1588 + }, + { + "epoch": 0.574475777295734, + "grad_norm": 0.0014593902130948524, + "learning_rate": 8.541341650583407e-06, + "loss": 0.0001, + "step": 1589 + }, + { + "epoch": 0.5748373101952278, + "grad_norm": 0.6653005456944524, + "learning_rate": 8.539230850077907e-06, + "loss": 0.0815, + "step": 1590 + }, + { + "epoch": 0.5751988430947216, + "grad_norm": 0.06664197441877107, + "learning_rate": 8.537118784652045e-06, + "loss": 0.0017, + "step": 1591 + }, + { + "epoch": 0.5755603759942155, + "grad_norm": 1.323509827610026, + "learning_rate": 8.535005455060678e-06, + "loss": 0.0579, + "step": 1592 + }, + { + "epoch": 0.5759219088937093, + "grad_norm": 0.24579730475012376, + "learning_rate": 8.532890862059107e-06, + "loss": 0.0391, + "step": 1593 + }, + { + "epoch": 0.5762834417932032, + "grad_norm": 0.52860288324456, + "learning_rate": 8.530775006403088e-06, + "loss": 0.0957, + "step": 1594 + }, + { + "epoch": 0.576644974692697, + "grad_norm": 0.2893013843639603, + "learning_rate": 8.528657888848823e-06, + "loss": 0.0432, + "step": 1595 + }, + { + "epoch": 0.5770065075921909, + "grad_norm": 0.22663757645658644, + "learning_rate": 8.526539510152978e-06, + "loss": 0.0317, + "step": 1596 + }, + { + "epoch": 0.5773680404916848, + "grad_norm": 3.5076433136731273, + "learning_rate": 8.524419871072653e-06, + "loss": 0.2891, + "step": 1597 + }, + { + "epoch": 0.5777295733911786, + "grad_norm": 0.0831887678915732, + "learning_rate": 8.522298972365411e-06, + "loss": 0.0022, + "step": 1598 + }, + { + "epoch": 0.5780911062906724, + "grad_norm": 0.3508191987297041, + "learning_rate": 8.52017681478926e-06, + "loss": 0.0258, + "step": 1599 + }, + { + "epoch": 0.5784526391901663, + "grad_norm": 0.1527084333012361, + "learning_rate": 8.518053399102659e-06, + "loss": 0.004, + "step": 1600 + }, + { + "epoch": 0.5788141720896601, + "grad_norm": 0.7345185926356824, + "learning_rate": 8.515928726064514e-06, + "loss": 0.063, + "step": 1601 + }, + { + "epoch": 0.579175704989154, + "grad_norm": 0.18596102336311024, + "learning_rate": 8.513802796434186e-06, + "loss": 0.0082, + "step": 1602 + }, + { + "epoch": 0.5795372378886479, + "grad_norm": 2.4576079981437, + "learning_rate": 8.511675610971479e-06, + "loss": 0.1035, + "step": 1603 + }, + { + "epoch": 0.5798987707881417, + "grad_norm": 0.8135364585622245, + "learning_rate": 8.50954717043665e-06, + "loss": 0.0432, + "step": 1604 + }, + { + "epoch": 0.5802603036876356, + "grad_norm": 0.40853964458886993, + "learning_rate": 8.507417475590403e-06, + "loss": 0.0576, + "step": 1605 + }, + { + "epoch": 0.5806218365871294, + "grad_norm": 1.0445931333326552, + "learning_rate": 8.50528652719389e-06, + "loss": 0.0752, + "step": 1606 + }, + { + "epoch": 0.5809833694866233, + "grad_norm": 0.9218696081758622, + "learning_rate": 8.503154326008712e-06, + "loss": 0.0286, + "step": 1607 + }, + { + "epoch": 0.5813449023861171, + "grad_norm": 1.081217770782864, + "learning_rate": 8.501020872796916e-06, + "loss": 0.0579, + "step": 1608 + }, + { + "epoch": 0.581706435285611, + "grad_norm": 0.0006556492387091502, + "learning_rate": 8.498886168321e-06, + "loss": 0.0, + "step": 1609 + }, + { + "epoch": 0.5820679681851049, + "grad_norm": 0.2188825407725189, + "learning_rate": 8.496750213343907e-06, + "loss": 0.0206, + "step": 1610 + }, + { + "epoch": 0.5824295010845987, + "grad_norm": 0.40401590967320355, + "learning_rate": 8.494613008629025e-06, + "loss": 0.0148, + "step": 1611 + }, + { + "epoch": 0.5827910339840926, + "grad_norm": 0.010467342557445632, + "learning_rate": 8.49247455494019e-06, + "loss": 0.0003, + "step": 1612 + }, + { + "epoch": 0.5831525668835864, + "grad_norm": 0.24346431138372954, + "learning_rate": 8.49033485304169e-06, + "loss": 0.0391, + "step": 1613 + }, + { + "epoch": 0.5835140997830802, + "grad_norm": 0.13658637653787561, + "learning_rate": 8.488193903698246e-06, + "loss": 0.0036, + "step": 1614 + }, + { + "epoch": 0.5838756326825741, + "grad_norm": 0.6881844236910616, + "learning_rate": 8.486051707675039e-06, + "loss": 0.0525, + "step": 1615 + }, + { + "epoch": 0.584237165582068, + "grad_norm": 0.5457455763885339, + "learning_rate": 8.483908265737685e-06, + "loss": 0.1035, + "step": 1616 + }, + { + "epoch": 0.5845986984815619, + "grad_norm": 0.2601803421717942, + "learning_rate": 8.481763578652253e-06, + "loss": 0.0258, + "step": 1617 + }, + { + "epoch": 0.5849602313810557, + "grad_norm": 7.743713401956938, + "learning_rate": 8.479617647185255e-06, + "loss": 0.1455, + "step": 1618 + }, + { + "epoch": 0.5853217642805495, + "grad_norm": 0.4895608197224362, + "learning_rate": 8.477470472103641e-06, + "loss": 0.0354, + "step": 1619 + }, + { + "epoch": 0.5856832971800434, + "grad_norm": 0.5179334024306659, + "learning_rate": 8.475322054174816e-06, + "loss": 0.1113, + "step": 1620 + }, + { + "epoch": 0.5860448300795372, + "grad_norm": 0.5616753539454463, + "learning_rate": 8.473172394166622e-06, + "loss": 0.0184, + "step": 1621 + }, + { + "epoch": 0.586406362979031, + "grad_norm": 0.43034713858885526, + "learning_rate": 8.471021492847348e-06, + "loss": 0.0288, + "step": 1622 + }, + { + "epoch": 0.586767895878525, + "grad_norm": 0.603875545987895, + "learning_rate": 8.468869350985725e-06, + "loss": 0.0391, + "step": 1623 + }, + { + "epoch": 0.5871294287780188, + "grad_norm": 0.6909968715078236, + "learning_rate": 8.466715969350928e-06, + "loss": 0.0258, + "step": 1624 + }, + { + "epoch": 0.5874909616775127, + "grad_norm": 0.8218078263601191, + "learning_rate": 8.464561348712575e-06, + "loss": 0.1279, + "step": 1625 + }, + { + "epoch": 0.5878524945770065, + "grad_norm": 0.36446843830759246, + "learning_rate": 8.462405489840726e-06, + "loss": 0.0286, + "step": 1626 + }, + { + "epoch": 0.5882140274765003, + "grad_norm": 0.21473212888908594, + "learning_rate": 8.460248393505885e-06, + "loss": 0.0286, + "step": 1627 + }, + { + "epoch": 0.5885755603759942, + "grad_norm": 0.0012389668246943018, + "learning_rate": 8.458090060479e-06, + "loss": 0.0001, + "step": 1628 + }, + { + "epoch": 0.588937093275488, + "grad_norm": 0.2940711057867082, + "learning_rate": 8.455930491531456e-06, + "loss": 0.0286, + "step": 1629 + }, + { + "epoch": 0.589298626174982, + "grad_norm": 1.0830031913919305, + "learning_rate": 8.453769687435086e-06, + "loss": 0.0957, + "step": 1630 + }, + { + "epoch": 0.5896601590744758, + "grad_norm": 0.025834181550117595, + "learning_rate": 8.451607648962156e-06, + "loss": 0.0006, + "step": 1631 + }, + { + "epoch": 0.5900216919739696, + "grad_norm": 0.29836782831549874, + "learning_rate": 8.449444376885383e-06, + "loss": 0.0391, + "step": 1632 + }, + { + "epoch": 0.5903832248734635, + "grad_norm": 0.0016145256056816814, + "learning_rate": 8.447279871977916e-06, + "loss": 0.0001, + "step": 1633 + }, + { + "epoch": 0.5907447577729573, + "grad_norm": 0.0008179273146525753, + "learning_rate": 8.44511413501335e-06, + "loss": 0.0, + "step": 1634 + }, + { + "epoch": 0.5911062906724512, + "grad_norm": 0.0010891765751186485, + "learning_rate": 8.44294716676572e-06, + "loss": 0.0, + "step": 1635 + }, + { + "epoch": 0.591467823571945, + "grad_norm": 0.001409475572500824, + "learning_rate": 8.4407789680095e-06, + "loss": 0.0001, + "step": 1636 + }, + { + "epoch": 0.5918293564714389, + "grad_norm": 0.6890104327016767, + "learning_rate": 8.438609539519601e-06, + "loss": 0.0354, + "step": 1637 + }, + { + "epoch": 0.5921908893709328, + "grad_norm": 0.22066654668058383, + "learning_rate": 8.43643888207138e-06, + "loss": 0.0131, + "step": 1638 + }, + { + "epoch": 0.5925524222704266, + "grad_norm": 0.40699024962688307, + "learning_rate": 8.434266996440628e-06, + "loss": 0.0476, + "step": 1639 + }, + { + "epoch": 0.5929139551699205, + "grad_norm": 0.19819642790097539, + "learning_rate": 8.432093883403576e-06, + "loss": 0.0165, + "step": 1640 + }, + { + "epoch": 0.5932754880694143, + "grad_norm": 0.780782850405221, + "learning_rate": 8.429919543736892e-06, + "loss": 0.1279, + "step": 1641 + }, + { + "epoch": 0.5936370209689081, + "grad_norm": 2.0028296994752792, + "learning_rate": 8.42774397821769e-06, + "loss": 0.0432, + "step": 1642 + }, + { + "epoch": 0.5939985538684021, + "grad_norm": 0.052240908538445985, + "learning_rate": 8.425567187623513e-06, + "loss": 0.0008, + "step": 1643 + }, + { + "epoch": 0.5943600867678959, + "grad_norm": 0.327398782487878, + "learning_rate": 8.423389172732345e-06, + "loss": 0.0317, + "step": 1644 + }, + { + "epoch": 0.5947216196673897, + "grad_norm": 0.01828668239016433, + "learning_rate": 8.42120993432261e-06, + "loss": 0.0005, + "step": 1645 + }, + { + "epoch": 0.5950831525668836, + "grad_norm": 0.36711402351392275, + "learning_rate": 8.419029473173166e-06, + "loss": 0.0317, + "step": 1646 + }, + { + "epoch": 0.5954446854663774, + "grad_norm": 0.04777199162704035, + "learning_rate": 8.41684779006331e-06, + "loss": 0.0009, + "step": 1647 + }, + { + "epoch": 0.5958062183658713, + "grad_norm": 0.005734191670593327, + "learning_rate": 8.414664885772774e-06, + "loss": 0.0002, + "step": 1648 + }, + { + "epoch": 0.5961677512653651, + "grad_norm": 0.00723568498678424, + "learning_rate": 8.412480761081729e-06, + "loss": 0.0002, + "step": 1649 + }, + { + "epoch": 0.596529284164859, + "grad_norm": 0.5918657281396364, + "learning_rate": 8.410295416770782e-06, + "loss": 0.1191, + "step": 1650 + }, + { + "epoch": 0.5968908170643529, + "grad_norm": 0.01158807374058785, + "learning_rate": 8.40810885362097e-06, + "loss": 0.0003, + "step": 1651 + }, + { + "epoch": 0.5972523499638467, + "grad_norm": 0.00145178645015802, + "learning_rate": 8.405921072413774e-06, + "loss": 0.0, + "step": 1652 + }, + { + "epoch": 0.5976138828633406, + "grad_norm": 0.3689379043126106, + "learning_rate": 8.403732073931106e-06, + "loss": 0.0432, + "step": 1653 + }, + { + "epoch": 0.5979754157628344, + "grad_norm": 0.22356275922079713, + "learning_rate": 8.401541858955312e-06, + "loss": 0.0286, + "step": 1654 + }, + { + "epoch": 0.5983369486623282, + "grad_norm": 0.8520559272953709, + "learning_rate": 8.399350428269177e-06, + "loss": 0.0432, + "step": 1655 + }, + { + "epoch": 0.5986984815618221, + "grad_norm": 0.8364877733400605, + "learning_rate": 8.397157782655915e-06, + "loss": 0.0258, + "step": 1656 + }, + { + "epoch": 0.599060014461316, + "grad_norm": 0.003995715695948819, + "learning_rate": 8.394963922899178e-06, + "loss": 0.0001, + "step": 1657 + }, + { + "epoch": 0.5994215473608099, + "grad_norm": 0.23648478787228097, + "learning_rate": 8.392768849783053e-06, + "loss": 0.0229, + "step": 1658 + }, + { + "epoch": 0.5997830802603037, + "grad_norm": 0.02703308502262687, + "learning_rate": 8.390572564092056e-06, + "loss": 0.0004, + "step": 1659 + }, + { + "epoch": 0.6001446131597975, + "grad_norm": 0.23848726116545663, + "learning_rate": 8.388375066611141e-06, + "loss": 0.0065, + "step": 1660 + }, + { + "epoch": 0.6005061460592914, + "grad_norm": 0.025937810314621546, + "learning_rate": 8.386176358125693e-06, + "loss": 0.0006, + "step": 1661 + }, + { + "epoch": 0.6008676789587852, + "grad_norm": 0.790855686978578, + "learning_rate": 8.383976439421525e-06, + "loss": 0.0476, + "step": 1662 + }, + { + "epoch": 0.6012292118582792, + "grad_norm": 0.6415422934430508, + "learning_rate": 8.381775311284896e-06, + "loss": 0.0354, + "step": 1663 + }, + { + "epoch": 0.601590744757773, + "grad_norm": 1.288546122837091, + "learning_rate": 8.379572974502483e-06, + "loss": 0.2129, + "step": 1664 + }, + { + "epoch": 0.6019522776572668, + "grad_norm": 0.3528968173797602, + "learning_rate": 8.377369429861403e-06, + "loss": 0.0229, + "step": 1665 + }, + { + "epoch": 0.6023138105567607, + "grad_norm": 0.35301836572698747, + "learning_rate": 8.375164678149201e-06, + "loss": 0.0391, + "step": 1666 + }, + { + "epoch": 0.6026753434562545, + "grad_norm": 0.22066460683029981, + "learning_rate": 8.372958720153855e-06, + "loss": 0.0036, + "step": 1667 + }, + { + "epoch": 0.6030368763557483, + "grad_norm": 0.16844760552664506, + "learning_rate": 8.370751556663774e-06, + "loss": 0.0184, + "step": 1668 + }, + { + "epoch": 0.6033984092552422, + "grad_norm": 0.15179635589434415, + "learning_rate": 8.3685431884678e-06, + "loss": 0.0117, + "step": 1669 + }, + { + "epoch": 0.603759942154736, + "grad_norm": 0.0031744424855634197, + "learning_rate": 8.366333616355198e-06, + "loss": 0.0001, + "step": 1670 + }, + { + "epoch": 0.60412147505423, + "grad_norm": 1.184710635283289, + "learning_rate": 8.364122841115675e-06, + "loss": 0.0579, + "step": 1671 + }, + { + "epoch": 0.6044830079537238, + "grad_norm": 0.1792666244198397, + "learning_rate": 8.361910863539357e-06, + "loss": 0.0146, + "step": 1672 + }, + { + "epoch": 0.6048445408532176, + "grad_norm": 0.6227907212316229, + "learning_rate": 8.359697684416805e-06, + "loss": 0.1113, + "step": 1673 + }, + { + "epoch": 0.6052060737527115, + "grad_norm": 1.3474534996119527, + "learning_rate": 8.357483304539012e-06, + "loss": 0.0258, + "step": 1674 + }, + { + "epoch": 0.6055676066522053, + "grad_norm": 0.2941231579828677, + "learning_rate": 8.355267724697394e-06, + "loss": 0.0286, + "step": 1675 + }, + { + "epoch": 0.6059291395516992, + "grad_norm": 0.28687074375109534, + "learning_rate": 8.353050945683798e-06, + "loss": 0.0206, + "step": 1676 + }, + { + "epoch": 0.6062906724511931, + "grad_norm": 0.001046912751808503, + "learning_rate": 8.350832968290502e-06, + "loss": 0.0, + "step": 1677 + }, + { + "epoch": 0.6066522053506869, + "grad_norm": 1.189156910080112, + "learning_rate": 8.34861379331021e-06, + "loss": 0.0432, + "step": 1678 + }, + { + "epoch": 0.6070137382501808, + "grad_norm": 0.031971117272388716, + "learning_rate": 8.346393421536056e-06, + "loss": 0.0005, + "step": 1679 + }, + { + "epoch": 0.6073752711496746, + "grad_norm": 0.5978871359762733, + "learning_rate": 8.344171853761599e-06, + "loss": 0.1191, + "step": 1680 + }, + { + "epoch": 0.6077368040491685, + "grad_norm": 1.6659662418669159, + "learning_rate": 8.341949090780827e-06, + "loss": 0.0432, + "step": 1681 + }, + { + "epoch": 0.6080983369486623, + "grad_norm": 0.23333401924515815, + "learning_rate": 8.339725133388154e-06, + "loss": 0.0258, + "step": 1682 + }, + { + "epoch": 0.6084598698481561, + "grad_norm": 0.8306102762375244, + "learning_rate": 8.337499982378426e-06, + "loss": 0.0354, + "step": 1683 + }, + { + "epoch": 0.6088214027476501, + "grad_norm": 0.000951084971088901, + "learning_rate": 8.335273638546906e-06, + "loss": 0.0, + "step": 1684 + }, + { + "epoch": 0.6091829356471439, + "grad_norm": 0.2101011044282741, + "learning_rate": 8.333046102689293e-06, + "loss": 0.0258, + "step": 1685 + }, + { + "epoch": 0.6095444685466378, + "grad_norm": 0.0017997989481211222, + "learning_rate": 8.330817375601705e-06, + "loss": 0.0001, + "step": 1686 + }, + { + "epoch": 0.6099060014461316, + "grad_norm": 0.16281220829490162, + "learning_rate": 8.328587458080691e-06, + "loss": 0.0229, + "step": 1687 + }, + { + "epoch": 0.6102675343456254, + "grad_norm": 0.0013549445104152377, + "learning_rate": 8.326356350923221e-06, + "loss": 0.0, + "step": 1688 + }, + { + "epoch": 0.6106290672451193, + "grad_norm": 0.001009787713429626, + "learning_rate": 8.324124054926695e-06, + "loss": 0.0, + "step": 1689 + }, + { + "epoch": 0.6109906001446131, + "grad_norm": 0.269570541011585, + "learning_rate": 8.32189057088893e-06, + "loss": 0.0165, + "step": 1690 + }, + { + "epoch": 0.611352133044107, + "grad_norm": 0.0013450454812281736, + "learning_rate": 8.319655899608182e-06, + "loss": 0.0001, + "step": 1691 + }, + { + "epoch": 0.6117136659436009, + "grad_norm": 0.6643780707411182, + "learning_rate": 8.31742004188311e-06, + "loss": 0.0286, + "step": 1692 + }, + { + "epoch": 0.6120751988430947, + "grad_norm": 0.9764732096031349, + "learning_rate": 8.315182998512817e-06, + "loss": 0.0147, + "step": 1693 + }, + { + "epoch": 0.6124367317425886, + "grad_norm": 0.0007038369137723837, + "learning_rate": 8.31294477029682e-06, + "loss": 0.0, + "step": 1694 + }, + { + "epoch": 0.6127982646420824, + "grad_norm": 0.3701506987747662, + "learning_rate": 8.310705358035062e-06, + "loss": 0.0391, + "step": 1695 + }, + { + "epoch": 0.6131597975415762, + "grad_norm": 6.228676705480163, + "learning_rate": 8.308464762527907e-06, + "loss": 0.3438, + "step": 1696 + }, + { + "epoch": 0.6135213304410702, + "grad_norm": 0.0006020137810904987, + "learning_rate": 8.306222984576145e-06, + "loss": 0.0, + "step": 1697 + }, + { + "epoch": 0.613882863340564, + "grad_norm": 0.10035448085350432, + "learning_rate": 8.303980024980986e-06, + "loss": 0.0015, + "step": 1698 + }, + { + "epoch": 0.6142443962400579, + "grad_norm": 7.671394352654925, + "learning_rate": 8.301735884544062e-06, + "loss": 0.3672, + "step": 1699 + }, + { + "epoch": 0.6146059291395517, + "grad_norm": 0.25400100427569894, + "learning_rate": 8.29949056406743e-06, + "loss": 0.0231, + "step": 1700 + }, + { + "epoch": 0.6149674620390455, + "grad_norm": 0.7731695736041488, + "learning_rate": 8.297244064353566e-06, + "loss": 0.0147, + "step": 1701 + }, + { + "epoch": 0.6153289949385394, + "grad_norm": 0.40279658122882167, + "learning_rate": 8.294996386205372e-06, + "loss": 0.0131, + "step": 1702 + }, + { + "epoch": 0.6156905278380332, + "grad_norm": 0.7451714820669065, + "learning_rate": 8.292747530426165e-06, + "loss": 0.1367, + "step": 1703 + }, + { + "epoch": 0.6160520607375272, + "grad_norm": 0.47868184380117773, + "learning_rate": 8.290497497819682e-06, + "loss": 0.0231, + "step": 1704 + }, + { + "epoch": 0.616413593637021, + "grad_norm": 0.19347818100154066, + "learning_rate": 8.28824628919009e-06, + "loss": 0.0205, + "step": 1705 + }, + { + "epoch": 0.6167751265365148, + "grad_norm": 0.14780414727577956, + "learning_rate": 8.285993905341968e-06, + "loss": 0.0064, + "step": 1706 + }, + { + "epoch": 0.6171366594360087, + "grad_norm": 2.2555152395467113, + "learning_rate": 8.283740347080318e-06, + "loss": 0.0957, + "step": 1707 + }, + { + "epoch": 0.6174981923355025, + "grad_norm": 0.14160970553663102, + "learning_rate": 8.281485615210559e-06, + "loss": 0.0117, + "step": 1708 + }, + { + "epoch": 0.6178597252349964, + "grad_norm": 0.6336690772765863, + "learning_rate": 8.279229710538536e-06, + "loss": 0.0231, + "step": 1709 + }, + { + "epoch": 0.6182212581344902, + "grad_norm": 0.7923277657399652, + "learning_rate": 8.276972633870507e-06, + "loss": 0.1367, + "step": 1710 + }, + { + "epoch": 0.6185827910339841, + "grad_norm": 2.4398528469904157, + "learning_rate": 8.274714386013147e-06, + "loss": 0.0752, + "step": 1711 + }, + { + "epoch": 0.618944323933478, + "grad_norm": 0.19886305803042467, + "learning_rate": 8.272454967773559e-06, + "loss": 0.0231, + "step": 1712 + }, + { + "epoch": 0.6193058568329718, + "grad_norm": 0.22468672507668813, + "learning_rate": 8.270194379959256e-06, + "loss": 0.0117, + "step": 1713 + }, + { + "epoch": 0.6196673897324656, + "grad_norm": 2.6171421010947022, + "learning_rate": 8.26793262337817e-06, + "loss": 0.1836, + "step": 1714 + }, + { + "epoch": 0.6200289226319595, + "grad_norm": 2.209093577412075, + "learning_rate": 8.265669698838656e-06, + "loss": 0.1738, + "step": 1715 + }, + { + "epoch": 0.6203904555314533, + "grad_norm": 0.7179986094943931, + "learning_rate": 8.26340560714948e-06, + "loss": 0.1279, + "step": 1716 + }, + { + "epoch": 0.6207519884309473, + "grad_norm": 0.036059978979289455, + "learning_rate": 8.261140349119829e-06, + "loss": 0.0006, + "step": 1717 + }, + { + "epoch": 0.6211135213304411, + "grad_norm": 0.0043303273896438555, + "learning_rate": 8.258873925559304e-06, + "loss": 0.0001, + "step": 1718 + }, + { + "epoch": 0.6214750542299349, + "grad_norm": 0.19799344462467428, + "learning_rate": 8.256606337277926e-06, + "loss": 0.0229, + "step": 1719 + }, + { + "epoch": 0.6218365871294288, + "grad_norm": 1.8650915424434855, + "learning_rate": 8.254337585086132e-06, + "loss": 0.1191, + "step": 1720 + }, + { + "epoch": 0.6221981200289226, + "grad_norm": 0.010378740797292622, + "learning_rate": 8.252067669794772e-06, + "loss": 0.0002, + "step": 1721 + }, + { + "epoch": 0.6225596529284165, + "grad_norm": 7.986471527286296, + "learning_rate": 8.249796592215112e-06, + "loss": 0.543, + "step": 1722 + }, + { + "epoch": 0.6229211858279103, + "grad_norm": 0.474982264986452, + "learning_rate": 8.247524353158836e-06, + "loss": 0.0131, + "step": 1723 + }, + { + "epoch": 0.6232827187274042, + "grad_norm": 0.1332352252445329, + "learning_rate": 8.245250953438041e-06, + "loss": 0.0092, + "step": 1724 + }, + { + "epoch": 0.6236442516268981, + "grad_norm": 1.9855184950917542, + "learning_rate": 8.242976393865242e-06, + "loss": 0.1934, + "step": 1725 + }, + { + "epoch": 0.6240057845263919, + "grad_norm": 0.20946639300341063, + "learning_rate": 8.240700675253362e-06, + "loss": 0.0286, + "step": 1726 + }, + { + "epoch": 0.6243673174258858, + "grad_norm": 0.0008469591081544387, + "learning_rate": 8.238423798415747e-06, + "loss": 0.0, + "step": 1727 + }, + { + "epoch": 0.6247288503253796, + "grad_norm": 0.001626680500000909, + "learning_rate": 8.236145764166147e-06, + "loss": 0.0001, + "step": 1728 + }, + { + "epoch": 0.6250903832248734, + "grad_norm": 0.0005487744648322094, + "learning_rate": 8.233866573318736e-06, + "loss": 0.0, + "step": 1729 + }, + { + "epoch": 0.6254519161243673, + "grad_norm": 0.2603834047006339, + "learning_rate": 8.231586226688093e-06, + "loss": 0.0258, + "step": 1730 + }, + { + "epoch": 0.6258134490238612, + "grad_norm": 0.25808543965429726, + "learning_rate": 8.229304725089216e-06, + "loss": 0.0231, + "step": 1731 + }, + { + "epoch": 0.6261749819233551, + "grad_norm": 0.19486784229509505, + "learning_rate": 8.22702206933751e-06, + "loss": 0.0255, + "step": 1732 + }, + { + "epoch": 0.6265365148228489, + "grad_norm": 0.5835444489576055, + "learning_rate": 8.2247382602488e-06, + "loss": 0.0131, + "step": 1733 + }, + { + "epoch": 0.6268980477223427, + "grad_norm": 0.5312304111217081, + "learning_rate": 8.222453298639314e-06, + "loss": 0.0317, + "step": 1734 + }, + { + "epoch": 0.6272595806218366, + "grad_norm": 0.002035343965726863, + "learning_rate": 8.220167185325699e-06, + "loss": 0.0001, + "step": 1735 + }, + { + "epoch": 0.6276211135213304, + "grad_norm": 0.8438961419894655, + "learning_rate": 8.217879921125012e-06, + "loss": 0.0957, + "step": 1736 + }, + { + "epoch": 0.6279826464208242, + "grad_norm": 0.3677908417572898, + "learning_rate": 8.21559150685472e-06, + "loss": 0.0391, + "step": 1737 + }, + { + "epoch": 0.6283441793203182, + "grad_norm": 0.2689072113693672, + "learning_rate": 8.213301943332703e-06, + "loss": 0.0255, + "step": 1738 + }, + { + "epoch": 0.628705712219812, + "grad_norm": 0.3735886121272941, + "learning_rate": 8.211011231377251e-06, + "loss": 0.0432, + "step": 1739 + }, + { + "epoch": 0.6290672451193059, + "grad_norm": 0.001982800397119815, + "learning_rate": 8.208719371807059e-06, + "loss": 0.0001, + "step": 1740 + }, + { + "epoch": 0.6294287780187997, + "grad_norm": 0.29894506758583134, + "learning_rate": 8.206426365441243e-06, + "loss": 0.0354, + "step": 1741 + }, + { + "epoch": 0.6297903109182935, + "grad_norm": 0.35407637880357073, + "learning_rate": 8.204132213099321e-06, + "loss": 0.0354, + "step": 1742 + }, + { + "epoch": 0.6301518438177874, + "grad_norm": 4.785623588089842, + "learning_rate": 8.201836915601222e-06, + "loss": 0.3867, + "step": 1743 + }, + { + "epoch": 0.6305133767172812, + "grad_norm": 1.0804669304575334, + "learning_rate": 8.199540473767284e-06, + "loss": 0.0688, + "step": 1744 + }, + { + "epoch": 0.6308749096167752, + "grad_norm": 0.9167342218108383, + "learning_rate": 8.197242888418255e-06, + "loss": 0.0576, + "step": 1745 + }, + { + "epoch": 0.631236442516269, + "grad_norm": 0.4213321073261121, + "learning_rate": 8.194944160375294e-06, + "loss": 0.0432, + "step": 1746 + }, + { + "epoch": 0.6315979754157628, + "grad_norm": 3.8075660560253897, + "learning_rate": 8.192644290459963e-06, + "loss": 0.3438, + "step": 1747 + }, + { + "epoch": 0.6319595083152567, + "grad_norm": 0.0024712709780444846, + "learning_rate": 8.19034327949424e-06, + "loss": 0.0001, + "step": 1748 + }, + { + "epoch": 0.6323210412147505, + "grad_norm": 3.7129376410844794, + "learning_rate": 8.1880411283005e-06, + "loss": 0.1367, + "step": 1749 + }, + { + "epoch": 0.6326825741142444, + "grad_norm": 0.1431547736163527, + "learning_rate": 8.185737837701532e-06, + "loss": 0.0184, + "step": 1750 + }, + { + "epoch": 0.6330441070137383, + "grad_norm": 0.884170976095499, + "learning_rate": 8.183433408520533e-06, + "loss": 0.0576, + "step": 1751 + }, + { + "epoch": 0.6334056399132321, + "grad_norm": 0.2608424983556434, + "learning_rate": 8.181127841581109e-06, + "loss": 0.0231, + "step": 1752 + }, + { + "epoch": 0.633767172812726, + "grad_norm": 0.008994093420295297, + "learning_rate": 8.178821137707263e-06, + "loss": 0.0003, + "step": 1753 + }, + { + "epoch": 0.6341287057122198, + "grad_norm": 0.0012736760311449957, + "learning_rate": 8.176513297723413e-06, + "loss": 0.0, + "step": 1754 + }, + { + "epoch": 0.6344902386117137, + "grad_norm": 0.002460185024899333, + "learning_rate": 8.174204322454382e-06, + "loss": 0.0001, + "step": 1755 + }, + { + "epoch": 0.6348517715112075, + "grad_norm": 1.477204064368333, + "learning_rate": 8.171894212725397e-06, + "loss": 0.0525, + "step": 1756 + }, + { + "epoch": 0.6352133044107013, + "grad_norm": 0.09336940323884191, + "learning_rate": 8.169582969362089e-06, + "loss": 0.0022, + "step": 1757 + }, + { + "epoch": 0.6355748373101953, + "grad_norm": 0.1262351319127486, + "learning_rate": 8.167270593190495e-06, + "loss": 0.0131, + "step": 1758 + }, + { + "epoch": 0.6359363702096891, + "grad_norm": 0.0005822055199149458, + "learning_rate": 8.164957085037063e-06, + "loss": 0.0, + "step": 1759 + }, + { + "epoch": 0.6362979031091829, + "grad_norm": 0.3795109448275529, + "learning_rate": 8.162642445728632e-06, + "loss": 0.0184, + "step": 1760 + }, + { + "epoch": 0.6366594360086768, + "grad_norm": 0.23851221648112256, + "learning_rate": 8.16032667609246e-06, + "loss": 0.0165, + "step": 1761 + }, + { + "epoch": 0.6370209689081706, + "grad_norm": 0.1746542876820734, + "learning_rate": 8.158009776956202e-06, + "loss": 0.0117, + "step": 1762 + }, + { + "epoch": 0.6373825018076645, + "grad_norm": 0.0017463142827894259, + "learning_rate": 8.155691749147917e-06, + "loss": 0.0001, + "step": 1763 + }, + { + "epoch": 0.6377440347071583, + "grad_norm": 0.002573393767246462, + "learning_rate": 8.153372593496065e-06, + "loss": 0.0001, + "step": 1764 + }, + { + "epoch": 0.6381055676066522, + "grad_norm": 0.0019108003301401672, + "learning_rate": 8.151052310829515e-06, + "loss": 0.0001, + "step": 1765 + }, + { + "epoch": 0.6384671005061461, + "grad_norm": 0.11656625585987918, + "learning_rate": 8.148730901977533e-06, + "loss": 0.0131, + "step": 1766 + }, + { + "epoch": 0.6388286334056399, + "grad_norm": 0.13969638166985507, + "learning_rate": 8.146408367769792e-06, + "loss": 0.0165, + "step": 1767 + }, + { + "epoch": 0.6391901663051338, + "grad_norm": 0.24563287658826222, + "learning_rate": 8.144084709036362e-06, + "loss": 0.0184, + "step": 1768 + }, + { + "epoch": 0.6395516992046276, + "grad_norm": 0.2949536370791295, + "learning_rate": 8.141759926607724e-06, + "loss": 0.0082, + "step": 1769 + }, + { + "epoch": 0.6399132321041214, + "grad_norm": 0.6599177556152134, + "learning_rate": 8.139434021314749e-06, + "loss": 0.0432, + "step": 1770 + }, + { + "epoch": 0.6402747650036154, + "grad_norm": 1.9506664610732611, + "learning_rate": 8.137106993988717e-06, + "loss": 0.2246, + "step": 1771 + }, + { + "epoch": 0.6406362979031092, + "grad_norm": 0.16535101661008536, + "learning_rate": 8.134778845461308e-06, + "loss": 0.0146, + "step": 1772 + }, + { + "epoch": 0.6409978308026031, + "grad_norm": 0.1589959792177536, + "learning_rate": 8.132449576564603e-06, + "loss": 0.0082, + "step": 1773 + }, + { + "epoch": 0.6413593637020969, + "grad_norm": 0.31203023797178525, + "learning_rate": 8.130119188131078e-06, + "loss": 0.0082, + "step": 1774 + }, + { + "epoch": 0.6417208966015907, + "grad_norm": 0.90499813319538, + "learning_rate": 8.127787680993617e-06, + "loss": 0.1934, + "step": 1775 + }, + { + "epoch": 0.6420824295010846, + "grad_norm": 1.0696175324567014, + "learning_rate": 8.125455055985499e-06, + "loss": 0.0525, + "step": 1776 + }, + { + "epoch": 0.6424439624005784, + "grad_norm": 0.866327959838758, + "learning_rate": 8.123121313940403e-06, + "loss": 0.1836, + "step": 1777 + }, + { + "epoch": 0.6428054953000724, + "grad_norm": 0.11501681054501478, + "learning_rate": 8.12078645569241e-06, + "loss": 0.0065, + "step": 1778 + }, + { + "epoch": 0.6431670281995662, + "grad_norm": 1.4300833058496862, + "learning_rate": 8.118450482075995e-06, + "loss": 0.2344, + "step": 1779 + }, + { + "epoch": 0.64352856109906, + "grad_norm": 4.781434720558354, + "learning_rate": 8.116113393926036e-06, + "loss": 0.1367, + "step": 1780 + }, + { + "epoch": 0.6438900939985539, + "grad_norm": 0.99772519634421, + "learning_rate": 8.113775192077806e-06, + "loss": 0.0432, + "step": 1781 + }, + { + "epoch": 0.6442516268980477, + "grad_norm": 0.19169615598524273, + "learning_rate": 8.111435877366982e-06, + "loss": 0.0165, + "step": 1782 + }, + { + "epoch": 0.6446131597975415, + "grad_norm": 0.02658207348238367, + "learning_rate": 8.109095450629629e-06, + "loss": 0.0004, + "step": 1783 + }, + { + "epoch": 0.6449746926970354, + "grad_norm": 0.11066086144522758, + "learning_rate": 8.10675391270222e-06, + "loss": 0.0103, + "step": 1784 + }, + { + "epoch": 0.6453362255965293, + "grad_norm": 0.11073545443955717, + "learning_rate": 8.10441126442162e-06, + "loss": 0.0117, + "step": 1785 + }, + { + "epoch": 0.6456977584960232, + "grad_norm": 0.37054738108163754, + "learning_rate": 8.102067506625086e-06, + "loss": 0.0184, + "step": 1786 + }, + { + "epoch": 0.646059291395517, + "grad_norm": 0.8257102715250263, + "learning_rate": 8.099722640150283e-06, + "loss": 0.0258, + "step": 1787 + }, + { + "epoch": 0.6464208242950108, + "grad_norm": 0.011345200596696345, + "learning_rate": 8.097376665835258e-06, + "loss": 0.0003, + "step": 1788 + }, + { + "epoch": 0.6467823571945047, + "grad_norm": 0.11938178015083062, + "learning_rate": 8.095029584518472e-06, + "loss": 0.0146, + "step": 1789 + }, + { + "epoch": 0.6471438900939985, + "grad_norm": 0.1264844465019853, + "learning_rate": 8.092681397038762e-06, + "loss": 0.0092, + "step": 1790 + }, + { + "epoch": 0.6475054229934925, + "grad_norm": 0.6607717961185173, + "learning_rate": 8.090332104235375e-06, + "loss": 0.0147, + "step": 1791 + }, + { + "epoch": 0.6478669558929863, + "grad_norm": 0.0034594351753504176, + "learning_rate": 8.087981706947946e-06, + "loss": 0.0001, + "step": 1792 + }, + { + "epoch": 0.6482284887924801, + "grad_norm": 0.29968880810747595, + "learning_rate": 8.085630206016505e-06, + "loss": 0.0231, + "step": 1793 + }, + { + "epoch": 0.648590021691974, + "grad_norm": 0.16949038417879303, + "learning_rate": 8.083277602281481e-06, + "loss": 0.0184, + "step": 1794 + }, + { + "epoch": 0.6489515545914678, + "grad_norm": 0.2242976905926988, + "learning_rate": 8.080923896583692e-06, + "loss": 0.0165, + "step": 1795 + }, + { + "epoch": 0.6493130874909617, + "grad_norm": 0.012451501071144623, + "learning_rate": 8.078569089764352e-06, + "loss": 0.0002, + "step": 1796 + }, + { + "epoch": 0.6496746203904555, + "grad_norm": 1.2523899943880243, + "learning_rate": 8.076213182665072e-06, + "loss": 0.0889, + "step": 1797 + }, + { + "epoch": 0.6500361532899493, + "grad_norm": 0.14451692500795627, + "learning_rate": 8.073856176127845e-06, + "loss": 0.0165, + "step": 1798 + }, + { + "epoch": 0.6503976861894433, + "grad_norm": 0.004149312380696711, + "learning_rate": 8.071498070995075e-06, + "loss": 0.0001, + "step": 1799 + }, + { + "epoch": 0.6507592190889371, + "grad_norm": 1.657402820097042, + "learning_rate": 8.069138868109539e-06, + "loss": 0.0889, + "step": 1800 + }, + { + "epoch": 0.651120751988431, + "grad_norm": 0.7555184323306506, + "learning_rate": 8.066778568314418e-06, + "loss": 0.0286, + "step": 1801 + }, + { + "epoch": 0.6514822848879248, + "grad_norm": 0.010653302273625345, + "learning_rate": 8.064417172453286e-06, + "loss": 0.0003, + "step": 1802 + }, + { + "epoch": 0.6518438177874186, + "grad_norm": 0.03630859339493411, + "learning_rate": 8.062054681370102e-06, + "loss": 0.0005, + "step": 1803 + }, + { + "epoch": 0.6522053506869125, + "grad_norm": 7.104965351966947, + "learning_rate": 8.059691095909223e-06, + "loss": 0.2148, + "step": 1804 + }, + { + "epoch": 0.6525668835864064, + "grad_norm": 0.14909723450658632, + "learning_rate": 8.057326416915393e-06, + "loss": 0.0165, + "step": 1805 + }, + { + "epoch": 0.6529284164859002, + "grad_norm": 0.21378209795297082, + "learning_rate": 8.054960645233743e-06, + "loss": 0.0184, + "step": 1806 + }, + { + "epoch": 0.6532899493853941, + "grad_norm": 0.674630090097052, + "learning_rate": 8.052593781709806e-06, + "loss": 0.0286, + "step": 1807 + }, + { + "epoch": 0.6536514822848879, + "grad_norm": 0.1624567434879652, + "learning_rate": 8.050225827189492e-06, + "loss": 0.0146, + "step": 1808 + }, + { + "epoch": 0.6540130151843818, + "grad_norm": 4.284423999203059, + "learning_rate": 8.047856782519114e-06, + "loss": 0.3867, + "step": 1809 + }, + { + "epoch": 0.6543745480838756, + "grad_norm": 0.023616345652032515, + "learning_rate": 8.045486648545367e-06, + "loss": 0.0007, + "step": 1810 + }, + { + "epoch": 0.6547360809833694, + "grad_norm": 0.16475629236219264, + "learning_rate": 8.04311542611533e-06, + "loss": 0.0184, + "step": 1811 + }, + { + "epoch": 0.6550976138828634, + "grad_norm": 0.2929871286875267, + "learning_rate": 8.040743116076485e-06, + "loss": 0.0206, + "step": 1812 + }, + { + "epoch": 0.6554591467823572, + "grad_norm": 0.006869254415157166, + "learning_rate": 8.038369719276692e-06, + "loss": 0.0002, + "step": 1813 + }, + { + "epoch": 0.6558206796818511, + "grad_norm": 0.09336792057027182, + "learning_rate": 8.035995236564202e-06, + "loss": 0.0072, + "step": 1814 + }, + { + "epoch": 0.6561822125813449, + "grad_norm": 0.6159786157629887, + "learning_rate": 8.033619668787656e-06, + "loss": 0.0286, + "step": 1815 + }, + { + "epoch": 0.6565437454808387, + "grad_norm": 2.943042805172859, + "learning_rate": 8.031243016796078e-06, + "loss": 0.1836, + "step": 1816 + }, + { + "epoch": 0.6569052783803326, + "grad_norm": 0.8480793696268277, + "learning_rate": 8.028865281438888e-06, + "loss": 0.1553, + "step": 1817 + }, + { + "epoch": 0.6572668112798264, + "grad_norm": 0.5183970402320969, + "learning_rate": 8.026486463565884e-06, + "loss": 0.0165, + "step": 1818 + }, + { + "epoch": 0.6576283441793204, + "grad_norm": 1.0580732706965321, + "learning_rate": 8.024106564027257e-06, + "loss": 0.1191, + "step": 1819 + }, + { + "epoch": 0.6579898770788142, + "grad_norm": 0.29482537280411353, + "learning_rate": 8.021725583673583e-06, + "loss": 0.0206, + "step": 1820 + }, + { + "epoch": 0.658351409978308, + "grad_norm": 1.6908787257259628, + "learning_rate": 8.019343523355824e-06, + "loss": 0.0957, + "step": 1821 + }, + { + "epoch": 0.6587129428778019, + "grad_norm": 0.5920037384958703, + "learning_rate": 8.016960383925326e-06, + "loss": 0.0354, + "step": 1822 + }, + { + "epoch": 0.6590744757772957, + "grad_norm": 0.173403689190574, + "learning_rate": 8.014576166233823e-06, + "loss": 0.0146, + "step": 1823 + }, + { + "epoch": 0.6594360086767896, + "grad_norm": 0.010607935978506584, + "learning_rate": 8.012190871133434e-06, + "loss": 0.0004, + "step": 1824 + }, + { + "epoch": 0.6597975415762835, + "grad_norm": 0.10248060992200261, + "learning_rate": 8.009804499476664e-06, + "loss": 0.0072, + "step": 1825 + }, + { + "epoch": 0.6601590744757773, + "grad_norm": 3.9922732434714554, + "learning_rate": 8.007417052116401e-06, + "loss": 0.1279, + "step": 1826 + }, + { + "epoch": 0.6605206073752712, + "grad_norm": 0.9037511312057589, + "learning_rate": 8.005028529905918e-06, + "loss": 0.0479, + "step": 1827 + }, + { + "epoch": 0.660882140274765, + "grad_norm": 0.006732005875010612, + "learning_rate": 8.002638933698872e-06, + "loss": 0.0002, + "step": 1828 + }, + { + "epoch": 0.6612436731742588, + "grad_norm": 0.1794151955804069, + "learning_rate": 8.000248264349306e-06, + "loss": 0.0131, + "step": 1829 + }, + { + "epoch": 0.6616052060737527, + "grad_norm": 0.019404889231059565, + "learning_rate": 7.997856522711645e-06, + "loss": 0.0006, + "step": 1830 + }, + { + "epoch": 0.6619667389732465, + "grad_norm": 0.16232846869156237, + "learning_rate": 7.995463709640692e-06, + "loss": 0.0184, + "step": 1831 + }, + { + "epoch": 0.6623282718727405, + "grad_norm": 0.06679451682957574, + "learning_rate": 7.993069825991643e-06, + "loss": 0.001, + "step": 1832 + }, + { + "epoch": 0.6626898047722343, + "grad_norm": 0.10460533277299018, + "learning_rate": 7.99067487262007e-06, + "loss": 0.0092, + "step": 1833 + }, + { + "epoch": 0.6630513376717281, + "grad_norm": 0.7681661368178658, + "learning_rate": 7.988278850381927e-06, + "loss": 0.1455, + "step": 1834 + }, + { + "epoch": 0.663412870571222, + "grad_norm": 0.2130951173778498, + "learning_rate": 7.985881760133556e-06, + "loss": 0.0014, + "step": 1835 + }, + { + "epoch": 0.6637744034707158, + "grad_norm": 0.8929013034496586, + "learning_rate": 7.983483602731673e-06, + "loss": 0.1836, + "step": 1836 + }, + { + "epoch": 0.6641359363702097, + "grad_norm": 0.01637770286237839, + "learning_rate": 7.98108437903338e-06, + "loss": 0.0001, + "step": 1837 + }, + { + "epoch": 0.6644974692697035, + "grad_norm": 1.0712477618181233, + "learning_rate": 7.978684089896159e-06, + "loss": 0.1191, + "step": 1838 + }, + { + "epoch": 0.6648590021691974, + "grad_norm": 1.9287867069463254, + "learning_rate": 7.976282736177872e-06, + "loss": 0.0688, + "step": 1839 + }, + { + "epoch": 0.6652205350686913, + "grad_norm": 1.1616807842394161, + "learning_rate": 7.973880318736764e-06, + "loss": 0.063, + "step": 1840 + }, + { + "epoch": 0.6655820679681851, + "grad_norm": 0.38518466438309384, + "learning_rate": 7.971476838431459e-06, + "loss": 0.0231, + "step": 1841 + }, + { + "epoch": 0.665943600867679, + "grad_norm": 0.19985250387275008, + "learning_rate": 7.969072296120958e-06, + "loss": 0.0165, + "step": 1842 + }, + { + "epoch": 0.6663051337671728, + "grad_norm": 0.8700480296776133, + "learning_rate": 7.966666692664645e-06, + "loss": 0.0391, + "step": 1843 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.08792703452829521, + "learning_rate": 7.964260028922282e-06, + "loss": 0.0028, + "step": 1844 + }, + { + "epoch": 0.6670281995661606, + "grad_norm": 0.6690395411659614, + "learning_rate": 7.96185230575401e-06, + "loss": 0.0432, + "step": 1845 + }, + { + "epoch": 0.6673897324656544, + "grad_norm": 0.10055007960385585, + "learning_rate": 7.95944352402035e-06, + "loss": 0.0072, + "step": 1846 + }, + { + "epoch": 0.6677512653651483, + "grad_norm": 0.17156408300899625, + "learning_rate": 7.957033684582198e-06, + "loss": 0.0206, + "step": 1847 + }, + { + "epoch": 0.6681127982646421, + "grad_norm": 0.0032741418217263783, + "learning_rate": 7.954622788300831e-06, + "loss": 0.0001, + "step": 1848 + }, + { + "epoch": 0.6684743311641359, + "grad_norm": 0.03609585108758954, + "learning_rate": 7.952210836037903e-06, + "loss": 0.001, + "step": 1849 + }, + { + "epoch": 0.6688358640636298, + "grad_norm": 0.9052964284697455, + "learning_rate": 7.949797828655446e-06, + "loss": 0.0206, + "step": 1850 + }, + { + "epoch": 0.6691973969631236, + "grad_norm": 0.01145486534220573, + "learning_rate": 7.947383767015867e-06, + "loss": 0.0003, + "step": 1851 + }, + { + "epoch": 0.6695589298626174, + "grad_norm": 2.4874515091783618, + "learning_rate": 7.944968651981953e-06, + "loss": 0.1367, + "step": 1852 + }, + { + "epoch": 0.6699204627621114, + "grad_norm": 1.039542994652333, + "learning_rate": 7.942552484416863e-06, + "loss": 0.1836, + "step": 1853 + }, + { + "epoch": 0.6702819956616052, + "grad_norm": 1.0198924809737608, + "learning_rate": 7.940135265184135e-06, + "loss": 0.1367, + "step": 1854 + }, + { + "epoch": 0.6706435285610991, + "grad_norm": 0.22933240659183404, + "learning_rate": 7.937716995147685e-06, + "loss": 0.0229, + "step": 1855 + }, + { + "epoch": 0.6710050614605929, + "grad_norm": 0.08652891474551064, + "learning_rate": 7.935297675171802e-06, + "loss": 0.0072, + "step": 1856 + }, + { + "epoch": 0.6713665943600867, + "grad_norm": 0.8990172757974001, + "learning_rate": 7.932877306121148e-06, + "loss": 0.1455, + "step": 1857 + }, + { + "epoch": 0.6717281272595806, + "grad_norm": 0.018376573949756246, + "learning_rate": 7.930455888860764e-06, + "loss": 0.0004, + "step": 1858 + }, + { + "epoch": 0.6720896601590745, + "grad_norm": 1.1404801889648204, + "learning_rate": 7.928033424256063e-06, + "loss": 0.0186, + "step": 1859 + }, + { + "epoch": 0.6724511930585684, + "grad_norm": 1.2254708756909385, + "learning_rate": 7.925609913172834e-06, + "loss": 0.2031, + "step": 1860 + }, + { + "epoch": 0.6728127259580622, + "grad_norm": 0.0913413511597702, + "learning_rate": 7.923185356477241e-06, + "loss": 0.0025, + "step": 1861 + }, + { + "epoch": 0.673174258857556, + "grad_norm": 0.5858598275337166, + "learning_rate": 7.920759755035818e-06, + "loss": 0.0752, + "step": 1862 + }, + { + "epoch": 0.6735357917570499, + "grad_norm": 0.0016499165235037395, + "learning_rate": 7.918333109715475e-06, + "loss": 0.0001, + "step": 1863 + }, + { + "epoch": 0.6738973246565437, + "grad_norm": 4.853373235185475, + "learning_rate": 7.915905421383494e-06, + "loss": 0.3672, + "step": 1864 + }, + { + "epoch": 0.6742588575560376, + "grad_norm": 0.024894234628182594, + "learning_rate": 7.913476690907532e-06, + "loss": 0.0003, + "step": 1865 + }, + { + "epoch": 0.6746203904555315, + "grad_norm": 0.04160324552069383, + "learning_rate": 7.911046919155614e-06, + "loss": 0.0007, + "step": 1866 + }, + { + "epoch": 0.6749819233550253, + "grad_norm": 0.9699829909238281, + "learning_rate": 7.908616106996143e-06, + "loss": 0.0286, + "step": 1867 + }, + { + "epoch": 0.6753434562545192, + "grad_norm": 0.20051918043396363, + "learning_rate": 7.906184255297887e-06, + "loss": 0.0058, + "step": 1868 + }, + { + "epoch": 0.675704989154013, + "grad_norm": 0.19003799076659894, + "learning_rate": 7.903751364929993e-06, + "loss": 0.0206, + "step": 1869 + }, + { + "epoch": 0.6760665220535069, + "grad_norm": 1.2893947306307334, + "learning_rate": 7.901317436761973e-06, + "loss": 0.0525, + "step": 1870 + }, + { + "epoch": 0.6764280549530007, + "grad_norm": 0.002516782341466223, + "learning_rate": 7.898882471663714e-06, + "loss": 0.0001, + "step": 1871 + }, + { + "epoch": 0.6767895878524945, + "grad_norm": 7.53552804998591, + "learning_rate": 7.896446470505473e-06, + "loss": 0.4434, + "step": 1872 + }, + { + "epoch": 0.6771511207519885, + "grad_norm": 5.3206824147260585, + "learning_rate": 7.894009434157873e-06, + "loss": 0.2041, + "step": 1873 + }, + { + "epoch": 0.6775126536514823, + "grad_norm": 1.0038906901279152, + "learning_rate": 7.89157136349191e-06, + "loss": 0.0432, + "step": 1874 + }, + { + "epoch": 0.6778741865509761, + "grad_norm": 0.3585396212926145, + "learning_rate": 7.889132259378954e-06, + "loss": 0.0476, + "step": 1875 + }, + { + "epoch": 0.67823571945047, + "grad_norm": 0.3179744065382294, + "learning_rate": 7.886692122690737e-06, + "loss": 0.0432, + "step": 1876 + }, + { + "epoch": 0.6785972523499638, + "grad_norm": 1.0607700905601072, + "learning_rate": 7.884250954299368e-06, + "loss": 0.0752, + "step": 1877 + }, + { + "epoch": 0.6789587852494577, + "grad_norm": 0.75714790002446, + "learning_rate": 7.881808755077314e-06, + "loss": 0.0258, + "step": 1878 + }, + { + "epoch": 0.6793203181489516, + "grad_norm": 1.2684033595242925, + "learning_rate": 7.87936552589742e-06, + "loss": 0.0957, + "step": 1879 + }, + { + "epoch": 0.6796818510484454, + "grad_norm": 0.015190251065350779, + "learning_rate": 7.876921267632894e-06, + "loss": 0.0004, + "step": 1880 + }, + { + "epoch": 0.6800433839479393, + "grad_norm": 0.19130092890180755, + "learning_rate": 7.874475981157315e-06, + "loss": 0.0286, + "step": 1881 + }, + { + "epoch": 0.6804049168474331, + "grad_norm": 0.8741379285640591, + "learning_rate": 7.872029667344626e-06, + "loss": 0.082, + "step": 1882 + }, + { + "epoch": 0.680766449746927, + "grad_norm": 0.32986145855616644, + "learning_rate": 7.86958232706914e-06, + "loss": 0.0391, + "step": 1883 + }, + { + "epoch": 0.6811279826464208, + "grad_norm": 0.16771454527525956, + "learning_rate": 7.867133961205536e-06, + "loss": 0.0041, + "step": 1884 + }, + { + "epoch": 0.6814895155459146, + "grad_norm": 0.20197130697530585, + "learning_rate": 7.86468457062886e-06, + "loss": 0.0317, + "step": 1885 + }, + { + "epoch": 0.6818510484454086, + "grad_norm": 0.006258383407966247, + "learning_rate": 7.862234156214523e-06, + "loss": 0.0002, + "step": 1886 + }, + { + "epoch": 0.6822125813449024, + "grad_norm": 0.6211462025463076, + "learning_rate": 7.859782718838302e-06, + "loss": 0.0525, + "step": 1887 + }, + { + "epoch": 0.6825741142443963, + "grad_norm": 0.2685473090136057, + "learning_rate": 7.857330259376341e-06, + "loss": 0.0317, + "step": 1888 + }, + { + "epoch": 0.6829356471438901, + "grad_norm": 0.3760943805589341, + "learning_rate": 7.854876778705147e-06, + "loss": 0.0082, + "step": 1889 + }, + { + "epoch": 0.6832971800433839, + "grad_norm": 0.0025370268428791774, + "learning_rate": 7.852422277701596e-06, + "loss": 0.0001, + "step": 1890 + }, + { + "epoch": 0.6836587129428778, + "grad_norm": 2.0375128438891243, + "learning_rate": 7.849966757242926e-06, + "loss": 0.0957, + "step": 1891 + }, + { + "epoch": 0.6840202458423716, + "grad_norm": 0.0986580499769701, + "learning_rate": 7.847510218206737e-06, + "loss": 0.0008, + "step": 1892 + }, + { + "epoch": 0.6843817787418656, + "grad_norm": 0.18642594565857523, + "learning_rate": 7.845052661470998e-06, + "loss": 0.0229, + "step": 1893 + }, + { + "epoch": 0.6847433116413594, + "grad_norm": 0.1946459836659337, + "learning_rate": 7.842594087914038e-06, + "loss": 0.0057, + "step": 1894 + }, + { + "epoch": 0.6851048445408532, + "grad_norm": 0.3310340689339737, + "learning_rate": 7.840134498414548e-06, + "loss": 0.0391, + "step": 1895 + }, + { + "epoch": 0.6854663774403471, + "grad_norm": 3.085469533082013, + "learning_rate": 7.837673893851591e-06, + "loss": 0.1035, + "step": 1896 + }, + { + "epoch": 0.6858279103398409, + "grad_norm": 0.0025548915254453948, + "learning_rate": 7.835212275104584e-06, + "loss": 0.0001, + "step": 1897 + }, + { + "epoch": 0.6861894432393347, + "grad_norm": 0.16756473072796524, + "learning_rate": 7.832749643053305e-06, + "loss": 0.0029, + "step": 1898 + }, + { + "epoch": 0.6865509761388287, + "grad_norm": 0.001468037193513863, + "learning_rate": 7.830285998577905e-06, + "loss": 0.0001, + "step": 1899 + }, + { + "epoch": 0.6869125090383225, + "grad_norm": 0.7120533405346874, + "learning_rate": 7.827821342558883e-06, + "loss": 0.0752, + "step": 1900 + }, + { + "epoch": 0.6872740419378164, + "grad_norm": 2.455491536341534, + "learning_rate": 7.825355675877111e-06, + "loss": 0.1035, + "step": 1901 + }, + { + "epoch": 0.6876355748373102, + "grad_norm": 0.0033415563193925244, + "learning_rate": 7.822888999413818e-06, + "loss": 0.0001, + "step": 1902 + }, + { + "epoch": 0.687997107736804, + "grad_norm": 0.01433459144761999, + "learning_rate": 7.82042131405059e-06, + "loss": 0.0005, + "step": 1903 + }, + { + "epoch": 0.6883586406362979, + "grad_norm": 0.02266926244711413, + "learning_rate": 7.817952620669383e-06, + "loss": 0.0002, + "step": 1904 + }, + { + "epoch": 0.6887201735357917, + "grad_norm": 0.16637335582882495, + "learning_rate": 7.8154829201525e-06, + "loss": 0.0229, + "step": 1905 + }, + { + "epoch": 0.6890817064352857, + "grad_norm": 0.16983501925550445, + "learning_rate": 7.813012213382618e-06, + "loss": 0.0206, + "step": 1906 + }, + { + "epoch": 0.6894432393347795, + "grad_norm": 0.18057009333826468, + "learning_rate": 7.810540501242765e-06, + "loss": 0.0231, + "step": 1907 + }, + { + "epoch": 0.6898047722342733, + "grad_norm": 0.9091595636129016, + "learning_rate": 7.808067784616328e-06, + "loss": 0.0889, + "step": 1908 + }, + { + "epoch": 0.6901663051337672, + "grad_norm": 0.023215286074940458, + "learning_rate": 7.80559406438706e-06, + "loss": 0.0005, + "step": 1909 + }, + { + "epoch": 0.690527838033261, + "grad_norm": 0.13725388917673434, + "learning_rate": 7.803119341439063e-06, + "loss": 0.0165, + "step": 1910 + }, + { + "epoch": 0.6908893709327549, + "grad_norm": 0.009305903915410614, + "learning_rate": 7.800643616656805e-06, + "loss": 0.0002, + "step": 1911 + }, + { + "epoch": 0.6912509038322487, + "grad_norm": 0.0008379829271826966, + "learning_rate": 7.79816689092511e-06, + "loss": 0.0, + "step": 1912 + }, + { + "epoch": 0.6916124367317426, + "grad_norm": 1.2867752637783645, + "learning_rate": 7.79568916512916e-06, + "loss": 0.0525, + "step": 1913 + }, + { + "epoch": 0.6919739696312365, + "grad_norm": 0.03359633431195336, + "learning_rate": 7.793210440154492e-06, + "loss": 0.0008, + "step": 1914 + }, + { + "epoch": 0.6923355025307303, + "grad_norm": 0.017695521294989365, + "learning_rate": 7.790730716887001e-06, + "loss": 0.0005, + "step": 1915 + }, + { + "epoch": 0.6926970354302241, + "grad_norm": 0.003428303454569928, + "learning_rate": 7.78824999621294e-06, + "loss": 0.0001, + "step": 1916 + }, + { + "epoch": 0.693058568329718, + "grad_norm": 0.0018448646312639713, + "learning_rate": 7.785768279018921e-06, + "loss": 0.0001, + "step": 1917 + }, + { + "epoch": 0.6934201012292118, + "grad_norm": 0.004342436419750446, + "learning_rate": 7.783285566191907e-06, + "loss": 0.0001, + "step": 1918 + }, + { + "epoch": 0.6937816341287057, + "grad_norm": 0.15274356108643364, + "learning_rate": 7.780801858619217e-06, + "loss": 0.0184, + "step": 1919 + }, + { + "epoch": 0.6941431670281996, + "grad_norm": 0.013318099830168375, + "learning_rate": 7.77831715718853e-06, + "loss": 0.0002, + "step": 1920 + }, + { + "epoch": 0.6945046999276934, + "grad_norm": 0.15433658361625516, + "learning_rate": 7.77583146278788e-06, + "loss": 0.0165, + "step": 1921 + }, + { + "epoch": 0.6948662328271873, + "grad_norm": 0.0038129007741385234, + "learning_rate": 7.773344776305648e-06, + "loss": 0.0001, + "step": 1922 + }, + { + "epoch": 0.6952277657266811, + "grad_norm": 0.8562330190857973, + "learning_rate": 7.77085709863058e-06, + "loss": 0.1367, + "step": 1923 + }, + { + "epoch": 0.695589298626175, + "grad_norm": 0.14300599561976587, + "learning_rate": 7.76836843065177e-06, + "loss": 0.0165, + "step": 1924 + }, + { + "epoch": 0.6959508315256688, + "grad_norm": 0.19344894710003632, + "learning_rate": 7.765878773258666e-06, + "loss": 0.0206, + "step": 1925 + }, + { + "epoch": 0.6963123644251626, + "grad_norm": 0.29376217274974387, + "learning_rate": 7.763388127341071e-06, + "loss": 0.0231, + "step": 1926 + }, + { + "epoch": 0.6966738973246566, + "grad_norm": 0.3511233451418111, + "learning_rate": 7.760896493789144e-06, + "loss": 0.0117, + "step": 1927 + }, + { + "epoch": 0.6970354302241504, + "grad_norm": 2.2709582486772075, + "learning_rate": 7.758403873493393e-06, + "loss": 0.0957, + "step": 1928 + }, + { + "epoch": 0.6973969631236443, + "grad_norm": 0.005148064711149245, + "learning_rate": 7.75591026734468e-06, + "loss": 0.0001, + "step": 1929 + }, + { + "epoch": 0.6977584960231381, + "grad_norm": 1.3685846346493968, + "learning_rate": 7.753415676234217e-06, + "loss": 0.0752, + "step": 1930 + }, + { + "epoch": 0.6981200289226319, + "grad_norm": 0.08509420458225012, + "learning_rate": 7.750920101053574e-06, + "loss": 0.0025, + "step": 1931 + }, + { + "epoch": 0.6984815618221258, + "grad_norm": 1.9945554483135381, + "learning_rate": 7.748423542694668e-06, + "loss": 0.1279, + "step": 1932 + }, + { + "epoch": 0.6988430947216197, + "grad_norm": 0.005517029874478681, + "learning_rate": 7.745926002049766e-06, + "loss": 0.0001, + "step": 1933 + }, + { + "epoch": 0.6992046276211136, + "grad_norm": 0.8082290919085043, + "learning_rate": 7.743427480011491e-06, + "loss": 0.0576, + "step": 1934 + }, + { + "epoch": 0.6995661605206074, + "grad_norm": 0.2646648840757841, + "learning_rate": 7.740927977472814e-06, + "loss": 0.0004, + "step": 1935 + }, + { + "epoch": 0.6999276934201012, + "grad_norm": 0.884288794850231, + "learning_rate": 7.738427495327057e-06, + "loss": 0.0889, + "step": 1936 + }, + { + "epoch": 0.7002892263195951, + "grad_norm": 0.1271523761588851, + "learning_rate": 7.73592603446789e-06, + "loss": 0.0028, + "step": 1937 + }, + { + "epoch": 0.7006507592190889, + "grad_norm": 0.17486743954793843, + "learning_rate": 7.733423595789337e-06, + "loss": 0.0184, + "step": 1938 + }, + { + "epoch": 0.7010122921185827, + "grad_norm": 0.02878862494461276, + "learning_rate": 7.730920180185765e-06, + "loss": 0.001, + "step": 1939 + }, + { + "epoch": 0.7013738250180767, + "grad_norm": 1.232230713918852, + "learning_rate": 7.7284157885519e-06, + "loss": 0.0889, + "step": 1940 + }, + { + "epoch": 0.7017353579175705, + "grad_norm": 1.413579708537534, + "learning_rate": 7.725910421782808e-06, + "loss": 0.0752, + "step": 1941 + }, + { + "epoch": 0.7020968908170644, + "grad_norm": 1.1103126083889103, + "learning_rate": 7.723404080773904e-06, + "loss": 0.0231, + "step": 1942 + }, + { + "epoch": 0.7024584237165582, + "grad_norm": 0.005030709934846639, + "learning_rate": 7.720896766420957e-06, + "loss": 0.0001, + "step": 1943 + }, + { + "epoch": 0.702819956616052, + "grad_norm": 0.010756894056394245, + "learning_rate": 7.71838847962008e-06, + "loss": 0.0003, + "step": 1944 + }, + { + "epoch": 0.7031814895155459, + "grad_norm": 0.021180830395068934, + "learning_rate": 7.715879221267736e-06, + "loss": 0.0005, + "step": 1945 + }, + { + "epoch": 0.7035430224150397, + "grad_norm": 0.0018870410923548268, + "learning_rate": 7.713368992260731e-06, + "loss": 0.0001, + "step": 1946 + }, + { + "epoch": 0.7039045553145337, + "grad_norm": 0.7922259041241697, + "learning_rate": 7.710857793496218e-06, + "loss": 0.063, + "step": 1947 + }, + { + "epoch": 0.7042660882140275, + "grad_norm": 0.006934756070320845, + "learning_rate": 7.708345625871703e-06, + "loss": 0.0002, + "step": 1948 + }, + { + "epoch": 0.7046276211135213, + "grad_norm": 1.1860797173617172, + "learning_rate": 7.705832490285034e-06, + "loss": 0.0391, + "step": 1949 + }, + { + "epoch": 0.7049891540130152, + "grad_norm": 0.2813429449516875, + "learning_rate": 7.703318387634403e-06, + "loss": 0.0231, + "step": 1950 + }, + { + "epoch": 0.705350686912509, + "grad_norm": 0.0010503254119625211, + "learning_rate": 7.700803318818352e-06, + "loss": 0.0, + "step": 1951 + }, + { + "epoch": 0.7057122198120029, + "grad_norm": 0.3155289450504141, + "learning_rate": 7.698287284735763e-06, + "loss": 0.0092, + "step": 1952 + }, + { + "epoch": 0.7060737527114967, + "grad_norm": 0.028132179903105937, + "learning_rate": 7.695770286285869e-06, + "loss": 0.0007, + "step": 1953 + }, + { + "epoch": 0.7064352856109906, + "grad_norm": 0.2524118360198448, + "learning_rate": 7.693252324368245e-06, + "loss": 0.0286, + "step": 1954 + }, + { + "epoch": 0.7067968185104845, + "grad_norm": 3.6367386013304666, + "learning_rate": 7.690733399882805e-06, + "loss": 0.1738, + "step": 1955 + }, + { + "epoch": 0.7071583514099783, + "grad_norm": 2.9594252538761365, + "learning_rate": 7.688213513729819e-06, + "loss": 0.4434, + "step": 1956 + }, + { + "epoch": 0.7075198843094722, + "grad_norm": 0.01441585002260335, + "learning_rate": 7.685692666809889e-06, + "loss": 0.0002, + "step": 1957 + }, + { + "epoch": 0.707881417208966, + "grad_norm": 0.4591712005546323, + "learning_rate": 7.683170860023967e-06, + "loss": 0.0354, + "step": 1958 + }, + { + "epoch": 0.7082429501084598, + "grad_norm": 0.19591522528983235, + "learning_rate": 7.680648094273346e-06, + "loss": 0.0206, + "step": 1959 + }, + { + "epoch": 0.7086044830079538, + "grad_norm": 0.018761711025618077, + "learning_rate": 7.67812437045966e-06, + "loss": 0.0005, + "step": 1960 + }, + { + "epoch": 0.7089660159074476, + "grad_norm": 0.0010917270831452975, + "learning_rate": 7.675599689484892e-06, + "loss": 0.0, + "step": 1961 + }, + { + "epoch": 0.7093275488069414, + "grad_norm": 0.003434568218030735, + "learning_rate": 7.673074052251358e-06, + "loss": 0.0001, + "step": 1962 + }, + { + "epoch": 0.7096890817064353, + "grad_norm": 0.5969714920848971, + "learning_rate": 7.670547459661723e-06, + "loss": 0.0258, + "step": 1963 + }, + { + "epoch": 0.7100506146059291, + "grad_norm": 0.0024400453352642574, + "learning_rate": 7.66801991261899e-06, + "loss": 0.0001, + "step": 1964 + }, + { + "epoch": 0.710412147505423, + "grad_norm": 1.0874175721149835, + "learning_rate": 7.6654914120265e-06, + "loss": 0.1367, + "step": 1965 + }, + { + "epoch": 0.7107736804049168, + "grad_norm": 0.3297415336427687, + "learning_rate": 7.662961958787946e-06, + "loss": 0.0231, + "step": 1966 + }, + { + "epoch": 0.7111352133044107, + "grad_norm": 0.8575288478502708, + "learning_rate": 7.660431553807348e-06, + "loss": 0.0688, + "step": 1967 + }, + { + "epoch": 0.7114967462039046, + "grad_norm": 0.5731515171776669, + "learning_rate": 7.657900197989072e-06, + "loss": 0.0258, + "step": 1968 + }, + { + "epoch": 0.7118582791033984, + "grad_norm": 0.3402222263922816, + "learning_rate": 7.655367892237831e-06, + "loss": 0.0286, + "step": 1969 + }, + { + "epoch": 0.7122198120028923, + "grad_norm": 0.4851255567883175, + "learning_rate": 7.652834637458662e-06, + "loss": 0.0231, + "step": 1970 + }, + { + "epoch": 0.7125813449023861, + "grad_norm": 0.13143541876944856, + "learning_rate": 7.650300434556954e-06, + "loss": 0.0036, + "step": 1971 + }, + { + "epoch": 0.7129428778018799, + "grad_norm": 0.19433500705100654, + "learning_rate": 7.647765284438432e-06, + "loss": 0.0206, + "step": 1972 + }, + { + "epoch": 0.7133044107013738, + "grad_norm": 0.7026907402976055, + "learning_rate": 7.645229188009153e-06, + "loss": 0.1279, + "step": 1973 + }, + { + "epoch": 0.7136659436008677, + "grad_norm": 1.0036392573270787, + "learning_rate": 7.642692146175524e-06, + "loss": 0.0354, + "step": 1974 + }, + { + "epoch": 0.7140274765003616, + "grad_norm": 0.2026724182619791, + "learning_rate": 7.640154159844275e-06, + "loss": 0.0184, + "step": 1975 + }, + { + "epoch": 0.7143890093998554, + "grad_norm": 1.3487479744614386, + "learning_rate": 7.63761522992249e-06, + "loss": 0.0752, + "step": 1976 + }, + { + "epoch": 0.7147505422993492, + "grad_norm": 0.5186002644771338, + "learning_rate": 7.635075357317577e-06, + "loss": 0.0317, + "step": 1977 + }, + { + "epoch": 0.7151120751988431, + "grad_norm": 0.12777142555003612, + "learning_rate": 7.632534542937287e-06, + "loss": 0.0131, + "step": 1978 + }, + { + "epoch": 0.7154736080983369, + "grad_norm": 0.6200587016217616, + "learning_rate": 7.629992787689708e-06, + "loss": 0.0165, + "step": 1979 + }, + { + "epoch": 0.7158351409978309, + "grad_norm": 0.02098178395301597, + "learning_rate": 7.627450092483263e-06, + "loss": 0.0007, + "step": 1980 + }, + { + "epoch": 0.7161966738973247, + "grad_norm": 1.3428017939317742, + "learning_rate": 7.624906458226708e-06, + "loss": 0.1191, + "step": 1981 + }, + { + "epoch": 0.7165582067968185, + "grad_norm": 0.9745759392059615, + "learning_rate": 7.6223618858291374e-06, + "loss": 0.0432, + "step": 1982 + }, + { + "epoch": 0.7169197396963124, + "grad_norm": 0.11611602461618466, + "learning_rate": 7.619816376199984e-06, + "loss": 0.0131, + "step": 1983 + }, + { + "epoch": 0.7172812725958062, + "grad_norm": 1.2324710324467227, + "learning_rate": 7.617269930249011e-06, + "loss": 0.0579, + "step": 1984 + }, + { + "epoch": 0.7176428054953, + "grad_norm": 0.3090047541829667, + "learning_rate": 7.614722548886316e-06, + "loss": 0.0231, + "step": 1985 + }, + { + "epoch": 0.7180043383947939, + "grad_norm": 0.1378903881287269, + "learning_rate": 7.612174233022336e-06, + "loss": 0.0165, + "step": 1986 + }, + { + "epoch": 0.7183658712942878, + "grad_norm": 0.5783412949167592, + "learning_rate": 7.609624983567834e-06, + "loss": 0.0165, + "step": 1987 + }, + { + "epoch": 0.7187274041937817, + "grad_norm": 0.0029589957704427915, + "learning_rate": 7.607074801433914e-06, + "loss": 0.0001, + "step": 1988 + }, + { + "epoch": 0.7190889370932755, + "grad_norm": 0.09183359106286683, + "learning_rate": 7.60452368753201e-06, + "loss": 0.0103, + "step": 1989 + }, + { + "epoch": 0.7194504699927693, + "grad_norm": 0.7744878150278995, + "learning_rate": 7.601971642773891e-06, + "loss": 0.0525, + "step": 1990 + }, + { + "epoch": 0.7198120028922632, + "grad_norm": 0.0015214632234208949, + "learning_rate": 7.599418668071656e-06, + "loss": 0.0, + "step": 1991 + }, + { + "epoch": 0.720173535791757, + "grad_norm": 0.1443477809610979, + "learning_rate": 7.596864764337735e-06, + "loss": 0.0064, + "step": 1992 + }, + { + "epoch": 0.720535068691251, + "grad_norm": 0.06872840500734788, + "learning_rate": 7.594309932484898e-06, + "loss": 0.0012, + "step": 1993 + }, + { + "epoch": 0.7208966015907448, + "grad_norm": 0.08491636803497614, + "learning_rate": 7.5917541734262376e-06, + "loss": 0.0092, + "step": 1994 + }, + { + "epoch": 0.7212581344902386, + "grad_norm": 1.329237548070853, + "learning_rate": 7.589197488075183e-06, + "loss": 0.0391, + "step": 1995 + }, + { + "epoch": 0.7216196673897325, + "grad_norm": 0.12756020052125083, + "learning_rate": 7.586639877345492e-06, + "loss": 0.0146, + "step": 1996 + }, + { + "epoch": 0.7219812002892263, + "grad_norm": 0.26035636494738956, + "learning_rate": 7.584081342151255e-06, + "loss": 0.0073, + "step": 1997 + }, + { + "epoch": 0.7223427331887202, + "grad_norm": 0.023949230573506775, + "learning_rate": 7.581521883406893e-06, + "loss": 0.0005, + "step": 1998 + }, + { + "epoch": 0.722704266088214, + "grad_norm": 0.12029328963817947, + "learning_rate": 7.578961502027153e-06, + "loss": 0.0031, + "step": 1999 + }, + { + "epoch": 0.7230657989877078, + "grad_norm": 1.0233209228795304, + "learning_rate": 7.576400198927117e-06, + "loss": 0.2031, + "step": 2000 + }, + { + "epoch": 0.7234273318872018, + "grad_norm": 1.0676304817449915, + "learning_rate": 7.5738379750221936e-06, + "loss": 0.063, + "step": 2001 + }, + { + "epoch": 0.7237888647866956, + "grad_norm": 0.05926956921375649, + "learning_rate": 7.571274831228119e-06, + "loss": 0.0006, + "step": 2002 + }, + { + "epoch": 0.7241503976861895, + "grad_norm": 3.9491755153301993, + "learning_rate": 7.568710768460965e-06, + "loss": 0.2461, + "step": 2003 + }, + { + "epoch": 0.7245119305856833, + "grad_norm": 0.1319406443471826, + "learning_rate": 7.566145787637125e-06, + "loss": 0.0115, + "step": 2004 + }, + { + "epoch": 0.7248734634851771, + "grad_norm": 0.17058089151276304, + "learning_rate": 7.56357988967332e-06, + "loss": 0.0103, + "step": 2005 + }, + { + "epoch": 0.725234996384671, + "grad_norm": 0.012453945274317115, + "learning_rate": 7.561013075486605e-06, + "loss": 0.0003, + "step": 2006 + }, + { + "epoch": 0.7255965292841648, + "grad_norm": 1.4921171714497563, + "learning_rate": 7.5584453459943565e-06, + "loss": 0.063, + "step": 2007 + }, + { + "epoch": 0.7259580621836587, + "grad_norm": 0.010265156240195557, + "learning_rate": 7.5558767021142795e-06, + "loss": 0.0003, + "step": 2008 + }, + { + "epoch": 0.7263195950831526, + "grad_norm": 0.0012682287870268067, + "learning_rate": 7.553307144764412e-06, + "loss": 0.0, + "step": 2009 + }, + { + "epoch": 0.7266811279826464, + "grad_norm": 0.12389226583013782, + "learning_rate": 7.550736674863108e-06, + "loss": 0.0131, + "step": 2010 + }, + { + "epoch": 0.7270426608821403, + "grad_norm": 0.36425610612450504, + "learning_rate": 7.548165293329055e-06, + "loss": 0.0286, + "step": 2011 + }, + { + "epoch": 0.7274041937816341, + "grad_norm": 0.13910002327196677, + "learning_rate": 7.545593001081264e-06, + "loss": 0.0082, + "step": 2012 + }, + { + "epoch": 0.7277657266811279, + "grad_norm": 0.9428010156519978, + "learning_rate": 7.543019799039071e-06, + "loss": 0.0525, + "step": 2013 + }, + { + "epoch": 0.7281272595806219, + "grad_norm": 0.29522232541968985, + "learning_rate": 7.54044568812214e-06, + "loss": 0.0184, + "step": 2014 + }, + { + "epoch": 0.7284887924801157, + "grad_norm": 0.24446631584904452, + "learning_rate": 7.537870669250453e-06, + "loss": 0.0206, + "step": 2015 + }, + { + "epoch": 0.7288503253796096, + "grad_norm": 0.3978543172129962, + "learning_rate": 7.535294743344328e-06, + "loss": 0.0286, + "step": 2016 + }, + { + "epoch": 0.7292118582791034, + "grad_norm": 0.11102343806888686, + "learning_rate": 7.532717911324394e-06, + "loss": 0.0082, + "step": 2017 + }, + { + "epoch": 0.7295733911785972, + "grad_norm": 2.1306028158521135, + "learning_rate": 7.530140174111616e-06, + "loss": 0.1641, + "step": 2018 + }, + { + "epoch": 0.7299349240780911, + "grad_norm": 0.39530668947979497, + "learning_rate": 7.527561532627272e-06, + "loss": 0.0286, + "step": 2019 + }, + { + "epoch": 0.7302964569775849, + "grad_norm": 0.20591299179938377, + "learning_rate": 7.5249819877929685e-06, + "loss": 0.0092, + "step": 2020 + }, + { + "epoch": 0.7306579898770789, + "grad_norm": 0.11983057112840388, + "learning_rate": 7.5224015405306384e-06, + "loss": 0.0008, + "step": 2021 + }, + { + "epoch": 0.7310195227765727, + "grad_norm": 0.04908101795846612, + "learning_rate": 7.519820191762528e-06, + "loss": 0.0007, + "step": 2022 + }, + { + "epoch": 0.7313810556760665, + "grad_norm": 0.8432901192155383, + "learning_rate": 7.517237942411213e-06, + "loss": 0.0206, + "step": 2023 + }, + { + "epoch": 0.7317425885755604, + "grad_norm": 1.118485652006064, + "learning_rate": 7.514654793399589e-06, + "loss": 0.0815, + "step": 2024 + }, + { + "epoch": 0.7321041214750542, + "grad_norm": 0.4822358332796937, + "learning_rate": 7.512070745650872e-06, + "loss": 0.0117, + "step": 2025 + }, + { + "epoch": 0.7324656543745481, + "grad_norm": 1.0901438390776452, + "learning_rate": 7.5094858000886005e-06, + "loss": 0.063, + "step": 2026 + }, + { + "epoch": 0.732827187274042, + "grad_norm": 0.865552450752716, + "learning_rate": 7.506899957636634e-06, + "loss": 0.0688, + "step": 2027 + }, + { + "epoch": 0.7331887201735358, + "grad_norm": 0.1969660210560107, + "learning_rate": 7.504313219219153e-06, + "loss": 0.0184, + "step": 2028 + }, + { + "epoch": 0.7335502530730297, + "grad_norm": 0.9892716334134939, + "learning_rate": 7.501725585760654e-06, + "loss": 0.1836, + "step": 2029 + }, + { + "epoch": 0.7339117859725235, + "grad_norm": 0.28407522933565454, + "learning_rate": 7.499137058185959e-06, + "loss": 0.0206, + "step": 2030 + }, + { + "epoch": 0.7342733188720173, + "grad_norm": 0.09271493971992761, + "learning_rate": 7.496547637420208e-06, + "loss": 0.0081, + "step": 2031 + }, + { + "epoch": 0.7346348517715112, + "grad_norm": 0.11257326899314352, + "learning_rate": 7.493957324388856e-06, + "loss": 0.0032, + "step": 2032 + }, + { + "epoch": 0.734996384671005, + "grad_norm": 0.013385935939067728, + "learning_rate": 7.491366120017684e-06, + "loss": 0.0004, + "step": 2033 + }, + { + "epoch": 0.735357917570499, + "grad_norm": 0.04703219773660298, + "learning_rate": 7.488774025232788e-06, + "loss": 0.0014, + "step": 2034 + }, + { + "epoch": 0.7357194504699928, + "grad_norm": 0.5896794867259714, + "learning_rate": 7.486181040960579e-06, + "loss": 0.0131, + "step": 2035 + }, + { + "epoch": 0.7360809833694866, + "grad_norm": 0.11233023241262621, + "learning_rate": 7.483587168127791e-06, + "loss": 0.0103, + "step": 2036 + }, + { + "epoch": 0.7364425162689805, + "grad_norm": 0.34743665188466255, + "learning_rate": 7.480992407661473e-06, + "loss": 0.0255, + "step": 2037 + }, + { + "epoch": 0.7368040491684743, + "grad_norm": 0.08718092815923963, + "learning_rate": 7.478396760488992e-06, + "loss": 0.0072, + "step": 2038 + }, + { + "epoch": 0.7371655820679682, + "grad_norm": 1.0117028604683875, + "learning_rate": 7.475800227538032e-06, + "loss": 0.0354, + "step": 2039 + }, + { + "epoch": 0.737527114967462, + "grad_norm": 41.77287314882658, + "learning_rate": 7.473202809736593e-06, + "loss": 0.2988, + "step": 2040 + }, + { + "epoch": 0.7378886478669558, + "grad_norm": 0.010105941822420727, + "learning_rate": 7.470604508012992e-06, + "loss": 0.0002, + "step": 2041 + }, + { + "epoch": 0.7382501807664498, + "grad_norm": 0.904856586015947, + "learning_rate": 7.46800532329586e-06, + "loss": 0.1934, + "step": 2042 + }, + { + "epoch": 0.7386117136659436, + "grad_norm": 0.2837767139306754, + "learning_rate": 7.465405256514149e-06, + "loss": 0.0255, + "step": 2043 + }, + { + "epoch": 0.7389732465654375, + "grad_norm": 0.3649075886886255, + "learning_rate": 7.462804308597118e-06, + "loss": 0.0184, + "step": 2044 + }, + { + "epoch": 0.7393347794649313, + "grad_norm": 0.0009178593080307801, + "learning_rate": 7.460202480474346e-06, + "loss": 0.0, + "step": 2045 + }, + { + "epoch": 0.7396963123644251, + "grad_norm": 1.0905695397999302, + "learning_rate": 7.457599773075728e-06, + "loss": 0.0525, + "step": 2046 + }, + { + "epoch": 0.740057845263919, + "grad_norm": 0.30467028722416667, + "learning_rate": 7.454996187331469e-06, + "loss": 0.0258, + "step": 2047 + }, + { + "epoch": 0.7404193781634129, + "grad_norm": 0.009898207313600684, + "learning_rate": 7.452391724172091e-06, + "loss": 0.0003, + "step": 2048 + }, + { + "epoch": 0.7407809110629068, + "grad_norm": 0.0009188786942729724, + "learning_rate": 7.449786384528428e-06, + "loss": 0.0, + "step": 2049 + }, + { + "epoch": 0.7411424439624006, + "grad_norm": 0.6790764838032568, + "learning_rate": 7.447180169331628e-06, + "loss": 0.0525, + "step": 2050 + }, + { + "epoch": 0.7415039768618944, + "grad_norm": 0.1148314632664774, + "learning_rate": 7.444573079513153e-06, + "loss": 0.0115, + "step": 2051 + }, + { + "epoch": 0.7418655097613883, + "grad_norm": 0.14340586192771776, + "learning_rate": 7.441965116004775e-06, + "loss": 0.0146, + "step": 2052 + }, + { + "epoch": 0.7422270426608821, + "grad_norm": 0.11747250838368868, + "learning_rate": 7.4393562797385795e-06, + "loss": 0.0117, + "step": 2053 + }, + { + "epoch": 0.7425885755603759, + "grad_norm": 0.10799776709994027, + "learning_rate": 7.436746571646965e-06, + "loss": 0.0131, + "step": 2054 + }, + { + "epoch": 0.7429501084598699, + "grad_norm": 0.09378212138635446, + "learning_rate": 7.434135992662639e-06, + "loss": 0.0092, + "step": 2055 + }, + { + "epoch": 0.7433116413593637, + "grad_norm": 0.00184359092415366, + "learning_rate": 7.431524543718624e-06, + "loss": 0.0001, + "step": 2056 + }, + { + "epoch": 0.7436731742588576, + "grad_norm": 0.0816118535741427, + "learning_rate": 7.42891222574825e-06, + "loss": 0.0035, + "step": 2057 + }, + { + "epoch": 0.7440347071583514, + "grad_norm": 0.9533416740826345, + "learning_rate": 7.426299039685159e-06, + "loss": 0.1553, + "step": 2058 + }, + { + "epoch": 0.7443962400578452, + "grad_norm": 0.17826138694873644, + "learning_rate": 7.4236849864633034e-06, + "loss": 0.0092, + "step": 2059 + }, + { + "epoch": 0.7447577729573391, + "grad_norm": 0.7678506514387755, + "learning_rate": 7.421070067016945e-06, + "loss": 0.1934, + "step": 2060 + }, + { + "epoch": 0.745119305856833, + "grad_norm": 0.17448509982293542, + "learning_rate": 7.418454282280655e-06, + "loss": 0.0103, + "step": 2061 + }, + { + "epoch": 0.7454808387563269, + "grad_norm": 0.05927184421515355, + "learning_rate": 7.415837633189316e-06, + "loss": 0.0015, + "step": 2062 + }, + { + "epoch": 0.7458423716558207, + "grad_norm": 0.3493500322267116, + "learning_rate": 7.413220120678115e-06, + "loss": 0.0184, + "step": 2063 + }, + { + "epoch": 0.7462039045553145, + "grad_norm": 0.8545416899192378, + "learning_rate": 7.410601745682554e-06, + "loss": 0.1836, + "step": 2064 + }, + { + "epoch": 0.7465654374548084, + "grad_norm": 0.02218243924478661, + "learning_rate": 7.407982509138436e-06, + "loss": 0.0003, + "step": 2065 + }, + { + "epoch": 0.7469269703543022, + "grad_norm": 0.07401798634384298, + "learning_rate": 7.405362411981879e-06, + "loss": 0.0028, + "step": 2066 + }, + { + "epoch": 0.7472885032537961, + "grad_norm": 0.1384219792935683, + "learning_rate": 7.402741455149303e-06, + "loss": 0.0165, + "step": 2067 + }, + { + "epoch": 0.74765003615329, + "grad_norm": 0.09310383573585125, + "learning_rate": 7.400119639577439e-06, + "loss": 0.0103, + "step": 2068 + }, + { + "epoch": 0.7480115690527838, + "grad_norm": 1.0479494550836177, + "learning_rate": 7.397496966203321e-06, + "loss": 0.1738, + "step": 2069 + }, + { + "epoch": 0.7483731019522777, + "grad_norm": 3.7233984403485474, + "learning_rate": 7.394873435964294e-06, + "loss": 0.1738, + "step": 2070 + }, + { + "epoch": 0.7487346348517715, + "grad_norm": 0.25223082543967484, + "learning_rate": 7.392249049798006e-06, + "loss": 0.0231, + "step": 2071 + }, + { + "epoch": 0.7490961677512654, + "grad_norm": 0.018910665446650352, + "learning_rate": 7.389623808642412e-06, + "loss": 0.0005, + "step": 2072 + }, + { + "epoch": 0.7494577006507592, + "grad_norm": 0.029190311476981107, + "learning_rate": 7.386997713435774e-06, + "loss": 0.0007, + "step": 2073 + }, + { + "epoch": 0.749819233550253, + "grad_norm": 0.21121616208363106, + "learning_rate": 7.384370765116657e-06, + "loss": 0.0057, + "step": 2074 + }, + { + "epoch": 0.750180766449747, + "grad_norm": 0.06258537633957925, + "learning_rate": 7.38174296462393e-06, + "loss": 0.0007, + "step": 2075 + }, + { + "epoch": 0.7505422993492408, + "grad_norm": 0.09602380109668353, + "learning_rate": 7.379114312896772e-06, + "loss": 0.0131, + "step": 2076 + }, + { + "epoch": 0.7509038322487346, + "grad_norm": 65.67409809165956, + "learning_rate": 7.37648481087466e-06, + "loss": 2.4375, + "step": 2077 + }, + { + "epoch": 0.7512653651482285, + "grad_norm": 0.602242209428168, + "learning_rate": 7.3738544594973785e-06, + "loss": 0.1641, + "step": 2078 + }, + { + "epoch": 0.7516268980477223, + "grad_norm": 0.4070543468918074, + "learning_rate": 7.371223259705014e-06, + "loss": 0.0286, + "step": 2079 + }, + { + "epoch": 0.7519884309472162, + "grad_norm": 0.16774627905591377, + "learning_rate": 7.368591212437957e-06, + "loss": 0.0051, + "step": 2080 + }, + { + "epoch": 0.75234996384671, + "grad_norm": 0.12804081033065182, + "learning_rate": 7.3659583186369e-06, + "loss": 0.0146, + "step": 2081 + }, + { + "epoch": 0.7527114967462039, + "grad_norm": 1.3523374895464397, + "learning_rate": 7.36332457924284e-06, + "loss": 0.0432, + "step": 2082 + }, + { + "epoch": 0.7530730296456978, + "grad_norm": 0.17950389465361544, + "learning_rate": 7.360689995197073e-06, + "loss": 0.0051, + "step": 2083 + }, + { + "epoch": 0.7534345625451916, + "grad_norm": 0.0022953385242821395, + "learning_rate": 7.3580545674412e-06, + "loss": 0.0001, + "step": 2084 + }, + { + "epoch": 0.7537960954446855, + "grad_norm": 0.19835387091148185, + "learning_rate": 7.355418296917123e-06, + "loss": 0.0146, + "step": 2085 + }, + { + "epoch": 0.7541576283441793, + "grad_norm": 0.0018157492617375459, + "learning_rate": 7.352781184567044e-06, + "loss": 0.0001, + "step": 2086 + }, + { + "epoch": 0.7545191612436731, + "grad_norm": 0.1860956056317618, + "learning_rate": 7.350143231333465e-06, + "loss": 0.0255, + "step": 2087 + }, + { + "epoch": 0.754880694143167, + "grad_norm": 0.22364434306275058, + "learning_rate": 7.347504438159193e-06, + "loss": 0.0255, + "step": 2088 + }, + { + "epoch": 0.7552422270426609, + "grad_norm": 0.15225738678785108, + "learning_rate": 7.344864805987329e-06, + "loss": 0.0165, + "step": 2089 + }, + { + "epoch": 0.7556037599421548, + "grad_norm": 0.31190474074847324, + "learning_rate": 7.342224335761277e-06, + "loss": 0.0258, + "step": 2090 + }, + { + "epoch": 0.7559652928416486, + "grad_norm": 0.061937119653506394, + "learning_rate": 7.339583028424744e-06, + "loss": 0.0012, + "step": 2091 + }, + { + "epoch": 0.7563268257411424, + "grad_norm": 6.669013987999024, + "learning_rate": 7.336940884921728e-06, + "loss": 0.4199, + "step": 2092 + }, + { + "epoch": 0.7566883586406363, + "grad_norm": 0.13049373164035455, + "learning_rate": 7.334297906196535e-06, + "loss": 0.0081, + "step": 2093 + }, + { + "epoch": 0.7570498915401301, + "grad_norm": 0.5826121913564084, + "learning_rate": 7.331654093193763e-06, + "loss": 0.1738, + "step": 2094 + }, + { + "epoch": 0.7574114244396241, + "grad_norm": 0.43508497057895146, + "learning_rate": 7.329009446858308e-06, + "loss": 0.0117, + "step": 2095 + }, + { + "epoch": 0.7577729573391179, + "grad_norm": 0.7370805048630696, + "learning_rate": 7.326363968135371e-06, + "loss": 0.0206, + "step": 2096 + }, + { + "epoch": 0.7581344902386117, + "grad_norm": 0.23379551073317736, + "learning_rate": 7.32371765797044e-06, + "loss": 0.0205, + "step": 2097 + }, + { + "epoch": 0.7584960231381056, + "grad_norm": 0.1469908561941192, + "learning_rate": 7.321070517309311e-06, + "loss": 0.004, + "step": 2098 + }, + { + "epoch": 0.7588575560375994, + "grad_norm": 0.0014435647774253704, + "learning_rate": 7.318422547098068e-06, + "loss": 0.0, + "step": 2099 + }, + { + "epoch": 0.7592190889370932, + "grad_norm": 0.3364662844684319, + "learning_rate": 7.315773748283095e-06, + "loss": 0.0184, + "step": 2100 + }, + { + "epoch": 0.7595806218365871, + "grad_norm": 0.26423346506353634, + "learning_rate": 7.313124121811074e-06, + "loss": 0.0206, + "step": 2101 + }, + { + "epoch": 0.759942154736081, + "grad_norm": 0.25249243473943783, + "learning_rate": 7.310473668628979e-06, + "loss": 0.0286, + "step": 2102 + }, + { + "epoch": 0.7603036876355749, + "grad_norm": 1.2360922880328802, + "learning_rate": 7.307822389684085e-06, + "loss": 0.0432, + "step": 2103 + }, + { + "epoch": 0.7606652205350687, + "grad_norm": 0.1554850789192601, + "learning_rate": 7.3051702859239525e-06, + "loss": 0.0165, + "step": 2104 + }, + { + "epoch": 0.7610267534345625, + "grad_norm": 0.22580232172537773, + "learning_rate": 7.3025173582964484e-06, + "loss": 0.0229, + "step": 2105 + }, + { + "epoch": 0.7613882863340564, + "grad_norm": 2.9003113858200447, + "learning_rate": 7.299863607749727e-06, + "loss": 0.1934, + "step": 2106 + }, + { + "epoch": 0.7617498192335502, + "grad_norm": 0.21962684021543571, + "learning_rate": 7.297209035232235e-06, + "loss": 0.0229, + "step": 2107 + }, + { + "epoch": 0.7621113521330442, + "grad_norm": 0.15967527755660937, + "learning_rate": 7.294553641692721e-06, + "loss": 0.0117, + "step": 2108 + }, + { + "epoch": 0.762472885032538, + "grad_norm": 1.1371884578952918, + "learning_rate": 7.291897428080218e-06, + "loss": 0.1738, + "step": 2109 + }, + { + "epoch": 0.7628344179320318, + "grad_norm": 0.31801831089858223, + "learning_rate": 7.289240395344059e-06, + "loss": 0.0146, + "step": 2110 + }, + { + "epoch": 0.7631959508315257, + "grad_norm": 0.07230128726757634, + "learning_rate": 7.2865825444338656e-06, + "loss": 0.0015, + "step": 2111 + }, + { + "epoch": 0.7635574837310195, + "grad_norm": 0.4178670994746391, + "learning_rate": 7.283923876299552e-06, + "loss": 0.0082, + "step": 2112 + }, + { + "epoch": 0.7639190166305134, + "grad_norm": 0.11678867954495922, + "learning_rate": 7.28126439189133e-06, + "loss": 0.0092, + "step": 2113 + }, + { + "epoch": 0.7642805495300072, + "grad_norm": 0.11022111982585235, + "learning_rate": 7.278604092159694e-06, + "loss": 0.0165, + "step": 2114 + }, + { + "epoch": 0.764642082429501, + "grad_norm": 0.0548617665067717, + "learning_rate": 7.275942978055436e-06, + "loss": 0.0007, + "step": 2115 + }, + { + "epoch": 0.765003615328995, + "grad_norm": 0.7770551783345011, + "learning_rate": 7.2732810505296395e-06, + "loss": 0.0146, + "step": 2116 + }, + { + "epoch": 0.7653651482284888, + "grad_norm": 0.4584618786383039, + "learning_rate": 7.270618310533672e-06, + "loss": 0.0258, + "step": 2117 + }, + { + "epoch": 0.7657266811279827, + "grad_norm": 0.8431318318793204, + "learning_rate": 7.267954759019203e-06, + "loss": 0.0476, + "step": 2118 + }, + { + "epoch": 0.7660882140274765, + "grad_norm": 0.10041237380249718, + "learning_rate": 7.2652903969381805e-06, + "loss": 0.0131, + "step": 2119 + }, + { + "epoch": 0.7664497469269703, + "grad_norm": 0.14871976236431933, + "learning_rate": 7.262625225242848e-06, + "loss": 0.0031, + "step": 2120 + }, + { + "epoch": 0.7668112798264642, + "grad_norm": 4.504856647486805, + "learning_rate": 7.259959244885738e-06, + "loss": 0.1455, + "step": 2121 + }, + { + "epoch": 0.767172812725958, + "grad_norm": 0.16281907467179244, + "learning_rate": 7.257292456819669e-06, + "loss": 0.0092, + "step": 2122 + }, + { + "epoch": 0.7675343456254519, + "grad_norm": 0.6808997032563496, + "learning_rate": 7.254624861997754e-06, + "loss": 0.0258, + "step": 2123 + }, + { + "epoch": 0.7678958785249458, + "grad_norm": 0.0015841296767392574, + "learning_rate": 7.25195646137339e-06, + "loss": 0.0, + "step": 2124 + }, + { + "epoch": 0.7682574114244396, + "grad_norm": 0.44202246105434834, + "learning_rate": 7.24928725590026e-06, + "loss": 0.0354, + "step": 2125 + }, + { + "epoch": 0.7686189443239335, + "grad_norm": 0.18247470827551568, + "learning_rate": 7.246617246532341e-06, + "loss": 0.0165, + "step": 2126 + }, + { + "epoch": 0.7689804772234273, + "grad_norm": 0.18004869092911077, + "learning_rate": 7.243946434223891e-06, + "loss": 0.0165, + "step": 2127 + }, + { + "epoch": 0.7693420101229211, + "grad_norm": 0.10212119520212036, + "learning_rate": 7.241274819929459e-06, + "loss": 0.0115, + "step": 2128 + }, + { + "epoch": 0.7697035430224151, + "grad_norm": 0.0012412480768968126, + "learning_rate": 7.2386024046038806e-06, + "loss": 0.0, + "step": 2129 + }, + { + "epoch": 0.7700650759219089, + "grad_norm": 1.623197954400981, + "learning_rate": 7.235929189202274e-06, + "loss": 0.063, + "step": 2130 + }, + { + "epoch": 0.7704266088214028, + "grad_norm": 0.009550575116673777, + "learning_rate": 7.233255174680048e-06, + "loss": 0.0003, + "step": 2131 + }, + { + "epoch": 0.7707881417208966, + "grad_norm": 0.16428359834974432, + "learning_rate": 7.230580361992893e-06, + "loss": 0.0117, + "step": 2132 + }, + { + "epoch": 0.7711496746203904, + "grad_norm": 1.7071257959721193, + "learning_rate": 7.227904752096788e-06, + "loss": 0.1191, + "step": 2133 + }, + { + "epoch": 0.7715112075198843, + "grad_norm": 0.16785118805030882, + "learning_rate": 7.225228345947995e-06, + "loss": 0.0165, + "step": 2134 + }, + { + "epoch": 0.7718727404193781, + "grad_norm": 0.12538431902477437, + "learning_rate": 7.22255114450306e-06, + "loss": 0.0072, + "step": 2135 + }, + { + "epoch": 0.7722342733188721, + "grad_norm": 0.8667286303949407, + "learning_rate": 7.2198731487188156e-06, + "loss": 0.0476, + "step": 2136 + }, + { + "epoch": 0.7725958062183659, + "grad_norm": 0.05285869042523489, + "learning_rate": 7.217194359552375e-06, + "loss": 0.0039, + "step": 2137 + }, + { + "epoch": 0.7729573391178597, + "grad_norm": 1.173989422363106, + "learning_rate": 7.214514777961139e-06, + "loss": 0.1738, + "step": 2138 + }, + { + "epoch": 0.7733188720173536, + "grad_norm": 0.07112381699518734, + "learning_rate": 7.211834404902789e-06, + "loss": 0.0072, + "step": 2139 + }, + { + "epoch": 0.7736804049168474, + "grad_norm": 0.08955312320754039, + "learning_rate": 7.209153241335289e-06, + "loss": 0.0092, + "step": 2140 + }, + { + "epoch": 0.7740419378163413, + "grad_norm": 0.9198642487236343, + "learning_rate": 7.206471288216888e-06, + "loss": 0.1836, + "step": 2141 + }, + { + "epoch": 0.7744034707158352, + "grad_norm": 0.1680273336874389, + "learning_rate": 7.203788546506113e-06, + "loss": 0.0165, + "step": 2142 + }, + { + "epoch": 0.774765003615329, + "grad_norm": 0.031854071364638074, + "learning_rate": 7.201105017161777e-06, + "loss": 0.0005, + "step": 2143 + }, + { + "epoch": 0.7751265365148229, + "grad_norm": 0.1982162564849145, + "learning_rate": 7.198420701142973e-06, + "loss": 0.0229, + "step": 2144 + }, + { + "epoch": 0.7754880694143167, + "grad_norm": 0.003256070609591163, + "learning_rate": 7.195735599409074e-06, + "loss": 0.0001, + "step": 2145 + }, + { + "epoch": 0.7758496023138105, + "grad_norm": 0.010204264232083947, + "learning_rate": 7.193049712919735e-06, + "loss": 0.0002, + "step": 2146 + }, + { + "epoch": 0.7762111352133044, + "grad_norm": 2.1917134477486475, + "learning_rate": 7.19036304263489e-06, + "loss": 0.0576, + "step": 2147 + }, + { + "epoch": 0.7765726681127982, + "grad_norm": 0.1061670822700315, + "learning_rate": 7.187675589514757e-06, + "loss": 0.0131, + "step": 2148 + }, + { + "epoch": 0.7769342010122922, + "grad_norm": 0.49568325344316005, + "learning_rate": 7.184987354519831e-06, + "loss": 0.0391, + "step": 2149 + }, + { + "epoch": 0.777295733911786, + "grad_norm": 0.01024546947552152, + "learning_rate": 7.182298338610885e-06, + "loss": 0.0003, + "step": 2150 + }, + { + "epoch": 0.7776572668112798, + "grad_norm": 0.8647345521892804, + "learning_rate": 7.1796085427489725e-06, + "loss": 0.1367, + "step": 2151 + }, + { + "epoch": 0.7780187997107737, + "grad_norm": 0.0040602531515012905, + "learning_rate": 7.176917967895427e-06, + "loss": 0.0001, + "step": 2152 + }, + { + "epoch": 0.7783803326102675, + "grad_norm": 0.004714768195188839, + "learning_rate": 7.17422661501186e-06, + "loss": 0.0001, + "step": 2153 + }, + { + "epoch": 0.7787418655097614, + "grad_norm": 0.16228197629279953, + "learning_rate": 7.171534485060158e-06, + "loss": 0.0104, + "step": 2154 + }, + { + "epoch": 0.7791033984092552, + "grad_norm": 0.8075922706528319, + "learning_rate": 7.168841579002492e-06, + "loss": 0.1836, + "step": 2155 + }, + { + "epoch": 0.779464931308749, + "grad_norm": 0.09494308563856486, + "learning_rate": 7.166147897801302e-06, + "loss": 0.0081, + "step": 2156 + }, + { + "epoch": 0.779826464208243, + "grad_norm": 0.18893353589964193, + "learning_rate": 7.1634534424193105e-06, + "loss": 0.0146, + "step": 2157 + }, + { + "epoch": 0.7801879971077368, + "grad_norm": 0.001527478824018795, + "learning_rate": 7.160758213819515e-06, + "loss": 0.0001, + "step": 2158 + }, + { + "epoch": 0.7805495300072307, + "grad_norm": 2.5533218430531317, + "learning_rate": 7.158062212965189e-06, + "loss": 0.1279, + "step": 2159 + }, + { + "epoch": 0.7809110629067245, + "grad_norm": 0.10074540833132772, + "learning_rate": 7.155365440819886e-06, + "loss": 0.0115, + "step": 2160 + }, + { + "epoch": 0.7812725958062183, + "grad_norm": 0.14587149232185387, + "learning_rate": 7.152667898347427e-06, + "loss": 0.0165, + "step": 2161 + }, + { + "epoch": 0.7816341287057122, + "grad_norm": 0.04481432589154362, + "learning_rate": 7.149969586511916e-06, + "loss": 0.0007, + "step": 2162 + }, + { + "epoch": 0.7819956616052061, + "grad_norm": 0.11572757128837682, + "learning_rate": 7.147270506277729e-06, + "loss": 0.0015, + "step": 2163 + }, + { + "epoch": 0.7823571945047, + "grad_norm": 0.374585688376914, + "learning_rate": 7.144570658609515e-06, + "loss": 0.0184, + "step": 2164 + }, + { + "epoch": 0.7827187274041938, + "grad_norm": 1.9936952185711392, + "learning_rate": 7.141870044472203e-06, + "loss": 0.0476, + "step": 2165 + }, + { + "epoch": 0.7830802603036876, + "grad_norm": 0.0029281377475290347, + "learning_rate": 7.139168664830987e-06, + "loss": 0.0001, + "step": 2166 + }, + { + "epoch": 0.7834417932031815, + "grad_norm": 0.31004959923578534, + "learning_rate": 7.136466520651342e-06, + "loss": 0.0258, + "step": 2167 + }, + { + "epoch": 0.7838033261026753, + "grad_norm": 5.226821734603357, + "learning_rate": 7.133763612899013e-06, + "loss": 0.0752, + "step": 2168 + }, + { + "epoch": 0.7841648590021691, + "grad_norm": 0.0006976054541743981, + "learning_rate": 7.131059942540018e-06, + "loss": 0.0, + "step": 2169 + }, + { + "epoch": 0.7845263919016631, + "grad_norm": 0.006161430396907739, + "learning_rate": 7.128355510540652e-06, + "loss": 0.0001, + "step": 2170 + }, + { + "epoch": 0.7848879248011569, + "grad_norm": 0.08891566510243183, + "learning_rate": 7.1256503178674756e-06, + "loss": 0.0022, + "step": 2171 + }, + { + "epoch": 0.7852494577006508, + "grad_norm": 13.906237443354158, + "learning_rate": 7.122944365487321e-06, + "loss": 0.1836, + "step": 2172 + }, + { + "epoch": 0.7856109906001446, + "grad_norm": 0.6583220416931237, + "learning_rate": 7.120237654367301e-06, + "loss": 0.1641, + "step": 2173 + }, + { + "epoch": 0.7859725234996384, + "grad_norm": 1.0833423217338662, + "learning_rate": 7.117530185474789e-06, + "loss": 0.0752, + "step": 2174 + }, + { + "epoch": 0.7863340563991323, + "grad_norm": 0.002395312251577525, + "learning_rate": 7.114821959777438e-06, + "loss": 0.0, + "step": 2175 + }, + { + "epoch": 0.7866955892986262, + "grad_norm": 0.48393541534337636, + "learning_rate": 7.112112978243162e-06, + "loss": 0.0064, + "step": 2176 + }, + { + "epoch": 0.7870571221981201, + "grad_norm": 3.2379169557753618, + "learning_rate": 7.109403241840156e-06, + "loss": 0.3105, + "step": 2177 + }, + { + "epoch": 0.7874186550976139, + "grad_norm": 2.116907502418323, + "learning_rate": 7.106692751536875e-06, + "loss": 0.063, + "step": 2178 + }, + { + "epoch": 0.7877801879971077, + "grad_norm": 5.377414607844673, + "learning_rate": 7.103981508302049e-06, + "loss": 0.6133, + "step": 2179 + }, + { + "epoch": 0.7881417208966016, + "grad_norm": 0.28006697098098216, + "learning_rate": 7.101269513104677e-06, + "loss": 0.0286, + "step": 2180 + }, + { + "epoch": 0.7885032537960954, + "grad_norm": 0.003329086850660327, + "learning_rate": 7.098556766914023e-06, + "loss": 0.0001, + "step": 2181 + }, + { + "epoch": 0.7888647866955893, + "grad_norm": 0.14795321799768502, + "learning_rate": 7.095843270699625e-06, + "loss": 0.0081, + "step": 2182 + }, + { + "epoch": 0.7892263195950832, + "grad_norm": 0.003983557079601843, + "learning_rate": 7.093129025431283e-06, + "loss": 0.0001, + "step": 2183 + }, + { + "epoch": 0.789587852494577, + "grad_norm": 0.0013830804646207478, + "learning_rate": 7.09041403207907e-06, + "loss": 0.0001, + "step": 2184 + }, + { + "epoch": 0.7899493853940709, + "grad_norm": 0.9609753310514657, + "learning_rate": 7.087698291613323e-06, + "loss": 0.0131, + "step": 2185 + }, + { + "epoch": 0.7903109182935647, + "grad_norm": 0.019298241919043196, + "learning_rate": 7.084981805004647e-06, + "loss": 0.0007, + "step": 2186 + }, + { + "epoch": 0.7906724511930586, + "grad_norm": 0.5181651510523677, + "learning_rate": 7.082264573223914e-06, + "loss": 0.0229, + "step": 2187 + }, + { + "epoch": 0.7910339840925524, + "grad_norm": 0.19629221942269323, + "learning_rate": 7.079546597242262e-06, + "loss": 0.0229, + "step": 2188 + }, + { + "epoch": 0.7913955169920462, + "grad_norm": 0.1618212164047671, + "learning_rate": 7.076827878031094e-06, + "loss": 0.0165, + "step": 2189 + }, + { + "epoch": 0.7917570498915402, + "grad_norm": 0.2342962225733182, + "learning_rate": 7.074108416562081e-06, + "loss": 0.0131, + "step": 2190 + }, + { + "epoch": 0.792118582791034, + "grad_norm": 0.7880369711366614, + "learning_rate": 7.071388213807159e-06, + "loss": 0.0525, + "step": 2191 + }, + { + "epoch": 0.7924801156905278, + "grad_norm": 0.0010334692080370433, + "learning_rate": 7.068667270738525e-06, + "loss": 0.0, + "step": 2192 + }, + { + "epoch": 0.7928416485900217, + "grad_norm": 0.2867350563710156, + "learning_rate": 7.065945588328646e-06, + "loss": 0.0206, + "step": 2193 + }, + { + "epoch": 0.7932031814895155, + "grad_norm": 0.008334101180836604, + "learning_rate": 7.06322316755025e-06, + "loss": 0.0003, + "step": 2194 + }, + { + "epoch": 0.7935647143890094, + "grad_norm": 0.07159701948252879, + "learning_rate": 7.060500009376327e-06, + "loss": 0.0022, + "step": 2195 + }, + { + "epoch": 0.7939262472885033, + "grad_norm": 0.9125772744609845, + "learning_rate": 7.0577761147801385e-06, + "loss": 0.1191, + "step": 2196 + }, + { + "epoch": 0.7942877801879971, + "grad_norm": 0.10307289993806447, + "learning_rate": 7.0550514847352e-06, + "loss": 0.0092, + "step": 2197 + }, + { + "epoch": 0.794649313087491, + "grad_norm": 0.1765670326835994, + "learning_rate": 7.052326120215294e-06, + "loss": 0.0229, + "step": 2198 + }, + { + "epoch": 0.7950108459869848, + "grad_norm": 0.14963205246970768, + "learning_rate": 7.049600022194465e-06, + "loss": 0.0082, + "step": 2199 + }, + { + "epoch": 0.7953723788864787, + "grad_norm": 0.10264694005524412, + "learning_rate": 7.046873191647022e-06, + "loss": 0.0103, + "step": 2200 + }, + { + "epoch": 0.7957339117859725, + "grad_norm": 0.12739915802843652, + "learning_rate": 7.044145629547532e-06, + "loss": 0.0146, + "step": 2201 + }, + { + "epoch": 0.7960954446854663, + "grad_norm": 3.0490885144039273, + "learning_rate": 7.041417336870826e-06, + "loss": 0.1641, + "step": 2202 + }, + { + "epoch": 0.7964569775849603, + "grad_norm": 0.003506195119681887, + "learning_rate": 7.038688314591994e-06, + "loss": 0.0001, + "step": 2203 + }, + { + "epoch": 0.7968185104844541, + "grad_norm": 0.17068484077810409, + "learning_rate": 7.035958563686387e-06, + "loss": 0.0165, + "step": 2204 + }, + { + "epoch": 0.797180043383948, + "grad_norm": 0.8657300169164964, + "learning_rate": 7.033228085129621e-06, + "loss": 0.0317, + "step": 2205 + }, + { + "epoch": 0.7975415762834418, + "grad_norm": 2.7976759425639157, + "learning_rate": 7.030496879897566e-06, + "loss": 0.4668, + "step": 2206 + }, + { + "epoch": 0.7979031091829356, + "grad_norm": 0.051589508679282296, + "learning_rate": 7.027764948966355e-06, + "loss": 0.0011, + "step": 2207 + }, + { + "epoch": 0.7982646420824295, + "grad_norm": 0.7877989011721813, + "learning_rate": 7.02503229331238e-06, + "loss": 0.0525, + "step": 2208 + }, + { + "epoch": 0.7986261749819233, + "grad_norm": 2.3228918727260788, + "learning_rate": 7.022298913912288e-06, + "loss": 0.332, + "step": 2209 + }, + { + "epoch": 0.7989877078814173, + "grad_norm": 3.6302646701725556, + "learning_rate": 7.019564811742992e-06, + "loss": 0.1836, + "step": 2210 + }, + { + "epoch": 0.7993492407809111, + "grad_norm": 0.06041039673301539, + "learning_rate": 7.016829987781659e-06, + "loss": 0.0017, + "step": 2211 + }, + { + "epoch": 0.7997107736804049, + "grad_norm": 2.259080853304419, + "learning_rate": 7.014094443005715e-06, + "loss": 0.2031, + "step": 2212 + }, + { + "epoch": 0.8000723065798988, + "grad_norm": 0.14413121854503497, + "learning_rate": 7.011358178392841e-06, + "loss": 0.0131, + "step": 2213 + }, + { + "epoch": 0.8004338394793926, + "grad_norm": 0.8939246066370646, + "learning_rate": 7.008621194920977e-06, + "loss": 0.0391, + "step": 2214 + }, + { + "epoch": 0.8007953723788864, + "grad_norm": 0.1113453470067427, + "learning_rate": 7.005883493568324e-06, + "loss": 0.0146, + "step": 2215 + }, + { + "epoch": 0.8011569052783803, + "grad_norm": 0.08516832000980798, + "learning_rate": 7.003145075313334e-06, + "loss": 0.0064, + "step": 2216 + }, + { + "epoch": 0.8015184381778742, + "grad_norm": 0.15706485004997808, + "learning_rate": 7.000405941134716e-06, + "loss": 0.0146, + "step": 2217 + }, + { + "epoch": 0.8018799710773681, + "grad_norm": 0.2543247205286929, + "learning_rate": 6.997666092011436e-06, + "loss": 0.0206, + "step": 2218 + }, + { + "epoch": 0.8022415039768619, + "grad_norm": 0.15022831325687358, + "learning_rate": 6.994925528922716e-06, + "loss": 0.0072, + "step": 2219 + }, + { + "epoch": 0.8026030368763557, + "grad_norm": 0.4935954829887983, + "learning_rate": 6.992184252848035e-06, + "loss": 0.0206, + "step": 2220 + }, + { + "epoch": 0.8029645697758496, + "grad_norm": 0.10189073658948253, + "learning_rate": 6.989442264767121e-06, + "loss": 0.0117, + "step": 2221 + }, + { + "epoch": 0.8033261026753434, + "grad_norm": 1.462185548370486, + "learning_rate": 6.986699565659963e-06, + "loss": 0.1191, + "step": 2222 + }, + { + "epoch": 0.8036876355748374, + "grad_norm": 0.31844282744959906, + "learning_rate": 6.983956156506798e-06, + "loss": 0.0104, + "step": 2223 + }, + { + "epoch": 0.8040491684743312, + "grad_norm": 0.8916371154519921, + "learning_rate": 6.981212038288121e-06, + "loss": 0.1279, + "step": 2224 + }, + { + "epoch": 0.804410701373825, + "grad_norm": 0.08520801157202593, + "learning_rate": 6.978467211984681e-06, + "loss": 0.0064, + "step": 2225 + }, + { + "epoch": 0.8047722342733189, + "grad_norm": 0.13491191591538057, + "learning_rate": 6.975721678577476e-06, + "loss": 0.0131, + "step": 2226 + }, + { + "epoch": 0.8051337671728127, + "grad_norm": 0.19298056432862248, + "learning_rate": 6.97297543904776e-06, + "loss": 0.0092, + "step": 2227 + }, + { + "epoch": 0.8054953000723066, + "grad_norm": 1.1902855060316717, + "learning_rate": 6.970228494377039e-06, + "loss": 0.0317, + "step": 2228 + }, + { + "epoch": 0.8058568329718004, + "grad_norm": 0.13494834847523943, + "learning_rate": 6.96748084554707e-06, + "loss": 0.0072, + "step": 2229 + }, + { + "epoch": 0.8062183658712943, + "grad_norm": 0.06969669124980961, + "learning_rate": 6.964732493539861e-06, + "loss": 0.0057, + "step": 2230 + }, + { + "epoch": 0.8065798987707882, + "grad_norm": 0.17040955579511866, + "learning_rate": 6.961983439337675e-06, + "loss": 0.0131, + "step": 2231 + }, + { + "epoch": 0.806941431670282, + "grad_norm": 0.1497457726976133, + "learning_rate": 6.959233683923022e-06, + "loss": 0.0064, + "step": 2232 + }, + { + "epoch": 0.8073029645697759, + "grad_norm": 0.11447186910566172, + "learning_rate": 6.956483228278662e-06, + "loss": 0.005, + "step": 2233 + }, + { + "epoch": 0.8076644974692697, + "grad_norm": 1.6749579119156495, + "learning_rate": 6.953732073387609e-06, + "loss": 0.1279, + "step": 2234 + }, + { + "epoch": 0.8080260303687635, + "grad_norm": 0.15092535907562166, + "learning_rate": 6.950980220233127e-06, + "loss": 0.0146, + "step": 2235 + }, + { + "epoch": 0.8083875632682574, + "grad_norm": 0.12175580389349483, + "learning_rate": 6.948227669798725e-06, + "loss": 0.0082, + "step": 2236 + }, + { + "epoch": 0.8087490961677513, + "grad_norm": 0.986380314425087, + "learning_rate": 6.945474423068166e-06, + "loss": 0.2344, + "step": 2237 + }, + { + "epoch": 0.8091106290672451, + "grad_norm": 0.15510689497022723, + "learning_rate": 6.942720481025458e-06, + "loss": 0.0146, + "step": 2238 + }, + { + "epoch": 0.809472161966739, + "grad_norm": 0.3969710566269427, + "learning_rate": 6.939965844654859e-06, + "loss": 0.0131, + "step": 2239 + }, + { + "epoch": 0.8098336948662328, + "grad_norm": 0.8875258666208911, + "learning_rate": 6.9372105149408775e-06, + "loss": 0.0688, + "step": 2240 + }, + { + "epoch": 0.8101952277657267, + "grad_norm": 1.3049768972894764, + "learning_rate": 6.934454492868268e-06, + "loss": 0.1553, + "step": 2241 + }, + { + "epoch": 0.8105567606652205, + "grad_norm": 0.07480740972176006, + "learning_rate": 6.93169777942203e-06, + "loss": 0.0072, + "step": 2242 + }, + { + "epoch": 0.8109182935647143, + "grad_norm": 0.2653703539798729, + "learning_rate": 6.9289403755874126e-06, + "loss": 0.0131, + "step": 2243 + }, + { + "epoch": 0.8112798264642083, + "grad_norm": 0.7358035741390797, + "learning_rate": 6.9261822823499124e-06, + "loss": 0.0286, + "step": 2244 + }, + { + "epoch": 0.8116413593637021, + "grad_norm": 0.0849421779410505, + "learning_rate": 6.923423500695272e-06, + "loss": 0.0072, + "step": 2245 + }, + { + "epoch": 0.812002892263196, + "grad_norm": 1.2247818642352026, + "learning_rate": 6.920664031609478e-06, + "loss": 0.1934, + "step": 2246 + }, + { + "epoch": 0.8123644251626898, + "grad_norm": 0.11586351677454855, + "learning_rate": 6.917903876078764e-06, + "loss": 0.0117, + "step": 2247 + }, + { + "epoch": 0.8127259580621836, + "grad_norm": 0.9809311627077557, + "learning_rate": 6.91514303508961e-06, + "loss": 0.1836, + "step": 2248 + }, + { + "epoch": 0.8130874909616775, + "grad_norm": 0.15074365028369857, + "learning_rate": 6.912381509628737e-06, + "loss": 0.0057, + "step": 2249 + }, + { + "epoch": 0.8134490238611713, + "grad_norm": 0.08213698772080243, + "learning_rate": 6.909619300683119e-06, + "loss": 0.0039, + "step": 2250 + }, + { + "epoch": 0.8138105567606653, + "grad_norm": 0.11421752825376703, + "learning_rate": 6.906856409239964e-06, + "loss": 0.0117, + "step": 2251 + }, + { + "epoch": 0.8141720896601591, + "grad_norm": 0.11271193761485405, + "learning_rate": 6.904092836286733e-06, + "loss": 0.0057, + "step": 2252 + }, + { + "epoch": 0.8145336225596529, + "grad_norm": 1.8904353910256613, + "learning_rate": 6.901328582811123e-06, + "loss": 0.2246, + "step": 2253 + }, + { + "epoch": 0.8148951554591468, + "grad_norm": 1.0105534829791536, + "learning_rate": 6.898563649801078e-06, + "loss": 0.0432, + "step": 2254 + }, + { + "epoch": 0.8152566883586406, + "grad_norm": 1.012701253791934, + "learning_rate": 6.895798038244786e-06, + "loss": 0.1641, + "step": 2255 + }, + { + "epoch": 0.8156182212581344, + "grad_norm": 0.24834734675655654, + "learning_rate": 6.893031749130674e-06, + "loss": 0.0231, + "step": 2256 + }, + { + "epoch": 0.8159797541576284, + "grad_norm": 0.12396785076639308, + "learning_rate": 6.890264783447417e-06, + "loss": 0.0146, + "step": 2257 + }, + { + "epoch": 0.8163412870571222, + "grad_norm": 0.5185865044012821, + "learning_rate": 6.887497142183924e-06, + "loss": 0.0286, + "step": 2258 + }, + { + "epoch": 0.8167028199566161, + "grad_norm": 0.6878466568657026, + "learning_rate": 6.884728826329349e-06, + "loss": 0.0393, + "step": 2259 + }, + { + "epoch": 0.8170643528561099, + "grad_norm": 0.7355902462145939, + "learning_rate": 6.881959836873091e-06, + "loss": 0.1455, + "step": 2260 + }, + { + "epoch": 0.8174258857556037, + "grad_norm": 0.2517195027969322, + "learning_rate": 6.879190174804783e-06, + "loss": 0.0131, + "step": 2261 + }, + { + "epoch": 0.8177874186550976, + "grad_norm": 0.1356479451013002, + "learning_rate": 6.876419841114305e-06, + "loss": 0.0184, + "step": 2262 + }, + { + "epoch": 0.8181489515545914, + "grad_norm": 0.13894535255561466, + "learning_rate": 6.873648836791772e-06, + "loss": 0.0184, + "step": 2263 + }, + { + "epoch": 0.8185104844540854, + "grad_norm": 0.041158249515315104, + "learning_rate": 6.870877162827538e-06, + "loss": 0.0013, + "step": 2264 + }, + { + "epoch": 0.8188720173535792, + "grad_norm": 1.4109362683909923, + "learning_rate": 6.8681048202122026e-06, + "loss": 0.1934, + "step": 2265 + }, + { + "epoch": 0.819233550253073, + "grad_norm": 0.18312200479402418, + "learning_rate": 6.865331809936597e-06, + "loss": 0.0131, + "step": 2266 + }, + { + "epoch": 0.8195950831525669, + "grad_norm": 0.17248946007518332, + "learning_rate": 6.862558132991798e-06, + "loss": 0.0229, + "step": 2267 + }, + { + "epoch": 0.8199566160520607, + "grad_norm": 0.6529285668328494, + "learning_rate": 6.859783790369116e-06, + "loss": 0.1279, + "step": 2268 + }, + { + "epoch": 0.8203181489515546, + "grad_norm": 0.567933892858737, + "learning_rate": 6.857008783060097e-06, + "loss": 0.0286, + "step": 2269 + }, + { + "epoch": 0.8206796818510484, + "grad_norm": 1.201986963701278, + "learning_rate": 6.854233112056533e-06, + "loss": 0.1836, + "step": 2270 + }, + { + "epoch": 0.8210412147505423, + "grad_norm": 0.12999242190825067, + "learning_rate": 6.851456778350445e-06, + "loss": 0.0092, + "step": 2271 + }, + { + "epoch": 0.8214027476500362, + "grad_norm": 2.0702905225517623, + "learning_rate": 6.848679782934094e-06, + "loss": 0.1279, + "step": 2272 + }, + { + "epoch": 0.82176428054953, + "grad_norm": 0.01286107243062211, + "learning_rate": 6.845902126799981e-06, + "loss": 0.0004, + "step": 2273 + }, + { + "epoch": 0.8221258134490239, + "grad_norm": 0.5660298507123742, + "learning_rate": 6.843123810940837e-06, + "loss": 0.1035, + "step": 2274 + }, + { + "epoch": 0.8224873463485177, + "grad_norm": 0.3822051318079844, + "learning_rate": 6.8403448363496315e-06, + "loss": 0.0317, + "step": 2275 + }, + { + "epoch": 0.8228488792480115, + "grad_norm": 0.5966138429970724, + "learning_rate": 6.83756520401957e-06, + "loss": 0.0354, + "step": 2276 + }, + { + "epoch": 0.8232104121475055, + "grad_norm": 7.372956850265034, + "learning_rate": 6.834784914944092e-06, + "loss": 0.0476, + "step": 2277 + }, + { + "epoch": 0.8235719450469993, + "grad_norm": 0.026797332996241523, + "learning_rate": 6.832003970116874e-06, + "loss": 0.0008, + "step": 2278 + }, + { + "epoch": 0.8239334779464931, + "grad_norm": 0.12091932612685038, + "learning_rate": 6.829222370531823e-06, + "loss": 0.0045, + "step": 2279 + }, + { + "epoch": 0.824295010845987, + "grad_norm": 0.7155957089445352, + "learning_rate": 6.826440117183082e-06, + "loss": 0.0354, + "step": 2280 + }, + { + "epoch": 0.8246565437454808, + "grad_norm": 0.8565005812171944, + "learning_rate": 6.823657211065028e-06, + "loss": 0.1553, + "step": 2281 + }, + { + "epoch": 0.8250180766449747, + "grad_norm": 0.5463254959291338, + "learning_rate": 6.820873653172273e-06, + "loss": 0.0258, + "step": 2282 + }, + { + "epoch": 0.8253796095444685, + "grad_norm": 0.08900369854288741, + "learning_rate": 6.818089444499658e-06, + "loss": 0.0007, + "step": 2283 + }, + { + "epoch": 0.8257411424439624, + "grad_norm": 0.11617629639294753, + "learning_rate": 6.81530458604226e-06, + "loss": 0.0058, + "step": 2284 + }, + { + "epoch": 0.8261026753434563, + "grad_norm": 0.21143328219555166, + "learning_rate": 6.812519078795386e-06, + "loss": 0.0258, + "step": 2285 + }, + { + "epoch": 0.8264642082429501, + "grad_norm": 0.20651983617181957, + "learning_rate": 6.809732923754575e-06, + "loss": 0.0286, + "step": 2286 + }, + { + "epoch": 0.826825741142444, + "grad_norm": 0.05671481376243257, + "learning_rate": 6.8069461219155985e-06, + "loss": 0.0025, + "step": 2287 + }, + { + "epoch": 0.8271872740419378, + "grad_norm": 0.6261465415451789, + "learning_rate": 6.804158674274461e-06, + "loss": 0.0752, + "step": 2288 + }, + { + "epoch": 0.8275488069414316, + "grad_norm": 0.0368545671014465, + "learning_rate": 6.801370581827393e-06, + "loss": 0.0008, + "step": 2289 + }, + { + "epoch": 0.8279103398409255, + "grad_norm": 0.19269721871108692, + "learning_rate": 6.798581845570859e-06, + "loss": 0.0104, + "step": 2290 + }, + { + "epoch": 0.8282718727404194, + "grad_norm": 0.2710200953468905, + "learning_rate": 6.795792466501554e-06, + "loss": 0.0391, + "step": 2291 + }, + { + "epoch": 0.8286334056399133, + "grad_norm": 0.06314862078107081, + "learning_rate": 6.7930024456164e-06, + "loss": 0.0022, + "step": 2292 + }, + { + "epoch": 0.8289949385394071, + "grad_norm": 0.2144591272697425, + "learning_rate": 6.79021178391255e-06, + "loss": 0.0092, + "step": 2293 + }, + { + "epoch": 0.8293564714389009, + "grad_norm": 0.1641509413843848, + "learning_rate": 6.787420482387387e-06, + "loss": 0.004, + "step": 2294 + }, + { + "epoch": 0.8297180043383948, + "grad_norm": 0.3707972240850493, + "learning_rate": 6.78462854203852e-06, + "loss": 0.0476, + "step": 2295 + }, + { + "epoch": 0.8300795372378886, + "grad_norm": 0.07726006727651825, + "learning_rate": 6.781835963863789e-06, + "loss": 0.0032, + "step": 2296 + }, + { + "epoch": 0.8304410701373826, + "grad_norm": 0.6150554479076961, + "learning_rate": 6.77904274886126e-06, + "loss": 0.0815, + "step": 2297 + }, + { + "epoch": 0.8308026030368764, + "grad_norm": 0.7529194668373377, + "learning_rate": 6.77624889802923e-06, + "loss": 0.0231, + "step": 2298 + }, + { + "epoch": 0.8311641359363702, + "grad_norm": 0.24938198084336413, + "learning_rate": 6.773454412366216e-06, + "loss": 0.0317, + "step": 2299 + }, + { + "epoch": 0.8315256688358641, + "grad_norm": 0.24943789758946275, + "learning_rate": 6.770659292870971e-06, + "loss": 0.0258, + "step": 2300 + }, + { + "epoch": 0.8318872017353579, + "grad_norm": 0.5593834342752472, + "learning_rate": 6.767863540542467e-06, + "loss": 0.0579, + "step": 2301 + }, + { + "epoch": 0.8322487346348517, + "grad_norm": 0.5736210142687929, + "learning_rate": 6.7650671563799075e-06, + "loss": 0.0286, + "step": 2302 + }, + { + "epoch": 0.8326102675343456, + "grad_norm": 0.6610238261996988, + "learning_rate": 6.76227014138272e-06, + "loss": 0.1113, + "step": 2303 + }, + { + "epoch": 0.8329718004338394, + "grad_norm": 1.1121566709142714, + "learning_rate": 6.759472496550554e-06, + "loss": 0.0576, + "step": 2304 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.4250636245913248, + "learning_rate": 6.75667422288329e-06, + "loss": 0.0354, + "step": 2305 + }, + { + "epoch": 0.8336948662328272, + "grad_norm": 0.2765916347737762, + "learning_rate": 6.75387532138103e-06, + "loss": 0.0354, + "step": 2306 + }, + { + "epoch": 0.834056399132321, + "grad_norm": 1.7583189746810093, + "learning_rate": 6.751075793044099e-06, + "loss": 0.1367, + "step": 2307 + }, + { + "epoch": 0.8344179320318149, + "grad_norm": 0.22825209106413938, + "learning_rate": 6.74827563887305e-06, + "loss": 0.0255, + "step": 2308 + }, + { + "epoch": 0.8347794649313087, + "grad_norm": 0.5681567026146713, + "learning_rate": 6.745474859868657e-06, + "loss": 0.0354, + "step": 2309 + }, + { + "epoch": 0.8351409978308026, + "grad_norm": 0.3468268558261614, + "learning_rate": 6.742673457031917e-06, + "loss": 0.0432, + "step": 2310 + }, + { + "epoch": 0.8355025307302965, + "grad_norm": 0.061574156262265374, + "learning_rate": 6.73987143136405e-06, + "loss": 0.0019, + "step": 2311 + }, + { + "epoch": 0.8358640636297903, + "grad_norm": 0.1929967656117038, + "learning_rate": 6.737068783866501e-06, + "loss": 0.0286, + "step": 2312 + }, + { + "epoch": 0.8362255965292842, + "grad_norm": 0.2163579723755383, + "learning_rate": 6.734265515540937e-06, + "loss": 0.0206, + "step": 2313 + }, + { + "epoch": 0.836587129428778, + "grad_norm": 0.047183690449269754, + "learning_rate": 6.731461627389242e-06, + "loss": 0.0015, + "step": 2314 + }, + { + "epoch": 0.8369486623282719, + "grad_norm": 0.04253331217166733, + "learning_rate": 6.728657120413529e-06, + "loss": 0.0022, + "step": 2315 + }, + { + "epoch": 0.8373101952277657, + "grad_norm": 0.0027913288841220955, + "learning_rate": 6.725851995616123e-06, + "loss": 0.0001, + "step": 2316 + }, + { + "epoch": 0.8376717281272595, + "grad_norm": 0.9002053369979401, + "learning_rate": 6.723046253999579e-06, + "loss": 0.1455, + "step": 2317 + }, + { + "epoch": 0.8380332610267535, + "grad_norm": 0.041972605491868126, + "learning_rate": 6.720239896566668e-06, + "loss": 0.001, + "step": 2318 + }, + { + "epoch": 0.8383947939262473, + "grad_norm": 0.1308423196411481, + "learning_rate": 6.717432924320382e-06, + "loss": 0.0165, + "step": 2319 + }, + { + "epoch": 0.8387563268257412, + "grad_norm": 0.34398505441236016, + "learning_rate": 6.714625338263929e-06, + "loss": 0.0165, + "step": 2320 + }, + { + "epoch": 0.839117859725235, + "grad_norm": 1.0315078664989847, + "learning_rate": 6.711817139400743e-06, + "loss": 0.1113, + "step": 2321 + }, + { + "epoch": 0.8394793926247288, + "grad_norm": 4.245789245028749, + "learning_rate": 6.709008328734472e-06, + "loss": 0.3438, + "step": 2322 + }, + { + "epoch": 0.8398409255242227, + "grad_norm": 0.9141973942570781, + "learning_rate": 6.706198907268986e-06, + "loss": 0.0476, + "step": 2323 + }, + { + "epoch": 0.8402024584237165, + "grad_norm": 0.2953992853664031, + "learning_rate": 6.703388876008371e-06, + "loss": 0.0184, + "step": 2324 + }, + { + "epoch": 0.8405639913232104, + "grad_norm": 2.000134775580248, + "learning_rate": 6.70057823595693e-06, + "loss": 0.0957, + "step": 2325 + }, + { + "epoch": 0.8409255242227043, + "grad_norm": 0.10191029864400881, + "learning_rate": 6.697766988119187e-06, + "loss": 0.0146, + "step": 2326 + }, + { + "epoch": 0.8412870571221981, + "grad_norm": 0.3262282486685745, + "learning_rate": 6.694955133499881e-06, + "loss": 0.0317, + "step": 2327 + }, + { + "epoch": 0.841648590021692, + "grad_norm": 1.4278980265330756, + "learning_rate": 6.692142673103967e-06, + "loss": 0.0688, + "step": 2328 + }, + { + "epoch": 0.8420101229211858, + "grad_norm": 0.13556867139724507, + "learning_rate": 6.6893296079366185e-06, + "loss": 0.0146, + "step": 2329 + }, + { + "epoch": 0.8423716558206796, + "grad_norm": 0.007401901999640642, + "learning_rate": 6.686515939003226e-06, + "loss": 0.0002, + "step": 2330 + }, + { + "epoch": 0.8427331887201736, + "grad_norm": 0.5624371615499048, + "learning_rate": 6.683701667309393e-06, + "loss": 0.1553, + "step": 2331 + }, + { + "epoch": 0.8430947216196674, + "grad_norm": 3.087681309022308, + "learning_rate": 6.680886793860939e-06, + "loss": 0.2246, + "step": 2332 + }, + { + "epoch": 0.8434562545191613, + "grad_norm": 0.4521426295165475, + "learning_rate": 6.678071319663899e-06, + "loss": 0.0131, + "step": 2333 + }, + { + "epoch": 0.8438177874186551, + "grad_norm": 0.17569462082517048, + "learning_rate": 6.675255245724524e-06, + "loss": 0.0057, + "step": 2334 + }, + { + "epoch": 0.8441793203181489, + "grad_norm": 0.12589822657223512, + "learning_rate": 6.672438573049278e-06, + "loss": 0.0184, + "step": 2335 + }, + { + "epoch": 0.8445408532176428, + "grad_norm": 0.09519928649911268, + "learning_rate": 6.669621302644838e-06, + "loss": 0.0005, + "step": 2336 + }, + { + "epoch": 0.8449023861171366, + "grad_norm": 0.10914798551124925, + "learning_rate": 6.666803435518096e-06, + "loss": 0.0092, + "step": 2337 + }, + { + "epoch": 0.8452639190166306, + "grad_norm": 0.5735132126121716, + "learning_rate": 6.66398497267616e-06, + "loss": 0.0317, + "step": 2338 + }, + { + "epoch": 0.8456254519161244, + "grad_norm": 1.14624795771663, + "learning_rate": 6.661165915126344e-06, + "loss": 0.0889, + "step": 2339 + }, + { + "epoch": 0.8459869848156182, + "grad_norm": 0.10369547865495557, + "learning_rate": 6.658346263876183e-06, + "loss": 0.0146, + "step": 2340 + }, + { + "epoch": 0.8463485177151121, + "grad_norm": 0.43305677406906135, + "learning_rate": 6.655526019933416e-06, + "loss": 0.0317, + "step": 2341 + }, + { + "epoch": 0.8467100506146059, + "grad_norm": 0.14127663254290349, + "learning_rate": 6.652705184305998e-06, + "loss": 0.0165, + "step": 2342 + }, + { + "epoch": 0.8470715835140998, + "grad_norm": 0.15725215029470738, + "learning_rate": 6.649883758002097e-06, + "loss": 0.0117, + "step": 2343 + }, + { + "epoch": 0.8474331164135936, + "grad_norm": 0.0748942415957896, + "learning_rate": 6.647061742030087e-06, + "loss": 0.0035, + "step": 2344 + }, + { + "epoch": 0.8477946493130875, + "grad_norm": 0.16427795567478273, + "learning_rate": 6.644239137398563e-06, + "loss": 0.0229, + "step": 2345 + }, + { + "epoch": 0.8481561822125814, + "grad_norm": 0.9779787556944866, + "learning_rate": 6.641415945116313e-06, + "loss": 0.0889, + "step": 2346 + }, + { + "epoch": 0.8485177151120752, + "grad_norm": 1.13655442125098, + "learning_rate": 6.638592166192353e-06, + "loss": 0.1455, + "step": 2347 + }, + { + "epoch": 0.848879248011569, + "grad_norm": 0.10893274396572447, + "learning_rate": 6.635767801635897e-06, + "loss": 0.0165, + "step": 2348 + }, + { + "epoch": 0.8492407809110629, + "grad_norm": 0.13098990032146537, + "learning_rate": 6.632942852456375e-06, + "loss": 0.0057, + "step": 2349 + }, + { + "epoch": 0.8496023138105567, + "grad_norm": 0.22460237488464121, + "learning_rate": 6.630117319663425e-06, + "loss": 0.0286, + "step": 2350 + }, + { + "epoch": 0.8499638467100507, + "grad_norm": 1.6653604914255165, + "learning_rate": 6.627291204266885e-06, + "loss": 0.2246, + "step": 2351 + }, + { + "epoch": 0.8503253796095445, + "grad_norm": 0.9648811548305163, + "learning_rate": 6.624464507276813e-06, + "loss": 0.1553, + "step": 2352 + }, + { + "epoch": 0.8506869125090383, + "grad_norm": 1.106686845796178, + "learning_rate": 6.621637229703468e-06, + "loss": 0.1113, + "step": 2353 + }, + { + "epoch": 0.8510484454085322, + "grad_norm": 0.6730920331597338, + "learning_rate": 6.618809372557322e-06, + "loss": 0.1191, + "step": 2354 + }, + { + "epoch": 0.851409978308026, + "grad_norm": 0.022456097768841485, + "learning_rate": 6.6159809368490465e-06, + "loss": 0.0005, + "step": 2355 + }, + { + "epoch": 0.8517715112075199, + "grad_norm": 1.2898190698631258, + "learning_rate": 6.613151923589525e-06, + "loss": 0.1279, + "step": 2356 + }, + { + "epoch": 0.8521330441070137, + "grad_norm": 0.6938332636855978, + "learning_rate": 6.610322333789847e-06, + "loss": 0.0815, + "step": 2357 + }, + { + "epoch": 0.8524945770065075, + "grad_norm": 0.5059695558260352, + "learning_rate": 6.6074921684613045e-06, + "loss": 0.0231, + "step": 2358 + }, + { + "epoch": 0.8528561099060015, + "grad_norm": 0.21470286975978287, + "learning_rate": 6.604661428615403e-06, + "loss": 0.0255, + "step": 2359 + }, + { + "epoch": 0.8532176428054953, + "grad_norm": 0.4173929941756619, + "learning_rate": 6.601830115263845e-06, + "loss": 0.0432, + "step": 2360 + }, + { + "epoch": 0.8535791757049892, + "grad_norm": 0.15914683963310558, + "learning_rate": 6.598998229418542e-06, + "loss": 0.0229, + "step": 2361 + }, + { + "epoch": 0.853940708604483, + "grad_norm": 0.5679206389455606, + "learning_rate": 6.596165772091609e-06, + "loss": 0.0476, + "step": 2362 + }, + { + "epoch": 0.8543022415039768, + "grad_norm": 0.14891493740534043, + "learning_rate": 6.593332744295364e-06, + "loss": 0.0229, + "step": 2363 + }, + { + "epoch": 0.8546637744034707, + "grad_norm": 0.8517981429657162, + "learning_rate": 6.590499147042335e-06, + "loss": 0.1455, + "step": 2364 + }, + { + "epoch": 0.8550253073029646, + "grad_norm": 0.94446488002712, + "learning_rate": 6.587664981345245e-06, + "loss": 0.0752, + "step": 2365 + }, + { + "epoch": 0.8553868402024585, + "grad_norm": 0.2083511467041562, + "learning_rate": 6.5848302482170264e-06, + "loss": 0.0103, + "step": 2366 + }, + { + "epoch": 0.8557483731019523, + "grad_norm": 0.7236261499124378, + "learning_rate": 6.5819949486708125e-06, + "loss": 0.1113, + "step": 2367 + }, + { + "epoch": 0.8561099060014461, + "grad_norm": 0.07569683646605582, + "learning_rate": 6.579159083719936e-06, + "loss": 0.0035, + "step": 2368 + }, + { + "epoch": 0.85647143890094, + "grad_norm": 0.028676339588565247, + "learning_rate": 6.576322654377937e-06, + "loss": 0.0009, + "step": 2369 + }, + { + "epoch": 0.8568329718004338, + "grad_norm": 0.5122494208001626, + "learning_rate": 6.573485661658554e-06, + "loss": 0.0286, + "step": 2370 + }, + { + "epoch": 0.8571945046999276, + "grad_norm": 2.4110424083308106, + "learning_rate": 6.5706481065757275e-06, + "loss": 0.1738, + "step": 2371 + }, + { + "epoch": 0.8575560375994216, + "grad_norm": 0.031244653836289892, + "learning_rate": 6.5678099901436e-06, + "loss": 0.0011, + "step": 2372 + }, + { + "epoch": 0.8579175704989154, + "grad_norm": 0.6220267235555653, + "learning_rate": 6.5649713133765115e-06, + "loss": 0.1191, + "step": 2373 + }, + { + "epoch": 0.8582791033984093, + "grad_norm": 0.2819093318769048, + "learning_rate": 6.562132077289006e-06, + "loss": 0.0354, + "step": 2374 + }, + { + "epoch": 0.8586406362979031, + "grad_norm": 0.6175833327687156, + "learning_rate": 6.559292282895827e-06, + "loss": 0.0391, + "step": 2375 + }, + { + "epoch": 0.8590021691973969, + "grad_norm": 0.17614118218581443, + "learning_rate": 6.556451931211915e-06, + "loss": 0.0255, + "step": 2376 + }, + { + "epoch": 0.8593637020968908, + "grad_norm": 0.4008092943580671, + "learning_rate": 6.553611023252411e-06, + "loss": 0.0525, + "step": 2377 + }, + { + "epoch": 0.8597252349963846, + "grad_norm": 0.6408186412159544, + "learning_rate": 6.550769560032654e-06, + "loss": 0.0688, + "step": 2378 + }, + { + "epoch": 0.8600867678958786, + "grad_norm": 0.06762108348450599, + "learning_rate": 6.547927542568184e-06, + "loss": 0.0027, + "step": 2379 + }, + { + "epoch": 0.8604483007953724, + "grad_norm": 0.4293730953108034, + "learning_rate": 6.545084971874738e-06, + "loss": 0.1035, + "step": 2380 + }, + { + "epoch": 0.8608098336948662, + "grad_norm": 0.48493512398301925, + "learning_rate": 6.5422418489682484e-06, + "loss": 0.0476, + "step": 2381 + }, + { + "epoch": 0.8611713665943601, + "grad_norm": 0.007747972793440675, + "learning_rate": 6.5393981748648486e-06, + "loss": 0.0003, + "step": 2382 + }, + { + "epoch": 0.8615328994938539, + "grad_norm": 0.26609409090801434, + "learning_rate": 6.536553950580864e-06, + "loss": 0.0317, + "step": 2383 + }, + { + "epoch": 0.8618944323933478, + "grad_norm": 0.43279165890660093, + "learning_rate": 6.533709177132822e-06, + "loss": 0.0286, + "step": 2384 + }, + { + "epoch": 0.8622559652928417, + "grad_norm": 0.16085206619789108, + "learning_rate": 6.530863855537445e-06, + "loss": 0.0015, + "step": 2385 + }, + { + "epoch": 0.8626174981923355, + "grad_norm": 0.38801711902143465, + "learning_rate": 6.528017986811649e-06, + "loss": 0.0317, + "step": 2386 + }, + { + "epoch": 0.8629790310918294, + "grad_norm": 0.04566810565316808, + "learning_rate": 6.525171571972546e-06, + "loss": 0.0022, + "step": 2387 + }, + { + "epoch": 0.8633405639913232, + "grad_norm": 0.07355602258931704, + "learning_rate": 6.522324612037445e-06, + "loss": 0.0032, + "step": 2388 + }, + { + "epoch": 0.8637020968908171, + "grad_norm": 0.21551585170849993, + "learning_rate": 6.5194771080238495e-06, + "loss": 0.0354, + "step": 2389 + }, + { + "epoch": 0.8640636297903109, + "grad_norm": 0.2678434996768197, + "learning_rate": 6.5166290609494566e-06, + "loss": 0.0354, + "step": 2390 + }, + { + "epoch": 0.8644251626898047, + "grad_norm": 0.2559961998006537, + "learning_rate": 6.5137804718321576e-06, + "loss": 0.0354, + "step": 2391 + }, + { + "epoch": 0.8647866955892987, + "grad_norm": 0.36167098823589017, + "learning_rate": 6.510931341690037e-06, + "loss": 0.0525, + "step": 2392 + }, + { + "epoch": 0.8651482284887925, + "grad_norm": 0.19523743025793164, + "learning_rate": 6.508081671541373e-06, + "loss": 0.0317, + "step": 2393 + }, + { + "epoch": 0.8655097613882863, + "grad_norm": 0.4809023823457338, + "learning_rate": 6.505231462404639e-06, + "loss": 0.0184, + "step": 2394 + }, + { + "epoch": 0.8658712942877802, + "grad_norm": 0.12659577120248053, + "learning_rate": 6.502380715298497e-06, + "loss": 0.004, + "step": 2395 + }, + { + "epoch": 0.866232827187274, + "grad_norm": 0.5825599472924412, + "learning_rate": 6.499529431241804e-06, + "loss": 0.0576, + "step": 2396 + }, + { + "epoch": 0.8665943600867679, + "grad_norm": 0.6994283003075928, + "learning_rate": 6.496677611253611e-06, + "loss": 0.0432, + "step": 2397 + }, + { + "epoch": 0.8669558929862617, + "grad_norm": 0.358715649820864, + "learning_rate": 6.493825256353153e-06, + "loss": 0.0231, + "step": 2398 + }, + { + "epoch": 0.8673174258857556, + "grad_norm": 0.004696571730201093, + "learning_rate": 6.4909723675598655e-06, + "loss": 0.0002, + "step": 2399 + }, + { + "epoch": 0.8676789587852495, + "grad_norm": 1.8372348027873322, + "learning_rate": 6.488118945893368e-06, + "loss": 0.2773, + "step": 2400 + }, + { + "epoch": 0.8680404916847433, + "grad_norm": 0.6892125204707977, + "learning_rate": 6.4852649923734725e-06, + "loss": 0.0391, + "step": 2401 + }, + { + "epoch": 0.8684020245842372, + "grad_norm": 0.1545266541361343, + "learning_rate": 6.4824105080201835e-06, + "loss": 0.0229, + "step": 2402 + }, + { + "epoch": 0.868763557483731, + "grad_norm": 0.5084532665666025, + "learning_rate": 6.479555493853691e-06, + "loss": 0.1279, + "step": 2403 + }, + { + "epoch": 0.8691250903832248, + "grad_norm": 0.19992061559960922, + "learning_rate": 6.476699950894377e-06, + "loss": 0.0317, + "step": 2404 + }, + { + "epoch": 0.8694866232827188, + "grad_norm": 1.2680838368232938, + "learning_rate": 6.473843880162812e-06, + "loss": 0.1191, + "step": 2405 + }, + { + "epoch": 0.8698481561822126, + "grad_norm": 0.5534573549568618, + "learning_rate": 6.470987282679756e-06, + "loss": 0.0576, + "step": 2406 + }, + { + "epoch": 0.8702096890817065, + "grad_norm": 0.5425078257029001, + "learning_rate": 6.468130159466156e-06, + "loss": 0.0889, + "step": 2407 + }, + { + "epoch": 0.8705712219812003, + "grad_norm": 0.004698004895950194, + "learning_rate": 6.465272511543146e-06, + "loss": 0.0001, + "step": 2408 + }, + { + "epoch": 0.8709327548806941, + "grad_norm": 0.301404865737575, + "learning_rate": 6.462414339932049e-06, + "loss": 0.0432, + "step": 2409 + }, + { + "epoch": 0.871294287780188, + "grad_norm": 0.040109966782499396, + "learning_rate": 6.459555645654378e-06, + "loss": 0.0012, + "step": 2410 + }, + { + "epoch": 0.8716558206796818, + "grad_norm": 0.9000164622110319, + "learning_rate": 6.456696429731824e-06, + "loss": 0.0688, + "step": 2411 + }, + { + "epoch": 0.8720173535791758, + "grad_norm": 0.23871296482686416, + "learning_rate": 6.453836693186276e-06, + "loss": 0.0317, + "step": 2412 + }, + { + "epoch": 0.8723788864786696, + "grad_norm": 0.5765943517239405, + "learning_rate": 6.4509764370398e-06, + "loss": 0.0752, + "step": 2413 + }, + { + "epoch": 0.8727404193781634, + "grad_norm": 0.4806958990603219, + "learning_rate": 6.448115662314651e-06, + "loss": 0.1279, + "step": 2414 + }, + { + "epoch": 0.8731019522776573, + "grad_norm": 1.3167400479971327, + "learning_rate": 6.44525437003327e-06, + "loss": 0.0354, + "step": 2415 + }, + { + "epoch": 0.8734634851771511, + "grad_norm": 0.2321924164491177, + "learning_rate": 6.442392561218283e-06, + "loss": 0.0317, + "step": 2416 + }, + { + "epoch": 0.8738250180766449, + "grad_norm": 0.06640312519994175, + "learning_rate": 6.439530236892498e-06, + "loss": 0.0022, + "step": 2417 + }, + { + "epoch": 0.8741865509761388, + "grad_norm": 0.5267477558756174, + "learning_rate": 6.436667398078911e-06, + "loss": 0.0889, + "step": 2418 + }, + { + "epoch": 0.8745480838756327, + "grad_norm": 0.5060030804424841, + "learning_rate": 6.433804045800698e-06, + "loss": 0.063, + "step": 2419 + }, + { + "epoch": 0.8749096167751266, + "grad_norm": 0.18253478297981957, + "learning_rate": 6.4309401810812225e-06, + "loss": 0.0317, + "step": 2420 + }, + { + "epoch": 0.8752711496746204, + "grad_norm": 0.06126315160965821, + "learning_rate": 6.428075804944027e-06, + "loss": 0.0028, + "step": 2421 + }, + { + "epoch": 0.8756326825741142, + "grad_norm": 0.17390562662725914, + "learning_rate": 6.425210918412843e-06, + "loss": 0.0317, + "step": 2422 + }, + { + "epoch": 0.8759942154736081, + "grad_norm": 0.289821630584194, + "learning_rate": 6.422345522511575e-06, + "loss": 0.0432, + "step": 2423 + }, + { + "epoch": 0.8763557483731019, + "grad_norm": 0.008381529853559569, + "learning_rate": 6.419479618264318e-06, + "loss": 0.0002, + "step": 2424 + }, + { + "epoch": 0.8767172812725958, + "grad_norm": 0.04436397814353989, + "learning_rate": 6.416613206695346e-06, + "loss": 0.0017, + "step": 2425 + }, + { + "epoch": 0.8770788141720897, + "grad_norm": 0.22743968458399308, + "learning_rate": 6.413746288829112e-06, + "loss": 0.0117, + "step": 2426 + }, + { + "epoch": 0.8774403470715835, + "grad_norm": 0.3066584701823851, + "learning_rate": 6.410878865690253e-06, + "loss": 0.0476, + "step": 2427 + }, + { + "epoch": 0.8778018799710774, + "grad_norm": 0.2854486491103703, + "learning_rate": 6.408010938303584e-06, + "loss": 0.0146, + "step": 2428 + }, + { + "epoch": 0.8781634128705712, + "grad_norm": 0.2887629516413645, + "learning_rate": 6.4051425076941046e-06, + "loss": 0.0184, + "step": 2429 + }, + { + "epoch": 0.8785249457700651, + "grad_norm": 0.2800154322841707, + "learning_rate": 6.402273574886989e-06, + "loss": 0.0391, + "step": 2430 + }, + { + "epoch": 0.8788864786695589, + "grad_norm": 0.09165926982287964, + "learning_rate": 6.399404140907593e-06, + "loss": 0.0031, + "step": 2431 + }, + { + "epoch": 0.8792480115690527, + "grad_norm": 0.5513376096108037, + "learning_rate": 6.3965342067814526e-06, + "loss": 0.0476, + "step": 2432 + }, + { + "epoch": 0.8796095444685467, + "grad_norm": 0.3962945235312506, + "learning_rate": 6.393663773534281e-06, + "loss": 0.0165, + "step": 2433 + }, + { + "epoch": 0.8799710773680405, + "grad_norm": 0.002863199083598219, + "learning_rate": 6.3907928421919715e-06, + "loss": 0.0001, + "step": 2434 + }, + { + "epoch": 0.8803326102675344, + "grad_norm": 0.13105353208223086, + "learning_rate": 6.387921413780594e-06, + "loss": 0.0017, + "step": 2435 + }, + { + "epoch": 0.8806941431670282, + "grad_norm": 0.2655944645002184, + "learning_rate": 6.385049489326395e-06, + "loss": 0.0103, + "step": 2436 + }, + { + "epoch": 0.881055676066522, + "grad_norm": 0.17405143807177278, + "learning_rate": 6.382177069855802e-06, + "loss": 0.0286, + "step": 2437 + }, + { + "epoch": 0.8814172089660159, + "grad_norm": 0.42079731906890067, + "learning_rate": 6.3793041563954165e-06, + "loss": 0.0146, + "step": 2438 + }, + { + "epoch": 0.8817787418655098, + "grad_norm": 0.5679519159239813, + "learning_rate": 6.3764307499720145e-06, + "loss": 0.0476, + "step": 2439 + }, + { + "epoch": 0.8821402747650036, + "grad_norm": 0.7018581680629676, + "learning_rate": 6.3735568516125545e-06, + "loss": 0.0476, + "step": 2440 + }, + { + "epoch": 0.8825018076644975, + "grad_norm": 0.6249986062386436, + "learning_rate": 6.370682462344165e-06, + "loss": 0.0815, + "step": 2441 + }, + { + "epoch": 0.8828633405639913, + "grad_norm": 0.23864506849310674, + "learning_rate": 6.367807583194152e-06, + "loss": 0.0286, + "step": 2442 + }, + { + "epoch": 0.8832248734634852, + "grad_norm": 0.3292631781782488, + "learning_rate": 6.364932215189998e-06, + "loss": 0.0354, + "step": 2443 + }, + { + "epoch": 0.883586406362979, + "grad_norm": 2.779763647232424, + "learning_rate": 6.36205635935936e-06, + "loss": 0.1187, + "step": 2444 + }, + { + "epoch": 0.8839479392624728, + "grad_norm": 0.5368090938282752, + "learning_rate": 6.359180016730064e-06, + "loss": 0.0525, + "step": 2445 + }, + { + "epoch": 0.8843094721619668, + "grad_norm": 0.7861109207849015, + "learning_rate": 6.356303188330118e-06, + "loss": 0.1641, + "step": 2446 + }, + { + "epoch": 0.8846710050614606, + "grad_norm": 0.5646795866415033, + "learning_rate": 6.3534258751877e-06, + "loss": 0.0476, + "step": 2447 + }, + { + "epoch": 0.8850325379609545, + "grad_norm": 1.427793741472884, + "learning_rate": 6.350548078331158e-06, + "loss": 0.063, + "step": 2448 + }, + { + "epoch": 0.8853940708604483, + "grad_norm": 0.04898455722139985, + "learning_rate": 6.347669798789019e-06, + "loss": 0.0012, + "step": 2449 + }, + { + "epoch": 0.8857556037599421, + "grad_norm": 0.6369747102749306, + "learning_rate": 6.3447910375899764e-06, + "loss": 0.1279, + "step": 2450 + }, + { + "epoch": 0.886117136659436, + "grad_norm": 0.15804301506725021, + "learning_rate": 6.341911795762903e-06, + "loss": 0.0255, + "step": 2451 + }, + { + "epoch": 0.8864786695589298, + "grad_norm": 1.0608303130917285, + "learning_rate": 6.339032074336836e-06, + "loss": 0.1035, + "step": 2452 + }, + { + "epoch": 0.8868402024584238, + "grad_norm": 0.3772465570815861, + "learning_rate": 6.3361518743409885e-06, + "loss": 0.0525, + "step": 2453 + }, + { + "epoch": 0.8872017353579176, + "grad_norm": 1.173731492416296, + "learning_rate": 6.333271196804743e-06, + "loss": 0.0432, + "step": 2454 + }, + { + "epoch": 0.8875632682574114, + "grad_norm": 0.020726433115822878, + "learning_rate": 6.330390042757653e-06, + "loss": 0.0002, + "step": 2455 + }, + { + "epoch": 0.8879248011569053, + "grad_norm": 0.027613588600986876, + "learning_rate": 6.3275084132294425e-06, + "loss": 0.0009, + "step": 2456 + }, + { + "epoch": 0.8882863340563991, + "grad_norm": 0.7423160281791397, + "learning_rate": 6.324626309250006e-06, + "loss": 0.0752, + "step": 2457 + }, + { + "epoch": 0.888647866955893, + "grad_norm": 0.404231483273273, + "learning_rate": 6.321743731849406e-06, + "loss": 0.0354, + "step": 2458 + }, + { + "epoch": 0.8890093998553869, + "grad_norm": 0.13356229634544145, + "learning_rate": 6.3188606820578744e-06, + "loss": 0.0206, + "step": 2459 + }, + { + "epoch": 0.8893709327548807, + "grad_norm": 0.005630978657407525, + "learning_rate": 6.315977160905813e-06, + "loss": 0.0002, + "step": 2460 + }, + { + "epoch": 0.8897324656543746, + "grad_norm": 0.39047305852861497, + "learning_rate": 6.313093169423793e-06, + "loss": 0.0104, + "step": 2461 + }, + { + "epoch": 0.8900939985538684, + "grad_norm": 0.16394803773702235, + "learning_rate": 6.3102087086425516e-06, + "loss": 0.0255, + "step": 2462 + }, + { + "epoch": 0.8904555314533622, + "grad_norm": 0.6389543096808127, + "learning_rate": 6.307323779592993e-06, + "loss": 0.0476, + "step": 2463 + }, + { + "epoch": 0.8908170643528561, + "grad_norm": 0.4761379245601871, + "learning_rate": 6.304438383306193e-06, + "loss": 0.1113, + "step": 2464 + }, + { + "epoch": 0.8911785972523499, + "grad_norm": 0.1496692408541743, + "learning_rate": 6.301552520813388e-06, + "loss": 0.0206, + "step": 2465 + }, + { + "epoch": 0.8915401301518439, + "grad_norm": 0.03970722349529476, + "learning_rate": 6.298666193145988e-06, + "loss": 0.0013, + "step": 2466 + }, + { + "epoch": 0.8919016630513377, + "grad_norm": 0.5566678413891573, + "learning_rate": 6.295779401335564e-06, + "loss": 0.0391, + "step": 2467 + }, + { + "epoch": 0.8922631959508315, + "grad_norm": 0.9148301612950618, + "learning_rate": 6.292892146413856e-06, + "loss": 0.0391, + "step": 2468 + }, + { + "epoch": 0.8926247288503254, + "grad_norm": 0.0019046690858030423, + "learning_rate": 6.290004429412768e-06, + "loss": 0.0001, + "step": 2469 + }, + { + "epoch": 0.8929862617498192, + "grad_norm": 0.9411410761535084, + "learning_rate": 6.287116251364369e-06, + "loss": 0.0815, + "step": 2470 + }, + { + "epoch": 0.8933477946493131, + "grad_norm": 0.5344369245147377, + "learning_rate": 6.284227613300893e-06, + "loss": 0.1279, + "step": 2471 + }, + { + "epoch": 0.8937093275488069, + "grad_norm": 1.497418884849518, + "learning_rate": 6.28133851625474e-06, + "loss": 0.1113, + "step": 2472 + }, + { + "epoch": 0.8940708604483008, + "grad_norm": 0.12164062209084307, + "learning_rate": 6.2784489612584695e-06, + "loss": 0.0206, + "step": 2473 + }, + { + "epoch": 0.8944323933477947, + "grad_norm": 0.2499009711295901, + "learning_rate": 6.275558949344813e-06, + "loss": 0.0354, + "step": 2474 + }, + { + "epoch": 0.8947939262472885, + "grad_norm": 0.4983316465669491, + "learning_rate": 6.272668481546655e-06, + "loss": 0.0525, + "step": 2475 + }, + { + "epoch": 0.8951554591467824, + "grad_norm": 3.1262893319490566, + "learning_rate": 6.2697775588970526e-06, + "loss": 0.1836, + "step": 2476 + }, + { + "epoch": 0.8955169920462762, + "grad_norm": 0.8044193686412383, + "learning_rate": 6.266886182429216e-06, + "loss": 0.1113, + "step": 2477 + }, + { + "epoch": 0.89587852494577, + "grad_norm": 0.6986890067782227, + "learning_rate": 6.263994353176526e-06, + "loss": 0.063, + "step": 2478 + }, + { + "epoch": 0.896240057845264, + "grad_norm": 0.4630532999993751, + "learning_rate": 6.261102072172523e-06, + "loss": 0.0432, + "step": 2479 + }, + { + "epoch": 0.8966015907447578, + "grad_norm": 0.009367869370840118, + "learning_rate": 6.258209340450903e-06, + "loss": 0.0003, + "step": 2480 + }, + { + "epoch": 0.8969631236442517, + "grad_norm": 0.509531366225729, + "learning_rate": 6.2553161590455305e-06, + "loss": 0.0957, + "step": 2481 + }, + { + "epoch": 0.8973246565437455, + "grad_norm": 0.18131095768552818, + "learning_rate": 6.252422528990427e-06, + "loss": 0.0286, + "step": 2482 + }, + { + "epoch": 0.8976861894432393, + "grad_norm": 1.4692727995894495, + "learning_rate": 6.249528451319777e-06, + "loss": 0.0889, + "step": 2483 + }, + { + "epoch": 0.8980477223427332, + "grad_norm": 0.23629926542194643, + "learning_rate": 6.246633927067923e-06, + "loss": 0.0064, + "step": 2484 + }, + { + "epoch": 0.898409255242227, + "grad_norm": 0.1262448120770378, + "learning_rate": 6.243738957269366e-06, + "loss": 0.0206, + "step": 2485 + }, + { + "epoch": 0.8987707881417208, + "grad_norm": 0.06474220064983108, + "learning_rate": 6.240843542958768e-06, + "loss": 0.0022, + "step": 2486 + }, + { + "epoch": 0.8991323210412148, + "grad_norm": 0.9929756656251513, + "learning_rate": 6.23794768517095e-06, + "loss": 0.0688, + "step": 2487 + }, + { + "epoch": 0.8994938539407086, + "grad_norm": 0.081026323118129, + "learning_rate": 6.235051384940889e-06, + "loss": 0.0022, + "step": 2488 + }, + { + "epoch": 0.8998553868402025, + "grad_norm": 0.9463223451411042, + "learning_rate": 6.232154643303726e-06, + "loss": 0.0579, + "step": 2489 + }, + { + "epoch": 0.9002169197396963, + "grad_norm": 0.29066272810819604, + "learning_rate": 6.229257461294752e-06, + "loss": 0.0317, + "step": 2490 + }, + { + "epoch": 0.9005784526391901, + "grad_norm": 0.001595738335027853, + "learning_rate": 6.2263598399494205e-06, + "loss": 0.0001, + "step": 2491 + }, + { + "epoch": 0.900939985538684, + "grad_norm": 0.2713525852441914, + "learning_rate": 6.2234617803033425e-06, + "loss": 0.0354, + "step": 2492 + }, + { + "epoch": 0.9013015184381779, + "grad_norm": 0.6031124049363402, + "learning_rate": 6.2205632833922805e-06, + "loss": 0.1035, + "step": 2493 + }, + { + "epoch": 0.9016630513376718, + "grad_norm": 0.04799963934373952, + "learning_rate": 6.217664350252162e-06, + "loss": 0.0005, + "step": 2494 + }, + { + "epoch": 0.9020245842371656, + "grad_norm": 0.519071613876734, + "learning_rate": 6.214764981919057e-06, + "loss": 0.1113, + "step": 2495 + }, + { + "epoch": 0.9023861171366594, + "grad_norm": 0.7824543882186696, + "learning_rate": 6.2118651794292075e-06, + "loss": 0.1035, + "step": 2496 + }, + { + "epoch": 0.9027476500361533, + "grad_norm": 0.3338665559808172, + "learning_rate": 6.208964943818997e-06, + "loss": 0.0432, + "step": 2497 + }, + { + "epoch": 0.9031091829356471, + "grad_norm": 0.23875086475152046, + "learning_rate": 6.20606427612497e-06, + "loss": 0.0286, + "step": 2498 + }, + { + "epoch": 0.903470715835141, + "grad_norm": 0.021886718693747045, + "learning_rate": 6.203163177383828e-06, + "loss": 0.0007, + "step": 2499 + }, + { + "epoch": 0.9038322487346349, + "grad_norm": 0.35460860494787166, + "learning_rate": 6.200261648632417e-06, + "loss": 0.0103, + "step": 2500 + }, + { + "epoch": 0.9041937816341287, + "grad_norm": 0.10805013842672663, + "learning_rate": 6.1973596909077485e-06, + "loss": 0.0184, + "step": 2501 + }, + { + "epoch": 0.9045553145336226, + "grad_norm": 0.08983734144792825, + "learning_rate": 6.194457305246978e-06, + "loss": 0.0031, + "step": 2502 + }, + { + "epoch": 0.9049168474331164, + "grad_norm": 0.3033391978605701, + "learning_rate": 6.191554492687418e-06, + "loss": 0.0354, + "step": 2503 + }, + { + "epoch": 0.9052783803326103, + "grad_norm": 0.2450869716326271, + "learning_rate": 6.188651254266536e-06, + "loss": 0.0317, + "step": 2504 + }, + { + "epoch": 0.9056399132321041, + "grad_norm": 0.4502201974845171, + "learning_rate": 6.185747591021944e-06, + "loss": 0.0432, + "step": 2505 + }, + { + "epoch": 0.9060014461315979, + "grad_norm": 1.7058631949374627, + "learning_rate": 6.182843503991416e-06, + "loss": 0.0476, + "step": 2506 + }, + { + "epoch": 0.9063629790310919, + "grad_norm": 0.22729413671135154, + "learning_rate": 6.179938994212868e-06, + "loss": 0.0286, + "step": 2507 + }, + { + "epoch": 0.9067245119305857, + "grad_norm": 0.8386623994533893, + "learning_rate": 6.177034062724372e-06, + "loss": 0.0432, + "step": 2508 + }, + { + "epoch": 0.9070860448300795, + "grad_norm": 0.029792727924923195, + "learning_rate": 6.17412871056415e-06, + "loss": 0.0011, + "step": 2509 + }, + { + "epoch": 0.9074475777295734, + "grad_norm": 1.3157558403686311, + "learning_rate": 6.171222938770576e-06, + "loss": 0.0231, + "step": 2510 + }, + { + "epoch": 0.9078091106290672, + "grad_norm": 0.2924795994254827, + "learning_rate": 6.16831674838217e-06, + "loss": 0.0205, + "step": 2511 + }, + { + "epoch": 0.9081706435285611, + "grad_norm": 0.13504336842148534, + "learning_rate": 6.165410140437605e-06, + "loss": 0.0184, + "step": 2512 + }, + { + "epoch": 0.908532176428055, + "grad_norm": 0.05439053072265098, + "learning_rate": 6.162503115975701e-06, + "loss": 0.0019, + "step": 2513 + }, + { + "epoch": 0.9088937093275488, + "grad_norm": 0.02269396855185247, + "learning_rate": 6.15959567603543e-06, + "loss": 0.0008, + "step": 2514 + }, + { + "epoch": 0.9092552422270427, + "grad_norm": 1.2964969650767653, + "learning_rate": 6.156687821655909e-06, + "loss": 0.1035, + "step": 2515 + }, + { + "epoch": 0.9096167751265365, + "grad_norm": 0.3711917836075629, + "learning_rate": 6.153779553876403e-06, + "loss": 0.0317, + "step": 2516 + }, + { + "epoch": 0.9099783080260304, + "grad_norm": 1.4592200726539608, + "learning_rate": 6.1508708737363295e-06, + "loss": 0.0752, + "step": 2517 + }, + { + "epoch": 0.9103398409255242, + "grad_norm": 0.0017413380991936195, + "learning_rate": 6.147961782275248e-06, + "loss": 0.0001, + "step": 2518 + }, + { + "epoch": 0.910701373825018, + "grad_norm": 3.1789260927909755, + "learning_rate": 6.145052280532868e-06, + "loss": 0.1738, + "step": 2519 + }, + { + "epoch": 0.911062906724512, + "grad_norm": 0.3926951185133265, + "learning_rate": 6.142142369549045e-06, + "loss": 0.0286, + "step": 2520 + }, + { + "epoch": 0.9114244396240058, + "grad_norm": 0.18653120844747254, + "learning_rate": 6.139232050363779e-06, + "loss": 0.0206, + "step": 2521 + }, + { + "epoch": 0.9117859725234997, + "grad_norm": 1.200332656249662, + "learning_rate": 6.13632132401722e-06, + "loss": 0.1113, + "step": 2522 + }, + { + "epoch": 0.9121475054229935, + "grad_norm": 2.9717868360675728, + "learning_rate": 6.133410191549658e-06, + "loss": 0.2129, + "step": 2523 + }, + { + "epoch": 0.9125090383224873, + "grad_norm": 1.5422218502137133, + "learning_rate": 6.130498654001534e-06, + "loss": 0.063, + "step": 2524 + }, + { + "epoch": 0.9128705712219812, + "grad_norm": 0.15570983200119065, + "learning_rate": 6.127586712413429e-06, + "loss": 0.0206, + "step": 2525 + }, + { + "epoch": 0.913232104121475, + "grad_norm": 0.027866523039067802, + "learning_rate": 6.124674367826072e-06, + "loss": 0.0006, + "step": 2526 + }, + { + "epoch": 0.913593637020969, + "grad_norm": 0.046264905440884664, + "learning_rate": 6.121761621280333e-06, + "loss": 0.0012, + "step": 2527 + }, + { + "epoch": 0.9139551699204628, + "grad_norm": 0.13345667468669367, + "learning_rate": 6.1188484738172264e-06, + "loss": 0.0184, + "step": 2528 + }, + { + "epoch": 0.9143167028199566, + "grad_norm": 0.045824962031401364, + "learning_rate": 6.115934926477911e-06, + "loss": 0.0017, + "step": 2529 + }, + { + "epoch": 0.9146782357194505, + "grad_norm": 0.8153247533497548, + "learning_rate": 6.11302098030369e-06, + "loss": 0.0476, + "step": 2530 + }, + { + "epoch": 0.9150397686189443, + "grad_norm": 0.2509927885568228, + "learning_rate": 6.110106636336004e-06, + "loss": 0.0165, + "step": 2531 + }, + { + "epoch": 0.9154013015184381, + "grad_norm": 0.06103677032287624, + "learning_rate": 6.107191895616442e-06, + "loss": 0.0028, + "step": 2532 + }, + { + "epoch": 0.915762834417932, + "grad_norm": 0.16961071342212394, + "learning_rate": 6.104276759186728e-06, + "loss": 0.0229, + "step": 2533 + }, + { + "epoch": 0.9161243673174259, + "grad_norm": 0.18149866560186595, + "learning_rate": 6.1013612280887344e-06, + "loss": 0.0206, + "step": 2534 + }, + { + "epoch": 0.9164859002169198, + "grad_norm": 1.0592586862176054, + "learning_rate": 6.098445303364472e-06, + "loss": 0.1934, + "step": 2535 + }, + { + "epoch": 0.9168474331164136, + "grad_norm": 0.6423583572508385, + "learning_rate": 6.095528986056088e-06, + "loss": 0.1455, + "step": 2536 + }, + { + "epoch": 0.9172089660159074, + "grad_norm": 0.07513827504169969, + "learning_rate": 6.092612277205876e-06, + "loss": 0.0022, + "step": 2537 + }, + { + "epoch": 0.9175704989154013, + "grad_norm": 0.8582768270829084, + "learning_rate": 6.0896951778562665e-06, + "loss": 0.0889, + "step": 2538 + }, + { + "epoch": 0.9179320318148951, + "grad_norm": 2.6074619530610823, + "learning_rate": 6.086777689049831e-06, + "loss": 0.0476, + "step": 2539 + }, + { + "epoch": 0.918293564714389, + "grad_norm": 1.52922146104831, + "learning_rate": 6.083859811829278e-06, + "loss": 0.0957, + "step": 2540 + }, + { + "epoch": 0.9186550976138829, + "grad_norm": 0.062395856020264276, + "learning_rate": 6.080941547237458e-06, + "loss": 0.0022, + "step": 2541 + }, + { + "epoch": 0.9190166305133767, + "grad_norm": 0.7382377156448532, + "learning_rate": 6.078022896317356e-06, + "loss": 0.1455, + "step": 2542 + }, + { + "epoch": 0.9193781634128706, + "grad_norm": 0.00500817679028902, + "learning_rate": 6.075103860112099e-06, + "loss": 0.0002, + "step": 2543 + }, + { + "epoch": 0.9197396963123644, + "grad_norm": 0.667025121366805, + "learning_rate": 6.07218443966495e-06, + "loss": 0.1191, + "step": 2544 + }, + { + "epoch": 0.9201012292118583, + "grad_norm": 0.028924605847842698, + "learning_rate": 6.069264636019306e-06, + "loss": 0.0008, + "step": 2545 + }, + { + "epoch": 0.9204627621113521, + "grad_norm": 0.21564929340887465, + "learning_rate": 6.066344450218711e-06, + "loss": 0.0231, + "step": 2546 + }, + { + "epoch": 0.920824295010846, + "grad_norm": 0.15840201139145313, + "learning_rate": 6.0634238833068315e-06, + "loss": 0.0184, + "step": 2547 + }, + { + "epoch": 0.9211858279103399, + "grad_norm": 0.8949672219296471, + "learning_rate": 6.060502936327481e-06, + "loss": 0.1553, + "step": 2548 + }, + { + "epoch": 0.9215473608098337, + "grad_norm": 0.040127401808680914, + "learning_rate": 6.057581610324605e-06, + "loss": 0.0011, + "step": 2549 + }, + { + "epoch": 0.9219088937093276, + "grad_norm": 0.1875225575221273, + "learning_rate": 6.054659906342284e-06, + "loss": 0.0229, + "step": 2550 + }, + { + "epoch": 0.9222704266088214, + "grad_norm": 0.11429714831494793, + "learning_rate": 6.051737825424737e-06, + "loss": 0.0206, + "step": 2551 + }, + { + "epoch": 0.9226319595083152, + "grad_norm": 0.06147321618838176, + "learning_rate": 6.048815368616311e-06, + "loss": 0.0027, + "step": 2552 + }, + { + "epoch": 0.9229934924078091, + "grad_norm": 0.9927067900463582, + "learning_rate": 6.045892536961494e-06, + "loss": 0.0525, + "step": 2553 + }, + { + "epoch": 0.923355025307303, + "grad_norm": 0.48379862829264775, + "learning_rate": 6.042969331504906e-06, + "loss": 0.1279, + "step": 2554 + }, + { + "epoch": 0.9237165582067968, + "grad_norm": 1.1182529732125415, + "learning_rate": 6.040045753291298e-06, + "loss": 0.063, + "step": 2555 + }, + { + "epoch": 0.9240780911062907, + "grad_norm": 0.05931324791818435, + "learning_rate": 6.037121803365559e-06, + "loss": 0.0017, + "step": 2556 + }, + { + "epoch": 0.9244396240057845, + "grad_norm": 0.0037057316417779024, + "learning_rate": 6.034197482772705e-06, + "loss": 0.0001, + "step": 2557 + }, + { + "epoch": 0.9248011569052784, + "grad_norm": 0.045200488573736576, + "learning_rate": 6.031272792557889e-06, + "loss": 0.0015, + "step": 2558 + }, + { + "epoch": 0.9251626898047722, + "grad_norm": 0.1753721626905568, + "learning_rate": 6.028347733766394e-06, + "loss": 0.0206, + "step": 2559 + }, + { + "epoch": 0.925524222704266, + "grad_norm": 0.5779717786939896, + "learning_rate": 6.025422307443636e-06, + "loss": 0.1191, + "step": 2560 + }, + { + "epoch": 0.92588575560376, + "grad_norm": 0.17041920842839656, + "learning_rate": 6.022496514635163e-06, + "loss": 0.0317, + "step": 2561 + }, + { + "epoch": 0.9262472885032538, + "grad_norm": 0.09283615818212684, + "learning_rate": 6.019570356386651e-06, + "loss": 0.001, + "step": 2562 + }, + { + "epoch": 0.9266088214027477, + "grad_norm": 0.03245523468937079, + "learning_rate": 6.016643833743908e-06, + "loss": 0.001, + "step": 2563 + }, + { + "epoch": 0.9269703543022415, + "grad_norm": 0.8333492868045457, + "learning_rate": 6.0137169477528745e-06, + "loss": 0.0354, + "step": 2564 + }, + { + "epoch": 0.9273318872017353, + "grad_norm": 0.8768522918131865, + "learning_rate": 6.010789699459616e-06, + "loss": 0.063, + "step": 2565 + }, + { + "epoch": 0.9276934201012292, + "grad_norm": 0.061177766277911945, + "learning_rate": 6.007862089910335e-06, + "loss": 0.002, + "step": 2566 + }, + { + "epoch": 0.928054953000723, + "grad_norm": 0.28178999245566655, + "learning_rate": 6.004934120151354e-06, + "loss": 0.0354, + "step": 2567 + }, + { + "epoch": 0.928416485900217, + "grad_norm": 0.011780564853130048, + "learning_rate": 6.002005791229131e-06, + "loss": 0.0002, + "step": 2568 + }, + { + "epoch": 0.9287780187997108, + "grad_norm": 0.07074590358127789, + "learning_rate": 5.999077104190249e-06, + "loss": 0.0027, + "step": 2569 + }, + { + "epoch": 0.9291395516992046, + "grad_norm": 0.20390804553682979, + "learning_rate": 5.9961480600814205e-06, + "loss": 0.0286, + "step": 2570 + }, + { + "epoch": 0.9295010845986985, + "grad_norm": 0.043678265058633606, + "learning_rate": 5.993218659949488e-06, + "loss": 0.0009, + "step": 2571 + }, + { + "epoch": 0.9298626174981923, + "grad_norm": 0.1726873875202264, + "learning_rate": 5.9902889048414125e-06, + "loss": 0.0286, + "step": 2572 + }, + { + "epoch": 0.9302241503976862, + "grad_norm": 0.952148916368132, + "learning_rate": 5.987358795804294e-06, + "loss": 0.0476, + "step": 2573 + }, + { + "epoch": 0.93058568329718, + "grad_norm": 0.010458769855223257, + "learning_rate": 5.984428333885349e-06, + "loss": 0.0002, + "step": 2574 + }, + { + "epoch": 0.9309472161966739, + "grad_norm": 0.002397756643828138, + "learning_rate": 5.981497520131926e-06, + "loss": 0.0001, + "step": 2575 + }, + { + "epoch": 0.9313087490961678, + "grad_norm": 3.4535320047118585, + "learning_rate": 5.9785663555914965e-06, + "loss": 0.1865, + "step": 2576 + }, + { + "epoch": 0.9316702819956616, + "grad_norm": 0.30800045433481377, + "learning_rate": 5.975634841311657e-06, + "loss": 0.0354, + "step": 2577 + }, + { + "epoch": 0.9320318148951554, + "grad_norm": 2.6948194544846573, + "learning_rate": 5.972702978340133e-06, + "loss": 0.3984, + "step": 2578 + }, + { + "epoch": 0.9323933477946493, + "grad_norm": 0.1443231177746419, + "learning_rate": 5.969770767724768e-06, + "loss": 0.0045, + "step": 2579 + }, + { + "epoch": 0.9327548806941431, + "grad_norm": 0.1319132370251666, + "learning_rate": 5.966838210513535e-06, + "loss": 0.0206, + "step": 2580 + }, + { + "epoch": 0.9331164135936371, + "grad_norm": 0.1602091579079953, + "learning_rate": 5.963905307754531e-06, + "loss": 0.0255, + "step": 2581 + }, + { + "epoch": 0.9334779464931309, + "grad_norm": 0.12843270233104845, + "learning_rate": 5.960972060495973e-06, + "loss": 0.0206, + "step": 2582 + }, + { + "epoch": 0.9338394793926247, + "grad_norm": 0.21672187162628068, + "learning_rate": 5.958038469786203e-06, + "loss": 0.0051, + "step": 2583 + }, + { + "epoch": 0.9342010122921186, + "grad_norm": 0.6003722864012999, + "learning_rate": 5.955104536673687e-06, + "loss": 0.1191, + "step": 2584 + }, + { + "epoch": 0.9345625451916124, + "grad_norm": 0.021240970522358106, + "learning_rate": 5.95217026220701e-06, + "loss": 0.0006, + "step": 2585 + }, + { + "epoch": 0.9349240780911063, + "grad_norm": 0.03708665635265594, + "learning_rate": 5.949235647434884e-06, + "loss": 0.0013, + "step": 2586 + }, + { + "epoch": 0.9352856109906001, + "grad_norm": 0.28886503003193337, + "learning_rate": 5.946300693406136e-06, + "loss": 0.0354, + "step": 2587 + }, + { + "epoch": 0.935647143890094, + "grad_norm": 0.9821371412107366, + "learning_rate": 5.943365401169721e-06, + "loss": 0.0576, + "step": 2588 + }, + { + "epoch": 0.9360086767895879, + "grad_norm": 0.24886000334003336, + "learning_rate": 5.94042977177471e-06, + "loss": 0.0391, + "step": 2589 + }, + { + "epoch": 0.9363702096890817, + "grad_norm": 2.0624849596988897, + "learning_rate": 5.937493806270297e-06, + "loss": 0.0957, + "step": 2590 + }, + { + "epoch": 0.9367317425885756, + "grad_norm": 0.22888303550853176, + "learning_rate": 5.9345575057057955e-06, + "loss": 0.0206, + "step": 2591 + }, + { + "epoch": 0.9370932754880694, + "grad_norm": 0.22279513881708257, + "learning_rate": 5.931620871130639e-06, + "loss": 0.0057, + "step": 2592 + }, + { + "epoch": 0.9374548083875632, + "grad_norm": 0.04339913319578736, + "learning_rate": 5.928683903594381e-06, + "loss": 0.0015, + "step": 2593 + }, + { + "epoch": 0.9378163412870572, + "grad_norm": 0.13315612229197613, + "learning_rate": 5.925746604146691e-06, + "loss": 0.0229, + "step": 2594 + }, + { + "epoch": 0.938177874186551, + "grad_norm": 0.16172464348627827, + "learning_rate": 5.922808973837359e-06, + "loss": 0.0258, + "step": 2595 + }, + { + "epoch": 0.9385394070860448, + "grad_norm": 0.12966767758471828, + "learning_rate": 5.919871013716294e-06, + "loss": 0.0229, + "step": 2596 + }, + { + "epoch": 0.9389009399855387, + "grad_norm": 0.0020073819990000467, + "learning_rate": 5.916932724833525e-06, + "loss": 0.0001, + "step": 2597 + }, + { + "epoch": 0.9392624728850325, + "grad_norm": 0.16316796953269969, + "learning_rate": 5.913994108239193e-06, + "loss": 0.0051, + "step": 2598 + }, + { + "epoch": 0.9396240057845264, + "grad_norm": 0.0017849508961832853, + "learning_rate": 5.911055164983559e-06, + "loss": 0.0001, + "step": 2599 + }, + { + "epoch": 0.9399855386840202, + "grad_norm": 0.11585549054680494, + "learning_rate": 5.908115896117e-06, + "loss": 0.0184, + "step": 2600 + }, + { + "epoch": 0.940347071583514, + "grad_norm": 0.4641028479209899, + "learning_rate": 5.905176302690015e-06, + "loss": 0.0117, + "step": 2601 + }, + { + "epoch": 0.940708604483008, + "grad_norm": 0.007666281778307229, + "learning_rate": 5.902236385753207e-06, + "loss": 0.0002, + "step": 2602 + }, + { + "epoch": 0.9410701373825018, + "grad_norm": 0.3719975472549993, + "learning_rate": 5.899296146357307e-06, + "loss": 0.0391, + "step": 2603 + }, + { + "epoch": 0.9414316702819957, + "grad_norm": 0.2169019890010398, + "learning_rate": 5.896355585553154e-06, + "loss": 0.0286, + "step": 2604 + }, + { + "epoch": 0.9417932031814895, + "grad_norm": 0.20712473766969164, + "learning_rate": 5.893414704391702e-06, + "loss": 0.0092, + "step": 2605 + }, + { + "epoch": 0.9421547360809833, + "grad_norm": 0.2845735762687585, + "learning_rate": 5.890473503924026e-06, + "loss": 0.0092, + "step": 2606 + }, + { + "epoch": 0.9425162689804772, + "grad_norm": 0.12286800342625223, + "learning_rate": 5.887531985201307e-06, + "loss": 0.0045, + "step": 2607 + }, + { + "epoch": 0.9428778018799711, + "grad_norm": 0.5829170744967145, + "learning_rate": 5.884590149274843e-06, + "loss": 0.1191, + "step": 2608 + }, + { + "epoch": 0.943239334779465, + "grad_norm": 0.36548213379912986, + "learning_rate": 5.881647997196046e-06, + "loss": 0.0255, + "step": 2609 + }, + { + "epoch": 0.9436008676789588, + "grad_norm": 0.007704251256414531, + "learning_rate": 5.8787055300164406e-06, + "loss": 0.0002, + "step": 2610 + }, + { + "epoch": 0.9439624005784526, + "grad_norm": 0.8241160547527555, + "learning_rate": 5.875762748787666e-06, + "loss": 0.0815, + "step": 2611 + }, + { + "epoch": 0.9443239334779465, + "grad_norm": 0.1454905961025096, + "learning_rate": 5.872819654561468e-06, + "loss": 0.0206, + "step": 2612 + }, + { + "epoch": 0.9446854663774403, + "grad_norm": 0.2780982105568724, + "learning_rate": 5.869876248389711e-06, + "loss": 0.0286, + "step": 2613 + }, + { + "epoch": 0.9450469992769343, + "grad_norm": 0.2504261049128973, + "learning_rate": 5.866932531324366e-06, + "loss": 0.0229, + "step": 2614 + }, + { + "epoch": 0.9454085321764281, + "grad_norm": 0.8781228908764114, + "learning_rate": 5.863988504417516e-06, + "loss": 0.0432, + "step": 2615 + }, + { + "epoch": 0.9457700650759219, + "grad_norm": 1.9170170485874412, + "learning_rate": 5.861044168721358e-06, + "loss": 0.1367, + "step": 2616 + }, + { + "epoch": 0.9461315979754158, + "grad_norm": 0.3586546187201101, + "learning_rate": 5.858099525288194e-06, + "loss": 0.0146, + "step": 2617 + }, + { + "epoch": 0.9464931308749096, + "grad_norm": 0.10888479555630833, + "learning_rate": 5.855154575170445e-06, + "loss": 0.0146, + "step": 2618 + }, + { + "epoch": 0.9468546637744034, + "grad_norm": 1.6663316146240281, + "learning_rate": 5.852209319420629e-06, + "loss": 0.0815, + "step": 2619 + }, + { + "epoch": 0.9472161966738973, + "grad_norm": 0.5471101553624205, + "learning_rate": 5.849263759091382e-06, + "loss": 0.0146, + "step": 2620 + }, + { + "epoch": 0.9475777295733911, + "grad_norm": 0.005753322645179661, + "learning_rate": 5.846317895235446e-06, + "loss": 0.0002, + "step": 2621 + }, + { + "epoch": 0.9479392624728851, + "grad_norm": 0.1155882558811455, + "learning_rate": 5.843371728905673e-06, + "loss": 0.0146, + "step": 2622 + }, + { + "epoch": 0.9483007953723789, + "grad_norm": 0.033862140794749064, + "learning_rate": 5.840425261155022e-06, + "loss": 0.0013, + "step": 2623 + }, + { + "epoch": 0.9486623282718727, + "grad_norm": 0.14739387947052557, + "learning_rate": 5.8374784930365616e-06, + "loss": 0.0165, + "step": 2624 + }, + { + "epoch": 0.9490238611713666, + "grad_norm": 2.0731902693856563, + "learning_rate": 5.8345314256034624e-06, + "loss": 0.0231, + "step": 2625 + }, + { + "epoch": 0.9493853940708604, + "grad_norm": 0.7988030161845844, + "learning_rate": 5.8315840599090104e-06, + "loss": 0.1113, + "step": 2626 + }, + { + "epoch": 0.9497469269703543, + "grad_norm": 0.023092357419508828, + "learning_rate": 5.8286363970065876e-06, + "loss": 0.0006, + "step": 2627 + }, + { + "epoch": 0.9501084598698482, + "grad_norm": 0.16310902108555886, + "learning_rate": 5.8256884379496945e-06, + "loss": 0.0012, + "step": 2628 + }, + { + "epoch": 0.950469992769342, + "grad_norm": 0.14459826620140215, + "learning_rate": 5.8227401837919275e-06, + "loss": 0.0184, + "step": 2629 + }, + { + "epoch": 0.9508315256688359, + "grad_norm": 0.03928504528718341, + "learning_rate": 5.81979163558699e-06, + "loss": 0.0008, + "step": 2630 + }, + { + "epoch": 0.9511930585683297, + "grad_norm": 1.2800248955246722, + "learning_rate": 5.816842794388697e-06, + "loss": 0.0815, + "step": 2631 + }, + { + "epoch": 0.9515545914678236, + "grad_norm": 0.0017442877002064375, + "learning_rate": 5.81389366125096e-06, + "loss": 0.0, + "step": 2632 + }, + { + "epoch": 0.9519161243673174, + "grad_norm": 0.3806952802030742, + "learning_rate": 5.810944237227803e-06, + "loss": 0.0231, + "step": 2633 + }, + { + "epoch": 0.9522776572668112, + "grad_norm": 0.15415380912108315, + "learning_rate": 5.807994523373345e-06, + "loss": 0.0184, + "step": 2634 + }, + { + "epoch": 0.9526391901663052, + "grad_norm": 0.20040872354104305, + "learning_rate": 5.805044520741814e-06, + "loss": 0.0146, + "step": 2635 + }, + { + "epoch": 0.953000723065799, + "grad_norm": 0.06640314986666371, + "learning_rate": 5.8020942303875425e-06, + "loss": 0.0021, + "step": 2636 + }, + { + "epoch": 0.9533622559652929, + "grad_norm": 0.041947973042427276, + "learning_rate": 5.799143653364961e-06, + "loss": 0.0011, + "step": 2637 + }, + { + "epoch": 0.9537237888647867, + "grad_norm": 0.5151667461231261, + "learning_rate": 5.796192790728608e-06, + "loss": 0.0391, + "step": 2638 + }, + { + "epoch": 0.9540853217642805, + "grad_norm": 0.5763664144661935, + "learning_rate": 5.793241643533119e-06, + "loss": 0.0354, + "step": 2639 + }, + { + "epoch": 0.9544468546637744, + "grad_norm": 0.7158692699709946, + "learning_rate": 5.790290212833235e-06, + "loss": 0.2031, + "step": 2640 + }, + { + "epoch": 0.9548083875632682, + "grad_norm": 0.18009118000932645, + "learning_rate": 5.787338499683794e-06, + "loss": 0.0206, + "step": 2641 + }, + { + "epoch": 0.9551699204627621, + "grad_norm": 1.2996693596662425, + "learning_rate": 5.78438650513974e-06, + "loss": 0.0688, + "step": 2642 + }, + { + "epoch": 0.955531453362256, + "grad_norm": 0.009686295243281516, + "learning_rate": 5.781434230256114e-06, + "loss": 0.0001, + "step": 2643 + }, + { + "epoch": 0.9558929862617498, + "grad_norm": 1.1194506482960493, + "learning_rate": 5.778481676088062e-06, + "loss": 0.0688, + "step": 2644 + }, + { + "epoch": 0.9562545191612437, + "grad_norm": 0.6094656458667184, + "learning_rate": 5.7755288436908195e-06, + "loss": 0.0476, + "step": 2645 + }, + { + "epoch": 0.9566160520607375, + "grad_norm": 0.2838354616835633, + "learning_rate": 5.772575734119734e-06, + "loss": 0.0286, + "step": 2646 + }, + { + "epoch": 0.9569775849602313, + "grad_norm": 0.09328667396570521, + "learning_rate": 5.769622348430243e-06, + "loss": 0.0024, + "step": 2647 + }, + { + "epoch": 0.9573391178597253, + "grad_norm": 0.16271569473136316, + "learning_rate": 5.766668687677888e-06, + "loss": 0.0206, + "step": 2648 + }, + { + "epoch": 0.9577006507592191, + "grad_norm": 0.3640795642462634, + "learning_rate": 5.763714752918305e-06, + "loss": 0.0008, + "step": 2649 + }, + { + "epoch": 0.958062183658713, + "grad_norm": 0.1136926748825311, + "learning_rate": 5.760760545207232e-06, + "loss": 0.0146, + "step": 2650 + }, + { + "epoch": 0.9584237165582068, + "grad_norm": 0.15970123194896751, + "learning_rate": 5.757806065600499e-06, + "loss": 0.004, + "step": 2651 + }, + { + "epoch": 0.9587852494577006, + "grad_norm": 1.5268855237827303, + "learning_rate": 5.754851315154038e-06, + "loss": 0.063, + "step": 2652 + }, + { + "epoch": 0.9591467823571945, + "grad_norm": 0.20602953793824066, + "learning_rate": 5.7518962949238786e-06, + "loss": 0.0146, + "step": 2653 + }, + { + "epoch": 0.9595083152566883, + "grad_norm": 0.18422608862096315, + "learning_rate": 5.748941005966141e-06, + "loss": 0.0165, + "step": 2654 + }, + { + "epoch": 0.9598698481561823, + "grad_norm": 0.117435618347497, + "learning_rate": 5.745985449337045e-06, + "loss": 0.0103, + "step": 2655 + }, + { + "epoch": 0.9602313810556761, + "grad_norm": 0.015491462697873106, + "learning_rate": 5.743029626092907e-06, + "loss": 0.0005, + "step": 2656 + }, + { + "epoch": 0.9605929139551699, + "grad_norm": 0.24724263877658492, + "learning_rate": 5.740073537290137e-06, + "loss": 0.0131, + "step": 2657 + }, + { + "epoch": 0.9609544468546638, + "grad_norm": 0.19018099495360868, + "learning_rate": 5.737117183985242e-06, + "loss": 0.0206, + "step": 2658 + }, + { + "epoch": 0.9613159797541576, + "grad_norm": 0.0873669128749022, + "learning_rate": 5.734160567234821e-06, + "loss": 0.0031, + "step": 2659 + }, + { + "epoch": 0.9616775126536515, + "grad_norm": 0.0098795877157565, + "learning_rate": 5.731203688095569e-06, + "loss": 0.0002, + "step": 2660 + }, + { + "epoch": 0.9620390455531453, + "grad_norm": 0.07783213116578044, + "learning_rate": 5.728246547624272e-06, + "loss": 0.0081, + "step": 2661 + }, + { + "epoch": 0.9624005784526392, + "grad_norm": 0.11767325322923074, + "learning_rate": 5.725289146877812e-06, + "loss": 0.0146, + "step": 2662 + }, + { + "epoch": 0.9627621113521331, + "grad_norm": 0.04198462191260759, + "learning_rate": 5.722331486913165e-06, + "loss": 0.0007, + "step": 2663 + }, + { + "epoch": 0.9631236442516269, + "grad_norm": 0.04415436103729363, + "learning_rate": 5.719373568787396e-06, + "loss": 0.0013, + "step": 2664 + }, + { + "epoch": 0.9634851771511207, + "grad_norm": 0.07803286465047395, + "learning_rate": 5.716415393557667e-06, + "loss": 0.0103, + "step": 2665 + }, + { + "epoch": 0.9638467100506146, + "grad_norm": 0.0036672426606927295, + "learning_rate": 5.713456962281227e-06, + "loss": 0.0001, + "step": 2666 + }, + { + "epoch": 0.9642082429501084, + "grad_norm": 0.22405508431621732, + "learning_rate": 5.7104982760154184e-06, + "loss": 0.0146, + "step": 2667 + }, + { + "epoch": 0.9645697758496024, + "grad_norm": 0.09513513198737242, + "learning_rate": 5.707539335817676e-06, + "loss": 0.0092, + "step": 2668 + }, + { + "epoch": 0.9649313087490962, + "grad_norm": 0.9521585583651768, + "learning_rate": 5.704580142745525e-06, + "loss": 0.2344, + "step": 2669 + }, + { + "epoch": 0.96529284164859, + "grad_norm": 0.23829671671899008, + "learning_rate": 5.701620697856579e-06, + "loss": 0.0206, + "step": 2670 + }, + { + "epoch": 0.9656543745480839, + "grad_norm": 0.10062857645628917, + "learning_rate": 5.6986610022085445e-06, + "loss": 0.0103, + "step": 2671 + }, + { + "epoch": 0.9660159074475777, + "grad_norm": 0.12272005153021327, + "learning_rate": 5.695701056859213e-06, + "loss": 0.0117, + "step": 2672 + }, + { + "epoch": 0.9663774403470716, + "grad_norm": 1.1897385690299354, + "learning_rate": 5.692740862866472e-06, + "loss": 0.2031, + "step": 2673 + }, + { + "epoch": 0.9667389732465654, + "grad_norm": 0.9937844914419128, + "learning_rate": 5.689780421288295e-06, + "loss": 0.0286, + "step": 2674 + }, + { + "epoch": 0.9671005061460592, + "grad_norm": 0.7467905559660643, + "learning_rate": 5.686819733182739e-06, + "loss": 0.1191, + "step": 2675 + }, + { + "epoch": 0.9674620390455532, + "grad_norm": 0.007496366139446289, + "learning_rate": 5.683858799607955e-06, + "loss": 0.0002, + "step": 2676 + }, + { + "epoch": 0.967823571945047, + "grad_norm": 0.680535640012393, + "learning_rate": 5.68089762162218e-06, + "loss": 0.2031, + "step": 2677 + }, + { + "epoch": 0.9681851048445409, + "grad_norm": 0.01598345768476426, + "learning_rate": 5.67793620028374e-06, + "loss": 0.0005, + "step": 2678 + }, + { + "epoch": 0.9685466377440347, + "grad_norm": 0.005855199120157678, + "learning_rate": 5.674974536651045e-06, + "loss": 0.0001, + "step": 2679 + }, + { + "epoch": 0.9689081706435285, + "grad_norm": 2.0760306188810005, + "learning_rate": 5.672012631782593e-06, + "loss": 0.1367, + "step": 2680 + }, + { + "epoch": 0.9692697035430224, + "grad_norm": 0.3295736180974514, + "learning_rate": 5.669050486736968e-06, + "loss": 0.0184, + "step": 2681 + }, + { + "epoch": 0.9696312364425163, + "grad_norm": 0.017481160999546896, + "learning_rate": 5.666088102572838e-06, + "loss": 0.0006, + "step": 2682 + }, + { + "epoch": 0.9699927693420102, + "grad_norm": 0.019164042462135468, + "learning_rate": 5.663125480348963e-06, + "loss": 0.0006, + "step": 2683 + }, + { + "epoch": 0.970354302241504, + "grad_norm": 0.16120353312505728, + "learning_rate": 5.660162621124182e-06, + "loss": 0.0184, + "step": 2684 + }, + { + "epoch": 0.9707158351409978, + "grad_norm": 0.12103249235842241, + "learning_rate": 5.657199525957419e-06, + "loss": 0.0146, + "step": 2685 + }, + { + "epoch": 0.9710773680404917, + "grad_norm": 0.22096325944887643, + "learning_rate": 5.654236195907683e-06, + "loss": 0.0231, + "step": 2686 + }, + { + "epoch": 0.9714389009399855, + "grad_norm": 0.13541619042431535, + "learning_rate": 5.65127263203407e-06, + "loss": 0.0131, + "step": 2687 + }, + { + "epoch": 0.9718004338394793, + "grad_norm": 0.9294675144548845, + "learning_rate": 5.648308835395755e-06, + "loss": 0.1553, + "step": 2688 + }, + { + "epoch": 0.9721619667389733, + "grad_norm": 1.052479215363782, + "learning_rate": 5.645344807051999e-06, + "loss": 0.0476, + "step": 2689 + }, + { + "epoch": 0.9725234996384671, + "grad_norm": 0.13717384438577362, + "learning_rate": 5.642380548062145e-06, + "loss": 0.0165, + "step": 2690 + }, + { + "epoch": 0.972885032537961, + "grad_norm": 0.25077249619665204, + "learning_rate": 5.63941605948562e-06, + "loss": 0.0255, + "step": 2691 + }, + { + "epoch": 0.9732465654374548, + "grad_norm": 0.12769901107294443, + "learning_rate": 5.636451342381928e-06, + "loss": 0.0184, + "step": 2692 + }, + { + "epoch": 0.9736080983369486, + "grad_norm": 0.7352224550925245, + "learning_rate": 5.633486397810661e-06, + "loss": 0.1191, + "step": 2693 + }, + { + "epoch": 0.9739696312364425, + "grad_norm": 2.4806804656951984, + "learning_rate": 5.630521226831491e-06, + "loss": 0.1553, + "step": 2694 + }, + { + "epoch": 0.9743311641359363, + "grad_norm": 0.0118235591170205, + "learning_rate": 5.627555830504167e-06, + "loss": 0.0003, + "step": 2695 + }, + { + "epoch": 0.9746926970354303, + "grad_norm": 0.636488294065816, + "learning_rate": 5.6245902098885205e-06, + "loss": 0.0051, + "step": 2696 + }, + { + "epoch": 0.9750542299349241, + "grad_norm": 3.0726900318601973, + "learning_rate": 5.621624366044464e-06, + "loss": 0.005, + "step": 2697 + }, + { + "epoch": 0.9754157628344179, + "grad_norm": 0.028301081147141978, + "learning_rate": 5.6186583000319925e-06, + "loss": 0.0008, + "step": 2698 + }, + { + "epoch": 0.9757772957339118, + "grad_norm": 0.7030319372298294, + "learning_rate": 5.615692012911175e-06, + "loss": 0.1367, + "step": 2699 + }, + { + "epoch": 0.9761388286334056, + "grad_norm": 0.19240898518074984, + "learning_rate": 5.612725505742161e-06, + "loss": 0.0231, + "step": 2700 + }, + { + "epoch": 0.9765003615328995, + "grad_norm": 0.5354250174320735, + "learning_rate": 5.609758779585182e-06, + "loss": 0.0286, + "step": 2701 + }, + { + "epoch": 0.9768618944323934, + "grad_norm": 2.3808040853559485, + "learning_rate": 5.606791835500543e-06, + "loss": 0.1738, + "step": 2702 + }, + { + "epoch": 0.9772234273318872, + "grad_norm": 0.20398467571288004, + "learning_rate": 5.603824674548629e-06, + "loss": 0.0286, + "step": 2703 + }, + { + "epoch": 0.9775849602313811, + "grad_norm": 1.8982708818641028, + "learning_rate": 5.600857297789904e-06, + "loss": 0.0957, + "step": 2704 + }, + { + "epoch": 0.9779464931308749, + "grad_norm": 0.004789809615362788, + "learning_rate": 5.597889706284909e-06, + "loss": 0.0002, + "step": 2705 + }, + { + "epoch": 0.9783080260303688, + "grad_norm": 0.16768117518869394, + "learning_rate": 5.594921901094259e-06, + "loss": 0.0206, + "step": 2706 + }, + { + "epoch": 0.9786695589298626, + "grad_norm": 0.005654033607004196, + "learning_rate": 5.591953883278645e-06, + "loss": 0.0001, + "step": 2707 + }, + { + "epoch": 0.9790310918293564, + "grad_norm": 0.04539726366922001, + "learning_rate": 5.58898565389884e-06, + "loss": 0.0002, + "step": 2708 + }, + { + "epoch": 0.9793926247288504, + "grad_norm": 2.2238900409847053, + "learning_rate": 5.5860172140156866e-06, + "loss": 0.0815, + "step": 2709 + }, + { + "epoch": 0.9797541576283442, + "grad_norm": 1.2285472654680405, + "learning_rate": 5.583048564690103e-06, + "loss": 0.0688, + "step": 2710 + }, + { + "epoch": 0.980115690527838, + "grad_norm": 0.06160298929134034, + "learning_rate": 5.580079706983087e-06, + "loss": 0.0022, + "step": 2711 + }, + { + "epoch": 0.9804772234273319, + "grad_norm": 0.2921789470411181, + "learning_rate": 5.577110641955705e-06, + "loss": 0.0286, + "step": 2712 + }, + { + "epoch": 0.9808387563268257, + "grad_norm": 0.08011458519080997, + "learning_rate": 5.5741413706691015e-06, + "loss": 0.0103, + "step": 2713 + }, + { + "epoch": 0.9812002892263196, + "grad_norm": 0.2233505291329589, + "learning_rate": 5.571171894184494e-06, + "loss": 0.0286, + "step": 2714 + }, + { + "epoch": 0.9815618221258134, + "grad_norm": 0.10770136008649188, + "learning_rate": 5.568202213563172e-06, + "loss": 0.0146, + "step": 2715 + }, + { + "epoch": 0.9819233550253073, + "grad_norm": 0.7823354224508241, + "learning_rate": 5.565232329866499e-06, + "loss": 0.0476, + "step": 2716 + }, + { + "epoch": 0.9822848879248012, + "grad_norm": 0.27263168125480747, + "learning_rate": 5.562262244155909e-06, + "loss": 0.0082, + "step": 2717 + }, + { + "epoch": 0.982646420824295, + "grad_norm": 0.08957690848040395, + "learning_rate": 5.559291957492914e-06, + "loss": 0.0028, + "step": 2718 + }, + { + "epoch": 0.9830079537237889, + "grad_norm": 0.4647148026326685, + "learning_rate": 5.556321470939089e-06, + "loss": 0.0082, + "step": 2719 + }, + { + "epoch": 0.9833694866232827, + "grad_norm": 0.11329157655794693, + "learning_rate": 5.553350785556089e-06, + "loss": 0.0165, + "step": 2720 + }, + { + "epoch": 0.9837310195227765, + "grad_norm": 0.10351469908454596, + "learning_rate": 5.550379902405636e-06, + "loss": 0.004, + "step": 2721 + }, + { + "epoch": 0.9840925524222705, + "grad_norm": 0.733160260139405, + "learning_rate": 5.547408822549521e-06, + "loss": 0.1641, + "step": 2722 + }, + { + "epoch": 0.9844540853217643, + "grad_norm": 0.014790702908466123, + "learning_rate": 5.544437547049608e-06, + "loss": 0.0005, + "step": 2723 + }, + { + "epoch": 0.9848156182212582, + "grad_norm": 1.241335754579528, + "learning_rate": 5.5414660769678296e-06, + "loss": 0.0391, + "step": 2724 + }, + { + "epoch": 0.985177151120752, + "grad_norm": 1.499949714688639, + "learning_rate": 5.538494413366191e-06, + "loss": 0.0957, + "step": 2725 + }, + { + "epoch": 0.9855386840202458, + "grad_norm": 0.6441959545987987, + "learning_rate": 5.535522557306764e-06, + "loss": 0.1279, + "step": 2726 + }, + { + "epoch": 0.9859002169197397, + "grad_norm": 0.14690039253193365, + "learning_rate": 5.532550509851687e-06, + "loss": 0.0131, + "step": 2727 + }, + { + "epoch": 0.9862617498192335, + "grad_norm": 0.9224358734056971, + "learning_rate": 5.529578272063169e-06, + "loss": 0.1367, + "step": 2728 + }, + { + "epoch": 0.9866232827187275, + "grad_norm": 0.16856353929401222, + "learning_rate": 5.526605845003488e-06, + "loss": 0.0184, + "step": 2729 + }, + { + "epoch": 0.9869848156182213, + "grad_norm": 0.1603797985992894, + "learning_rate": 5.52363322973499e-06, + "loss": 0.0165, + "step": 2730 + }, + { + "epoch": 0.9873463485177151, + "grad_norm": 0.8649968937041845, + "learning_rate": 5.520660427320088e-06, + "loss": 0.1934, + "step": 2731 + }, + { + "epoch": 0.987707881417209, + "grad_norm": 0.2099913036895439, + "learning_rate": 5.517687438821256e-06, + "loss": 0.0165, + "step": 2732 + }, + { + "epoch": 0.9880694143167028, + "grad_norm": 0.04074122584714941, + "learning_rate": 5.514714265301045e-06, + "loss": 0.0013, + "step": 2733 + }, + { + "epoch": 0.9884309472161966, + "grad_norm": 0.1317007928225009, + "learning_rate": 5.511740907822063e-06, + "loss": 0.0206, + "step": 2734 + }, + { + "epoch": 0.9887924801156905, + "grad_norm": 0.49220805951486224, + "learning_rate": 5.508767367446989e-06, + "loss": 0.0889, + "step": 2735 + }, + { + "epoch": 0.9891540130151844, + "grad_norm": 0.09684010707708676, + "learning_rate": 5.5057936452385656e-06, + "loss": 0.0165, + "step": 2736 + }, + { + "epoch": 0.9895155459146783, + "grad_norm": 0.13632433099491348, + "learning_rate": 5.502819742259599e-06, + "loss": 0.0184, + "step": 2737 + }, + { + "epoch": 0.9898770788141721, + "grad_norm": 0.0965749779390738, + "learning_rate": 5.499845659572964e-06, + "loss": 0.0031, + "step": 2738 + }, + { + "epoch": 0.9902386117136659, + "grad_norm": 0.03970971452693321, + "learning_rate": 5.496871398241595e-06, + "loss": 0.0011, + "step": 2739 + }, + { + "epoch": 0.9906001446131598, + "grad_norm": 1.8490156428611808, + "learning_rate": 5.493896959328493e-06, + "loss": 0.1113, + "step": 2740 + }, + { + "epoch": 0.9909616775126536, + "grad_norm": 0.479332131800778, + "learning_rate": 5.490922343896722e-06, + "loss": 0.1455, + "step": 2741 + }, + { + "epoch": 0.9913232104121475, + "grad_norm": 2.560269198557247, + "learning_rate": 5.487947553009409e-06, + "loss": 0.1191, + "step": 2742 + }, + { + "epoch": 0.9916847433116414, + "grad_norm": 0.09856827779512756, + "learning_rate": 5.484972587729744e-06, + "loss": 0.0031, + "step": 2743 + }, + { + "epoch": 0.9920462762111352, + "grad_norm": 0.495566461236058, + "learning_rate": 5.481997449120977e-06, + "loss": 0.1553, + "step": 2744 + }, + { + "epoch": 0.9924078091106291, + "grad_norm": 0.15637681838127968, + "learning_rate": 5.479022138246425e-06, + "loss": 0.0184, + "step": 2745 + }, + { + "epoch": 0.9927693420101229, + "grad_norm": 0.7665091765129673, + "learning_rate": 5.476046656169461e-06, + "loss": 0.0688, + "step": 2746 + }, + { + "epoch": 0.9931308749096168, + "grad_norm": 0.18749328241455215, + "learning_rate": 5.473071003953524e-06, + "loss": 0.0231, + "step": 2747 + }, + { + "epoch": 0.9934924078091106, + "grad_norm": 0.6286013688335791, + "learning_rate": 5.47009518266211e-06, + "loss": 0.0957, + "step": 2748 + }, + { + "epoch": 0.9938539407086044, + "grad_norm": 0.18715895036936842, + "learning_rate": 5.4671191933587746e-06, + "loss": 0.0258, + "step": 2749 + }, + { + "epoch": 0.9942154736080984, + "grad_norm": 0.3835704414454147, + "learning_rate": 5.464143037107139e-06, + "loss": 0.0432, + "step": 2750 + }, + { + "epoch": 0.9945770065075922, + "grad_norm": 0.018708226991649785, + "learning_rate": 5.46116671497088e-06, + "loss": 0.0003, + "step": 2751 + }, + { + "epoch": 0.9949385394070861, + "grad_norm": 0.019474189600746247, + "learning_rate": 5.458190228013736e-06, + "loss": 0.0005, + "step": 2752 + }, + { + "epoch": 0.9953000723065799, + "grad_norm": 0.5041830804760344, + "learning_rate": 5.455213577299499e-06, + "loss": 0.1553, + "step": 2753 + }, + { + "epoch": 0.9956616052060737, + "grad_norm": 1.0386712139469618, + "learning_rate": 5.452236763892026e-06, + "loss": 0.0579, + "step": 2754 + }, + { + "epoch": 0.9960231381055676, + "grad_norm": 0.4997816293420263, + "learning_rate": 5.4492597888552304e-06, + "loss": 0.1113, + "step": 2755 + }, + { + "epoch": 0.9963846710050615, + "grad_norm": 0.008616834534710282, + "learning_rate": 5.44628265325308e-06, + "loss": 0.0002, + "step": 2756 + }, + { + "epoch": 0.9967462039045553, + "grad_norm": 0.28445784945697794, + "learning_rate": 5.443305358149603e-06, + "loss": 0.0476, + "step": 2757 + }, + { + "epoch": 0.9971077368040492, + "grad_norm": 0.11351213515682869, + "learning_rate": 5.440327904608886e-06, + "loss": 0.004, + "step": 2758 + }, + { + "epoch": 0.997469269703543, + "grad_norm": 0.18288485417747904, + "learning_rate": 5.4373502936950674e-06, + "loss": 0.0255, + "step": 2759 + }, + { + "epoch": 0.9978308026030369, + "grad_norm": 0.19712900657947557, + "learning_rate": 5.434372526472347e-06, + "loss": 0.0317, + "step": 2760 + }, + { + "epoch": 0.9981923355025307, + "grad_norm": 0.20306505450431084, + "learning_rate": 5.431394604004977e-06, + "loss": 0.0354, + "step": 2761 + }, + { + "epoch": 0.9985538684020245, + "grad_norm": 0.18141972423733874, + "learning_rate": 5.4284165273572665e-06, + "loss": 0.0354, + "step": 2762 + }, + { + "epoch": 0.9989154013015185, + "grad_norm": 0.5487308311433079, + "learning_rate": 5.42543829759358e-06, + "loss": 0.0576, + "step": 2763 + }, + { + "epoch": 0.9992769342010123, + "grad_norm": 0.2711903165381143, + "learning_rate": 5.422459915778334e-06, + "loss": 0.0391, + "step": 2764 + }, + { + "epoch": 0.9996384671005062, + "grad_norm": 0.6746684086049157, + "learning_rate": 5.4194813829760055e-06, + "loss": 0.1191, + "step": 2765 + }, + { + "epoch": 1.0, + "grad_norm": 0.044645725794420416, + "learning_rate": 5.416502700251118e-06, + "loss": 0.0013, + "step": 2766 + }, + { + "epoch": 1.000361532899494, + "grad_norm": 0.19668114113957672, + "learning_rate": 5.4135238686682545e-06, + "loss": 0.0258, + "step": 2767 + }, + { + "epoch": 1.0007230657989876, + "grad_norm": 0.18477124814579932, + "learning_rate": 5.410544889292047e-06, + "loss": 0.0231, + "step": 2768 + }, + { + "epoch": 1.0010845986984815, + "grad_norm": 0.0025729533695968846, + "learning_rate": 5.407565763187182e-06, + "loss": 0.0001, + "step": 2769 + }, + { + "epoch": 1.0014461315979755, + "grad_norm": 0.30127073073387306, + "learning_rate": 5.404586491418399e-06, + "loss": 0.0391, + "step": 2770 + }, + { + "epoch": 1.0018076644974692, + "grad_norm": 0.5336701995767614, + "learning_rate": 5.40160707505049e-06, + "loss": 0.1113, + "step": 2771 + }, + { + "epoch": 1.002169197396963, + "grad_norm": 0.19111784287349337, + "learning_rate": 5.398627515148298e-06, + "loss": 0.0317, + "step": 2772 + }, + { + "epoch": 1.002530730296457, + "grad_norm": 1.3213835004794692, + "learning_rate": 5.3956478127767155e-06, + "loss": 0.0391, + "step": 2773 + }, + { + "epoch": 1.002892263195951, + "grad_norm": 0.9187000139620038, + "learning_rate": 5.392667969000688e-06, + "loss": 0.0752, + "step": 2774 + }, + { + "epoch": 1.0032537960954446, + "grad_norm": 0.17172331087388298, + "learning_rate": 5.389687984885211e-06, + "loss": 0.0286, + "step": 2775 + }, + { + "epoch": 1.0036153289949385, + "grad_norm": 0.14845589942742632, + "learning_rate": 5.3867078614953305e-06, + "loss": 0.0286, + "step": 2776 + }, + { + "epoch": 1.0039768618944325, + "grad_norm": 0.18685887400350892, + "learning_rate": 5.383727599896143e-06, + "loss": 0.0354, + "step": 2777 + }, + { + "epoch": 1.0043383947939262, + "grad_norm": 0.10996357530219043, + "learning_rate": 5.380747201152792e-06, + "loss": 0.0205, + "step": 2778 + }, + { + "epoch": 1.00469992769342, + "grad_norm": 0.0030794373565173837, + "learning_rate": 5.37776666633047e-06, + "loss": 0.0001, + "step": 2779 + }, + { + "epoch": 1.005061460592914, + "grad_norm": 0.3680552407828665, + "learning_rate": 5.374785996494423e-06, + "loss": 0.0082, + "step": 2780 + }, + { + "epoch": 1.0054229934924077, + "grad_norm": 0.16988446166371532, + "learning_rate": 5.371805192709939e-06, + "loss": 0.0286, + "step": 2781 + }, + { + "epoch": 1.0057845263919016, + "grad_norm": 0.5178789655597076, + "learning_rate": 5.3688242560423585e-06, + "loss": 0.1279, + "step": 2782 + }, + { + "epoch": 1.0061460592913956, + "grad_norm": 0.7698789477739757, + "learning_rate": 5.365843187557066e-06, + "loss": 0.1035, + "step": 2783 + }, + { + "epoch": 1.0065075921908895, + "grad_norm": 0.5658771512827715, + "learning_rate": 5.362861988319495e-06, + "loss": 0.0206, + "step": 2784 + }, + { + "epoch": 1.0068691250903832, + "grad_norm": 0.14252732745968136, + "learning_rate": 5.359880659395127e-06, + "loss": 0.0255, + "step": 2785 + }, + { + "epoch": 1.007230657989877, + "grad_norm": 0.9215924860427326, + "learning_rate": 5.356899201849487e-06, + "loss": 0.063, + "step": 2786 + }, + { + "epoch": 1.007592190889371, + "grad_norm": 0.24623098201197824, + "learning_rate": 5.353917616748147e-06, + "loss": 0.0255, + "step": 2787 + }, + { + "epoch": 1.0079537237888647, + "grad_norm": 0.1472234059154758, + "learning_rate": 5.3509359051567265e-06, + "loss": 0.0165, + "step": 2788 + }, + { + "epoch": 1.0083152566883586, + "grad_norm": 0.5367711229433482, + "learning_rate": 5.347954068140886e-06, + "loss": 0.1035, + "step": 2789 + }, + { + "epoch": 1.0086767895878526, + "grad_norm": 0.45558773136200986, + "learning_rate": 5.344972106766336e-06, + "loss": 0.1113, + "step": 2790 + }, + { + "epoch": 1.0090383224873463, + "grad_norm": 0.04236792058230652, + "learning_rate": 5.341990022098829e-06, + "loss": 0.0006, + "step": 2791 + }, + { + "epoch": 1.0093998553868402, + "grad_norm": 0.0105555201763254, + "learning_rate": 5.339007815204157e-06, + "loss": 0.0004, + "step": 2792 + }, + { + "epoch": 1.009761388286334, + "grad_norm": 0.7457259634260792, + "learning_rate": 5.336025487148167e-06, + "loss": 0.0231, + "step": 2793 + }, + { + "epoch": 1.0101229211858278, + "grad_norm": 0.5777737957143646, + "learning_rate": 5.333043038996737e-06, + "loss": 0.0889, + "step": 2794 + }, + { + "epoch": 1.0104844540853217, + "grad_norm": 0.3645031926940933, + "learning_rate": 5.3300604718157955e-06, + "loss": 0.0231, + "step": 2795 + }, + { + "epoch": 1.0108459869848156, + "grad_norm": 0.01605812054156756, + "learning_rate": 5.327077786671311e-06, + "loss": 0.0004, + "step": 2796 + }, + { + "epoch": 1.0112075198843096, + "grad_norm": 0.0018686766837167346, + "learning_rate": 5.324094984629293e-06, + "loss": 0.0, + "step": 2797 + }, + { + "epoch": 1.0115690527838033, + "grad_norm": 0.2403415778088686, + "learning_rate": 5.321112066755799e-06, + "loss": 0.0286, + "step": 2798 + }, + { + "epoch": 1.0119305856832972, + "grad_norm": 0.9892017724825471, + "learning_rate": 5.318129034116918e-06, + "loss": 0.0525, + "step": 2799 + }, + { + "epoch": 1.0122921185827911, + "grad_norm": 0.5401033208676057, + "learning_rate": 5.315145887778788e-06, + "loss": 0.0184, + "step": 2800 + }, + { + "epoch": 1.0126536514822848, + "grad_norm": 1.0355201621149543, + "learning_rate": 5.312162628807584e-06, + "loss": 0.063, + "step": 2801 + }, + { + "epoch": 1.0130151843817787, + "grad_norm": 0.010151516445755949, + "learning_rate": 5.3091792582695215e-06, + "loss": 0.0003, + "step": 2802 + }, + { + "epoch": 1.0133767172812727, + "grad_norm": 84.080440454738, + "learning_rate": 5.306195777230859e-06, + "loss": 1.6797, + "step": 2803 + }, + { + "epoch": 1.0137382501807664, + "grad_norm": 0.23093725641378088, + "learning_rate": 5.303212186757889e-06, + "loss": 0.0206, + "step": 2804 + }, + { + "epoch": 1.0140997830802603, + "grad_norm": 0.013376433463948224, + "learning_rate": 5.300228487916949e-06, + "loss": 0.0001, + "step": 2805 + }, + { + "epoch": 1.0144613159797542, + "grad_norm": 0.1836437893795422, + "learning_rate": 5.29724468177441e-06, + "loss": 0.0317, + "step": 2806 + }, + { + "epoch": 1.0148228488792481, + "grad_norm": 0.23638145535815444, + "learning_rate": 5.294260769396683e-06, + "loss": 0.0258, + "step": 2807 + }, + { + "epoch": 1.0151843817787418, + "grad_norm": 0.16086986123618469, + "learning_rate": 5.291276751850222e-06, + "loss": 0.0255, + "step": 2808 + }, + { + "epoch": 1.0155459146782357, + "grad_norm": 0.26621710803220117, + "learning_rate": 5.288292630201508e-06, + "loss": 0.0354, + "step": 2809 + }, + { + "epoch": 1.0159074475777297, + "grad_norm": 0.013342692318118497, + "learning_rate": 5.285308405517071e-06, + "loss": 0.0003, + "step": 2810 + }, + { + "epoch": 1.0162689804772234, + "grad_norm": 0.4760116614035775, + "learning_rate": 5.2823240788634685e-06, + "loss": 0.0576, + "step": 2811 + }, + { + "epoch": 1.0166305133767173, + "grad_norm": 0.20064598571031117, + "learning_rate": 5.279339651307301e-06, + "loss": 0.0286, + "step": 2812 + }, + { + "epoch": 1.0169920462762112, + "grad_norm": 0.15671739186316386, + "learning_rate": 5.276355123915203e-06, + "loss": 0.0206, + "step": 2813 + }, + { + "epoch": 1.017353579175705, + "grad_norm": 0.1876067370452774, + "learning_rate": 5.273370497753839e-06, + "loss": 0.0258, + "step": 2814 + }, + { + "epoch": 1.0177151120751988, + "grad_norm": 6.504393838715643, + "learning_rate": 5.270385773889918e-06, + "loss": 0.0752, + "step": 2815 + }, + { + "epoch": 1.0180766449746927, + "grad_norm": 0.09234883167291193, + "learning_rate": 5.267400953390177e-06, + "loss": 0.0025, + "step": 2816 + }, + { + "epoch": 1.0184381778741864, + "grad_norm": 0.11628936635490911, + "learning_rate": 5.2644160373213935e-06, + "loss": 0.0028, + "step": 2817 + }, + { + "epoch": 1.0187997107736804, + "grad_norm": 1.7475138710610938, + "learning_rate": 5.2614310267503745e-06, + "loss": 0.0889, + "step": 2818 + }, + { + "epoch": 1.0191612436731743, + "grad_norm": 0.49772962250567193, + "learning_rate": 5.25844592274396e-06, + "loss": 0.1191, + "step": 2819 + }, + { + "epoch": 1.0195227765726682, + "grad_norm": 0.9189828384813065, + "learning_rate": 5.2554607263690285e-06, + "loss": 0.0576, + "step": 2820 + }, + { + "epoch": 1.019884309472162, + "grad_norm": 0.3157364729128106, + "learning_rate": 5.252475438692486e-06, + "loss": 0.0255, + "step": 2821 + }, + { + "epoch": 1.0202458423716558, + "grad_norm": 0.19840020538363623, + "learning_rate": 5.249490060781276e-06, + "loss": 0.0286, + "step": 2822 + }, + { + "epoch": 1.0206073752711498, + "grad_norm": 0.13739937984627118, + "learning_rate": 5.2465045937023704e-06, + "loss": 0.0031, + "step": 2823 + }, + { + "epoch": 1.0209689081706435, + "grad_norm": 0.7866214332096955, + "learning_rate": 5.2435190385227765e-06, + "loss": 0.0028, + "step": 2824 + }, + { + "epoch": 1.0213304410701374, + "grad_norm": 0.2225722636595426, + "learning_rate": 5.240533396309528e-06, + "loss": 0.0286, + "step": 2825 + }, + { + "epoch": 1.0216919739696313, + "grad_norm": 1.2170975541109463, + "learning_rate": 5.237547668129694e-06, + "loss": 0.0889, + "step": 2826 + }, + { + "epoch": 1.022053506869125, + "grad_norm": 0.5125110269835735, + "learning_rate": 5.234561855050375e-06, + "loss": 0.0476, + "step": 2827 + }, + { + "epoch": 1.022415039768619, + "grad_norm": 13.515889778642743, + "learning_rate": 5.2315759581386985e-06, + "loss": 0.4375, + "step": 2828 + }, + { + "epoch": 1.0227765726681128, + "grad_norm": 0.1847958324647992, + "learning_rate": 5.228589978461824e-06, + "loss": 0.0317, + "step": 2829 + }, + { + "epoch": 1.0231381055676068, + "grad_norm": 0.2630587212039082, + "learning_rate": 5.225603917086938e-06, + "loss": 0.0255, + "step": 2830 + }, + { + "epoch": 1.0234996384671005, + "grad_norm": 3.73170004884557, + "learning_rate": 5.222617775081259e-06, + "loss": 0.332, + "step": 2831 + }, + { + "epoch": 1.0238611713665944, + "grad_norm": 0.6540320765029713, + "learning_rate": 5.219631553512034e-06, + "loss": 0.0576, + "step": 2832 + }, + { + "epoch": 1.0242227042660883, + "grad_norm": 0.17380049451763738, + "learning_rate": 5.21664525344654e-06, + "loss": 0.0229, + "step": 2833 + }, + { + "epoch": 1.024584237165582, + "grad_norm": 0.0030899998829715994, + "learning_rate": 5.2136588759520775e-06, + "loss": 0.0001, + "step": 2834 + }, + { + "epoch": 1.024945770065076, + "grad_norm": 0.4486818444789284, + "learning_rate": 5.210672422095978e-06, + "loss": 0.0476, + "step": 2835 + }, + { + "epoch": 1.0253073029645698, + "grad_norm": 0.12207545393569076, + "learning_rate": 5.207685892945599e-06, + "loss": 0.001, + "step": 2836 + }, + { + "epoch": 1.0256688358640635, + "grad_norm": 0.027749369516537944, + "learning_rate": 5.204699289568326e-06, + "loss": 0.0009, + "step": 2837 + }, + { + "epoch": 1.0260303687635575, + "grad_norm": 0.1530198439207097, + "learning_rate": 5.20171261303157e-06, + "loss": 0.0206, + "step": 2838 + }, + { + "epoch": 1.0263919016630514, + "grad_norm": 0.17366657614417244, + "learning_rate": 5.198725864402768e-06, + "loss": 0.0206, + "step": 2839 + }, + { + "epoch": 1.026753434562545, + "grad_norm": 0.11082520151666397, + "learning_rate": 5.195739044749385e-06, + "loss": 0.0165, + "step": 2840 + }, + { + "epoch": 1.027114967462039, + "grad_norm": 1.315393495573098, + "learning_rate": 5.192752155138907e-06, + "loss": 0.1191, + "step": 2841 + }, + { + "epoch": 1.027476500361533, + "grad_norm": 0.7317399855323564, + "learning_rate": 5.189765196638852e-06, + "loss": 0.1738, + "step": 2842 + }, + { + "epoch": 1.0278380332610269, + "grad_norm": 0.10888832415127912, + "learning_rate": 5.186778170316754e-06, + "loss": 0.0131, + "step": 2843 + }, + { + "epoch": 1.0281995661605206, + "grad_norm": 0.017134701153294334, + "learning_rate": 5.183791077240178e-06, + "loss": 0.0004, + "step": 2844 + }, + { + "epoch": 1.0285610990600145, + "grad_norm": 1.7328247759074646, + "learning_rate": 5.18080391847671e-06, + "loss": 0.0957, + "step": 2845 + }, + { + "epoch": 1.0289226319595084, + "grad_norm": 0.14637310568166625, + "learning_rate": 5.177816695093958e-06, + "loss": 0.0165, + "step": 2846 + }, + { + "epoch": 1.029284164859002, + "grad_norm": 0.13467272586415527, + "learning_rate": 5.174829408159558e-06, + "loss": 0.0117, + "step": 2847 + }, + { + "epoch": 1.029645697758496, + "grad_norm": 0.012704214998965393, + "learning_rate": 5.171842058741166e-06, + "loss": 0.0004, + "step": 2848 + }, + { + "epoch": 1.03000723065799, + "grad_norm": 3.251538233071545, + "learning_rate": 5.168854647906456e-06, + "loss": 0.0688, + "step": 2849 + }, + { + "epoch": 1.0303687635574836, + "grad_norm": 1.5181101421622887, + "learning_rate": 5.165867176723132e-06, + "loss": 0.0752, + "step": 2850 + }, + { + "epoch": 1.0307302964569776, + "grad_norm": 0.2252518844107366, + "learning_rate": 5.162879646258913e-06, + "loss": 0.0165, + "step": 2851 + }, + { + "epoch": 1.0310918293564715, + "grad_norm": 0.019142931664302653, + "learning_rate": 5.159892057581542e-06, + "loss": 0.0005, + "step": 2852 + }, + { + "epoch": 1.0314533622559654, + "grad_norm": 1.1991742161216545, + "learning_rate": 5.156904411758785e-06, + "loss": 0.0889, + "step": 2853 + }, + { + "epoch": 1.031814895155459, + "grad_norm": 0.17476924605030514, + "learning_rate": 5.153916709858423e-06, + "loss": 0.0165, + "step": 2854 + }, + { + "epoch": 1.032176428054953, + "grad_norm": 0.26730755506029774, + "learning_rate": 5.1509289529482645e-06, + "loss": 0.0255, + "step": 2855 + }, + { + "epoch": 1.032537960954447, + "grad_norm": 0.021931161592802464, + "learning_rate": 5.147941142096127e-06, + "loss": 0.0005, + "step": 2856 + }, + { + "epoch": 1.0328994938539406, + "grad_norm": 1.1101138530586299, + "learning_rate": 5.144953278369858e-06, + "loss": 0.1641, + "step": 2857 + }, + { + "epoch": 1.0332610267534346, + "grad_norm": 0.11597929404749102, + "learning_rate": 5.141965362837317e-06, + "loss": 0.0184, + "step": 2858 + }, + { + "epoch": 1.0336225596529285, + "grad_norm": 1.0052234743132538, + "learning_rate": 5.138977396566384e-06, + "loss": 0.1553, + "step": 2859 + }, + { + "epoch": 1.0339840925524222, + "grad_norm": 0.5490932916470588, + "learning_rate": 5.135989380624962e-06, + "loss": 0.1455, + "step": 2860 + }, + { + "epoch": 1.034345625451916, + "grad_norm": 0.0047465401283862865, + "learning_rate": 5.133001316080961e-06, + "loss": 0.0001, + "step": 2861 + }, + { + "epoch": 1.03470715835141, + "grad_norm": 0.10429421850167911, + "learning_rate": 5.13001320400232e-06, + "loss": 0.0165, + "step": 2862 + }, + { + "epoch": 1.0350686912509037, + "grad_norm": 0.13405733874815942, + "learning_rate": 5.127025045456986e-06, + "loss": 0.0165, + "step": 2863 + }, + { + "epoch": 1.0354302241503976, + "grad_norm": 0.0179929483537909, + "learning_rate": 5.124036841512927e-06, + "loss": 0.0005, + "step": 2864 + }, + { + "epoch": 1.0357917570498916, + "grad_norm": 0.0650598603069466, + "learning_rate": 5.121048593238129e-06, + "loss": 0.0019, + "step": 2865 + }, + { + "epoch": 1.0361532899493855, + "grad_norm": 0.10588920482556607, + "learning_rate": 5.118060301700588e-06, + "loss": 0.0165, + "step": 2866 + }, + { + "epoch": 1.0365148228488792, + "grad_norm": 0.8875066474056126, + "learning_rate": 5.1150719679683205e-06, + "loss": 0.0286, + "step": 2867 + }, + { + "epoch": 1.0368763557483731, + "grad_norm": 0.24987513135835748, + "learning_rate": 5.112083593109356e-06, + "loss": 0.0317, + "step": 2868 + }, + { + "epoch": 1.037237888647867, + "grad_norm": 0.5550041414634269, + "learning_rate": 5.109095178191739e-06, + "loss": 0.1191, + "step": 2869 + }, + { + "epoch": 1.0375994215473607, + "grad_norm": 0.12534225683326078, + "learning_rate": 5.106106724283529e-06, + "loss": 0.0206, + "step": 2870 + }, + { + "epoch": 1.0379609544468547, + "grad_norm": 0.11135254445578989, + "learning_rate": 5.103118232452796e-06, + "loss": 0.0184, + "step": 2871 + }, + { + "epoch": 1.0383224873463486, + "grad_norm": 0.019693634978222487, + "learning_rate": 5.10012970376763e-06, + "loss": 0.0006, + "step": 2872 + }, + { + "epoch": 1.0386840202458423, + "grad_norm": 0.1581129829509804, + "learning_rate": 5.097141139296129e-06, + "loss": 0.0184, + "step": 2873 + }, + { + "epoch": 1.0390455531453362, + "grad_norm": 0.43673144199267216, + "learning_rate": 5.094152540106404e-06, + "loss": 0.0131, + "step": 2874 + }, + { + "epoch": 1.0394070860448301, + "grad_norm": 0.22681614411450962, + "learning_rate": 5.091163907266584e-06, + "loss": 0.0031, + "step": 2875 + }, + { + "epoch": 1.0397686189443238, + "grad_norm": 0.3570014959195119, + "learning_rate": 5.0881752418448e-06, + "loss": 0.0206, + "step": 2876 + }, + { + "epoch": 1.0401301518438177, + "grad_norm": 0.19024213036381366, + "learning_rate": 5.085186544909204e-06, + "loss": 0.0184, + "step": 2877 + }, + { + "epoch": 1.0404916847433117, + "grad_norm": 0.019765634698784004, + "learning_rate": 5.082197817527955e-06, + "loss": 0.0005, + "step": 2878 + }, + { + "epoch": 1.0408532176428056, + "grad_norm": 1.6340917804325972, + "learning_rate": 5.0792090607692235e-06, + "loss": 0.1279, + "step": 2879 + }, + { + "epoch": 1.0412147505422993, + "grad_norm": 0.12609379608720364, + "learning_rate": 5.076220275701191e-06, + "loss": 0.0031, + "step": 2880 + }, + { + "epoch": 1.0415762834417932, + "grad_norm": 0.8962038284387989, + "learning_rate": 5.073231463392047e-06, + "loss": 0.0815, + "step": 2881 + }, + { + "epoch": 1.0419378163412871, + "grad_norm": 0.4566021818701243, + "learning_rate": 5.0702426249099935e-06, + "loss": 0.1455, + "step": 2882 + }, + { + "epoch": 1.0422993492407808, + "grad_norm": 1.1105844210945879, + "learning_rate": 5.0672537613232405e-06, + "loss": 0.1113, + "step": 2883 + }, + { + "epoch": 1.0426608821402747, + "grad_norm": 0.934613126861038, + "learning_rate": 5.0642648737000066e-06, + "loss": 0.1113, + "step": 2884 + }, + { + "epoch": 1.0430224150397687, + "grad_norm": 1.0628427079816103, + "learning_rate": 5.061275963108524e-06, + "loss": 0.1738, + "step": 2885 + }, + { + "epoch": 1.0433839479392624, + "grad_norm": 0.3629827811125593, + "learning_rate": 5.058287030617022e-06, + "loss": 0.0117, + "step": 2886 + }, + { + "epoch": 1.0437454808387563, + "grad_norm": 0.5987945596447356, + "learning_rate": 5.055298077293748e-06, + "loss": 0.0286, + "step": 2887 + }, + { + "epoch": 1.0441070137382502, + "grad_norm": 0.5765603756010104, + "learning_rate": 5.052309104206953e-06, + "loss": 0.1113, + "step": 2888 + }, + { + "epoch": 1.0444685466377441, + "grad_norm": 0.1756688045572934, + "learning_rate": 5.049320112424895e-06, + "loss": 0.0057, + "step": 2889 + }, + { + "epoch": 1.0448300795372378, + "grad_norm": 0.3490555487992977, + "learning_rate": 5.046331103015839e-06, + "loss": 0.0391, + "step": 2890 + }, + { + "epoch": 1.0451916124367318, + "grad_norm": 0.4103948977276094, + "learning_rate": 5.043342077048058e-06, + "loss": 0.0117, + "step": 2891 + }, + { + "epoch": 1.0455531453362257, + "grad_norm": 0.08424716133621439, + "learning_rate": 5.040353035589826e-06, + "loss": 0.0022, + "step": 2892 + }, + { + "epoch": 1.0459146782357194, + "grad_norm": 0.5322668264215623, + "learning_rate": 5.037363979709428e-06, + "loss": 0.1367, + "step": 2893 + }, + { + "epoch": 1.0462762111352133, + "grad_norm": 0.22168017595794057, + "learning_rate": 5.034374910475153e-06, + "loss": 0.0317, + "step": 2894 + }, + { + "epoch": 1.0466377440347072, + "grad_norm": 0.25817441901518884, + "learning_rate": 5.031385828955291e-06, + "loss": 0.0354, + "step": 2895 + }, + { + "epoch": 1.046999276934201, + "grad_norm": 0.13832916250736635, + "learning_rate": 5.028396736218141e-06, + "loss": 0.0255, + "step": 2896 + }, + { + "epoch": 1.0473608098336948, + "grad_norm": 0.15052455662753367, + "learning_rate": 5.025407633332003e-06, + "loss": 0.0255, + "step": 2897 + }, + { + "epoch": 1.0477223427331888, + "grad_norm": 0.21690877971379932, + "learning_rate": 5.022418521365182e-06, + "loss": 0.0286, + "step": 2898 + }, + { + "epoch": 1.0480838756326825, + "grad_norm": 0.7117131942127197, + "learning_rate": 5.019429401385985e-06, + "loss": 0.0258, + "step": 2899 + }, + { + "epoch": 1.0484454085321764, + "grad_norm": 0.24754648094279497, + "learning_rate": 5.0164402744627275e-06, + "loss": 0.0286, + "step": 2900 + }, + { + "epoch": 1.0488069414316703, + "grad_norm": 0.7727410479615802, + "learning_rate": 5.0134511416637164e-06, + "loss": 0.0432, + "step": 2901 + }, + { + "epoch": 1.0491684743311642, + "grad_norm": 0.001578432940779665, + "learning_rate": 5.010462004057272e-06, + "loss": 0.0, + "step": 2902 + }, + { + "epoch": 1.049530007230658, + "grad_norm": 1.1005831478891568, + "learning_rate": 5.007472862711708e-06, + "loss": 0.0815, + "step": 2903 + }, + { + "epoch": 1.0498915401301518, + "grad_norm": 0.6827104158490767, + "learning_rate": 5.004483718695345e-06, + "loss": 0.0476, + "step": 2904 + }, + { + "epoch": 1.0502530730296458, + "grad_norm": 0.008516973809855005, + "learning_rate": 5.0014945730765015e-06, + "loss": 0.0002, + "step": 2905 + }, + { + "epoch": 1.0506146059291395, + "grad_norm": 0.05183890247503163, + "learning_rate": 4.9985054269234985e-06, + "loss": 0.0009, + "step": 2906 + }, + { + "epoch": 1.0509761388286334, + "grad_norm": 0.15570721699448495, + "learning_rate": 4.9955162813046565e-06, + "loss": 0.0229, + "step": 2907 + }, + { + "epoch": 1.0513376717281273, + "grad_norm": 0.8341879780413903, + "learning_rate": 4.9925271372882925e-06, + "loss": 0.063, + "step": 2908 + }, + { + "epoch": 1.051699204627621, + "grad_norm": 0.0978022360935427, + "learning_rate": 4.98953799594273e-06, + "loss": 0.0027, + "step": 2909 + }, + { + "epoch": 1.052060737527115, + "grad_norm": 0.005227030878014112, + "learning_rate": 4.986548858336286e-06, + "loss": 0.0001, + "step": 2910 + }, + { + "epoch": 1.0524222704266089, + "grad_norm": 0.009266321886045543, + "learning_rate": 4.983559725537273e-06, + "loss": 0.0003, + "step": 2911 + }, + { + "epoch": 1.0527838033261028, + "grad_norm": 0.3194390257597236, + "learning_rate": 4.9805705986140155e-06, + "loss": 0.0354, + "step": 2912 + }, + { + "epoch": 1.0531453362255965, + "grad_norm": 0.041332446662086186, + "learning_rate": 4.977581478634819e-06, + "loss": 0.0011, + "step": 2913 + }, + { + "epoch": 1.0535068691250904, + "grad_norm": 3.059919536344809, + "learning_rate": 4.974592366667998e-06, + "loss": 0.4434, + "step": 2914 + }, + { + "epoch": 1.0538684020245843, + "grad_norm": 2.1216944942766482, + "learning_rate": 4.971603263781862e-06, + "loss": 0.1279, + "step": 2915 + }, + { + "epoch": 1.054229934924078, + "grad_norm": 0.5836956860126554, + "learning_rate": 4.96861417104471e-06, + "loss": 0.0576, + "step": 2916 + }, + { + "epoch": 1.054591467823572, + "grad_norm": 0.9523938266683422, + "learning_rate": 4.965625089524849e-06, + "loss": 0.063, + "step": 2917 + }, + { + "epoch": 1.0549530007230659, + "grad_norm": 0.18703986484374405, + "learning_rate": 4.9626360202905725e-06, + "loss": 0.0231, + "step": 2918 + }, + { + "epoch": 1.0553145336225596, + "grad_norm": 0.6413485605850456, + "learning_rate": 4.959646964410175e-06, + "loss": 0.0889, + "step": 2919 + }, + { + "epoch": 1.0556760665220535, + "grad_norm": 0.013096612428250068, + "learning_rate": 4.9566579229519455e-06, + "loss": 0.0003, + "step": 2920 + }, + { + "epoch": 1.0560375994215474, + "grad_norm": 0.027348805341908387, + "learning_rate": 4.953668896984161e-06, + "loss": 0.0009, + "step": 2921 + }, + { + "epoch": 1.056399132321041, + "grad_norm": 0.9702660835401604, + "learning_rate": 4.950679887575107e-06, + "loss": 0.1035, + "step": 2922 + }, + { + "epoch": 1.056760665220535, + "grad_norm": 0.13630666137433184, + "learning_rate": 4.947690895793049e-06, + "loss": 0.0255, + "step": 2923 + }, + { + "epoch": 1.057122198120029, + "grad_norm": 0.05662117732182156, + "learning_rate": 4.944701922706254e-06, + "loss": 0.002, + "step": 2924 + }, + { + "epoch": 1.0574837310195229, + "grad_norm": 0.23136832212183017, + "learning_rate": 4.941712969382981e-06, + "loss": 0.0286, + "step": 2925 + }, + { + "epoch": 1.0578452639190166, + "grad_norm": 0.012219284209355506, + "learning_rate": 4.938724036891478e-06, + "loss": 0.0002, + "step": 2926 + }, + { + "epoch": 1.0582067968185105, + "grad_norm": 0.13436060210916176, + "learning_rate": 4.935735126299994e-06, + "loss": 0.0027, + "step": 2927 + }, + { + "epoch": 1.0585683297180044, + "grad_norm": 0.48460148328326963, + "learning_rate": 4.93274623867676e-06, + "loss": 0.0957, + "step": 2928 + }, + { + "epoch": 1.058929862617498, + "grad_norm": 3.4183143397507028, + "learning_rate": 4.929757375090008e-06, + "loss": 0.1641, + "step": 2929 + }, + { + "epoch": 1.059291395516992, + "grad_norm": 0.20620021331140517, + "learning_rate": 4.9267685366079556e-06, + "loss": 0.0317, + "step": 2930 + }, + { + "epoch": 1.059652928416486, + "grad_norm": 0.2318122015801382, + "learning_rate": 4.92377972429881e-06, + "loss": 0.0286, + "step": 2931 + }, + { + "epoch": 1.0600144613159797, + "grad_norm": 0.022884336971995094, + "learning_rate": 4.920790939230778e-06, + "loss": 0.0006, + "step": 2932 + }, + { + "epoch": 1.0603759942154736, + "grad_norm": 0.2037521107310005, + "learning_rate": 4.917802182472046e-06, + "loss": 0.0317, + "step": 2933 + }, + { + "epoch": 1.0607375271149675, + "grad_norm": 0.5399116167088887, + "learning_rate": 4.914813455090797e-06, + "loss": 0.0231, + "step": 2934 + }, + { + "epoch": 1.0610990600144614, + "grad_norm": 0.0016196611705238464, + "learning_rate": 4.911824758155201e-06, + "loss": 0.0, + "step": 2935 + }, + { + "epoch": 1.0614605929139551, + "grad_norm": 0.2596900662403395, + "learning_rate": 4.908836092733417e-06, + "loss": 0.0317, + "step": 2936 + }, + { + "epoch": 1.061822125813449, + "grad_norm": 0.3633095383055188, + "learning_rate": 4.905847459893597e-06, + "loss": 0.0432, + "step": 2937 + }, + { + "epoch": 1.062183658712943, + "grad_norm": 1.0154515361970466, + "learning_rate": 4.902858860703872e-06, + "loss": 0.0476, + "step": 2938 + }, + { + "epoch": 1.0625451916124367, + "grad_norm": 0.592383017750386, + "learning_rate": 4.899870296232371e-06, + "loss": 0.0957, + "step": 2939 + }, + { + "epoch": 1.0629067245119306, + "grad_norm": 0.5626186609245257, + "learning_rate": 4.896881767547205e-06, + "loss": 0.1191, + "step": 2940 + }, + { + "epoch": 1.0632682574114245, + "grad_norm": 0.18425034499505125, + "learning_rate": 4.893893275716472e-06, + "loss": 0.0286, + "step": 2941 + }, + { + "epoch": 1.0636297903109182, + "grad_norm": 0.9700216012773438, + "learning_rate": 4.890904821808263e-06, + "loss": 0.0815, + "step": 2942 + }, + { + "epoch": 1.0639913232104121, + "grad_norm": 0.3644308749775059, + "learning_rate": 4.887916406890645e-06, + "loss": 0.0432, + "step": 2943 + }, + { + "epoch": 1.064352856109906, + "grad_norm": 0.0015376655007828864, + "learning_rate": 4.88492803203168e-06, + "loss": 0.0, + "step": 2944 + }, + { + "epoch": 1.0647143890093997, + "grad_norm": 0.2777255055419606, + "learning_rate": 4.881939698299413e-06, + "loss": 0.0391, + "step": 2945 + }, + { + "epoch": 1.0650759219088937, + "grad_norm": 0.4040793574460753, + "learning_rate": 4.878951406761872e-06, + "loss": 0.0432, + "step": 2946 + }, + { + "epoch": 1.0654374548083876, + "grad_norm": 0.14136337160648976, + "learning_rate": 4.875963158487074e-06, + "loss": 0.0255, + "step": 2947 + }, + { + "epoch": 1.0657989877078815, + "grad_norm": 2.672775524879189, + "learning_rate": 4.872974954543015e-06, + "loss": 0.2988, + "step": 2948 + }, + { + "epoch": 1.0661605206073752, + "grad_norm": 0.007959784791552125, + "learning_rate": 4.8699867959976824e-06, + "loss": 0.0001, + "step": 2949 + }, + { + "epoch": 1.0665220535068691, + "grad_norm": 0.04794092326280291, + "learning_rate": 4.866998683919041e-06, + "loss": 0.0015, + "step": 2950 + }, + { + "epoch": 1.066883586406363, + "grad_norm": 0.002111254151084743, + "learning_rate": 4.86401061937504e-06, + "loss": 0.0001, + "step": 2951 + }, + { + "epoch": 1.0672451193058567, + "grad_norm": 2.285841972637919, + "learning_rate": 4.861022603433617e-06, + "loss": 0.2773, + "step": 2952 + }, + { + "epoch": 1.0676066522053507, + "grad_norm": 2.944294646218988, + "learning_rate": 4.8580346371626855e-06, + "loss": 0.3867, + "step": 2953 + }, + { + "epoch": 1.0679681851048446, + "grad_norm": 0.0507928545364839, + "learning_rate": 4.855046721630145e-06, + "loss": 0.0009, + "step": 2954 + }, + { + "epoch": 1.0683297180043383, + "grad_norm": 0.9625836084031454, + "learning_rate": 4.8520588579038755e-06, + "loss": 0.0476, + "step": 2955 + }, + { + "epoch": 1.0686912509038322, + "grad_norm": 1.1439232703684403, + "learning_rate": 4.849071047051738e-06, + "loss": 0.0815, + "step": 2956 + }, + { + "epoch": 1.0690527838033261, + "grad_norm": 0.2432133098845861, + "learning_rate": 4.846083290141578e-06, + "loss": 0.0354, + "step": 2957 + }, + { + "epoch": 1.06941431670282, + "grad_norm": 0.17181609658713515, + "learning_rate": 4.843095588241216e-06, + "loss": 0.0286, + "step": 2958 + }, + { + "epoch": 1.0697758496023138, + "grad_norm": 2.0614723829912487, + "learning_rate": 4.840107942418459e-06, + "loss": 0.3203, + "step": 2959 + }, + { + "epoch": 1.0701373825018077, + "grad_norm": 0.01711041820225617, + "learning_rate": 4.83712035374109e-06, + "loss": 0.0004, + "step": 2960 + }, + { + "epoch": 1.0704989154013016, + "grad_norm": 0.5184500715274774, + "learning_rate": 4.83413282327687e-06, + "loss": 0.0525, + "step": 2961 + }, + { + "epoch": 1.0708604483007953, + "grad_norm": 0.18901514410374123, + "learning_rate": 4.831145352093547e-06, + "loss": 0.0255, + "step": 2962 + }, + { + "epoch": 1.0712219812002892, + "grad_norm": 0.463768814785309, + "learning_rate": 4.828157941258837e-06, + "loss": 0.0889, + "step": 2963 + }, + { + "epoch": 1.0715835140997831, + "grad_norm": 0.06443996041978421, + "learning_rate": 4.825170591840443e-06, + "loss": 0.0021, + "step": 2964 + }, + { + "epoch": 1.0719450469992768, + "grad_norm": 0.004787377980224397, + "learning_rate": 4.822183304906043e-06, + "loss": 0.0002, + "step": 2965 + }, + { + "epoch": 1.0723065798987708, + "grad_norm": 0.22454735093685654, + "learning_rate": 4.819196081523291e-06, + "loss": 0.0231, + "step": 2966 + }, + { + "epoch": 1.0726681127982647, + "grad_norm": 1.9888399824001506, + "learning_rate": 4.816208922759824e-06, + "loss": 0.1113, + "step": 2967 + }, + { + "epoch": 1.0730296456977584, + "grad_norm": 0.819669180137066, + "learning_rate": 4.8132218296832475e-06, + "loss": 0.063, + "step": 2968 + }, + { + "epoch": 1.0733911785972523, + "grad_norm": 0.1269172678315109, + "learning_rate": 4.81023480336115e-06, + "loss": 0.0206, + "step": 2969 + }, + { + "epoch": 1.0737527114967462, + "grad_norm": 0.5392371162102386, + "learning_rate": 4.8072478448610935e-06, + "loss": 0.1455, + "step": 2970 + }, + { + "epoch": 1.0741142443962401, + "grad_norm": 0.008007602930154154, + "learning_rate": 4.804260955250616e-06, + "loss": 0.0002, + "step": 2971 + }, + { + "epoch": 1.0744757772957338, + "grad_norm": 0.4190237948928705, + "learning_rate": 4.8012741355972344e-06, + "loss": 0.0476, + "step": 2972 + }, + { + "epoch": 1.0748373101952278, + "grad_norm": 0.15568638574606658, + "learning_rate": 4.7982873869684315e-06, + "loss": 0.0229, + "step": 2973 + }, + { + "epoch": 1.0751988430947217, + "grad_norm": 0.3575101044668928, + "learning_rate": 4.795300710431676e-06, + "loss": 0.0165, + "step": 2974 + }, + { + "epoch": 1.0755603759942154, + "grad_norm": 0.14972422120857562, + "learning_rate": 4.792314107054403e-06, + "loss": 0.0082, + "step": 2975 + }, + { + "epoch": 1.0759219088937093, + "grad_norm": 0.15567217066960753, + "learning_rate": 4.789327577904023e-06, + "loss": 0.0057, + "step": 2976 + }, + { + "epoch": 1.0762834417932032, + "grad_norm": 0.01104738795774507, + "learning_rate": 4.786341124047925e-06, + "loss": 0.0002, + "step": 2977 + }, + { + "epoch": 1.076644974692697, + "grad_norm": 0.17628230911459786, + "learning_rate": 4.78335474655346e-06, + "loss": 0.0286, + "step": 2978 + }, + { + "epoch": 1.0770065075921909, + "grad_norm": 1.3803956087878229, + "learning_rate": 4.7803684464879665e-06, + "loss": 0.0815, + "step": 2979 + }, + { + "epoch": 1.0773680404916848, + "grad_norm": 0.5736533292764692, + "learning_rate": 4.777382224918742e-06, + "loss": 0.1035, + "step": 2980 + }, + { + "epoch": 1.0777295733911787, + "grad_norm": 0.20840070668016764, + "learning_rate": 4.774396082913064e-06, + "loss": 0.0258, + "step": 2981 + }, + { + "epoch": 1.0780911062906724, + "grad_norm": 0.3124752687556627, + "learning_rate": 4.77141002153818e-06, + "loss": 0.0035, + "step": 2982 + }, + { + "epoch": 1.0784526391901663, + "grad_norm": 0.15677560181291963, + "learning_rate": 4.768424041861302e-06, + "loss": 0.0229, + "step": 2983 + }, + { + "epoch": 1.0788141720896602, + "grad_norm": 0.12967485355391592, + "learning_rate": 4.765438144949626e-06, + "loss": 0.0184, + "step": 2984 + }, + { + "epoch": 1.079175704989154, + "grad_norm": 0.025201052134370443, + "learning_rate": 4.762452331870306e-06, + "loss": 0.0008, + "step": 2985 + }, + { + "epoch": 1.0795372378886479, + "grad_norm": 0.6220665206856478, + "learning_rate": 4.759466603690473e-06, + "loss": 0.0525, + "step": 2986 + }, + { + "epoch": 1.0798987707881418, + "grad_norm": 0.1178193836915828, + "learning_rate": 4.756480961477226e-06, + "loss": 0.0184, + "step": 2987 + }, + { + "epoch": 1.0802603036876355, + "grad_norm": 0.5099932272222077, + "learning_rate": 4.753495406297629e-06, + "loss": 0.0525, + "step": 2988 + }, + { + "epoch": 1.0806218365871294, + "grad_norm": 0.10243996090531576, + "learning_rate": 4.750509939218725e-06, + "loss": 0.0165, + "step": 2989 + }, + { + "epoch": 1.0809833694866233, + "grad_norm": 0.14029558366640718, + "learning_rate": 4.747524561307515e-06, + "loss": 0.0184, + "step": 2990 + }, + { + "epoch": 1.081344902386117, + "grad_norm": 0.3235233351270535, + "learning_rate": 4.744539273630973e-06, + "loss": 0.0184, + "step": 2991 + }, + { + "epoch": 1.081706435285611, + "grad_norm": 0.21951925582809576, + "learning_rate": 4.741554077256042e-06, + "loss": 0.0184, + "step": 2992 + }, + { + "epoch": 1.0820679681851049, + "grad_norm": 0.21527215889839388, + "learning_rate": 4.738568973249626e-06, + "loss": 0.0184, + "step": 2993 + }, + { + "epoch": 1.0824295010845988, + "grad_norm": 0.2312116369460282, + "learning_rate": 4.735583962678607e-06, + "loss": 0.0286, + "step": 2994 + }, + { + "epoch": 1.0827910339840925, + "grad_norm": 0.6376756435164965, + "learning_rate": 4.7325990466098234e-06, + "loss": 0.1035, + "step": 2995 + }, + { + "epoch": 1.0831525668835864, + "grad_norm": 0.5523780481130037, + "learning_rate": 4.729614226110084e-06, + "loss": 0.0391, + "step": 2996 + }, + { + "epoch": 1.0835140997830803, + "grad_norm": 0.3521521667776589, + "learning_rate": 4.726629502246163e-06, + "loss": 0.0146, + "step": 2997 + }, + { + "epoch": 1.083875632682574, + "grad_norm": 0.14330001487832497, + "learning_rate": 4.723644876084799e-06, + "loss": 0.0206, + "step": 2998 + }, + { + "epoch": 1.084237165582068, + "grad_norm": 0.22124658776943762, + "learning_rate": 4.7206603486927e-06, + "loss": 0.0286, + "step": 2999 + }, + { + "epoch": 1.0845986984815619, + "grad_norm": 0.09830348017726943, + "learning_rate": 4.7176759211365315e-06, + "loss": 0.0146, + "step": 3000 + }, + { + "epoch": 1.0849602313810556, + "grad_norm": 0.5494302902993798, + "learning_rate": 4.7146915944829305e-06, + "loss": 0.1279, + "step": 3001 + }, + { + "epoch": 1.0853217642805495, + "grad_norm": 0.18681682974996427, + "learning_rate": 4.711707369798495e-06, + "loss": 0.0146, + "step": 3002 + }, + { + "epoch": 1.0856832971800434, + "grad_norm": 0.11265349018043938, + "learning_rate": 4.70872324814978e-06, + "loss": 0.0165, + "step": 3003 + }, + { + "epoch": 1.0860448300795373, + "grad_norm": 0.3111715959451547, + "learning_rate": 4.705739230603319e-06, + "loss": 0.0286, + "step": 3004 + }, + { + "epoch": 1.086406362979031, + "grad_norm": 0.3092226480652991, + "learning_rate": 4.702755318225592e-06, + "loss": 0.0258, + "step": 3005 + }, + { + "epoch": 1.086767895878525, + "grad_norm": 0.18012087355379683, + "learning_rate": 4.699771512083053e-06, + "loss": 0.0072, + "step": 3006 + }, + { + "epoch": 1.0871294287780189, + "grad_norm": 0.8258104885414617, + "learning_rate": 4.696787813242114e-06, + "loss": 0.1455, + "step": 3007 + }, + { + "epoch": 1.0874909616775126, + "grad_norm": 0.02503870388813457, + "learning_rate": 4.6938042227691425e-06, + "loss": 0.0008, + "step": 3008 + }, + { + "epoch": 1.0878524945770065, + "grad_norm": 0.34886142950019433, + "learning_rate": 4.69082074173048e-06, + "loss": 0.0286, + "step": 3009 + }, + { + "epoch": 1.0882140274765004, + "grad_norm": 0.10210006154999035, + "learning_rate": 4.6878373711924175e-06, + "loss": 0.0146, + "step": 3010 + }, + { + "epoch": 1.0885755603759941, + "grad_norm": 1.5381003880012476, + "learning_rate": 4.684854112221214e-06, + "loss": 0.1113, + "step": 3011 + }, + { + "epoch": 1.088937093275488, + "grad_norm": 0.18830290925353388, + "learning_rate": 4.681870965883085e-06, + "loss": 0.0258, + "step": 3012 + }, + { + "epoch": 1.089298626174982, + "grad_norm": 0.6880114144801022, + "learning_rate": 4.6788879332442025e-06, + "loss": 0.1279, + "step": 3013 + }, + { + "epoch": 1.0896601590744757, + "grad_norm": 0.1726687157643773, + "learning_rate": 4.675905015370708e-06, + "loss": 0.0206, + "step": 3014 + }, + { + "epoch": 1.0900216919739696, + "grad_norm": 0.12144107802202653, + "learning_rate": 4.672922213328691e-06, + "loss": 0.0184, + "step": 3015 + }, + { + "epoch": 1.0903832248734635, + "grad_norm": 0.28754610644929024, + "learning_rate": 4.669939528184206e-06, + "loss": 0.0103, + "step": 3016 + }, + { + "epoch": 1.0907447577729574, + "grad_norm": 1.0402359203456895, + "learning_rate": 4.666956961003266e-06, + "loss": 0.0525, + "step": 3017 + }, + { + "epoch": 1.0911062906724511, + "grad_norm": 0.8055991847934729, + "learning_rate": 4.663974512851834e-06, + "loss": 0.0957, + "step": 3018 + }, + { + "epoch": 1.091467823571945, + "grad_norm": 0.679708148757563, + "learning_rate": 4.660992184795844e-06, + "loss": 0.1035, + "step": 3019 + }, + { + "epoch": 1.091829356471439, + "grad_norm": 0.13396603155572823, + "learning_rate": 4.658009977901173e-06, + "loss": 0.0165, + "step": 3020 + }, + { + "epoch": 1.0921908893709327, + "grad_norm": 0.04106819612568712, + "learning_rate": 4.655027893233665e-06, + "loss": 0.0012, + "step": 3021 + }, + { + "epoch": 1.0925524222704266, + "grad_norm": 0.8064192628287706, + "learning_rate": 4.652045931859116e-06, + "loss": 0.0525, + "step": 3022 + }, + { + "epoch": 1.0929139551699205, + "grad_norm": 0.11440395593153754, + "learning_rate": 4.649064094843274e-06, + "loss": 0.0146, + "step": 3023 + }, + { + "epoch": 1.0932754880694142, + "grad_norm": 0.8793509337922305, + "learning_rate": 4.6460823832518555e-06, + "loss": 0.0576, + "step": 3024 + }, + { + "epoch": 1.0936370209689081, + "grad_norm": 1.433825827944719, + "learning_rate": 4.6431007981505146e-06, + "loss": 0.1113, + "step": 3025 + }, + { + "epoch": 1.093998553868402, + "grad_norm": 0.782825434709505, + "learning_rate": 4.640119340604875e-06, + "loss": 0.0576, + "step": 3026 + }, + { + "epoch": 1.094360086767896, + "grad_norm": 0.06013428034402097, + "learning_rate": 4.637138011680508e-06, + "loss": 0.0006, + "step": 3027 + }, + { + "epoch": 1.0947216196673897, + "grad_norm": 1.0899017950924563, + "learning_rate": 4.634156812442936e-06, + "loss": 0.1367, + "step": 3028 + }, + { + "epoch": 1.0950831525668836, + "grad_norm": 0.15346109617027692, + "learning_rate": 4.631175743957644e-06, + "loss": 0.0206, + "step": 3029 + }, + { + "epoch": 1.0954446854663775, + "grad_norm": 0.24162788168779883, + "learning_rate": 4.6281948072900625e-06, + "loss": 0.0039, + "step": 3030 + }, + { + "epoch": 1.0958062183658712, + "grad_norm": 0.29417371269437304, + "learning_rate": 4.625214003505579e-06, + "loss": 0.0354, + "step": 3031 + }, + { + "epoch": 1.0961677512653651, + "grad_norm": 0.2753141469041446, + "learning_rate": 4.622233333669531e-06, + "loss": 0.0117, + "step": 3032 + }, + { + "epoch": 1.096529284164859, + "grad_norm": 0.013704992428618896, + "learning_rate": 4.61925279884721e-06, + "loss": 0.0004, + "step": 3033 + }, + { + "epoch": 1.0968908170643528, + "grad_norm": 1.1724697106075137, + "learning_rate": 4.61627240010386e-06, + "loss": 0.1113, + "step": 3034 + }, + { + "epoch": 1.0972523499638467, + "grad_norm": 0.8033418538802815, + "learning_rate": 4.61329213850467e-06, + "loss": 0.1191, + "step": 3035 + }, + { + "epoch": 1.0976138828633406, + "grad_norm": 0.494252057632048, + "learning_rate": 4.6103120151147905e-06, + "loss": 0.0354, + "step": 3036 + }, + { + "epoch": 1.0979754157628343, + "grad_norm": 0.19871974622508115, + "learning_rate": 4.6073320309993145e-06, + "loss": 0.0255, + "step": 3037 + }, + { + "epoch": 1.0983369486623282, + "grad_norm": 0.2342926748745729, + "learning_rate": 4.604352187223286e-06, + "loss": 0.0206, + "step": 3038 + }, + { + "epoch": 1.0986984815618221, + "grad_norm": 0.19079119707440567, + "learning_rate": 4.601372484851705e-06, + "loss": 0.0206, + "step": 3039 + }, + { + "epoch": 1.099060014461316, + "grad_norm": 0.007368180395702981, + "learning_rate": 4.5983929249495104e-06, + "loss": 0.0001, + "step": 3040 + }, + { + "epoch": 1.0994215473608098, + "grad_norm": 0.24891084528156457, + "learning_rate": 4.595413508581602e-06, + "loss": 0.0039, + "step": 3041 + }, + { + "epoch": 1.0997830802603037, + "grad_norm": 1.6626211065472163, + "learning_rate": 4.59243423681282e-06, + "loss": 0.1641, + "step": 3042 + }, + { + "epoch": 1.1001446131597976, + "grad_norm": 0.10145737180325487, + "learning_rate": 4.589455110707955e-06, + "loss": 0.004, + "step": 3043 + }, + { + "epoch": 1.1005061460592913, + "grad_norm": 0.035433153049225806, + "learning_rate": 4.586476131331749e-06, + "loss": 0.0012, + "step": 3044 + }, + { + "epoch": 1.1008676789587852, + "grad_norm": 0.7004041256628266, + "learning_rate": 4.583497299748883e-06, + "loss": 0.1191, + "step": 3045 + }, + { + "epoch": 1.1012292118582792, + "grad_norm": 0.05956882777065356, + "learning_rate": 4.580518617023996e-06, + "loss": 0.0013, + "step": 3046 + }, + { + "epoch": 1.1015907447577729, + "grad_norm": 0.3624549264255399, + "learning_rate": 4.577540084221666e-06, + "loss": 0.0117, + "step": 3047 + }, + { + "epoch": 1.1019522776572668, + "grad_norm": 2.336020769841728, + "learning_rate": 4.574561702406421e-06, + "loss": 0.3867, + "step": 3048 + }, + { + "epoch": 1.1023138105567607, + "grad_norm": 0.45496135774682467, + "learning_rate": 4.571583472642736e-06, + "loss": 0.0354, + "step": 3049 + }, + { + "epoch": 1.1026753434562546, + "grad_norm": 0.0788483814980606, + "learning_rate": 4.568605395995025e-06, + "loss": 0.0024, + "step": 3050 + }, + { + "epoch": 1.1030368763557483, + "grad_norm": 1.1304234346749336, + "learning_rate": 4.565627473527655e-06, + "loss": 0.0752, + "step": 3051 + }, + { + "epoch": 1.1033984092552422, + "grad_norm": 2.476576715177942, + "learning_rate": 4.562649706304933e-06, + "loss": 0.1934, + "step": 3052 + }, + { + "epoch": 1.1037599421547362, + "grad_norm": 0.35469961840014064, + "learning_rate": 4.559672095391116e-06, + "loss": 0.0391, + "step": 3053 + }, + { + "epoch": 1.1041214750542299, + "grad_norm": 0.4211382633677529, + "learning_rate": 4.5566946418503985e-06, + "loss": 0.0286, + "step": 3054 + }, + { + "epoch": 1.1044830079537238, + "grad_norm": 0.20221020043001842, + "learning_rate": 4.553717346746922e-06, + "loss": 0.0205, + "step": 3055 + }, + { + "epoch": 1.1048445408532177, + "grad_norm": 0.041692104527745355, + "learning_rate": 4.550740211144772e-06, + "loss": 0.0019, + "step": 3056 + }, + { + "epoch": 1.1052060737527114, + "grad_norm": 0.21531182954411585, + "learning_rate": 4.547763236107975e-06, + "loss": 0.0231, + "step": 3057 + }, + { + "epoch": 1.1055676066522053, + "grad_norm": 0.2510425466721191, + "learning_rate": 4.5447864227005015e-06, + "loss": 0.0317, + "step": 3058 + }, + { + "epoch": 1.1059291395516992, + "grad_norm": 0.35372594911475064, + "learning_rate": 4.541809771986267e-06, + "loss": 0.0258, + "step": 3059 + }, + { + "epoch": 1.106290672451193, + "grad_norm": 1.1113862937018997, + "learning_rate": 4.538833285029121e-06, + "loss": 0.1279, + "step": 3060 + }, + { + "epoch": 1.1066522053506869, + "grad_norm": 0.7055114553773549, + "learning_rate": 4.535856962892862e-06, + "loss": 0.063, + "step": 3061 + }, + { + "epoch": 1.1070137382501808, + "grad_norm": 0.5634722305841074, + "learning_rate": 4.532880806641226e-06, + "loss": 0.1035, + "step": 3062 + }, + { + "epoch": 1.1073752711496747, + "grad_norm": 0.12517743041875418, + "learning_rate": 4.529904817337892e-06, + "loss": 0.0117, + "step": 3063 + }, + { + "epoch": 1.1077368040491684, + "grad_norm": 0.5904064039648738, + "learning_rate": 4.526928996046479e-06, + "loss": 0.0391, + "step": 3064 + }, + { + "epoch": 1.1080983369486623, + "grad_norm": 0.6346540352305918, + "learning_rate": 4.52395334383054e-06, + "loss": 0.1113, + "step": 3065 + }, + { + "epoch": 1.1084598698481563, + "grad_norm": 1.145817355810022, + "learning_rate": 4.520977861753576e-06, + "loss": 0.1367, + "step": 3066 + }, + { + "epoch": 1.10882140274765, + "grad_norm": 0.12203082692941639, + "learning_rate": 4.518002550879023e-06, + "loss": 0.0146, + "step": 3067 + }, + { + "epoch": 1.1091829356471439, + "grad_norm": 1.5520784237660754, + "learning_rate": 4.515027412270257e-06, + "loss": 0.2246, + "step": 3068 + }, + { + "epoch": 1.1095444685466378, + "grad_norm": 0.14808197167778356, + "learning_rate": 4.512052446990593e-06, + "loss": 0.0184, + "step": 3069 + }, + { + "epoch": 1.1099060014461315, + "grad_norm": 0.5736204185520157, + "learning_rate": 4.509077656103279e-06, + "loss": 0.1367, + "step": 3070 + }, + { + "epoch": 1.1102675343456254, + "grad_norm": 0.5483808926513187, + "learning_rate": 4.506103040671508e-06, + "loss": 0.1641, + "step": 3071 + }, + { + "epoch": 1.1106290672451193, + "grad_norm": 1.7967792166344163, + "learning_rate": 4.503128601758406e-06, + "loss": 0.1367, + "step": 3072 + }, + { + "epoch": 1.1109906001446133, + "grad_norm": 0.2536037388730382, + "learning_rate": 4.500154340427037e-06, + "loss": 0.0165, + "step": 3073 + }, + { + "epoch": 1.111352133044107, + "grad_norm": 0.11976285656413133, + "learning_rate": 4.497180257740403e-06, + "loss": 0.0057, + "step": 3074 + }, + { + "epoch": 1.1117136659436009, + "grad_norm": 0.2801427880311958, + "learning_rate": 4.494206354761436e-06, + "loss": 0.0317, + "step": 3075 + }, + { + "epoch": 1.1120751988430948, + "grad_norm": 0.17857154116821677, + "learning_rate": 4.491232632553013e-06, + "loss": 0.0092, + "step": 3076 + }, + { + "epoch": 1.1124367317425885, + "grad_norm": 0.1499613359547655, + "learning_rate": 4.488259092177937e-06, + "loss": 0.0064, + "step": 3077 + }, + { + "epoch": 1.1127982646420824, + "grad_norm": 0.10996069945354651, + "learning_rate": 4.4852857346989565e-06, + "loss": 0.0184, + "step": 3078 + }, + { + "epoch": 1.1131597975415763, + "grad_norm": 0.3269006273043139, + "learning_rate": 4.4823125611787455e-06, + "loss": 0.0258, + "step": 3079 + }, + { + "epoch": 1.11352133044107, + "grad_norm": 0.19646535905981402, + "learning_rate": 4.479339572679913e-06, + "loss": 0.0258, + "step": 3080 + }, + { + "epoch": 1.113882863340564, + "grad_norm": 0.2798795028709484, + "learning_rate": 4.476366770265011e-06, + "loss": 0.0258, + "step": 3081 + }, + { + "epoch": 1.1142443962400579, + "grad_norm": 0.9159322966511623, + "learning_rate": 4.473394154996512e-06, + "loss": 0.0957, + "step": 3082 + }, + { + "epoch": 1.1146059291395516, + "grad_norm": 0.3891741790298775, + "learning_rate": 4.470421727936832e-06, + "loss": 0.0317, + "step": 3083 + }, + { + "epoch": 1.1149674620390455, + "grad_norm": 1.041582162713566, + "learning_rate": 4.467449490148317e-06, + "loss": 0.1191, + "step": 3084 + }, + { + "epoch": 1.1153289949385394, + "grad_norm": 0.12711601312172874, + "learning_rate": 4.464477442693237e-06, + "loss": 0.0206, + "step": 3085 + }, + { + "epoch": 1.1156905278380334, + "grad_norm": 0.8911393137845661, + "learning_rate": 4.46150558663381e-06, + "loss": 0.0815, + "step": 3086 + }, + { + "epoch": 1.116052060737527, + "grad_norm": 0.1365193613600487, + "learning_rate": 4.45853392303217e-06, + "loss": 0.0206, + "step": 3087 + }, + { + "epoch": 1.116413593637021, + "grad_norm": 0.1805304240899517, + "learning_rate": 4.455562452950394e-06, + "loss": 0.0092, + "step": 3088 + }, + { + "epoch": 1.116775126536515, + "grad_norm": 0.9965444439544499, + "learning_rate": 4.452591177450482e-06, + "loss": 0.1279, + "step": 3089 + }, + { + "epoch": 1.1171366594360086, + "grad_norm": 0.47187727851177513, + "learning_rate": 4.449620097594365e-06, + "loss": 0.0576, + "step": 3090 + }, + { + "epoch": 1.1174981923355025, + "grad_norm": 0.02980585017424303, + "learning_rate": 4.446649214443912e-06, + "loss": 0.0006, + "step": 3091 + }, + { + "epoch": 1.1178597252349964, + "grad_norm": 0.17466242959741263, + "learning_rate": 4.443678529060912e-06, + "loss": 0.0206, + "step": 3092 + }, + { + "epoch": 1.1182212581344901, + "grad_norm": 0.1934685548173993, + "learning_rate": 4.440708042507087e-06, + "loss": 0.0103, + "step": 3093 + }, + { + "epoch": 1.118582791033984, + "grad_norm": 0.6106918382400566, + "learning_rate": 4.437737755844093e-06, + "loss": 0.0206, + "step": 3094 + }, + { + "epoch": 1.118944323933478, + "grad_norm": 0.9579533243524414, + "learning_rate": 4.434767670133502e-06, + "loss": 0.0752, + "step": 3095 + }, + { + "epoch": 1.119305856832972, + "grad_norm": 0.21574158702014512, + "learning_rate": 4.43179778643683e-06, + "loss": 0.0286, + "step": 3096 + }, + { + "epoch": 1.1196673897324656, + "grad_norm": 0.37365959890037226, + "learning_rate": 4.428828105815507e-06, + "loss": 0.0231, + "step": 3097 + }, + { + "epoch": 1.1200289226319595, + "grad_norm": 0.2996582940602468, + "learning_rate": 4.425858629330899e-06, + "loss": 0.0286, + "step": 3098 + }, + { + "epoch": 1.1203904555314534, + "grad_norm": 0.1343308450312456, + "learning_rate": 4.4228893580442975e-06, + "loss": 0.0146, + "step": 3099 + }, + { + "epoch": 1.1207519884309471, + "grad_norm": 0.36731495462262176, + "learning_rate": 4.419920293016914e-06, + "loss": 0.0391, + "step": 3100 + }, + { + "epoch": 1.121113521330441, + "grad_norm": 0.46184113777819474, + "learning_rate": 4.416951435309899e-06, + "loss": 0.0317, + "step": 3101 + }, + { + "epoch": 1.121475054229935, + "grad_norm": 0.5173431838089483, + "learning_rate": 4.413982785984315e-06, + "loss": 0.0391, + "step": 3102 + }, + { + "epoch": 1.1218365871294287, + "grad_norm": 0.21385928098735113, + "learning_rate": 4.411014346101162e-06, + "loss": 0.0131, + "step": 3103 + }, + { + "epoch": 1.1221981200289226, + "grad_norm": 0.2761579311758171, + "learning_rate": 4.408046116721357e-06, + "loss": 0.0317, + "step": 3104 + }, + { + "epoch": 1.1225596529284165, + "grad_norm": 0.17154665830551685, + "learning_rate": 4.405078098905743e-06, + "loss": 0.0229, + "step": 3105 + }, + { + "epoch": 1.1229211858279102, + "grad_norm": 0.27522087597916167, + "learning_rate": 4.402110293715094e-06, + "loss": 0.0354, + "step": 3106 + }, + { + "epoch": 1.1232827187274042, + "grad_norm": 0.11065820083280548, + "learning_rate": 4.399142702210097e-06, + "loss": 0.0165, + "step": 3107 + }, + { + "epoch": 1.123644251626898, + "grad_norm": 0.2544872480783611, + "learning_rate": 4.3961753254513725e-06, + "loss": 0.0258, + "step": 3108 + }, + { + "epoch": 1.124005784526392, + "grad_norm": 0.2811860279080879, + "learning_rate": 4.393208164499461e-06, + "loss": 0.0317, + "step": 3109 + }, + { + "epoch": 1.1243673174258857, + "grad_norm": 0.26141522124674893, + "learning_rate": 4.390241220414819e-06, + "loss": 0.0117, + "step": 3110 + }, + { + "epoch": 1.1247288503253796, + "grad_norm": 0.1972820359665157, + "learning_rate": 4.3872744942578406e-06, + "loss": 0.0184, + "step": 3111 + }, + { + "epoch": 1.1250903832248735, + "grad_norm": 0.10302707162390118, + "learning_rate": 4.384307987088826e-06, + "loss": 0.0146, + "step": 3112 + }, + { + "epoch": 1.1254519161243672, + "grad_norm": 0.3382872223445003, + "learning_rate": 4.381341699968008e-06, + "loss": 0.0286, + "step": 3113 + }, + { + "epoch": 1.1258134490238612, + "grad_norm": 0.12187190076863298, + "learning_rate": 4.378375633955537e-06, + "loss": 0.0117, + "step": 3114 + }, + { + "epoch": 1.126174981923355, + "grad_norm": 1.6170374663130584, + "learning_rate": 4.37540979011148e-06, + "loss": 0.0688, + "step": 3115 + }, + { + "epoch": 1.126536514822849, + "grad_norm": 0.19749752113557903, + "learning_rate": 4.372444169495836e-06, + "loss": 0.0165, + "step": 3116 + }, + { + "epoch": 1.1268980477223427, + "grad_norm": 0.19959006581252425, + "learning_rate": 4.369478773168511e-06, + "loss": 0.0103, + "step": 3117 + }, + { + "epoch": 1.1272595806218366, + "grad_norm": 0.08007203068790338, + "learning_rate": 4.3665136021893394e-06, + "loss": 0.0072, + "step": 3118 + }, + { + "epoch": 1.1276211135213305, + "grad_norm": 2.062545907846154, + "learning_rate": 4.363548657618073e-06, + "loss": 0.1113, + "step": 3119 + }, + { + "epoch": 1.1279826464208242, + "grad_norm": 0.21989108980660751, + "learning_rate": 4.360583940514382e-06, + "loss": 0.0165, + "step": 3120 + }, + { + "epoch": 1.1283441793203182, + "grad_norm": 0.6313789462265232, + "learning_rate": 4.357619451937858e-06, + "loss": 0.1191, + "step": 3121 + }, + { + "epoch": 1.128705712219812, + "grad_norm": 0.5738477043224248, + "learning_rate": 4.354655192948003e-06, + "loss": 0.0317, + "step": 3122 + }, + { + "epoch": 1.1290672451193058, + "grad_norm": 0.5481915291342324, + "learning_rate": 4.351691164604247e-06, + "loss": 0.0165, + "step": 3123 + }, + { + "epoch": 1.1294287780187997, + "grad_norm": 0.15510097269434842, + "learning_rate": 4.348727367965931e-06, + "loss": 0.0131, + "step": 3124 + }, + { + "epoch": 1.1297903109182936, + "grad_norm": 0.11068128474571355, + "learning_rate": 4.345763804092318e-06, + "loss": 0.0131, + "step": 3125 + }, + { + "epoch": 1.1301518438177873, + "grad_norm": 0.06521013210330871, + "learning_rate": 4.342800474042584e-06, + "loss": 0.0005, + "step": 3126 + }, + { + "epoch": 1.1305133767172812, + "grad_norm": 0.0264791123837413, + "learning_rate": 4.3398373788758196e-06, + "loss": 0.0007, + "step": 3127 + }, + { + "epoch": 1.1308749096167752, + "grad_norm": 0.10991748733432032, + "learning_rate": 4.3368745196510375e-06, + "loss": 0.0117, + "step": 3128 + }, + { + "epoch": 1.1312364425162689, + "grad_norm": 0.0988564466126072, + "learning_rate": 4.333911897427162e-06, + "loss": 0.0115, + "step": 3129 + }, + { + "epoch": 1.1315979754157628, + "grad_norm": 0.3073960006203922, + "learning_rate": 4.330949513263034e-06, + "loss": 0.0231, + "step": 3130 + }, + { + "epoch": 1.1319595083152567, + "grad_norm": 13.647098015945744, + "learning_rate": 4.32798736821741e-06, + "loss": 0.4238, + "step": 3131 + }, + { + "epoch": 1.1323210412147506, + "grad_norm": 0.19515215288951115, + "learning_rate": 4.325025463348957e-06, + "loss": 0.0206, + "step": 3132 + }, + { + "epoch": 1.1326825741142443, + "grad_norm": 0.74437492397769, + "learning_rate": 4.322063799716261e-06, + "loss": 0.0752, + "step": 3133 + }, + { + "epoch": 1.1330441070137383, + "grad_norm": 0.2538369019779822, + "learning_rate": 4.3191023783778205e-06, + "loss": 0.0103, + "step": 3134 + }, + { + "epoch": 1.1334056399132322, + "grad_norm": 1.1300768984623777, + "learning_rate": 4.316141200392046e-06, + "loss": 0.0476, + "step": 3135 + }, + { + "epoch": 1.1337671728127259, + "grad_norm": 0.18271082861685503, + "learning_rate": 4.313180266817264e-06, + "loss": 0.0229, + "step": 3136 + }, + { + "epoch": 1.1341287057122198, + "grad_norm": 0.16696445377019692, + "learning_rate": 4.310219578711707e-06, + "loss": 0.0165, + "step": 3137 + }, + { + "epoch": 1.1344902386117137, + "grad_norm": 0.1067140007620213, + "learning_rate": 4.3072591371335285e-06, + "loss": 0.0115, + "step": 3138 + }, + { + "epoch": 1.1348517715112076, + "grad_norm": 0.07948727940904182, + "learning_rate": 4.304298943140787e-06, + "loss": 0.0081, + "step": 3139 + }, + { + "epoch": 1.1352133044107013, + "grad_norm": 0.11366144992357047, + "learning_rate": 4.301338997791457e-06, + "loss": 0.0004, + "step": 3140 + }, + { + "epoch": 1.1355748373101953, + "grad_norm": 0.14938925067322478, + "learning_rate": 4.298379302143424e-06, + "loss": 0.0057, + "step": 3141 + }, + { + "epoch": 1.1359363702096892, + "grad_norm": 0.31243340357635646, + "learning_rate": 4.2954198572544766e-06, + "loss": 0.0146, + "step": 3142 + }, + { + "epoch": 1.1362979031091829, + "grad_norm": 0.15977609195942627, + "learning_rate": 4.292460664182326e-06, + "loss": 0.0064, + "step": 3143 + }, + { + "epoch": 1.1366594360086768, + "grad_norm": 1.3175970303200766, + "learning_rate": 4.289501723984582e-06, + "loss": 0.0889, + "step": 3144 + }, + { + "epoch": 1.1370209689081707, + "grad_norm": 0.06835171750906004, + "learning_rate": 4.286543037718774e-06, + "loss": 0.0031, + "step": 3145 + }, + { + "epoch": 1.1373825018076644, + "grad_norm": 0.7943064061099472, + "learning_rate": 4.283584606442336e-06, + "loss": 0.0688, + "step": 3146 + }, + { + "epoch": 1.1377440347071583, + "grad_norm": 0.8448691209283182, + "learning_rate": 4.280626431212604e-06, + "loss": 0.1553, + "step": 3147 + }, + { + "epoch": 1.1381055676066523, + "grad_norm": 0.10199708064721547, + "learning_rate": 4.277668513086837e-06, + "loss": 0.0081, + "step": 3148 + }, + { + "epoch": 1.138467100506146, + "grad_norm": 0.08557185925409007, + "learning_rate": 4.274710853122188e-06, + "loss": 0.0081, + "step": 3149 + }, + { + "epoch": 1.13882863340564, + "grad_norm": 0.23261700881695357, + "learning_rate": 4.271753452375729e-06, + "loss": 0.0117, + "step": 3150 + }, + { + "epoch": 1.1391901663051338, + "grad_norm": 0.8472910752804267, + "learning_rate": 4.268796311904434e-06, + "loss": 0.1191, + "step": 3151 + }, + { + "epoch": 1.1395516992046275, + "grad_norm": 0.3152819224040991, + "learning_rate": 4.26583943276518e-06, + "loss": 0.0317, + "step": 3152 + }, + { + "epoch": 1.1399132321041214, + "grad_norm": 0.15641453074459846, + "learning_rate": 4.2628828160147585e-06, + "loss": 0.0057, + "step": 3153 + }, + { + "epoch": 1.1402747650036154, + "grad_norm": 0.17623162371306672, + "learning_rate": 4.259926462709862e-06, + "loss": 0.0184, + "step": 3154 + }, + { + "epoch": 1.1406362979031093, + "grad_norm": 0.1255831354237995, + "learning_rate": 4.256970373907094e-06, + "loss": 0.0092, + "step": 3155 + }, + { + "epoch": 1.140997830802603, + "grad_norm": 1.0007834719910083, + "learning_rate": 4.254014550662957e-06, + "loss": 0.0391, + "step": 3156 + }, + { + "epoch": 1.141359363702097, + "grad_norm": 0.14558845052407438, + "learning_rate": 4.251058994033861e-06, + "loss": 0.0146, + "step": 3157 + }, + { + "epoch": 1.1417208966015908, + "grad_norm": 0.3438880978816085, + "learning_rate": 4.248103705076123e-06, + "loss": 0.0147, + "step": 3158 + }, + { + "epoch": 1.1420824295010845, + "grad_norm": 0.15068453524326683, + "learning_rate": 4.245148684845961e-06, + "loss": 0.0117, + "step": 3159 + }, + { + "epoch": 1.1424439624005784, + "grad_norm": 0.07713917698804425, + "learning_rate": 4.2421939343995014e-06, + "loss": 0.0022, + "step": 3160 + }, + { + "epoch": 1.1428054953000724, + "grad_norm": 0.24938474195856283, + "learning_rate": 4.2392394547927705e-06, + "loss": 0.0165, + "step": 3161 + }, + { + "epoch": 1.1431670281995663, + "grad_norm": 0.07248056398099077, + "learning_rate": 4.2362852470816954e-06, + "loss": 0.0024, + "step": 3162 + }, + { + "epoch": 1.14352856109906, + "grad_norm": 0.20803129059539077, + "learning_rate": 4.233331312322113e-06, + "loss": 0.0184, + "step": 3163 + }, + { + "epoch": 1.143890093998554, + "grad_norm": 0.15271668678102343, + "learning_rate": 4.230377651569757e-06, + "loss": 0.0165, + "step": 3164 + }, + { + "epoch": 1.1442516268980478, + "grad_norm": 0.07215112110812359, + "learning_rate": 4.227424265880267e-06, + "loss": 0.0064, + "step": 3165 + }, + { + "epoch": 1.1446131597975415, + "grad_norm": 0.14752267682922493, + "learning_rate": 4.224471156309182e-06, + "loss": 0.0103, + "step": 3166 + }, + { + "epoch": 1.1449746926970354, + "grad_norm": 0.08788422731844842, + "learning_rate": 4.221518323911941e-06, + "loss": 0.0092, + "step": 3167 + }, + { + "epoch": 1.1453362255965294, + "grad_norm": 0.68552068677539, + "learning_rate": 4.218565769743887e-06, + "loss": 0.0258, + "step": 3168 + }, + { + "epoch": 1.145697758496023, + "grad_norm": 0.7589385643042409, + "learning_rate": 4.215613494860261e-06, + "loss": 0.1113, + "step": 3169 + }, + { + "epoch": 1.146059291395517, + "grad_norm": 0.2334019341989036, + "learning_rate": 4.212661500316207e-06, + "loss": 0.0146, + "step": 3170 + }, + { + "epoch": 1.146420824295011, + "grad_norm": 0.8895066397115015, + "learning_rate": 4.209709787166768e-06, + "loss": 0.2129, + "step": 3171 + }, + { + "epoch": 1.1467823571945046, + "grad_norm": 0.5596197175076489, + "learning_rate": 4.206758356466882e-06, + "loss": 0.0391, + "step": 3172 + }, + { + "epoch": 1.1471438900939985, + "grad_norm": 0.7793273452733201, + "learning_rate": 4.203807209271393e-06, + "loss": 0.1641, + "step": 3173 + }, + { + "epoch": 1.1475054229934925, + "grad_norm": 0.9676778260022709, + "learning_rate": 4.20085634663504e-06, + "loss": 0.1553, + "step": 3174 + }, + { + "epoch": 1.1478669558929862, + "grad_norm": 0.1874612433169156, + "learning_rate": 4.197905769612458e-06, + "loss": 0.0184, + "step": 3175 + }, + { + "epoch": 1.14822848879248, + "grad_norm": 0.04873553795665832, + "learning_rate": 4.194955479258188e-06, + "loss": 0.0021, + "step": 3176 + }, + { + "epoch": 1.148590021691974, + "grad_norm": 1.1346784939826653, + "learning_rate": 4.192005476626656e-06, + "loss": 0.1035, + "step": 3177 + }, + { + "epoch": 1.148951554591468, + "grad_norm": 0.02801637653952016, + "learning_rate": 4.1890557627722e-06, + "loss": 0.0008, + "step": 3178 + }, + { + "epoch": 1.1493130874909616, + "grad_norm": 0.9806547902305611, + "learning_rate": 4.18610633874904e-06, + "loss": 0.1279, + "step": 3179 + }, + { + "epoch": 1.1496746203904555, + "grad_norm": 0.10102272464360458, + "learning_rate": 4.183157205611304e-06, + "loss": 0.0019, + "step": 3180 + }, + { + "epoch": 1.1500361532899495, + "grad_norm": 0.8106125963505253, + "learning_rate": 4.180208364413013e-06, + "loss": 0.1934, + "step": 3181 + }, + { + "epoch": 1.1503976861894432, + "grad_norm": 0.12039182679056845, + "learning_rate": 4.177259816208075e-06, + "loss": 0.0117, + "step": 3182 + }, + { + "epoch": 1.150759219088937, + "grad_norm": 0.7587902820159653, + "learning_rate": 4.174311562050308e-06, + "loss": 0.1455, + "step": 3183 + }, + { + "epoch": 1.151120751988431, + "grad_norm": 0.32093331611741976, + "learning_rate": 4.171363602993412e-06, + "loss": 0.0131, + "step": 3184 + }, + { + "epoch": 1.151482284887925, + "grad_norm": 0.3039926266315322, + "learning_rate": 4.168415940090992e-06, + "loss": 0.0255, + "step": 3185 + }, + { + "epoch": 1.1518438177874186, + "grad_norm": 0.7470881865084156, + "learning_rate": 4.1654685743965375e-06, + "loss": 0.1738, + "step": 3186 + }, + { + "epoch": 1.1522053506869125, + "grad_norm": 0.45629717652796764, + "learning_rate": 4.162521506963439e-06, + "loss": 0.0391, + "step": 3187 + }, + { + "epoch": 1.1525668835864065, + "grad_norm": 0.6747374864332183, + "learning_rate": 4.159574738844979e-06, + "loss": 0.0576, + "step": 3188 + }, + { + "epoch": 1.1529284164859002, + "grad_norm": 0.7002158027479072, + "learning_rate": 4.156628271094328e-06, + "loss": 0.0815, + "step": 3189 + }, + { + "epoch": 1.153289949385394, + "grad_norm": 0.27389378663680125, + "learning_rate": 4.153682104764556e-06, + "loss": 0.0286, + "step": 3190 + }, + { + "epoch": 1.153651482284888, + "grad_norm": 0.05490389850333845, + "learning_rate": 4.150736240908619e-06, + "loss": 0.0015, + "step": 3191 + }, + { + "epoch": 1.1540130151843817, + "grad_norm": 0.40903922018105077, + "learning_rate": 4.147790680579373e-06, + "loss": 0.0317, + "step": 3192 + }, + { + "epoch": 1.1543745480838756, + "grad_norm": 0.3884626251982179, + "learning_rate": 4.144845424829558e-06, + "loss": 0.0146, + "step": 3193 + }, + { + "epoch": 1.1547360809833696, + "grad_norm": 0.1940552779048049, + "learning_rate": 4.141900474711805e-06, + "loss": 0.0255, + "step": 3194 + }, + { + "epoch": 1.1550976138828633, + "grad_norm": 0.7573001062555333, + "learning_rate": 4.138955831278643e-06, + "loss": 0.1455, + "step": 3195 + }, + { + "epoch": 1.1554591467823572, + "grad_norm": 1.1730416861110928, + "learning_rate": 4.136011495582484e-06, + "loss": 0.0815, + "step": 3196 + }, + { + "epoch": 1.155820679681851, + "grad_norm": 0.5520558875429902, + "learning_rate": 4.1330674686756354e-06, + "loss": 0.1113, + "step": 3197 + }, + { + "epoch": 1.1561822125813448, + "grad_norm": 0.24361308014616465, + "learning_rate": 4.130123751610291e-06, + "loss": 0.0206, + "step": 3198 + }, + { + "epoch": 1.1565437454808387, + "grad_norm": 3.022873269925805, + "learning_rate": 4.127180345438533e-06, + "loss": 0.2559, + "step": 3199 + }, + { + "epoch": 1.1569052783803326, + "grad_norm": 0.3659721259133444, + "learning_rate": 4.124237251212337e-06, + "loss": 0.0255, + "step": 3200 + }, + { + "epoch": 1.1572668112798266, + "grad_norm": 0.18429880043829683, + "learning_rate": 4.121294469983559e-06, + "loss": 0.0205, + "step": 3201 + }, + { + "epoch": 1.1576283441793203, + "grad_norm": 0.26390530446586696, + "learning_rate": 4.118352002803955e-06, + "loss": 0.0286, + "step": 3202 + }, + { + "epoch": 1.1579898770788142, + "grad_norm": 0.2181741185770056, + "learning_rate": 4.11540985072516e-06, + "loss": 0.0286, + "step": 3203 + }, + { + "epoch": 1.158351409978308, + "grad_norm": 0.38953943237255145, + "learning_rate": 4.112468014798695e-06, + "loss": 0.0286, + "step": 3204 + }, + { + "epoch": 1.1587129428778018, + "grad_norm": 0.1310881038333508, + "learning_rate": 4.109526496075975e-06, + "loss": 0.0051, + "step": 3205 + }, + { + "epoch": 1.1590744757772957, + "grad_norm": 0.02716675010601008, + "learning_rate": 4.106585295608297e-06, + "loss": 0.0008, + "step": 3206 + }, + { + "epoch": 1.1594360086767896, + "grad_norm": 0.2897431272239283, + "learning_rate": 4.1036444144468475e-06, + "loss": 0.0231, + "step": 3207 + }, + { + "epoch": 1.1597975415762836, + "grad_norm": 0.30245888554128164, + "learning_rate": 4.1007038536426954e-06, + "loss": 0.0354, + "step": 3208 + }, + { + "epoch": 1.1601590744757773, + "grad_norm": 0.1385126409049744, + "learning_rate": 4.0977636142467935e-06, + "loss": 0.0184, + "step": 3209 + }, + { + "epoch": 1.1605206073752712, + "grad_norm": 4.040909871326346, + "learning_rate": 4.094823697309988e-06, + "loss": 0.0967, + "step": 3210 + }, + { + "epoch": 1.160882140274765, + "grad_norm": 0.03870419270032079, + "learning_rate": 4.091884103882999e-06, + "loss": 0.0013, + "step": 3211 + }, + { + "epoch": 1.1612436731742588, + "grad_norm": 0.11075572644768035, + "learning_rate": 4.088944835016443e-06, + "loss": 0.0045, + "step": 3212 + }, + { + "epoch": 1.1616052060737527, + "grad_norm": 0.7248721792351326, + "learning_rate": 4.08600589176081e-06, + "loss": 0.063, + "step": 3213 + }, + { + "epoch": 1.1619667389732466, + "grad_norm": 0.0019027297921811956, + "learning_rate": 4.083067275166477e-06, + "loss": 0.0, + "step": 3214 + }, + { + "epoch": 1.1623282718727403, + "grad_norm": 0.267847625103884, + "learning_rate": 4.080128986283707e-06, + "loss": 0.0206, + "step": 3215 + }, + { + "epoch": 1.1626898047722343, + "grad_norm": 0.44081018980920134, + "learning_rate": 4.077191026162642e-06, + "loss": 0.0391, + "step": 3216 + }, + { + "epoch": 1.1630513376717282, + "grad_norm": 0.5901834228959123, + "learning_rate": 4.074253395853311e-06, + "loss": 0.0476, + "step": 3217 + }, + { + "epoch": 1.163412870571222, + "grad_norm": 0.391049487186569, + "learning_rate": 4.071316096405622e-06, + "loss": 0.0432, + "step": 3218 + }, + { + "epoch": 1.1637744034707158, + "grad_norm": 0.00571040192590138, + "learning_rate": 4.068379128869362e-06, + "loss": 0.0002, + "step": 3219 + }, + { + "epoch": 1.1641359363702097, + "grad_norm": 0.23611312544190655, + "learning_rate": 4.065442494294205e-06, + "loss": 0.0206, + "step": 3220 + }, + { + "epoch": 1.1644974692697034, + "grad_norm": 2.338943366633161, + "learning_rate": 4.062506193729704e-06, + "loss": 0.2344, + "step": 3221 + }, + { + "epoch": 1.1648590021691974, + "grad_norm": 0.35829947683674984, + "learning_rate": 4.059570228225291e-06, + "loss": 0.0258, + "step": 3222 + }, + { + "epoch": 1.1652205350686913, + "grad_norm": 0.032784442247252674, + "learning_rate": 4.056634598830282e-06, + "loss": 0.0013, + "step": 3223 + }, + { + "epoch": 1.1655820679681852, + "grad_norm": 0.31898986239939936, + "learning_rate": 4.0536993065938655e-06, + "loss": 0.0103, + "step": 3224 + }, + { + "epoch": 1.165943600867679, + "grad_norm": 0.409659574570131, + "learning_rate": 4.050764352565119e-06, + "loss": 0.0391, + "step": 3225 + }, + { + "epoch": 1.1663051337671728, + "grad_norm": 0.24766110639319414, + "learning_rate": 4.047829737792991e-06, + "loss": 0.0317, + "step": 3226 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 1.7938802965496345, + "learning_rate": 4.0448954633263145e-06, + "loss": 0.1191, + "step": 3227 + }, + { + "epoch": 1.1670281995661604, + "grad_norm": 0.6355503894063134, + "learning_rate": 4.041961530213799e-06, + "loss": 0.0957, + "step": 3228 + }, + { + "epoch": 1.1673897324656544, + "grad_norm": 0.40245520590347783, + "learning_rate": 4.039027939504028e-06, + "loss": 0.0258, + "step": 3229 + }, + { + "epoch": 1.1677512653651483, + "grad_norm": 1.0395751289784252, + "learning_rate": 4.03609469224547e-06, + "loss": 0.0815, + "step": 3230 + }, + { + "epoch": 1.1681127982646422, + "grad_norm": 0.5772904793396831, + "learning_rate": 4.033161789486465e-06, + "loss": 0.0476, + "step": 3231 + }, + { + "epoch": 1.168474331164136, + "grad_norm": 0.174416458375091, + "learning_rate": 4.030229232275233e-06, + "loss": 0.0229, + "step": 3232 + }, + { + "epoch": 1.1688358640636298, + "grad_norm": 1.1933994453137755, + "learning_rate": 4.02729702165987e-06, + "loss": 0.1455, + "step": 3233 + }, + { + "epoch": 1.1691973969631237, + "grad_norm": 0.15114590961356816, + "learning_rate": 4.024365158688344e-06, + "loss": 0.0146, + "step": 3234 + }, + { + "epoch": 1.1695589298626174, + "grad_norm": 0.15082516790067718, + "learning_rate": 4.021433644408506e-06, + "loss": 0.0131, + "step": 3235 + }, + { + "epoch": 1.1699204627621114, + "grad_norm": 0.15263859882038727, + "learning_rate": 4.018502479868075e-06, + "loss": 0.0057, + "step": 3236 + }, + { + "epoch": 1.1702819956616053, + "grad_norm": 1.0851272774450607, + "learning_rate": 4.0155716661146515e-06, + "loss": 0.0117, + "step": 3237 + }, + { + "epoch": 1.170643528561099, + "grad_norm": 0.13420528658801795, + "learning_rate": 4.012641204195709e-06, + "loss": 0.0206, + "step": 3238 + }, + { + "epoch": 1.171005061460593, + "grad_norm": 0.7442922511412088, + "learning_rate": 4.009711095158588e-06, + "loss": 0.1279, + "step": 3239 + }, + { + "epoch": 1.1713665943600868, + "grad_norm": 1.2280724291029108, + "learning_rate": 4.006781340050515e-06, + "loss": 0.0476, + "step": 3240 + }, + { + "epoch": 1.1717281272595805, + "grad_norm": 0.5023744800675524, + "learning_rate": 4.00385193991858e-06, + "loss": 0.0391, + "step": 3241 + }, + { + "epoch": 1.1720896601590745, + "grad_norm": 0.15531848138644774, + "learning_rate": 4.000922895809752e-06, + "loss": 0.0205, + "step": 3242 + }, + { + "epoch": 1.1724511930585684, + "grad_norm": 0.21408054745049968, + "learning_rate": 3.997994208770873e-06, + "loss": 0.0184, + "step": 3243 + }, + { + "epoch": 1.172812725958062, + "grad_norm": 0.05827084325406384, + "learning_rate": 3.995065879848648e-06, + "loss": 0.0024, + "step": 3244 + }, + { + "epoch": 1.173174258857556, + "grad_norm": 0.15603646153878506, + "learning_rate": 3.992137910089668e-06, + "loss": 0.0229, + "step": 3245 + }, + { + "epoch": 1.17353579175705, + "grad_norm": 0.4186250250133814, + "learning_rate": 3.9892103005403845e-06, + "loss": 0.0146, + "step": 3246 + }, + { + "epoch": 1.1738973246565438, + "grad_norm": 0.11041284772043865, + "learning_rate": 3.986283052247127e-06, + "loss": 0.0117, + "step": 3247 + }, + { + "epoch": 1.1742588575560375, + "grad_norm": 0.5402127550300185, + "learning_rate": 3.983356166256094e-06, + "loss": 0.0146, + "step": 3248 + }, + { + "epoch": 1.1746203904555315, + "grad_norm": 0.10133826596778688, + "learning_rate": 3.980429643613351e-06, + "loss": 0.0131, + "step": 3249 + }, + { + "epoch": 1.1749819233550254, + "grad_norm": 0.15590449494046213, + "learning_rate": 3.9775034853648386e-06, + "loss": 0.0117, + "step": 3250 + }, + { + "epoch": 1.175343456254519, + "grad_norm": 0.8166561260027388, + "learning_rate": 3.974577692556364e-06, + "loss": 0.1367, + "step": 3251 + }, + { + "epoch": 1.175704989154013, + "grad_norm": 0.5655340170801814, + "learning_rate": 3.971652266233607e-06, + "loss": 0.0286, + "step": 3252 + }, + { + "epoch": 1.176066522053507, + "grad_norm": 0.2152841797214319, + "learning_rate": 3.968727207442114e-06, + "loss": 0.0286, + "step": 3253 + }, + { + "epoch": 1.1764280549530008, + "grad_norm": 0.2152316929049899, + "learning_rate": 3.965802517227297e-06, + "loss": 0.0146, + "step": 3254 + }, + { + "epoch": 1.1767895878524945, + "grad_norm": 0.21192714731748377, + "learning_rate": 3.962878196634444e-06, + "loss": 0.0146, + "step": 3255 + }, + { + "epoch": 1.1771511207519885, + "grad_norm": 0.314625652867792, + "learning_rate": 3.959954246708703e-06, + "loss": 0.0231, + "step": 3256 + }, + { + "epoch": 1.1775126536514824, + "grad_norm": 0.6358053782626316, + "learning_rate": 3.957030668495095e-06, + "loss": 0.0206, + "step": 3257 + }, + { + "epoch": 1.177874186550976, + "grad_norm": 0.9831689037698522, + "learning_rate": 3.954107463038506e-06, + "loss": 0.0576, + "step": 3258 + }, + { + "epoch": 1.17823571945047, + "grad_norm": 0.13706996155458165, + "learning_rate": 3.95118463138369e-06, + "loss": 0.0165, + "step": 3259 + }, + { + "epoch": 1.178597252349964, + "grad_norm": 0.9851945678182122, + "learning_rate": 3.948262174575266e-06, + "loss": 0.1279, + "step": 3260 + }, + { + "epoch": 1.1789587852494576, + "grad_norm": 0.2515427522133576, + "learning_rate": 3.945340093657717e-06, + "loss": 0.0229, + "step": 3261 + }, + { + "epoch": 1.1793203181489516, + "grad_norm": 0.5758093390358348, + "learning_rate": 3.942418389675396e-06, + "loss": 0.0231, + "step": 3262 + }, + { + "epoch": 1.1796818510484455, + "grad_norm": 1.6666726536211163, + "learning_rate": 3.93949706367252e-06, + "loss": 0.1035, + "step": 3263 + }, + { + "epoch": 1.1800433839479392, + "grad_norm": 0.9750555210577815, + "learning_rate": 3.93657611669317e-06, + "loss": 0.0317, + "step": 3264 + }, + { + "epoch": 1.180404916847433, + "grad_norm": 0.6518121699973077, + "learning_rate": 3.933655549781292e-06, + "loss": 0.1113, + "step": 3265 + }, + { + "epoch": 1.180766449746927, + "grad_norm": 0.010805977678779787, + "learning_rate": 3.930735363980693e-06, + "loss": 0.0003, + "step": 3266 + }, + { + "epoch": 1.1811279826464207, + "grad_norm": 0.17369758758956608, + "learning_rate": 3.927815560335051e-06, + "loss": 0.0131, + "step": 3267 + }, + { + "epoch": 1.1814895155459146, + "grad_norm": 1.3825284971025655, + "learning_rate": 3.9248961398879006e-06, + "loss": 0.1113, + "step": 3268 + }, + { + "epoch": 1.1818510484454086, + "grad_norm": 0.32488943699225853, + "learning_rate": 3.921977103682645e-06, + "loss": 0.0317, + "step": 3269 + }, + { + "epoch": 1.1822125813449025, + "grad_norm": 0.7925653475339064, + "learning_rate": 3.919058452762544e-06, + "loss": 0.0889, + "step": 3270 + }, + { + "epoch": 1.1825741142443962, + "grad_norm": 0.23832465504412212, + "learning_rate": 3.916140188170723e-06, + "loss": 0.0165, + "step": 3271 + }, + { + "epoch": 1.18293564714389, + "grad_norm": 0.19446040528012218, + "learning_rate": 3.91322231095017e-06, + "loss": 0.0146, + "step": 3272 + }, + { + "epoch": 1.183297180043384, + "grad_norm": 1.3858680504686023, + "learning_rate": 3.910304822143734e-06, + "loss": 0.063, + "step": 3273 + }, + { + "epoch": 1.1836587129428777, + "grad_norm": 0.0412413603794393, + "learning_rate": 3.907387722794125e-06, + "loss": 0.0009, + "step": 3274 + }, + { + "epoch": 1.1840202458423716, + "grad_norm": 0.013176210565869878, + "learning_rate": 3.904471013943914e-06, + "loss": 0.0002, + "step": 3275 + }, + { + "epoch": 1.1843817787418656, + "grad_norm": 0.342855263498064, + "learning_rate": 3.90155469663553e-06, + "loss": 0.0354, + "step": 3276 + }, + { + "epoch": 1.1847433116413595, + "grad_norm": 0.6737068248349456, + "learning_rate": 3.898638771911266e-06, + "loss": 0.0752, + "step": 3277 + }, + { + "epoch": 1.1851048445408532, + "grad_norm": 0.39245433440785915, + "learning_rate": 3.895723240813272e-06, + "loss": 0.0354, + "step": 3278 + }, + { + "epoch": 1.185466377440347, + "grad_norm": 0.0008772630851158605, + "learning_rate": 3.892808104383559e-06, + "loss": 0.0, + "step": 3279 + }, + { + "epoch": 1.185827910339841, + "grad_norm": 1.0196687216875604, + "learning_rate": 3.889893363663998e-06, + "loss": 0.0815, + "step": 3280 + }, + { + "epoch": 1.1861894432393347, + "grad_norm": 0.16158472802045717, + "learning_rate": 3.886979019696312e-06, + "loss": 0.0103, + "step": 3281 + }, + { + "epoch": 1.1865509761388287, + "grad_norm": 2.265087513509196, + "learning_rate": 3.88406507352209e-06, + "loss": 0.1035, + "step": 3282 + }, + { + "epoch": 1.1869125090383226, + "grad_norm": 0.0006761326660103409, + "learning_rate": 3.881151526182774e-06, + "loss": 0.0, + "step": 3283 + }, + { + "epoch": 1.1872740419378163, + "grad_norm": 0.8319833325061897, + "learning_rate": 3.8782383787196685e-06, + "loss": 0.0286, + "step": 3284 + }, + { + "epoch": 1.1876355748373102, + "grad_norm": 0.027272118351089922, + "learning_rate": 3.87532563217393e-06, + "loss": 0.0008, + "step": 3285 + }, + { + "epoch": 1.1879971077368041, + "grad_norm": 0.17414341847364903, + "learning_rate": 3.872413287586572e-06, + "loss": 0.0206, + "step": 3286 + }, + { + "epoch": 1.1883586406362978, + "grad_norm": 1.1312949972251987, + "learning_rate": 3.869501345998467e-06, + "loss": 0.0688, + "step": 3287 + }, + { + "epoch": 1.1887201735357917, + "grad_norm": 0.01054560213747547, + "learning_rate": 3.866589808450342e-06, + "loss": 0.0001, + "step": 3288 + }, + { + "epoch": 1.1890817064352857, + "grad_norm": 0.7115484722471581, + "learning_rate": 3.863678675982782e-06, + "loss": 0.1455, + "step": 3289 + }, + { + "epoch": 1.1894432393347794, + "grad_norm": 0.03604763097014493, + "learning_rate": 3.860767949636223e-06, + "loss": 0.0013, + "step": 3290 + }, + { + "epoch": 1.1898047722342733, + "grad_norm": 0.03341669712712224, + "learning_rate": 3.857857630450957e-06, + "loss": 0.0007, + "step": 3291 + }, + { + "epoch": 1.1901663051337672, + "grad_norm": 0.1586288258268024, + "learning_rate": 3.854947719467134e-06, + "loss": 0.0206, + "step": 3292 + }, + { + "epoch": 1.1905278380332611, + "grad_norm": 1.7017707253407202, + "learning_rate": 3.8520382177247525e-06, + "loss": 0.1035, + "step": 3293 + }, + { + "epoch": 1.1908893709327548, + "grad_norm": 1.3066481168595698, + "learning_rate": 3.849129126263671e-06, + "loss": 0.1191, + "step": 3294 + }, + { + "epoch": 1.1912509038322487, + "grad_norm": 0.3955047329345974, + "learning_rate": 3.846220446123599e-06, + "loss": 0.0255, + "step": 3295 + }, + { + "epoch": 1.1916124367317427, + "grad_norm": 0.15577719926919145, + "learning_rate": 3.843312178344093e-06, + "loss": 0.0206, + "step": 3296 + }, + { + "epoch": 1.1919739696312364, + "grad_norm": 0.6450168061427249, + "learning_rate": 3.840404323964572e-06, + "loss": 0.1455, + "step": 3297 + }, + { + "epoch": 1.1923355025307303, + "grad_norm": 0.6783170008566503, + "learning_rate": 3.837496884024299e-06, + "loss": 0.0432, + "step": 3298 + }, + { + "epoch": 1.1926970354302242, + "grad_norm": 0.13968620183687014, + "learning_rate": 3.834589859562396e-06, + "loss": 0.0165, + "step": 3299 + }, + { + "epoch": 1.1930585683297181, + "grad_norm": 3.5368900415761564, + "learning_rate": 3.831683251617832e-06, + "loss": 0.2988, + "step": 3300 + }, + { + "epoch": 1.1934201012292118, + "grad_norm": 0.6860895784179416, + "learning_rate": 3.828777061229426e-06, + "loss": 0.1367, + "step": 3301 + }, + { + "epoch": 1.1937816341287057, + "grad_norm": 0.8234189513572108, + "learning_rate": 3.825871289435851e-06, + "loss": 0.0957, + "step": 3302 + }, + { + "epoch": 1.1941431670281997, + "grad_norm": 0.08263885041729875, + "learning_rate": 3.822965937275629e-06, + "loss": 0.0022, + "step": 3303 + }, + { + "epoch": 1.1945046999276934, + "grad_norm": 0.18173912262635222, + "learning_rate": 3.820061005787133e-06, + "loss": 0.0229, + "step": 3304 + }, + { + "epoch": 1.1948662328271873, + "grad_norm": 1.3319472373570977, + "learning_rate": 3.817156496008587e-06, + "loss": 0.0391, + "step": 3305 + }, + { + "epoch": 1.1952277657266812, + "grad_norm": 0.24805513466579313, + "learning_rate": 3.8142524089780564e-06, + "loss": 0.0255, + "step": 3306 + }, + { + "epoch": 1.195589298626175, + "grad_norm": 0.0033076410500417664, + "learning_rate": 3.8113487457334657e-06, + "loss": 0.0001, + "step": 3307 + }, + { + "epoch": 1.1959508315256688, + "grad_norm": 0.1792849424049055, + "learning_rate": 3.808445507312582e-06, + "loss": 0.0229, + "step": 3308 + }, + { + "epoch": 1.1963123644251628, + "grad_norm": 0.031008281525545238, + "learning_rate": 3.805542694753023e-06, + "loss": 0.0007, + "step": 3309 + }, + { + "epoch": 1.1966738973246565, + "grad_norm": 0.1423147119997822, + "learning_rate": 3.8026403090922544e-06, + "loss": 0.0045, + "step": 3310 + }, + { + "epoch": 1.1970354302241504, + "grad_norm": 0.25855153973997436, + "learning_rate": 3.799738351367584e-06, + "loss": 0.0146, + "step": 3311 + }, + { + "epoch": 1.1973969631236443, + "grad_norm": 0.6983932103718338, + "learning_rate": 3.7968368226161743e-06, + "loss": 0.0525, + "step": 3312 + }, + { + "epoch": 1.197758496023138, + "grad_norm": 0.6130348750218367, + "learning_rate": 3.7939357238750302e-06, + "loss": 0.0432, + "step": 3313 + }, + { + "epoch": 1.198120028922632, + "grad_norm": 0.5252475258653932, + "learning_rate": 3.7910350561810045e-06, + "loss": 0.0391, + "step": 3314 + }, + { + "epoch": 1.1984815618221258, + "grad_norm": 0.10989656152248042, + "learning_rate": 3.788134820570796e-06, + "loss": 0.0027, + "step": 3315 + }, + { + "epoch": 1.1988430947216198, + "grad_norm": 0.5338687811445197, + "learning_rate": 3.7852350180809437e-06, + "loss": 0.0391, + "step": 3316 + }, + { + "epoch": 1.1992046276211135, + "grad_norm": 0.16619054777261222, + "learning_rate": 3.7823356497478414e-06, + "loss": 0.0131, + "step": 3317 + }, + { + "epoch": 1.1995661605206074, + "grad_norm": 0.4646978691058429, + "learning_rate": 3.7794367166077194e-06, + "loss": 0.0432, + "step": 3318 + }, + { + "epoch": 1.1999276934201013, + "grad_norm": 0.4627728024277318, + "learning_rate": 3.7765382196966588e-06, + "loss": 0.0432, + "step": 3319 + }, + { + "epoch": 1.200289226319595, + "grad_norm": 0.2839977633912662, + "learning_rate": 3.773640160050581e-06, + "loss": 0.0146, + "step": 3320 + }, + { + "epoch": 1.200650759219089, + "grad_norm": 0.3866333354554876, + "learning_rate": 3.77074253870525e-06, + "loss": 0.0255, + "step": 3321 + }, + { + "epoch": 1.2010122921185828, + "grad_norm": 0.2526911327786038, + "learning_rate": 3.7678453566962763e-06, + "loss": 0.0286, + "step": 3322 + }, + { + "epoch": 1.2013738250180768, + "grad_norm": 0.230640993179104, + "learning_rate": 3.7649486150591115e-06, + "loss": 0.0255, + "step": 3323 + }, + { + "epoch": 1.2017353579175705, + "grad_norm": 0.03144790777483809, + "learning_rate": 3.7620523148290517e-06, + "loss": 0.0008, + "step": 3324 + }, + { + "epoch": 1.2020968908170644, + "grad_norm": 0.6101828616030824, + "learning_rate": 3.7591564570412343e-06, + "loss": 0.0432, + "step": 3325 + }, + { + "epoch": 1.2024584237165583, + "grad_norm": 0.10082711227454848, + "learning_rate": 3.7562610427306357e-06, + "loss": 0.0131, + "step": 3326 + }, + { + "epoch": 1.202819956616052, + "grad_norm": 0.6372570676531326, + "learning_rate": 3.7533660729320785e-06, + "loss": 0.0354, + "step": 3327 + }, + { + "epoch": 1.203181489515546, + "grad_norm": 0.2182216249444688, + "learning_rate": 3.7504715486802234e-06, + "loss": 0.0057, + "step": 3328 + }, + { + "epoch": 1.2035430224150399, + "grad_norm": 0.29819667861989413, + "learning_rate": 3.7475774710095736e-06, + "loss": 0.0317, + "step": 3329 + }, + { + "epoch": 1.2039045553145336, + "grad_norm": 0.5996335139199439, + "learning_rate": 3.7446838409544708e-06, + "loss": 0.1035, + "step": 3330 + }, + { + "epoch": 1.2042660882140275, + "grad_norm": 0.008876487752425027, + "learning_rate": 3.7417906595490993e-06, + "loss": 0.0002, + "step": 3331 + }, + { + "epoch": 1.2046276211135214, + "grad_norm": 0.057478107090419776, + "learning_rate": 3.7388979278274806e-06, + "loss": 0.001, + "step": 3332 + }, + { + "epoch": 1.204989154013015, + "grad_norm": 0.3728682596899638, + "learning_rate": 3.736005646823475e-06, + "loss": 0.0165, + "step": 3333 + }, + { + "epoch": 1.205350686912509, + "grad_norm": 1.7953493416644117, + "learning_rate": 3.733113817570785e-06, + "loss": 0.0576, + "step": 3334 + }, + { + "epoch": 1.205712219812003, + "grad_norm": 0.9428945279751938, + "learning_rate": 3.7302224411029487e-06, + "loss": 0.0476, + "step": 3335 + }, + { + "epoch": 1.2060737527114966, + "grad_norm": 0.003974732482027209, + "learning_rate": 3.7273315184533465e-06, + "loss": 0.0001, + "step": 3336 + }, + { + "epoch": 1.2064352856109906, + "grad_norm": 4.202441565730771, + "learning_rate": 3.724441050655189e-06, + "loss": 0.5117, + "step": 3337 + }, + { + "epoch": 1.2067968185104845, + "grad_norm": 0.2716121883960844, + "learning_rate": 3.7215510387415305e-06, + "loss": 0.0131, + "step": 3338 + }, + { + "epoch": 1.2071583514099784, + "grad_norm": 0.01660883400098483, + "learning_rate": 3.7186614837452617e-06, + "loss": 0.0004, + "step": 3339 + }, + { + "epoch": 1.207519884309472, + "grad_norm": 0.023383508714539404, + "learning_rate": 3.7157723866991067e-06, + "loss": 0.0002, + "step": 3340 + }, + { + "epoch": 1.207881417208966, + "grad_norm": 1.1627750427130346, + "learning_rate": 3.712883748635633e-06, + "loss": 0.0525, + "step": 3341 + }, + { + "epoch": 1.20824295010846, + "grad_norm": 0.45385891498766223, + "learning_rate": 3.709995570587234e-06, + "loss": 0.0317, + "step": 3342 + }, + { + "epoch": 1.2086044830079536, + "grad_norm": 0.00117924262389793, + "learning_rate": 3.7071078535861447e-06, + "loss": 0.0, + "step": 3343 + }, + { + "epoch": 1.2089660159074476, + "grad_norm": 0.14971181812377718, + "learning_rate": 3.704220598664437e-06, + "loss": 0.0146, + "step": 3344 + }, + { + "epoch": 1.2093275488069415, + "grad_norm": 0.0063067963301950474, + "learning_rate": 3.701333806854013e-06, + "loss": 0.0001, + "step": 3345 + }, + { + "epoch": 1.2096890817064354, + "grad_norm": 0.2125827174339066, + "learning_rate": 3.6984474791866136e-06, + "loss": 0.0206, + "step": 3346 + }, + { + "epoch": 1.210050614605929, + "grad_norm": 0.0035276673537544584, + "learning_rate": 3.69556161669381e-06, + "loss": 0.0001, + "step": 3347 + }, + { + "epoch": 1.210412147505423, + "grad_norm": 0.514829250103369, + "learning_rate": 3.6926762204070086e-06, + "loss": 0.0354, + "step": 3348 + }, + { + "epoch": 1.210773680404917, + "grad_norm": 1.3755657459059347, + "learning_rate": 3.6897912913574505e-06, + "loss": 0.0688, + "step": 3349 + }, + { + "epoch": 1.2111352133044107, + "grad_norm": 0.005580177888173035, + "learning_rate": 3.686906830576208e-06, + "loss": 0.0001, + "step": 3350 + }, + { + "epoch": 1.2114967462039046, + "grad_norm": 0.005432730835552846, + "learning_rate": 3.684022839094189e-06, + "loss": 0.0001, + "step": 3351 + }, + { + "epoch": 1.2118582791033985, + "grad_norm": 1.0590309373002538, + "learning_rate": 3.6811393179421285e-06, + "loss": 0.0432, + "step": 3352 + }, + { + "epoch": 1.2122198120028922, + "grad_norm": 0.5001276648090125, + "learning_rate": 3.6782562681505963e-06, + "loss": 0.0051, + "step": 3353 + }, + { + "epoch": 1.2125813449023861, + "grad_norm": 0.12426119498147019, + "learning_rate": 3.675373690749996e-06, + "loss": 0.0146, + "step": 3354 + }, + { + "epoch": 1.21294287780188, + "grad_norm": 0.13573905394651056, + "learning_rate": 3.672491586770558e-06, + "loss": 0.0022, + "step": 3355 + }, + { + "epoch": 1.2133044107013737, + "grad_norm": 0.25398560818440524, + "learning_rate": 3.6696099572423484e-06, + "loss": 0.0131, + "step": 3356 + }, + { + "epoch": 1.2136659436008677, + "grad_norm": 0.1842097407068306, + "learning_rate": 3.6667288031952584e-06, + "loss": 0.0072, + "step": 3357 + }, + { + "epoch": 1.2140274765003616, + "grad_norm": 0.36632231667381676, + "learning_rate": 3.6638481256590123e-06, + "loss": 0.0317, + "step": 3358 + }, + { + "epoch": 1.2143890093998553, + "grad_norm": 0.6672410582214008, + "learning_rate": 3.6609679256631647e-06, + "loss": 0.1455, + "step": 3359 + }, + { + "epoch": 1.2147505422993492, + "grad_norm": 0.417422493708114, + "learning_rate": 3.6580882042370974e-06, + "loss": 0.0206, + "step": 3360 + }, + { + "epoch": 1.2151120751988431, + "grad_norm": 0.10865289428951255, + "learning_rate": 3.6552089624100244e-06, + "loss": 0.0115, + "step": 3361 + }, + { + "epoch": 1.215473608098337, + "grad_norm": 0.9447747548510905, + "learning_rate": 3.6523302012109835e-06, + "loss": 0.0286, + "step": 3362 + }, + { + "epoch": 1.2158351409978307, + "grad_norm": 0.00710623529135999, + "learning_rate": 3.649451921668843e-06, + "loss": 0.0002, + "step": 3363 + }, + { + "epoch": 1.2161966738973247, + "grad_norm": 2.1029007069980308, + "learning_rate": 3.646574124812302e-06, + "loss": 0.2344, + "step": 3364 + }, + { + "epoch": 1.2165582067968186, + "grad_norm": 1.6342927215871919, + "learning_rate": 3.643696811669882e-06, + "loss": 0.0815, + "step": 3365 + }, + { + "epoch": 1.2169197396963123, + "grad_norm": 0.12104878653807008, + "learning_rate": 3.6408199832699377e-06, + "loss": 0.0115, + "step": 3366 + }, + { + "epoch": 1.2172812725958062, + "grad_norm": 0.8364988738433728, + "learning_rate": 3.6379436406406426e-06, + "loss": 0.0391, + "step": 3367 + }, + { + "epoch": 1.2176428054953001, + "grad_norm": 0.1011455146198225, + "learning_rate": 3.6350677848100025e-06, + "loss": 0.0015, + "step": 3368 + }, + { + "epoch": 1.218004338394794, + "grad_norm": 1.0053413687434434, + "learning_rate": 3.6321924168058487e-06, + "loss": 0.063, + "step": 3369 + }, + { + "epoch": 1.2183658712942878, + "grad_norm": 0.12654136982910294, + "learning_rate": 3.629317537655836e-06, + "loss": 0.0146, + "step": 3370 + }, + { + "epoch": 1.2187274041937817, + "grad_norm": 0.15734934783905993, + "learning_rate": 3.626443148387447e-06, + "loss": 0.0092, + "step": 3371 + }, + { + "epoch": 1.2190889370932756, + "grad_norm": 0.09263741321956558, + "learning_rate": 3.623569250027987e-06, + "loss": 0.0057, + "step": 3372 + }, + { + "epoch": 1.2194504699927693, + "grad_norm": 0.26099233578326164, + "learning_rate": 3.6206958436045856e-06, + "loss": 0.0081, + "step": 3373 + }, + { + "epoch": 1.2198120028922632, + "grad_norm": 0.17616498038305067, + "learning_rate": 3.617822930144199e-06, + "loss": 0.0229, + "step": 3374 + }, + { + "epoch": 1.2201735357917571, + "grad_norm": 3.313139201077565, + "learning_rate": 3.614950510673605e-06, + "loss": 0.2773, + "step": 3375 + }, + { + "epoch": 1.2205350686912508, + "grad_norm": 0.034128367110015455, + "learning_rate": 3.6120785862194075e-06, + "loss": 0.0008, + "step": 3376 + }, + { + "epoch": 1.2208966015907448, + "grad_norm": 0.07127394983582508, + "learning_rate": 3.6092071578080306e-06, + "loss": 0.0027, + "step": 3377 + }, + { + "epoch": 1.2212581344902387, + "grad_norm": 0.23733249260842243, + "learning_rate": 3.60633622646572e-06, + "loss": 0.0229, + "step": 3378 + }, + { + "epoch": 1.2216196673897324, + "grad_norm": 3.6289079097607244, + "learning_rate": 3.603465793218549e-06, + "loss": 0.1367, + "step": 3379 + }, + { + "epoch": 1.2219812002892263, + "grad_norm": 0.005096343302747712, + "learning_rate": 3.6005958590924085e-06, + "loss": 0.0001, + "step": 3380 + }, + { + "epoch": 1.2223427331887202, + "grad_norm": 0.01221153414245686, + "learning_rate": 3.5977264251130127e-06, + "loss": 0.0002, + "step": 3381 + }, + { + "epoch": 1.222704266088214, + "grad_norm": 1.1066672189804179, + "learning_rate": 3.5948574923058975e-06, + "loss": 0.0525, + "step": 3382 + }, + { + "epoch": 1.2230657989877078, + "grad_norm": 0.02823361583897005, + "learning_rate": 3.591989061696417e-06, + "loss": 0.0006, + "step": 3383 + }, + { + "epoch": 1.2234273318872018, + "grad_norm": 0.39048455667885923, + "learning_rate": 3.5891211343097492e-06, + "loss": 0.0255, + "step": 3384 + }, + { + "epoch": 1.2237888647866957, + "grad_norm": 0.1195108891516006, + "learning_rate": 3.5862537111708895e-06, + "loss": 0.0022, + "step": 3385 + }, + { + "epoch": 1.2241503976861894, + "grad_norm": 0.11890020458402036, + "learning_rate": 3.583386793304655e-06, + "loss": 0.0131, + "step": 3386 + }, + { + "epoch": 1.2245119305856833, + "grad_norm": 0.43736110235422226, + "learning_rate": 3.5805203817356837e-06, + "loss": 0.0286, + "step": 3387 + }, + { + "epoch": 1.2248734634851772, + "grad_norm": 0.7679234699012171, + "learning_rate": 3.5776544774884263e-06, + "loss": 0.0258, + "step": 3388 + }, + { + "epoch": 1.225234996384671, + "grad_norm": 0.3506377676483006, + "learning_rate": 3.5747890815871596e-06, + "loss": 0.0206, + "step": 3389 + }, + { + "epoch": 1.2255965292841648, + "grad_norm": 0.599531287874885, + "learning_rate": 3.5719241950559726e-06, + "loss": 0.0231, + "step": 3390 + }, + { + "epoch": 1.2259580621836588, + "grad_norm": 0.012624403984825686, + "learning_rate": 3.5690598189187787e-06, + "loss": 0.0002, + "step": 3391 + }, + { + "epoch": 1.2263195950831527, + "grad_norm": 0.29834676523101095, + "learning_rate": 3.566195954199304e-06, + "loss": 0.0206, + "step": 3392 + }, + { + "epoch": 1.2266811279826464, + "grad_norm": 0.4422637908158175, + "learning_rate": 3.5633326019210914e-06, + "loss": 0.0206, + "step": 3393 + }, + { + "epoch": 1.2270426608821403, + "grad_norm": 2.214342337124005, + "learning_rate": 3.5604697631075035e-06, + "loss": 0.3105, + "step": 3394 + }, + { + "epoch": 1.2274041937816342, + "grad_norm": 0.9176373284858413, + "learning_rate": 3.5576074387817184e-06, + "loss": 0.1367, + "step": 3395 + }, + { + "epoch": 1.227765726681128, + "grad_norm": 0.13917996231887556, + "learning_rate": 3.554745629966731e-06, + "loss": 0.0131, + "step": 3396 + }, + { + "epoch": 1.2281272595806219, + "grad_norm": 1.034032198503509, + "learning_rate": 3.5518843376853497e-06, + "loss": 0.1113, + "step": 3397 + }, + { + "epoch": 1.2284887924801158, + "grad_norm": 0.014549227710260617, + "learning_rate": 3.549023562960202e-06, + "loss": 0.0003, + "step": 3398 + }, + { + "epoch": 1.2288503253796095, + "grad_norm": 0.3869933605923945, + "learning_rate": 3.5461633068137256e-06, + "loss": 0.0255, + "step": 3399 + }, + { + "epoch": 1.2292118582791034, + "grad_norm": 0.06746615021505298, + "learning_rate": 3.543303570268176e-06, + "loss": 0.0027, + "step": 3400 + }, + { + "epoch": 1.2295733911785973, + "grad_norm": 0.3026769410595853, + "learning_rate": 3.540444354345624e-06, + "loss": 0.0146, + "step": 3401 + }, + { + "epoch": 1.229934924078091, + "grad_norm": 0.1297650220491593, + "learning_rate": 3.53758566006795e-06, + "loss": 0.0115, + "step": 3402 + }, + { + "epoch": 1.230296456977585, + "grad_norm": 2.565572057602732, + "learning_rate": 3.534727488456856e-06, + "loss": 0.2676, + "step": 3403 + }, + { + "epoch": 1.2306579898770789, + "grad_norm": 0.14329332073579598, + "learning_rate": 3.5318698405338458e-06, + "loss": 0.0131, + "step": 3404 + }, + { + "epoch": 1.2310195227765726, + "grad_norm": 0.5006277767461323, + "learning_rate": 3.529012717320245e-06, + "loss": 0.0391, + "step": 3405 + }, + { + "epoch": 1.2313810556760665, + "grad_norm": 0.6083427598341375, + "learning_rate": 3.5261561198371887e-06, + "loss": 0.0286, + "step": 3406 + }, + { + "epoch": 1.2317425885755604, + "grad_norm": 0.026841916260829837, + "learning_rate": 3.5233000491056236e-06, + "loss": 0.0007, + "step": 3407 + }, + { + "epoch": 1.2321041214750543, + "grad_norm": 1.284542353188182, + "learning_rate": 3.520444506146311e-06, + "loss": 0.0576, + "step": 3408 + }, + { + "epoch": 1.232465654374548, + "grad_norm": 0.5585866037973901, + "learning_rate": 3.5175894919798186e-06, + "loss": 0.0354, + "step": 3409 + }, + { + "epoch": 1.232827187274042, + "grad_norm": 0.515948796148921, + "learning_rate": 3.5147350076265287e-06, + "loss": 0.0286, + "step": 3410 + }, + { + "epoch": 1.2331887201735359, + "grad_norm": 2.3040507127004384, + "learning_rate": 3.511881054106634e-06, + "loss": 0.2988, + "step": 3411 + }, + { + "epoch": 1.2335502530730296, + "grad_norm": 0.9381790093003757, + "learning_rate": 3.5090276324401353e-06, + "loss": 0.1113, + "step": 3412 + }, + { + "epoch": 1.2339117859725235, + "grad_norm": 0.7734850788922631, + "learning_rate": 3.5061747436468485e-06, + "loss": 0.0525, + "step": 3413 + }, + { + "epoch": 1.2342733188720174, + "grad_norm": 1.3101244637775724, + "learning_rate": 3.5033223887463918e-06, + "loss": 0.0476, + "step": 3414 + }, + { + "epoch": 1.234634851771511, + "grad_norm": 0.19291128104960403, + "learning_rate": 3.5004705687581963e-06, + "loss": 0.0184, + "step": 3415 + }, + { + "epoch": 1.234996384671005, + "grad_norm": 0.5947750229146954, + "learning_rate": 3.4976192847015045e-06, + "loss": 0.0391, + "step": 3416 + }, + { + "epoch": 1.235357917570499, + "grad_norm": 0.162888249489684, + "learning_rate": 3.494768537595362e-06, + "loss": 0.0064, + "step": 3417 + }, + { + "epoch": 1.2357194504699929, + "grad_norm": 1.3678609521480884, + "learning_rate": 3.491918328458629e-06, + "loss": 0.063, + "step": 3418 + }, + { + "epoch": 1.2360809833694866, + "grad_norm": 0.10365411492161089, + "learning_rate": 3.489068658309965e-06, + "loss": 0.0022, + "step": 3419 + }, + { + "epoch": 1.2364425162689805, + "grad_norm": 0.01337833542258149, + "learning_rate": 3.486219528167844e-06, + "loss": 0.0002, + "step": 3420 + }, + { + "epoch": 1.2368040491684744, + "grad_norm": 0.5181911800895478, + "learning_rate": 3.4833709390505443e-06, + "loss": 0.0255, + "step": 3421 + }, + { + "epoch": 1.2371655820679681, + "grad_norm": 0.957845511004752, + "learning_rate": 3.48052289197615e-06, + "loss": 0.063, + "step": 3422 + }, + { + "epoch": 1.237527114967462, + "grad_norm": 0.874588998947053, + "learning_rate": 3.4776753879625563e-06, + "loss": 0.0317, + "step": 3423 + }, + { + "epoch": 1.237888647866956, + "grad_norm": 0.2712071417438692, + "learning_rate": 3.4748284280274557e-06, + "loss": 0.0165, + "step": 3424 + }, + { + "epoch": 1.2382501807664497, + "grad_norm": 1.0991526381441454, + "learning_rate": 3.471982013188353e-06, + "loss": 0.063, + "step": 3425 + }, + { + "epoch": 1.2386117136659436, + "grad_norm": 0.02026631937626935, + "learning_rate": 3.4691361444625564e-06, + "loss": 0.0006, + "step": 3426 + }, + { + "epoch": 1.2389732465654375, + "grad_norm": 0.6983849794104716, + "learning_rate": 3.4662908228671776e-06, + "loss": 0.0286, + "step": 3427 + }, + { + "epoch": 1.2393347794649312, + "grad_norm": 1.691634930552244, + "learning_rate": 3.463446049419138e-06, + "loss": 0.0752, + "step": 3428 + }, + { + "epoch": 1.2396963123644251, + "grad_norm": 3.456499195763383, + "learning_rate": 3.460601825135155e-06, + "loss": 0.1279, + "step": 3429 + }, + { + "epoch": 1.240057845263919, + "grad_norm": 0.1250796346488491, + "learning_rate": 3.457758151031753e-06, + "loss": 0.0103, + "step": 3430 + }, + { + "epoch": 1.240419378163413, + "grad_norm": 0.032228994824531156, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.0001, + "step": 3431 + }, + { + "epoch": 1.2407809110629067, + "grad_norm": 0.3726209981840686, + "learning_rate": 3.452072457431816e-06, + "loss": 0.0146, + "step": 3432 + }, + { + "epoch": 1.2411424439624006, + "grad_norm": 0.19882510874529694, + "learning_rate": 3.4492304399673476e-06, + "loss": 0.0146, + "step": 3433 + }, + { + "epoch": 1.2415039768618945, + "grad_norm": 1.3326799734445574, + "learning_rate": 3.4463889767475917e-06, + "loss": 0.1641, + "step": 3434 + }, + { + "epoch": 1.2418655097613882, + "grad_norm": 0.0826947653271377, + "learning_rate": 3.4435480687880867e-06, + "loss": 0.0064, + "step": 3435 + }, + { + "epoch": 1.2422270426608821, + "grad_norm": 0.6921841645222848, + "learning_rate": 3.4407077171041748e-06, + "loss": 0.0317, + "step": 3436 + }, + { + "epoch": 1.242588575560376, + "grad_norm": 0.60229334143831, + "learning_rate": 3.4378679227109936e-06, + "loss": 0.0352, + "step": 3437 + }, + { + "epoch": 1.2429501084598698, + "grad_norm": 0.08339083006861739, + "learning_rate": 3.43502868662349e-06, + "loss": 0.0035, + "step": 3438 + }, + { + "epoch": 1.2433116413593637, + "grad_norm": 1.05453744797806, + "learning_rate": 3.4321900098564024e-06, + "loss": 0.1836, + "step": 3439 + }, + { + "epoch": 1.2436731742588576, + "grad_norm": 0.2502528992444709, + "learning_rate": 3.429351893424273e-06, + "loss": 0.0146, + "step": 3440 + }, + { + "epoch": 1.2440347071583515, + "grad_norm": 0.19000537374674858, + "learning_rate": 3.4265143383414473e-06, + "loss": 0.0072, + "step": 3441 + }, + { + "epoch": 1.2443962400578452, + "grad_norm": 0.5391963220244498, + "learning_rate": 3.4236773456220633e-06, + "loss": 0.0286, + "step": 3442 + }, + { + "epoch": 1.2447577729573391, + "grad_norm": 0.26339762923815346, + "learning_rate": 3.420840916280066e-06, + "loss": 0.0205, + "step": 3443 + }, + { + "epoch": 1.245119305856833, + "grad_norm": 0.18078909736884952, + "learning_rate": 3.41800505132919e-06, + "loss": 0.0057, + "step": 3444 + }, + { + "epoch": 1.2454808387563268, + "grad_norm": 0.08371659494078491, + "learning_rate": 3.415169751782974e-06, + "loss": 0.0035, + "step": 3445 + }, + { + "epoch": 1.2458423716558207, + "grad_norm": 0.2842801473587209, + "learning_rate": 3.412335018654756e-06, + "loss": 0.0206, + "step": 3446 + }, + { + "epoch": 1.2462039045553146, + "grad_norm": 0.2392403452515833, + "learning_rate": 3.4095008529576655e-06, + "loss": 0.0206, + "step": 3447 + }, + { + "epoch": 1.2465654374548083, + "grad_norm": 0.0036420128335128707, + "learning_rate": 3.4066672557046372e-06, + "loss": 0.0001, + "step": 3448 + }, + { + "epoch": 1.2469269703543022, + "grad_norm": 0.31736120196561196, + "learning_rate": 3.4038342279083934e-06, + "loss": 0.0117, + "step": 3449 + }, + { + "epoch": 1.2472885032537961, + "grad_norm": 0.871225103070157, + "learning_rate": 3.40100177058146e-06, + "loss": 0.1455, + "step": 3450 + }, + { + "epoch": 1.2476500361532898, + "grad_norm": 1.9710142069991405, + "learning_rate": 3.3981698847361567e-06, + "loss": 0.1738, + "step": 3451 + }, + { + "epoch": 1.2480115690527838, + "grad_norm": 0.8174949174449775, + "learning_rate": 3.3953385713845976e-06, + "loss": 0.0476, + "step": 3452 + }, + { + "epoch": 1.2483731019522777, + "grad_norm": 1.8600801931061308, + "learning_rate": 3.3925078315386963e-06, + "loss": 0.0957, + "step": 3453 + }, + { + "epoch": 1.2487346348517716, + "grad_norm": 0.6513929909643129, + "learning_rate": 3.389677666210156e-06, + "loss": 0.0432, + "step": 3454 + }, + { + "epoch": 1.2490961677512653, + "grad_norm": 0.6257414055273273, + "learning_rate": 3.3868480764104762e-06, + "loss": 0.0391, + "step": 3455 + }, + { + "epoch": 1.2494577006507592, + "grad_norm": 0.16629648984188158, + "learning_rate": 3.384019063150955e-06, + "loss": 0.0131, + "step": 3456 + }, + { + "epoch": 1.2498192335502532, + "grad_norm": 1.157183525674392, + "learning_rate": 3.381190627442679e-06, + "loss": 0.0576, + "step": 3457 + }, + { + "epoch": 1.2501807664497468, + "grad_norm": 0.38322822849106614, + "learning_rate": 3.3783627702965326e-06, + "loss": 0.0286, + "step": 3458 + }, + { + "epoch": 1.2505422993492408, + "grad_norm": 0.7178422781529427, + "learning_rate": 3.3755354927231892e-06, + "loss": 0.0889, + "step": 3459 + }, + { + "epoch": 1.2509038322487347, + "grad_norm": 1.4707132629704491, + "learning_rate": 3.372708795733116e-06, + "loss": 0.0576, + "step": 3460 + }, + { + "epoch": 1.2512653651482286, + "grad_norm": 0.735025833528673, + "learning_rate": 3.3698826803365783e-06, + "loss": 0.063, + "step": 3461 + }, + { + "epoch": 1.2516268980477223, + "grad_norm": 0.06838926449435065, + "learning_rate": 3.367057147543624e-06, + "loss": 0.0027, + "step": 3462 + }, + { + "epoch": 1.2519884309472162, + "grad_norm": 0.9037820870014964, + "learning_rate": 3.3642321983641035e-06, + "loss": 0.0815, + "step": 3463 + }, + { + "epoch": 1.2523499638467102, + "grad_norm": 0.039011970724716204, + "learning_rate": 3.3614078338076494e-06, + "loss": 0.001, + "step": 3464 + }, + { + "epoch": 1.2527114967462039, + "grad_norm": 0.02377359648483476, + "learning_rate": 3.3585840548836878e-06, + "loss": 0.0006, + "step": 3465 + }, + { + "epoch": 1.2530730296456978, + "grad_norm": 1.2682731493830641, + "learning_rate": 3.3557608626014402e-06, + "loss": 0.0432, + "step": 3466 + }, + { + "epoch": 1.2534345625451917, + "grad_norm": 0.1951039537905298, + "learning_rate": 3.3529382579699123e-06, + "loss": 0.0184, + "step": 3467 + }, + { + "epoch": 1.2537960954446854, + "grad_norm": 0.2107566160585955, + "learning_rate": 3.3501162419979038e-06, + "loss": 0.0184, + "step": 3468 + }, + { + "epoch": 1.2541576283441793, + "grad_norm": 0.816853387793121, + "learning_rate": 3.347294815694002e-06, + "loss": 0.1641, + "step": 3469 + }, + { + "epoch": 1.2545191612436732, + "grad_norm": 0.0007741083920848124, + "learning_rate": 3.344473980066586e-06, + "loss": 0.0, + "step": 3470 + }, + { + "epoch": 1.254880694143167, + "grad_norm": 1.1197755150333533, + "learning_rate": 3.341653736123819e-06, + "loss": 0.1738, + "step": 3471 + }, + { + "epoch": 1.2552422270426609, + "grad_norm": 0.13717319894096922, + "learning_rate": 3.3388340848736557e-06, + "loss": 0.0064, + "step": 3472 + }, + { + "epoch": 1.2556037599421548, + "grad_norm": 0.055720043587702626, + "learning_rate": 3.3360150273238413e-06, + "loss": 0.0019, + "step": 3473 + }, + { + "epoch": 1.2559652928416485, + "grad_norm": 0.0034938679300012595, + "learning_rate": 3.3331965644819037e-06, + "loss": 0.0001, + "step": 3474 + }, + { + "epoch": 1.2563268257411424, + "grad_norm": 0.1506737469219196, + "learning_rate": 3.3303786973551643e-06, + "loss": 0.0129, + "step": 3475 + }, + { + "epoch": 1.2566883586406363, + "grad_norm": 0.9511995146178492, + "learning_rate": 3.327561426950725e-06, + "loss": 0.0432, + "step": 3476 + }, + { + "epoch": 1.25704989154013, + "grad_norm": 2.5814017173837605, + "learning_rate": 3.324744754275477e-06, + "loss": 0.2344, + "step": 3477 + }, + { + "epoch": 1.257411424439624, + "grad_norm": 0.5374987975640227, + "learning_rate": 3.321928680336103e-06, + "loss": 0.0354, + "step": 3478 + }, + { + "epoch": 1.2577729573391179, + "grad_norm": 0.4316967365938019, + "learning_rate": 3.319113206139062e-06, + "loss": 0.0206, + "step": 3479 + }, + { + "epoch": 1.2581344902386118, + "grad_norm": 0.002217796794303697, + "learning_rate": 3.316298332690609e-06, + "loss": 0.0001, + "step": 3480 + }, + { + "epoch": 1.2584960231381055, + "grad_norm": 1.095689587303309, + "learning_rate": 3.3134840609967756e-06, + "loss": 0.063, + "step": 3481 + }, + { + "epoch": 1.2588575560375994, + "grad_norm": 0.21800662346938054, + "learning_rate": 3.3106703920633814e-06, + "loss": 0.0072, + "step": 3482 + }, + { + "epoch": 1.2592190889370933, + "grad_norm": 0.26791828890846314, + "learning_rate": 3.307857326896034e-06, + "loss": 0.0206, + "step": 3483 + }, + { + "epoch": 1.2595806218365873, + "grad_norm": 0.5165029843318752, + "learning_rate": 3.30504486650012e-06, + "loss": 0.0317, + "step": 3484 + }, + { + "epoch": 1.259942154736081, + "grad_norm": 0.5193821168646988, + "learning_rate": 3.3022330118808142e-06, + "loss": 0.0255, + "step": 3485 + }, + { + "epoch": 1.2603036876355749, + "grad_norm": 0.47522508132662367, + "learning_rate": 3.2994217640430715e-06, + "loss": 0.0317, + "step": 3486 + }, + { + "epoch": 1.2606652205350688, + "grad_norm": 0.2644260990197968, + "learning_rate": 3.2966111239916305e-06, + "loss": 0.0205, + "step": 3487 + }, + { + "epoch": 1.2610267534345625, + "grad_norm": 0.01198807699566875, + "learning_rate": 3.2938010927310147e-06, + "loss": 0.0003, + "step": 3488 + }, + { + "epoch": 1.2613882863340564, + "grad_norm": 0.8043512744333957, + "learning_rate": 3.2909916712655278e-06, + "loss": 0.1113, + "step": 3489 + }, + { + "epoch": 1.2617498192335503, + "grad_norm": 1.8771325781379478, + "learning_rate": 3.2881828605992587e-06, + "loss": 0.0688, + "step": 3490 + }, + { + "epoch": 1.262111352133044, + "grad_norm": 0.17769554335833185, + "learning_rate": 3.2853746617360727e-06, + "loss": 0.0131, + "step": 3491 + }, + { + "epoch": 1.262472885032538, + "grad_norm": 0.6470025351881808, + "learning_rate": 3.28256707567962e-06, + "loss": 0.0391, + "step": 3492 + }, + { + "epoch": 1.2628344179320319, + "grad_norm": 0.19868557089862868, + "learning_rate": 3.2797601034333333e-06, + "loss": 0.0146, + "step": 3493 + }, + { + "epoch": 1.2631959508315256, + "grad_norm": 0.12929791769645463, + "learning_rate": 3.276953746000421e-06, + "loss": 0.0027, + "step": 3494 + }, + { + "epoch": 1.2635574837310195, + "grad_norm": 0.9752881639010685, + "learning_rate": 3.2741480043838793e-06, + "loss": 0.063, + "step": 3495 + }, + { + "epoch": 1.2639190166305134, + "grad_norm": 0.07186742033157567, + "learning_rate": 3.2713428795864743e-06, + "loss": 0.0024, + "step": 3496 + }, + { + "epoch": 1.2642805495300071, + "grad_norm": 0.29316599150398137, + "learning_rate": 3.268538372610759e-06, + "loss": 0.0255, + "step": 3497 + }, + { + "epoch": 1.264642082429501, + "grad_norm": 1.0314845413378013, + "learning_rate": 3.2657344844590644e-06, + "loss": 0.0229, + "step": 3498 + }, + { + "epoch": 1.265003615328995, + "grad_norm": 1.436958287736568, + "learning_rate": 3.262931216133499e-06, + "loss": 0.1279, + "step": 3499 + }, + { + "epoch": 1.2653651482284887, + "grad_norm": 0.279835678186422, + "learning_rate": 3.2601285686359517e-06, + "loss": 0.0042, + "step": 3500 + }, + { + "epoch": 1.2657266811279826, + "grad_norm": 0.4195620885948637, + "learning_rate": 3.2573265429680855e-06, + "loss": 0.0286, + "step": 3501 + }, + { + "epoch": 1.2660882140274765, + "grad_norm": 0.8221536143451448, + "learning_rate": 3.254525140131345e-06, + "loss": 0.0525, + "step": 3502 + }, + { + "epoch": 1.2664497469269704, + "grad_norm": 0.037123131087314465, + "learning_rate": 3.251724361126951e-06, + "loss": 0.0008, + "step": 3503 + }, + { + "epoch": 1.2668112798264641, + "grad_norm": 0.9664248457780763, + "learning_rate": 3.248924206955901e-06, + "loss": 0.063, + "step": 3504 + }, + { + "epoch": 1.267172812725958, + "grad_norm": 0.053490161718602866, + "learning_rate": 3.246124678618972e-06, + "loss": 0.0007, + "step": 3505 + }, + { + "epoch": 1.267534345625452, + "grad_norm": 0.3927043071804653, + "learning_rate": 3.2433257771167116e-06, + "loss": 0.0255, + "step": 3506 + }, + { + "epoch": 1.267895878524946, + "grad_norm": 0.5084920712611378, + "learning_rate": 3.2405275034494468e-06, + "loss": 0.0286, + "step": 3507 + }, + { + "epoch": 1.2682574114244396, + "grad_norm": 1.1802196034912675, + "learning_rate": 3.2377298586172816e-06, + "loss": 0.0576, + "step": 3508 + }, + { + "epoch": 1.2686189443239335, + "grad_norm": 1.5645647967346845, + "learning_rate": 3.234932843620092e-06, + "loss": 0.1279, + "step": 3509 + }, + { + "epoch": 1.2689804772234274, + "grad_norm": 1.586454883241974, + "learning_rate": 3.2321364594575343e-06, + "loss": 0.1191, + "step": 3510 + }, + { + "epoch": 1.2693420101229211, + "grad_norm": 0.34224155302836756, + "learning_rate": 3.229340707129031e-06, + "loss": 0.0229, + "step": 3511 + }, + { + "epoch": 1.269703543022415, + "grad_norm": 0.019146984319274564, + "learning_rate": 3.2265455876337846e-06, + "loss": 0.0002, + "step": 3512 + }, + { + "epoch": 1.270065075921909, + "grad_norm": 0.04703856746241733, + "learning_rate": 3.2237511019707725e-06, + "loss": 0.0017, + "step": 3513 + }, + { + "epoch": 1.2704266088214027, + "grad_norm": 0.17743930554687654, + "learning_rate": 3.22095725113874e-06, + "loss": 0.0146, + "step": 3514 + }, + { + "epoch": 1.2707881417208966, + "grad_norm": 1.0470278756703972, + "learning_rate": 3.218164036136213e-06, + "loss": 0.1641, + "step": 3515 + }, + { + "epoch": 1.2711496746203905, + "grad_norm": 0.10827970818454784, + "learning_rate": 3.2153714579614818e-06, + "loss": 0.0064, + "step": 3516 + }, + { + "epoch": 1.2715112075198842, + "grad_norm": 0.01983409196975966, + "learning_rate": 3.212579517612614e-06, + "loss": 0.0004, + "step": 3517 + }, + { + "epoch": 1.2718727404193781, + "grad_norm": 0.653925158421703, + "learning_rate": 3.2097882160874513e-06, + "loss": 0.0255, + "step": 3518 + }, + { + "epoch": 1.272234273318872, + "grad_norm": 0.006241071331396885, + "learning_rate": 3.2069975543836007e-06, + "loss": 0.0001, + "step": 3519 + }, + { + "epoch": 1.2725958062183658, + "grad_norm": 0.14361575693678058, + "learning_rate": 3.204207533498448e-06, + "loss": 0.0131, + "step": 3520 + }, + { + "epoch": 1.2729573391178597, + "grad_norm": 0.9476181487619428, + "learning_rate": 3.2014181544291424e-06, + "loss": 0.0525, + "step": 3521 + }, + { + "epoch": 1.2733188720173536, + "grad_norm": 0.02033304816922753, + "learning_rate": 3.1986294181726075e-06, + "loss": 0.0004, + "step": 3522 + }, + { + "epoch": 1.2736804049168473, + "grad_norm": 0.178298916585529, + "learning_rate": 3.1958413257255403e-06, + "loss": 0.0131, + "step": 3523 + }, + { + "epoch": 1.2740419378163412, + "grad_norm": 6.988825386827615, + "learning_rate": 3.193053878084401e-06, + "loss": 0.2031, + "step": 3524 + }, + { + "epoch": 1.2744034707158352, + "grad_norm": 0.46861681563473523, + "learning_rate": 3.1902670762454267e-06, + "loss": 0.0476, + "step": 3525 + }, + { + "epoch": 1.274765003615329, + "grad_norm": 0.4862064614699399, + "learning_rate": 3.1874809212046166e-06, + "loss": 0.0354, + "step": 3526 + }, + { + "epoch": 1.2751265365148228, + "grad_norm": 0.8558472998166591, + "learning_rate": 3.1846954139577414e-06, + "loss": 0.0258, + "step": 3527 + }, + { + "epoch": 1.2754880694143167, + "grad_norm": 0.38673514300592315, + "learning_rate": 3.1819105555003426e-06, + "loss": 0.0057, + "step": 3528 + }, + { + "epoch": 1.2758496023138106, + "grad_norm": 0.024319372623277904, + "learning_rate": 3.179126346827727e-06, + "loss": 0.0004, + "step": 3529 + }, + { + "epoch": 1.2762111352133045, + "grad_norm": 0.09178661294541769, + "learning_rate": 3.176342788934973e-06, + "loss": 0.0031, + "step": 3530 + }, + { + "epoch": 1.2765726681127982, + "grad_norm": 1.8894955052678477, + "learning_rate": 3.17355988281692e-06, + "loss": 0.0957, + "step": 3531 + }, + { + "epoch": 1.2769342010122922, + "grad_norm": 2.2815897020515856, + "learning_rate": 3.170777629468179e-06, + "loss": 0.1035, + "step": 3532 + }, + { + "epoch": 1.277295733911786, + "grad_norm": 0.3928470324130623, + "learning_rate": 3.167996029883128e-06, + "loss": 0.0146, + "step": 3533 + }, + { + "epoch": 1.2776572668112798, + "grad_norm": 1.3021122235128206, + "learning_rate": 3.165215085055908e-06, + "loss": 0.0752, + "step": 3534 + }, + { + "epoch": 1.2780187997107737, + "grad_norm": 0.8407762940933392, + "learning_rate": 3.1624347959804314e-06, + "loss": 0.063, + "step": 3535 + }, + { + "epoch": 1.2783803326102676, + "grad_norm": 1.3058976440350396, + "learning_rate": 3.15965516365037e-06, + "loss": 0.0889, + "step": 3536 + }, + { + "epoch": 1.2787418655097613, + "grad_norm": 0.01095937823208976, + "learning_rate": 3.156876189059164e-06, + "loss": 0.0002, + "step": 3537 + }, + { + "epoch": 1.2791033984092552, + "grad_norm": 0.032860655283816514, + "learning_rate": 3.15409787320002e-06, + "loss": 0.0007, + "step": 3538 + }, + { + "epoch": 1.2794649313087492, + "grad_norm": 0.007047403982880362, + "learning_rate": 3.1513202170659053e-06, + "loss": 0.0002, + "step": 3539 + }, + { + "epoch": 1.2798264642082429, + "grad_norm": 0.27191728687929145, + "learning_rate": 3.148543221649557e-06, + "loss": 0.0206, + "step": 3540 + }, + { + "epoch": 1.2801879971077368, + "grad_norm": 0.17416627256080097, + "learning_rate": 3.145766887943468e-06, + "loss": 0.0117, + "step": 3541 + }, + { + "epoch": 1.2805495300072307, + "grad_norm": 0.5304741006464286, + "learning_rate": 3.1429912169399047e-06, + "loss": 0.0146, + "step": 3542 + }, + { + "epoch": 1.2809110629067244, + "grad_norm": 1.092990020387214, + "learning_rate": 3.140216209630887e-06, + "loss": 0.0391, + "step": 3543 + }, + { + "epoch": 1.2812725958062183, + "grad_norm": 0.6209192955257061, + "learning_rate": 3.137441867008203e-06, + "loss": 0.0432, + "step": 3544 + }, + { + "epoch": 1.2816341287057122, + "grad_norm": 0.00565919626474557, + "learning_rate": 3.1346681900634045e-06, + "loss": 0.0002, + "step": 3545 + }, + { + "epoch": 1.281995661605206, + "grad_norm": 0.15527587636286064, + "learning_rate": 3.131895179787798e-06, + "loss": 0.0092, + "step": 3546 + }, + { + "epoch": 1.2823571945046999, + "grad_norm": 0.7751069011502175, + "learning_rate": 3.129122837172463e-06, + "loss": 0.1641, + "step": 3547 + }, + { + "epoch": 1.2827187274041938, + "grad_norm": 0.29931872054717124, + "learning_rate": 3.1263511632082306e-06, + "loss": 0.0184, + "step": 3548 + }, + { + "epoch": 1.2830802603036877, + "grad_norm": 0.42052278812836463, + "learning_rate": 3.1235801588856956e-06, + "loss": 0.0206, + "step": 3549 + }, + { + "epoch": 1.2834417932031814, + "grad_norm": 1.242353556846884, + "learning_rate": 3.120809825195218e-06, + "loss": 0.1191, + "step": 3550 + }, + { + "epoch": 1.2838033261026753, + "grad_norm": 0.009645480866062234, + "learning_rate": 3.11804016312691e-06, + "loss": 0.0002, + "step": 3551 + }, + { + "epoch": 1.2841648590021693, + "grad_norm": 0.37692746419067186, + "learning_rate": 3.115271173670652e-06, + "loss": 0.0317, + "step": 3552 + }, + { + "epoch": 1.2845263919016632, + "grad_norm": 0.1515916978564681, + "learning_rate": 3.112502857816079e-06, + "loss": 0.0103, + "step": 3553 + }, + { + "epoch": 1.2848879248011569, + "grad_norm": 0.03175251887323331, + "learning_rate": 3.109735216552585e-06, + "loss": 0.0006, + "step": 3554 + }, + { + "epoch": 1.2852494577006508, + "grad_norm": 0.6106566825620304, + "learning_rate": 3.1069682508693276e-06, + "loss": 0.0286, + "step": 3555 + }, + { + "epoch": 1.2856109906001447, + "grad_norm": 0.6880019163169049, + "learning_rate": 3.1042019617552142e-06, + "loss": 0.0432, + "step": 3556 + }, + { + "epoch": 1.2859725234996384, + "grad_norm": 0.8906269661137394, + "learning_rate": 3.101436350198924e-06, + "loss": 0.1455, + "step": 3557 + }, + { + "epoch": 1.2863340563991323, + "grad_norm": 0.45540876239643135, + "learning_rate": 3.09867141718888e-06, + "loss": 0.0103, + "step": 3558 + }, + { + "epoch": 1.2866955892986263, + "grad_norm": 0.27757396656204514, + "learning_rate": 3.0959071637132688e-06, + "loss": 0.0072, + "step": 3559 + }, + { + "epoch": 1.28705712219812, + "grad_norm": 0.07950584649938663, + "learning_rate": 3.093143590760037e-06, + "loss": 0.0035, + "step": 3560 + }, + { + "epoch": 1.2874186550976139, + "grad_norm": 0.48539065486598787, + "learning_rate": 3.090380699316882e-06, + "loss": 0.0231, + "step": 3561 + }, + { + "epoch": 1.2877801879971078, + "grad_norm": 1.5277743442559406, + "learning_rate": 3.0876184903712637e-06, + "loss": 0.1113, + "step": 3562 + }, + { + "epoch": 1.2881417208966015, + "grad_norm": 0.11496358917475775, + "learning_rate": 3.084856964910393e-06, + "loss": 0.0115, + "step": 3563 + }, + { + "epoch": 1.2885032537960954, + "grad_norm": 0.24036460849347507, + "learning_rate": 3.082096123921238e-06, + "loss": 0.0184, + "step": 3564 + }, + { + "epoch": 1.2888647866955893, + "grad_norm": 0.8354058605158693, + "learning_rate": 3.079335968390524e-06, + "loss": 0.0317, + "step": 3565 + }, + { + "epoch": 1.289226319595083, + "grad_norm": 1.4496894279927772, + "learning_rate": 3.076576499304729e-06, + "loss": 0.0317, + "step": 3566 + }, + { + "epoch": 1.289587852494577, + "grad_norm": 0.18183979660657623, + "learning_rate": 3.073817717650089e-06, + "loss": 0.0081, + "step": 3567 + }, + { + "epoch": 1.289949385394071, + "grad_norm": 0.07250118334430168, + "learning_rate": 3.071059624412589e-06, + "loss": 0.0039, + "step": 3568 + }, + { + "epoch": 1.2903109182935646, + "grad_norm": 0.17075088091071758, + "learning_rate": 3.068302220577971e-06, + "loss": 0.0117, + "step": 3569 + }, + { + "epoch": 1.2906724511930585, + "grad_norm": 0.8069932199696198, + "learning_rate": 3.0655455071317337e-06, + "loss": 0.0286, + "step": 3570 + }, + { + "epoch": 1.2910339840925524, + "grad_norm": 0.3323237662473371, + "learning_rate": 3.062789485059122e-06, + "loss": 0.0255, + "step": 3571 + }, + { + "epoch": 1.2913955169920464, + "grad_norm": 0.18473700019079756, + "learning_rate": 3.0600341553451416e-06, + "loss": 0.0165, + "step": 3572 + }, + { + "epoch": 1.29175704989154, + "grad_norm": 0.7406250086801212, + "learning_rate": 3.057279518974544e-06, + "loss": 0.0432, + "step": 3573 + }, + { + "epoch": 1.292118582791034, + "grad_norm": 0.20295118541415821, + "learning_rate": 3.0545255769318355e-06, + "loss": 0.0131, + "step": 3574 + }, + { + "epoch": 1.292480115690528, + "grad_norm": 2.561903038481076, + "learning_rate": 3.0517723302012757e-06, + "loss": 0.0752, + "step": 3575 + }, + { + "epoch": 1.2928416485900218, + "grad_norm": 0.1581715408148294, + "learning_rate": 3.0490197797668738e-06, + "loss": 0.0146, + "step": 3576 + }, + { + "epoch": 1.2932031814895155, + "grad_norm": 0.4176765057553801, + "learning_rate": 3.046267926612392e-06, + "loss": 0.0231, + "step": 3577 + }, + { + "epoch": 1.2935647143890094, + "grad_norm": 0.03381646628549193, + "learning_rate": 3.0435167717213397e-06, + "loss": 0.0007, + "step": 3578 + }, + { + "epoch": 1.2939262472885034, + "grad_norm": 0.7167825440467239, + "learning_rate": 3.040766316076981e-06, + "loss": 0.0206, + "step": 3579 + }, + { + "epoch": 1.294287780187997, + "grad_norm": 0.0018781276181944492, + "learning_rate": 3.0380165606623267e-06, + "loss": 0.0001, + "step": 3580 + }, + { + "epoch": 1.294649313087491, + "grad_norm": 1.413897507316326, + "learning_rate": 3.035267506460139e-06, + "loss": 0.1553, + "step": 3581 + }, + { + "epoch": 1.295010845986985, + "grad_norm": 0.4486497050661668, + "learning_rate": 3.032519154452932e-06, + "loss": 0.0206, + "step": 3582 + }, + { + "epoch": 1.2953723788864786, + "grad_norm": 2.0074685144466664, + "learning_rate": 3.0297715056229627e-06, + "loss": 0.0688, + "step": 3583 + }, + { + "epoch": 1.2957339117859725, + "grad_norm": 1.288773882291294, + "learning_rate": 3.027024560952241e-06, + "loss": 0.1738, + "step": 3584 + }, + { + "epoch": 1.2960954446854664, + "grad_norm": 0.09435027228235962, + "learning_rate": 3.024278321422526e-06, + "loss": 0.0035, + "step": 3585 + }, + { + "epoch": 1.2964569775849601, + "grad_norm": 0.1476391583421634, + "learning_rate": 3.02153278801532e-06, + "loss": 0.0115, + "step": 3586 + }, + { + "epoch": 1.296818510484454, + "grad_norm": 0.1396510234333488, + "learning_rate": 3.018787961711881e-06, + "loss": 0.0039, + "step": 3587 + }, + { + "epoch": 1.297180043383948, + "grad_norm": 0.1663486846618139, + "learning_rate": 3.0160438434932048e-06, + "loss": 0.0146, + "step": 3588 + }, + { + "epoch": 1.2975415762834417, + "grad_norm": 0.6557358568905363, + "learning_rate": 3.013300434340039e-06, + "loss": 0.0317, + "step": 3589 + }, + { + "epoch": 1.2979031091829356, + "grad_norm": 0.24735918360218564, + "learning_rate": 3.0105577352328804e-06, + "loss": 0.0205, + "step": 3590 + }, + { + "epoch": 1.2982646420824295, + "grad_norm": 0.012287449314975931, + "learning_rate": 3.007815747151966e-06, + "loss": 0.0003, + "step": 3591 + }, + { + "epoch": 1.2986261749819232, + "grad_norm": 0.18919572985606584, + "learning_rate": 3.005074471077285e-06, + "loss": 0.0165, + "step": 3592 + }, + { + "epoch": 1.2989877078814172, + "grad_norm": 0.21168170619042526, + "learning_rate": 3.002333907988566e-06, + "loss": 0.0103, + "step": 3593 + }, + { + "epoch": 1.299349240780911, + "grad_norm": 0.0029323073296107765, + "learning_rate": 2.999594058865286e-06, + "loss": 0.0, + "step": 3594 + }, + { + "epoch": 1.299710773680405, + "grad_norm": 0.006225737273396153, + "learning_rate": 2.9968549246866685e-06, + "loss": 0.0001, + "step": 3595 + }, + { + "epoch": 1.3000723065798987, + "grad_norm": 0.11006179196168699, + "learning_rate": 2.994116506431677e-06, + "loss": 0.0021, + "step": 3596 + }, + { + "epoch": 1.3004338394793926, + "grad_norm": 0.03087000112998807, + "learning_rate": 2.9913788050790243e-06, + "loss": 0.001, + "step": 3597 + }, + { + "epoch": 1.3007953723788865, + "grad_norm": 0.4108345285512667, + "learning_rate": 2.988641821607162e-06, + "loss": 0.0165, + "step": 3598 + }, + { + "epoch": 1.3011569052783805, + "grad_norm": 0.010679534238619778, + "learning_rate": 2.9859055569942874e-06, + "loss": 0.0002, + "step": 3599 + }, + { + "epoch": 1.3015184381778742, + "grad_norm": 0.019854633523092535, + "learning_rate": 2.983170012218343e-06, + "loss": 0.0005, + "step": 3600 + }, + { + "epoch": 1.301879971077368, + "grad_norm": 0.9916746935353665, + "learning_rate": 2.980435188257008e-06, + "loss": 0.1553, + "step": 3601 + }, + { + "epoch": 1.302241503976862, + "grad_norm": 1.4513491415062034, + "learning_rate": 2.9777010860877143e-06, + "loss": 0.0688, + "step": 3602 + }, + { + "epoch": 1.3026030368763557, + "grad_norm": 0.7327969767972533, + "learning_rate": 2.9749677066876237e-06, + "loss": 0.0317, + "step": 3603 + }, + { + "epoch": 1.3029645697758496, + "grad_norm": 0.05121683730257118, + "learning_rate": 2.972235051033646e-06, + "loss": 0.001, + "step": 3604 + }, + { + "epoch": 1.3033261026753435, + "grad_norm": 0.41968848634645534, + "learning_rate": 2.9695031201024355e-06, + "loss": 0.0131, + "step": 3605 + }, + { + "epoch": 1.3036876355748372, + "grad_norm": 0.18889236854982977, + "learning_rate": 2.9667719148703794e-06, + "loss": 0.0146, + "step": 3606 + }, + { + "epoch": 1.3040491684743312, + "grad_norm": 3.491738349673741, + "learning_rate": 2.964041436313614e-06, + "loss": 0.4434, + "step": 3607 + }, + { + "epoch": 1.304410701373825, + "grad_norm": 0.010602997865516215, + "learning_rate": 2.9613116854080076e-06, + "loss": 0.0002, + "step": 3608 + }, + { + "epoch": 1.3047722342733188, + "grad_norm": 0.00031185685086630493, + "learning_rate": 2.9585826631291757e-06, + "loss": 0.0, + "step": 3609 + }, + { + "epoch": 1.3051337671728127, + "grad_norm": 0.04217257264562888, + "learning_rate": 2.955854370452469e-06, + "loss": 0.0003, + "step": 3610 + }, + { + "epoch": 1.3054953000723066, + "grad_norm": 0.3436753650356677, + "learning_rate": 2.9531268083529785e-06, + "loss": 0.0255, + "step": 3611 + }, + { + "epoch": 1.3058568329718003, + "grad_norm": 0.14440350023961518, + "learning_rate": 2.950399977805536e-06, + "loss": 0.0131, + "step": 3612 + }, + { + "epoch": 1.3062183658712943, + "grad_norm": 0.15127529387318234, + "learning_rate": 2.947673879784706e-06, + "loss": 0.0045, + "step": 3613 + }, + { + "epoch": 1.3065798987707882, + "grad_norm": 0.02625525501109045, + "learning_rate": 2.9449485152648014e-06, + "loss": 0.0009, + "step": 3614 + }, + { + "epoch": 1.3069414316702819, + "grad_norm": 0.09566977159575223, + "learning_rate": 2.9422238852198628e-06, + "loss": 0.0072, + "step": 3615 + }, + { + "epoch": 1.3073029645697758, + "grad_norm": 0.3602150619901472, + "learning_rate": 2.939499990623672e-06, + "loss": 0.0115, + "step": 3616 + }, + { + "epoch": 1.3076644974692697, + "grad_norm": 0.019353941611857894, + "learning_rate": 2.9367768324497527e-06, + "loss": 0.0004, + "step": 3617 + }, + { + "epoch": 1.3080260303687636, + "grad_norm": 0.6560256593590915, + "learning_rate": 2.9340544116713536e-06, + "loss": 0.0317, + "step": 3618 + }, + { + "epoch": 1.3083875632682573, + "grad_norm": 0.020616621937142473, + "learning_rate": 2.931332729261476e-06, + "loss": 0.0006, + "step": 3619 + }, + { + "epoch": 1.3087490961677513, + "grad_norm": 0.13362756192071726, + "learning_rate": 2.9286117861928427e-06, + "loss": 0.0115, + "step": 3620 + }, + { + "epoch": 1.3091106290672452, + "grad_norm": 0.15787885921342593, + "learning_rate": 2.925891583437919e-06, + "loss": 0.0072, + "step": 3621 + }, + { + "epoch": 1.309472161966739, + "grad_norm": 0.029498382428870432, + "learning_rate": 2.923172121968908e-06, + "loss": 0.0004, + "step": 3622 + }, + { + "epoch": 1.3098336948662328, + "grad_norm": 1.5057764186266833, + "learning_rate": 2.9204534027577387e-06, + "loss": 0.1553, + "step": 3623 + }, + { + "epoch": 1.3101952277657267, + "grad_norm": 0.5782154254377051, + "learning_rate": 2.9177354267760876e-06, + "loss": 0.0391, + "step": 3624 + }, + { + "epoch": 1.3105567606652206, + "grad_norm": 3.7463250364273852, + "learning_rate": 2.915018194995355e-06, + "loss": 0.0889, + "step": 3625 + }, + { + "epoch": 1.3109182935647143, + "grad_norm": 0.065099629216664, + "learning_rate": 2.912301708386679e-06, + "loss": 0.0021, + "step": 3626 + }, + { + "epoch": 1.3112798264642083, + "grad_norm": 0.03864041473859173, + "learning_rate": 2.909585967920932e-06, + "loss": 0.0005, + "step": 3627 + }, + { + "epoch": 1.3116413593637022, + "grad_norm": 0.23306104482410528, + "learning_rate": 2.906870974568717e-06, + "loss": 0.0165, + "step": 3628 + }, + { + "epoch": 1.3120028922631959, + "grad_norm": 0.06987546736479366, + "learning_rate": 2.904156729300376e-06, + "loss": 0.0017, + "step": 3629 + }, + { + "epoch": 1.3123644251626898, + "grad_norm": 0.2458319229996178, + "learning_rate": 2.9014432330859792e-06, + "loss": 0.0165, + "step": 3630 + }, + { + "epoch": 1.3127259580621837, + "grad_norm": 1.2782486966961093, + "learning_rate": 2.898730486895324e-06, + "loss": 0.1113, + "step": 3631 + }, + { + "epoch": 1.3130874909616774, + "grad_norm": 1.8930918051532188, + "learning_rate": 2.8960184916979515e-06, + "loss": 0.0688, + "step": 3632 + }, + { + "epoch": 1.3134490238611713, + "grad_norm": 0.010574801579589411, + "learning_rate": 2.893307248463126e-06, + "loss": 0.0003, + "step": 3633 + }, + { + "epoch": 1.3138105567606653, + "grad_norm": 0.552397015003762, + "learning_rate": 2.890596758159846e-06, + "loss": 0.0184, + "step": 3634 + }, + { + "epoch": 1.314172089660159, + "grad_norm": 0.006031379299221498, + "learning_rate": 2.887887021756839e-06, + "loss": 0.0001, + "step": 3635 + }, + { + "epoch": 1.314533622559653, + "grad_norm": 0.04362180235522781, + "learning_rate": 2.885178040222565e-06, + "loss": 0.0015, + "step": 3636 + }, + { + "epoch": 1.3148951554591468, + "grad_norm": 0.0018229308528501281, + "learning_rate": 2.882469814525213e-06, + "loss": 0.0, + "step": 3637 + }, + { + "epoch": 1.3152566883586405, + "grad_norm": 1.1171136114303934, + "learning_rate": 2.8797623456326993e-06, + "loss": 0.0391, + "step": 3638 + }, + { + "epoch": 1.3156182212581344, + "grad_norm": 0.5637706221999814, + "learning_rate": 2.8770556345126787e-06, + "loss": 0.0317, + "step": 3639 + }, + { + "epoch": 1.3159797541576284, + "grad_norm": 0.0013303420706335604, + "learning_rate": 2.874349682132529e-06, + "loss": 0.0, + "step": 3640 + }, + { + "epoch": 1.3163412870571223, + "grad_norm": 0.03828814617827932, + "learning_rate": 2.8716444894593494e-06, + "loss": 0.0012, + "step": 3641 + }, + { + "epoch": 1.316702819956616, + "grad_norm": 0.17503293917012097, + "learning_rate": 2.868940057459982e-06, + "loss": 0.0115, + "step": 3642 + }, + { + "epoch": 1.31706435285611, + "grad_norm": 0.9915805681230551, + "learning_rate": 2.8662363871009885e-06, + "loss": 0.2031, + "step": 3643 + }, + { + "epoch": 1.3174258857556038, + "grad_norm": 0.10910352375633742, + "learning_rate": 2.86353347934866e-06, + "loss": 0.0044, + "step": 3644 + }, + { + "epoch": 1.3177874186550977, + "grad_norm": 3.1059809249355133, + "learning_rate": 2.8608313351690153e-06, + "loss": 0.4199, + "step": 3645 + }, + { + "epoch": 1.3181489515545914, + "grad_norm": 0.024167654982892178, + "learning_rate": 2.8581299555277995e-06, + "loss": 0.0001, + "step": 3646 + }, + { + "epoch": 1.3185104844540854, + "grad_norm": 0.18525573522004798, + "learning_rate": 2.8554293413904867e-06, + "loss": 0.0115, + "step": 3647 + }, + { + "epoch": 1.3188720173535793, + "grad_norm": 0.2533303814200102, + "learning_rate": 2.8527294937222717e-06, + "loss": 0.0081, + "step": 3648 + }, + { + "epoch": 1.319233550253073, + "grad_norm": 0.0013576205617939928, + "learning_rate": 2.850030413488084e-06, + "loss": 0.0, + "step": 3649 + }, + { + "epoch": 1.319595083152567, + "grad_norm": 2.5008096734596954, + "learning_rate": 2.8473321016525755e-06, + "loss": 0.2246, + "step": 3650 + }, + { + "epoch": 1.3199566160520608, + "grad_norm": 2.217630694920513, + "learning_rate": 2.844634559180116e-06, + "loss": 0.0957, + "step": 3651 + }, + { + "epoch": 1.3203181489515545, + "grad_norm": 0.24146945850663878, + "learning_rate": 2.841937787034812e-06, + "loss": 0.0092, + "step": 3652 + }, + { + "epoch": 1.3206796818510484, + "grad_norm": 0.0018192514981451352, + "learning_rate": 2.8392417861804867e-06, + "loss": 0.0, + "step": 3653 + }, + { + "epoch": 1.3210412147505424, + "grad_norm": 0.014284323847762872, + "learning_rate": 2.8365465575806916e-06, + "loss": 0.0004, + "step": 3654 + }, + { + "epoch": 1.321402747650036, + "grad_norm": 0.01565213000538824, + "learning_rate": 2.8338521021987e-06, + "loss": 0.0004, + "step": 3655 + }, + { + "epoch": 1.32176428054953, + "grad_norm": 0.05314482535441919, + "learning_rate": 2.8311584209975105e-06, + "loss": 0.0008, + "step": 3656 + }, + { + "epoch": 1.322125813449024, + "grad_norm": 0.04994111031977619, + "learning_rate": 2.8284655149398434e-06, + "loss": 0.0015, + "step": 3657 + }, + { + "epoch": 1.3224873463485176, + "grad_norm": 0.033362423218825946, + "learning_rate": 2.8257733849881407e-06, + "loss": 0.0012, + "step": 3658 + }, + { + "epoch": 1.3228488792480115, + "grad_norm": 2.9969029495937134, + "learning_rate": 2.823082032104573e-06, + "loss": 0.1279, + "step": 3659 + }, + { + "epoch": 1.3232104121475055, + "grad_norm": 0.0042398234289971175, + "learning_rate": 2.8203914572510305e-06, + "loss": 0.0, + "step": 3660 + }, + { + "epoch": 1.3235719450469992, + "grad_norm": 0.0016619717812040758, + "learning_rate": 2.817701661389116e-06, + "loss": 0.0, + "step": 3661 + }, + { + "epoch": 1.323933477946493, + "grad_norm": 0.05515734306836507, + "learning_rate": 2.815012645480171e-06, + "loss": 0.0012, + "step": 3662 + }, + { + "epoch": 1.324295010845987, + "grad_norm": 0.21424930575378157, + "learning_rate": 2.8123244104852436e-06, + "loss": 0.0184, + "step": 3663 + }, + { + "epoch": 1.324656543745481, + "grad_norm": 0.40324556800870764, + "learning_rate": 2.8096369573651113e-06, + "loss": 0.0317, + "step": 3664 + }, + { + "epoch": 1.3250180766449746, + "grad_norm": 2.161300364012764, + "learning_rate": 2.806950287080268e-06, + "loss": 0.1934, + "step": 3665 + }, + { + "epoch": 1.3253796095444685, + "grad_norm": 0.6708226493090647, + "learning_rate": 2.8042644005909287e-06, + "loss": 0.0354, + "step": 3666 + }, + { + "epoch": 1.3257411424439625, + "grad_norm": 0.2766068743929356, + "learning_rate": 2.8015792988570297e-06, + "loss": 0.0184, + "step": 3667 + }, + { + "epoch": 1.3261026753434564, + "grad_norm": 1.2264946618975974, + "learning_rate": 2.7988949828382228e-06, + "loss": 0.0525, + "step": 3668 + }, + { + "epoch": 1.32646420824295, + "grad_norm": 0.26140220630363675, + "learning_rate": 2.796211453493887e-06, + "loss": 0.0184, + "step": 3669 + }, + { + "epoch": 1.326825741142444, + "grad_norm": 0.8530408294333807, + "learning_rate": 2.793528711783115e-06, + "loss": 0.0165, + "step": 3670 + }, + { + "epoch": 1.327187274041938, + "grad_norm": 1.264725013797512, + "learning_rate": 2.7908467586647112e-06, + "loss": 0.1836, + "step": 3671 + }, + { + "epoch": 1.3275488069414316, + "grad_norm": 0.9721976426698187, + "learning_rate": 2.788165595097212e-06, + "loss": 0.1553, + "step": 3672 + }, + { + "epoch": 1.3279103398409255, + "grad_norm": 1.0562143744696175, + "learning_rate": 2.7854852220388617e-06, + "loss": 0.0354, + "step": 3673 + }, + { + "epoch": 1.3282718727404195, + "grad_norm": 1.0572721275113968, + "learning_rate": 2.782805640447627e-06, + "loss": 0.1455, + "step": 3674 + }, + { + "epoch": 1.3286334056399132, + "grad_norm": 3.66479995916972, + "learning_rate": 2.7801268512811874e-06, + "loss": 0.2559, + "step": 3675 + }, + { + "epoch": 1.328994938539407, + "grad_norm": 0.1446302841085483, + "learning_rate": 2.7774488554969425e-06, + "loss": 0.0057, + "step": 3676 + }, + { + "epoch": 1.329356471438901, + "grad_norm": 0.0008152292846871596, + "learning_rate": 2.7747716540520082e-06, + "loss": 0.0, + "step": 3677 + }, + { + "epoch": 1.3297180043383947, + "grad_norm": 0.2602753666953517, + "learning_rate": 2.7720952479032127e-06, + "loss": 0.0103, + "step": 3678 + }, + { + "epoch": 1.3300795372378886, + "grad_norm": 0.9038767182709299, + "learning_rate": 2.7694196380071074e-06, + "loss": 0.1641, + "step": 3679 + }, + { + "epoch": 1.3304410701373826, + "grad_norm": 0.5028884075856274, + "learning_rate": 2.7667448253199536e-06, + "loss": 0.0354, + "step": 3680 + }, + { + "epoch": 1.3308026030368763, + "grad_norm": 0.23240012545011246, + "learning_rate": 2.7640708107977264e-06, + "loss": 0.0117, + "step": 3681 + }, + { + "epoch": 1.3311641359363702, + "grad_norm": 0.31661011987234283, + "learning_rate": 2.7613975953961207e-06, + "loss": 0.0229, + "step": 3682 + }, + { + "epoch": 1.331525668835864, + "grad_norm": 0.04171180507371879, + "learning_rate": 2.7587251800705416e-06, + "loss": 0.0013, + "step": 3683 + }, + { + "epoch": 1.3318872017353578, + "grad_norm": 0.40060769506094224, + "learning_rate": 2.7560535657761106e-06, + "loss": 0.0317, + "step": 3684 + }, + { + "epoch": 1.3322487346348517, + "grad_norm": 0.3405426487695686, + "learning_rate": 2.7533827534676594e-06, + "loss": 0.0146, + "step": 3685 + }, + { + "epoch": 1.3326102675343456, + "grad_norm": 0.0007965945525116289, + "learning_rate": 2.7507127440997392e-06, + "loss": 0.0, + "step": 3686 + }, + { + "epoch": 1.3329718004338396, + "grad_norm": 1.2597901982458306, + "learning_rate": 2.748043538626613e-06, + "loss": 0.0286, + "step": 3687 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.4731718425249758, + "learning_rate": 2.7453751380022457e-06, + "loss": 0.0255, + "step": 3688 + }, + { + "epoch": 1.3336948662328272, + "grad_norm": 0.8787968403269206, + "learning_rate": 2.7427075431803307e-06, + "loss": 0.0476, + "step": 3689 + }, + { + "epoch": 1.334056399132321, + "grad_norm": 0.8843130384080147, + "learning_rate": 2.7400407551142636e-06, + "loss": 0.1367, + "step": 3690 + }, + { + "epoch": 1.334417932031815, + "grad_norm": 0.8990394846448557, + "learning_rate": 2.7373747747571534e-06, + "loss": 0.1455, + "step": 3691 + }, + { + "epoch": 1.3347794649313087, + "grad_norm": 0.4979506102198186, + "learning_rate": 2.7347096030618216e-06, + "loss": 0.0391, + "step": 3692 + }, + { + "epoch": 1.3351409978308026, + "grad_norm": 0.04126650875920814, + "learning_rate": 2.7320452409807986e-06, + "loss": 0.0006, + "step": 3693 + }, + { + "epoch": 1.3355025307302966, + "grad_norm": 1.6964804539537557, + "learning_rate": 2.7293816894663285e-06, + "loss": 0.0889, + "step": 3694 + }, + { + "epoch": 1.3358640636297903, + "grad_norm": 0.03511072657501642, + "learning_rate": 2.7267189494703617e-06, + "loss": 0.001, + "step": 3695 + }, + { + "epoch": 1.3362255965292842, + "grad_norm": 0.1532451450545643, + "learning_rate": 2.724057021944564e-06, + "loss": 0.0081, + "step": 3696 + }, + { + "epoch": 1.336587129428778, + "grad_norm": 1.2045150704735832, + "learning_rate": 2.7213959078403083e-06, + "loss": 0.0752, + "step": 3697 + }, + { + "epoch": 1.3369486623282718, + "grad_norm": 0.5063510504764572, + "learning_rate": 2.7187356081086713e-06, + "loss": 0.0286, + "step": 3698 + }, + { + "epoch": 1.3373101952277657, + "grad_norm": 0.055014796789949846, + "learning_rate": 2.7160761237004476e-06, + "loss": 0.0013, + "step": 3699 + }, + { + "epoch": 1.3376717281272597, + "grad_norm": 2.4752321757187192, + "learning_rate": 2.7134174555661353e-06, + "loss": 0.2031, + "step": 3700 + }, + { + "epoch": 1.3380332610267534, + "grad_norm": 1.7066724806079248, + "learning_rate": 2.7107596046559427e-06, + "loss": 0.2031, + "step": 3701 + }, + { + "epoch": 1.3383947939262473, + "grad_norm": 0.018679822207507208, + "learning_rate": 2.708102571919783e-06, + "loss": 0.0005, + "step": 3702 + }, + { + "epoch": 1.3387563268257412, + "grad_norm": 0.7141652944740488, + "learning_rate": 2.705446358307281e-06, + "loss": 0.0317, + "step": 3703 + }, + { + "epoch": 1.339117859725235, + "grad_norm": 0.39138379203391027, + "learning_rate": 2.7027909647677664e-06, + "loss": 0.0206, + "step": 3704 + }, + { + "epoch": 1.3394793926247288, + "grad_norm": 0.0009743446191831578, + "learning_rate": 2.700136392250274e-06, + "loss": 0.0, + "step": 3705 + }, + { + "epoch": 1.3398409255242227, + "grad_norm": 2.491005850997266, + "learning_rate": 2.6974826417035515e-06, + "loss": 0.0957, + "step": 3706 + }, + { + "epoch": 1.3402024584237164, + "grad_norm": 0.3836740584266246, + "learning_rate": 2.694829714076049e-06, + "loss": 0.0255, + "step": 3707 + }, + { + "epoch": 1.3405639913232104, + "grad_norm": 0.0055574490527077955, + "learning_rate": 2.692177610315917e-06, + "loss": 0.0001, + "step": 3708 + }, + { + "epoch": 1.3409255242227043, + "grad_norm": 0.003382725798414415, + "learning_rate": 2.6895263313710213e-06, + "loss": 0.0, + "step": 3709 + }, + { + "epoch": 1.3412870571221982, + "grad_norm": 1.5711007607026388, + "learning_rate": 2.686875878188927e-06, + "loss": 0.1279, + "step": 3710 + }, + { + "epoch": 1.341648590021692, + "grad_norm": 0.002372406869403224, + "learning_rate": 2.684226251716906e-06, + "loss": 0.0001, + "step": 3711 + }, + { + "epoch": 1.3420101229211858, + "grad_norm": 0.042020090361864194, + "learning_rate": 2.6815774529019345e-06, + "loss": 0.001, + "step": 3712 + }, + { + "epoch": 1.3423716558206797, + "grad_norm": 0.07993936273808008, + "learning_rate": 2.678929482690691e-06, + "loss": 0.0017, + "step": 3713 + }, + { + "epoch": 1.3427331887201737, + "grad_norm": 0.03271234535909543, + "learning_rate": 2.6762823420295612e-06, + "loss": 0.0007, + "step": 3714 + }, + { + "epoch": 1.3430947216196674, + "grad_norm": 0.0887513430810978, + "learning_rate": 2.6736360318646293e-06, + "loss": 0.0056, + "step": 3715 + }, + { + "epoch": 1.3434562545191613, + "grad_norm": 0.27300650451011105, + "learning_rate": 2.670990553141691e-06, + "loss": 0.0229, + "step": 3716 + }, + { + "epoch": 1.3438177874186552, + "grad_norm": 0.2929170725347439, + "learning_rate": 2.66834590680624e-06, + "loss": 0.0255, + "step": 3717 + }, + { + "epoch": 1.344179320318149, + "grad_norm": 0.02309429263481061, + "learning_rate": 2.6657020938034654e-06, + "loss": 0.0008, + "step": 3718 + }, + { + "epoch": 1.3445408532176428, + "grad_norm": 0.2743012668805788, + "learning_rate": 2.6630591150782718e-06, + "loss": 0.0103, + "step": 3719 + }, + { + "epoch": 1.3449023861171367, + "grad_norm": 0.028003984743486234, + "learning_rate": 2.6604169715752576e-06, + "loss": 0.0008, + "step": 3720 + }, + { + "epoch": 1.3452639190166304, + "grad_norm": 0.34839194211175295, + "learning_rate": 2.6577756642387235e-06, + "loss": 0.0131, + "step": 3721 + }, + { + "epoch": 1.3456254519161244, + "grad_norm": 0.15618366763421243, + "learning_rate": 2.6551351940126735e-06, + "loss": 0.0045, + "step": 3722 + }, + { + "epoch": 1.3459869848156183, + "grad_norm": 4.6361727061173745, + "learning_rate": 2.6524955618408093e-06, + "loss": 0.3535, + "step": 3723 + }, + { + "epoch": 1.346348517715112, + "grad_norm": 1.397927436609171, + "learning_rate": 2.6498567686665367e-06, + "loss": 0.2129, + "step": 3724 + }, + { + "epoch": 1.346710050614606, + "grad_norm": 0.8568595923067269, + "learning_rate": 2.6472188154329567e-06, + "loss": 0.0391, + "step": 3725 + }, + { + "epoch": 1.3470715835140998, + "grad_norm": 0.015259404537488357, + "learning_rate": 2.644581703082877e-06, + "loss": 0.0002, + "step": 3726 + }, + { + "epoch": 1.3474331164135935, + "grad_norm": 4.042354560513148, + "learning_rate": 2.641945432558802e-06, + "loss": 0.1455, + "step": 3727 + }, + { + "epoch": 1.3477946493130875, + "grad_norm": 0.24039147711045075, + "learning_rate": 2.639310004802928e-06, + "loss": 0.0206, + "step": 3728 + }, + { + "epoch": 1.3481561822125814, + "grad_norm": 0.19357447957864055, + "learning_rate": 2.636675420757162e-06, + "loss": 0.0205, + "step": 3729 + }, + { + "epoch": 1.348517715112075, + "grad_norm": 0.3744307963152612, + "learning_rate": 2.634041681363102e-06, + "loss": 0.0117, + "step": 3730 + }, + { + "epoch": 1.348879248011569, + "grad_norm": 0.35043878211383583, + "learning_rate": 2.6314087875620455e-06, + "loss": 0.0117, + "step": 3731 + }, + { + "epoch": 1.349240780911063, + "grad_norm": 0.0013340565053411686, + "learning_rate": 2.628776740294988e-06, + "loss": 0.0, + "step": 3732 + }, + { + "epoch": 1.3496023138105568, + "grad_norm": 0.553958775889184, + "learning_rate": 2.6261455405026236e-06, + "loss": 0.0391, + "step": 3733 + }, + { + "epoch": 1.3499638467100505, + "grad_norm": 1.1825422056505341, + "learning_rate": 2.6235151891253417e-06, + "loss": 0.0432, + "step": 3734 + }, + { + "epoch": 1.3503253796095445, + "grad_norm": 0.6937466995176652, + "learning_rate": 2.6208856871032284e-06, + "loss": 0.0286, + "step": 3735 + }, + { + "epoch": 1.3506869125090384, + "grad_norm": 1.8292509426636285, + "learning_rate": 2.61825703537607e-06, + "loss": 0.1191, + "step": 3736 + }, + { + "epoch": 1.3510484454085323, + "grad_norm": 0.3078991505928045, + "learning_rate": 2.615629234883347e-06, + "loss": 0.0255, + "step": 3737 + }, + { + "epoch": 1.351409978308026, + "grad_norm": 0.01372783444252099, + "learning_rate": 2.6130022865642275e-06, + "loss": 0.0004, + "step": 3738 + }, + { + "epoch": 1.35177151120752, + "grad_norm": 1.5275022945767678, + "learning_rate": 2.610376191357589e-06, + "loss": 0.1934, + "step": 3739 + }, + { + "epoch": 1.3521330441070138, + "grad_norm": 0.18905417022586546, + "learning_rate": 2.607750950201996e-06, + "loss": 0.0081, + "step": 3740 + }, + { + "epoch": 1.3524945770065075, + "grad_norm": 0.3110848620639763, + "learning_rate": 2.605126564035708e-06, + "loss": 0.0229, + "step": 3741 + }, + { + "epoch": 1.3528561099060015, + "grad_norm": 0.20773640454193107, + "learning_rate": 2.602503033796681e-06, + "loss": 0.0146, + "step": 3742 + }, + { + "epoch": 1.3532176428054954, + "grad_norm": 2.0935247238144292, + "learning_rate": 2.5998803604225638e-06, + "loss": 0.1738, + "step": 3743 + }, + { + "epoch": 1.353579175704989, + "grad_norm": 0.3733619878865032, + "learning_rate": 2.5972585448506994e-06, + "loss": 0.0286, + "step": 3744 + }, + { + "epoch": 1.353940708604483, + "grad_norm": 0.1340455622178819, + "learning_rate": 2.594637588018121e-06, + "loss": 0.0057, + "step": 3745 + }, + { + "epoch": 1.354302241503977, + "grad_norm": 1.0396695890290881, + "learning_rate": 2.5920174908615643e-06, + "loss": 0.0525, + "step": 3746 + }, + { + "epoch": 1.3546637744034706, + "grad_norm": 1.0721826013895075, + "learning_rate": 2.589398254317447e-06, + "loss": 0.063, + "step": 3747 + }, + { + "epoch": 1.3550253073029646, + "grad_norm": 0.9097220997245622, + "learning_rate": 2.5867798793218856e-06, + "loss": 0.1113, + "step": 3748 + }, + { + "epoch": 1.3553868402024585, + "grad_norm": 1.0988799418732174, + "learning_rate": 2.584162366810686e-06, + "loss": 0.1279, + "step": 3749 + }, + { + "epoch": 1.3557483731019522, + "grad_norm": 0.007839931619169122, + "learning_rate": 2.581545717719347e-06, + "loss": 0.0002, + "step": 3750 + }, + { + "epoch": 1.356109906001446, + "grad_norm": 0.05557698835811949, + "learning_rate": 2.5789299329830577e-06, + "loss": 0.0013, + "step": 3751 + }, + { + "epoch": 1.35647143890094, + "grad_norm": 0.015569987858251127, + "learning_rate": 2.576315013536697e-06, + "loss": 0.0003, + "step": 3752 + }, + { + "epoch": 1.3568329718004337, + "grad_norm": 0.4723274752767142, + "learning_rate": 2.573700960314843e-06, + "loss": 0.0286, + "step": 3753 + }, + { + "epoch": 1.3571945046999276, + "grad_norm": 0.1745030416520009, + "learning_rate": 2.5710877742517528e-06, + "loss": 0.0146, + "step": 3754 + }, + { + "epoch": 1.3575560375994216, + "grad_norm": 0.3860419281201075, + "learning_rate": 2.5684754562813763e-06, + "loss": 0.0051, + "step": 3755 + }, + { + "epoch": 1.3579175704989155, + "grad_norm": 0.09808398079568129, + "learning_rate": 2.565864007337361e-06, + "loss": 0.0021, + "step": 3756 + }, + { + "epoch": 1.3582791033984092, + "grad_norm": 0.27493477949532713, + "learning_rate": 2.5632534283530363e-06, + "loss": 0.0229, + "step": 3757 + }, + { + "epoch": 1.358640636297903, + "grad_norm": 0.46553986463329, + "learning_rate": 2.5606437202614213e-06, + "loss": 0.0081, + "step": 3758 + }, + { + "epoch": 1.359002169197397, + "grad_norm": 1.4798409326194293, + "learning_rate": 2.5580348839952264e-06, + "loss": 0.0889, + "step": 3759 + }, + { + "epoch": 1.359363702096891, + "grad_norm": 0.9343112921426894, + "learning_rate": 2.5554269204868482e-06, + "loss": 0.0752, + "step": 3760 + }, + { + "epoch": 1.3597252349963846, + "grad_norm": 1.4185376267201841, + "learning_rate": 2.5528198306683726e-06, + "loss": 0.0752, + "step": 3761 + }, + { + "epoch": 1.3600867678958786, + "grad_norm": 0.2158389857318438, + "learning_rate": 2.5502136154715717e-06, + "loss": 0.0117, + "step": 3762 + }, + { + "epoch": 1.3604483007953725, + "grad_norm": 0.4513533745711553, + "learning_rate": 2.547608275827911e-06, + "loss": 0.0206, + "step": 3763 + }, + { + "epoch": 1.3608098336948662, + "grad_norm": 0.13063024254059832, + "learning_rate": 2.545003812668534e-06, + "loss": 0.0092, + "step": 3764 + }, + { + "epoch": 1.36117136659436, + "grad_norm": 0.1589831005097657, + "learning_rate": 2.5424002269242732e-06, + "loss": 0.0131, + "step": 3765 + }, + { + "epoch": 1.361532899493854, + "grad_norm": 0.12910524845570434, + "learning_rate": 2.5397975195256553e-06, + "loss": 0.0115, + "step": 3766 + }, + { + "epoch": 1.3618944323933477, + "grad_norm": 1.4962516022466634, + "learning_rate": 2.5371956914028838e-06, + "loss": 0.1836, + "step": 3767 + }, + { + "epoch": 1.3622559652928417, + "grad_norm": 0.281925058231366, + "learning_rate": 2.534594743485853e-06, + "loss": 0.0229, + "step": 3768 + }, + { + "epoch": 1.3626174981923356, + "grad_norm": 0.23090520266836423, + "learning_rate": 2.531994676704141e-06, + "loss": 0.0184, + "step": 3769 + }, + { + "epoch": 1.3629790310918293, + "grad_norm": 0.18287320212428973, + "learning_rate": 2.52939549198701e-06, + "loss": 0.0184, + "step": 3770 + }, + { + "epoch": 1.3633405639913232, + "grad_norm": 0.25181454895616856, + "learning_rate": 2.5267971902634093e-06, + "loss": 0.0146, + "step": 3771 + }, + { + "epoch": 1.3637020968908171, + "grad_norm": 0.0026427097308877687, + "learning_rate": 2.5241997724619683e-06, + "loss": 0.0001, + "step": 3772 + }, + { + "epoch": 1.3640636297903108, + "grad_norm": 0.17054678750141047, + "learning_rate": 2.521603239511011e-06, + "loss": 0.0146, + "step": 3773 + }, + { + "epoch": 1.3644251626898047, + "grad_norm": 0.18563215654004012, + "learning_rate": 2.51900759233853e-06, + "loss": 0.0146, + "step": 3774 + }, + { + "epoch": 1.3647866955892987, + "grad_norm": 0.0025403518492670783, + "learning_rate": 2.5164128318722104e-06, + "loss": 0.0001, + "step": 3775 + }, + { + "epoch": 1.3651482284887924, + "grad_norm": 0.13278921096034366, + "learning_rate": 2.5138189590394224e-06, + "loss": 0.0027, + "step": 3776 + }, + { + "epoch": 1.3655097613882863, + "grad_norm": 1.8612416540254846, + "learning_rate": 2.5112259747672134e-06, + "loss": 0.1191, + "step": 3777 + }, + { + "epoch": 1.3658712942877802, + "grad_norm": 0.14591821763729262, + "learning_rate": 2.508633879982316e-06, + "loss": 0.0131, + "step": 3778 + }, + { + "epoch": 1.3662328271872741, + "grad_norm": 2.1550482635872985, + "learning_rate": 2.5060426756111446e-06, + "loss": 0.0889, + "step": 3779 + }, + { + "epoch": 1.3665943600867678, + "grad_norm": 0.20843947507814406, + "learning_rate": 2.503452362579794e-06, + "loss": 0.0082, + "step": 3780 + }, + { + "epoch": 1.3669558929862617, + "grad_norm": 1.3294024248266305, + "learning_rate": 2.500862941814043e-06, + "loss": 0.0889, + "step": 3781 + }, + { + "epoch": 1.3673174258857557, + "grad_norm": 0.11830527735343813, + "learning_rate": 2.498274414239346e-06, + "loss": 0.0092, + "step": 3782 + }, + { + "epoch": 1.3676789587852496, + "grad_norm": 0.023564589107394958, + "learning_rate": 2.4956867807808483e-06, + "loss": 0.0009, + "step": 3783 + }, + { + "epoch": 1.3680404916847433, + "grad_norm": 0.31745253894209247, + "learning_rate": 2.4931000423633685e-06, + "loss": 0.001, + "step": 3784 + }, + { + "epoch": 1.3684020245842372, + "grad_norm": 0.509373596895046, + "learning_rate": 2.4905141999114003e-06, + "loss": 0.0317, + "step": 3785 + }, + { + "epoch": 1.3687635574837311, + "grad_norm": 0.3494162553493776, + "learning_rate": 2.487929254349129e-06, + "loss": 0.0184, + "step": 3786 + }, + { + "epoch": 1.3691250903832248, + "grad_norm": 0.0405195332320681, + "learning_rate": 2.4853452066004126e-06, + "loss": 0.0015, + "step": 3787 + }, + { + "epoch": 1.3694866232827188, + "grad_norm": 0.012722494024236987, + "learning_rate": 2.4827620575887888e-06, + "loss": 0.0003, + "step": 3788 + }, + { + "epoch": 1.3698481561822127, + "grad_norm": 0.0011202931172904472, + "learning_rate": 2.4801798082374737e-06, + "loss": 0.0, + "step": 3789 + }, + { + "epoch": 1.3702096890817064, + "grad_norm": 0.27625565421565496, + "learning_rate": 2.477598459469364e-06, + "loss": 0.0255, + "step": 3790 + }, + { + "epoch": 1.3705712219812003, + "grad_norm": 1.0849993046572495, + "learning_rate": 2.4750180122070328e-06, + "loss": 0.0391, + "step": 3791 + }, + { + "epoch": 1.3709327548806942, + "grad_norm": 0.34734280352144853, + "learning_rate": 2.4724384673727285e-06, + "loss": 0.0131, + "step": 3792 + }, + { + "epoch": 1.371294287780188, + "grad_norm": 0.02799652051775824, + "learning_rate": 2.469859825888385e-06, + "loss": 0.0007, + "step": 3793 + }, + { + "epoch": 1.3716558206796818, + "grad_norm": 0.194245343551266, + "learning_rate": 2.4672820886756074e-06, + "loss": 0.0072, + "step": 3794 + }, + { + "epoch": 1.3720173535791758, + "grad_norm": 0.0035976576519694574, + "learning_rate": 2.464705256655673e-06, + "loss": 0.0001, + "step": 3795 + }, + { + "epoch": 1.3723788864786695, + "grad_norm": 0.0016201424194501218, + "learning_rate": 2.462129330749547e-06, + "loss": 0.0, + "step": 3796 + }, + { + "epoch": 1.3727404193781634, + "grad_norm": 0.2056057778018301, + "learning_rate": 2.459554311877862e-06, + "loss": 0.0205, + "step": 3797 + }, + { + "epoch": 1.3731019522776573, + "grad_norm": 0.15138969855457005, + "learning_rate": 2.4569802009609306e-06, + "loss": 0.0131, + "step": 3798 + }, + { + "epoch": 1.373463485177151, + "grad_norm": 0.8828319277931419, + "learning_rate": 2.454406998918738e-06, + "loss": 0.0432, + "step": 3799 + }, + { + "epoch": 1.373825018076645, + "grad_norm": 0.06414652117582638, + "learning_rate": 2.451834706670947e-06, + "loss": 0.0008, + "step": 3800 + }, + { + "epoch": 1.3741865509761388, + "grad_norm": 0.001577891502296687, + "learning_rate": 2.4492633251368943e-06, + "loss": 0.0, + "step": 3801 + }, + { + "epoch": 1.3745480838756328, + "grad_norm": 0.3545906043769453, + "learning_rate": 2.4466928552355885e-06, + "loss": 0.0117, + "step": 3802 + }, + { + "epoch": 1.3749096167751265, + "grad_norm": 0.007126316431217021, + "learning_rate": 2.44412329788572e-06, + "loss": 0.0001, + "step": 3803 + }, + { + "epoch": 1.3752711496746204, + "grad_norm": 0.6765476481179741, + "learning_rate": 2.441554654005647e-06, + "loss": 0.0391, + "step": 3804 + }, + { + "epoch": 1.3756326825741143, + "grad_norm": 0.2109901865647213, + "learning_rate": 2.4389869245133967e-06, + "loss": 0.0165, + "step": 3805 + }, + { + "epoch": 1.3759942154736082, + "grad_norm": 1.5410792382410414, + "learning_rate": 2.436420110326681e-06, + "loss": 0.1191, + "step": 3806 + }, + { + "epoch": 1.376355748373102, + "grad_norm": 0.40304234899227626, + "learning_rate": 2.4338542123628774e-06, + "loss": 0.0391, + "step": 3807 + }, + { + "epoch": 1.3767172812725958, + "grad_norm": 0.19041487092409645, + "learning_rate": 2.4312892315390364e-06, + "loss": 0.0165, + "step": 3808 + }, + { + "epoch": 1.3770788141720898, + "grad_norm": 0.9104176281443358, + "learning_rate": 2.4287251687718816e-06, + "loss": 0.0432, + "step": 3809 + }, + { + "epoch": 1.3774403470715835, + "grad_norm": 0.38854621421953695, + "learning_rate": 2.426162024977809e-06, + "loss": 0.0231, + "step": 3810 + }, + { + "epoch": 1.3778018799710774, + "grad_norm": 0.1883417350830366, + "learning_rate": 2.4235998010728855e-06, + "loss": 0.0164, + "step": 3811 + }, + { + "epoch": 1.3781634128705713, + "grad_norm": 1.7593305755047943, + "learning_rate": 2.421038497972848e-06, + "loss": 0.1738, + "step": 3812 + }, + { + "epoch": 1.378524945770065, + "grad_norm": 0.4161171178805602, + "learning_rate": 2.4184781165931086e-06, + "loss": 0.0229, + "step": 3813 + }, + { + "epoch": 1.378886478669559, + "grad_norm": 0.164706982791038, + "learning_rate": 2.4159186578487476e-06, + "loss": 0.0035, + "step": 3814 + }, + { + "epoch": 1.3792480115690529, + "grad_norm": 0.2806801081979543, + "learning_rate": 2.4133601226545087e-06, + "loss": 0.0229, + "step": 3815 + }, + { + "epoch": 1.3796095444685466, + "grad_norm": 0.11021448797190041, + "learning_rate": 2.410802511924818e-06, + "loss": 0.0064, + "step": 3816 + }, + { + "epoch": 1.3799710773680405, + "grad_norm": 1.9051322059973206, + "learning_rate": 2.4082458265737637e-06, + "loss": 0.0889, + "step": 3817 + }, + { + "epoch": 1.3803326102675344, + "grad_norm": 1.8412168777090219, + "learning_rate": 2.4056900675151034e-06, + "loss": 0.0231, + "step": 3818 + }, + { + "epoch": 1.380694143167028, + "grad_norm": 0.9110812703310353, + "learning_rate": 2.403135235662264e-06, + "loss": 0.0525, + "step": 3819 + }, + { + "epoch": 1.381055676066522, + "grad_norm": 0.29909017992796666, + "learning_rate": 2.4005813319283473e-06, + "loss": 0.0103, + "step": 3820 + }, + { + "epoch": 1.381417208966016, + "grad_norm": 0.5957073556238879, + "learning_rate": 2.3980283572261116e-06, + "loss": 0.0184, + "step": 3821 + }, + { + "epoch": 1.3817787418655096, + "grad_norm": 0.5157136864494952, + "learning_rate": 2.3954763124679897e-06, + "loss": 0.0184, + "step": 3822 + }, + { + "epoch": 1.3821402747650036, + "grad_norm": 0.0022093093856896742, + "learning_rate": 2.3929251985660866e-06, + "loss": 0.0, + "step": 3823 + }, + { + "epoch": 1.3825018076644975, + "grad_norm": 0.24919586625604614, + "learning_rate": 2.3903750164321672e-06, + "loss": 0.0146, + "step": 3824 + }, + { + "epoch": 1.3828633405639914, + "grad_norm": 0.8067250597993407, + "learning_rate": 2.387825766977666e-06, + "loss": 0.1191, + "step": 3825 + }, + { + "epoch": 1.383224873463485, + "grad_norm": 0.21887242698937354, + "learning_rate": 2.385277451113685e-06, + "loss": 0.0115, + "step": 3826 + }, + { + "epoch": 1.383586406362979, + "grad_norm": 0.13984364830337376, + "learning_rate": 2.38273006975099e-06, + "loss": 0.0027, + "step": 3827 + }, + { + "epoch": 1.383947939262473, + "grad_norm": 0.23737276649496636, + "learning_rate": 2.380183623800017e-06, + "loss": 0.0206, + "step": 3828 + }, + { + "epoch": 1.3843094721619669, + "grad_norm": 0.03622139332020473, + "learning_rate": 2.3776381141708617e-06, + "loss": 0.0015, + "step": 3829 + }, + { + "epoch": 1.3846710050614606, + "grad_norm": 0.21448803286371151, + "learning_rate": 2.3750935417732946e-06, + "loss": 0.0184, + "step": 3830 + }, + { + "epoch": 1.3850325379609545, + "grad_norm": 0.37120250799038856, + "learning_rate": 2.3725499075167397e-06, + "loss": 0.0131, + "step": 3831 + }, + { + "epoch": 1.3853940708604484, + "grad_norm": 0.09337680028902544, + "learning_rate": 2.370007212310292e-06, + "loss": 0.0039, + "step": 3832 + }, + { + "epoch": 1.3857556037599421, + "grad_norm": 0.09577418363145067, + "learning_rate": 2.3674654570627128e-06, + "loss": 0.0064, + "step": 3833 + }, + { + "epoch": 1.386117136659436, + "grad_norm": 0.22215363153042905, + "learning_rate": 2.364924642682424e-06, + "loss": 0.0072, + "step": 3834 + }, + { + "epoch": 1.38647866955893, + "grad_norm": 0.030655524350721254, + "learning_rate": 2.3623847700775116e-06, + "loss": 0.0009, + "step": 3835 + }, + { + "epoch": 1.3868402024584237, + "grad_norm": 1.6891997365716966, + "learning_rate": 2.3598458401557254e-06, + "loss": 0.1191, + "step": 3836 + }, + { + "epoch": 1.3872017353579176, + "grad_norm": 0.0008444004298794633, + "learning_rate": 2.3573078538244797e-06, + "loss": 0.0, + "step": 3837 + }, + { + "epoch": 1.3875632682574115, + "grad_norm": 0.8616274981015141, + "learning_rate": 2.3547708119908485e-06, + "loss": 0.2129, + "step": 3838 + }, + { + "epoch": 1.3879248011569052, + "grad_norm": 0.002580413935221409, + "learning_rate": 2.3522347155615692e-06, + "loss": 0.0001, + "step": 3839 + }, + { + "epoch": 1.3882863340563991, + "grad_norm": 1.8128383037514595, + "learning_rate": 2.3496995654430484e-06, + "loss": 0.0889, + "step": 3840 + }, + { + "epoch": 1.388647866955893, + "grad_norm": 0.09623428964076869, + "learning_rate": 2.3471653625413405e-06, + "loss": 0.0044, + "step": 3841 + }, + { + "epoch": 1.3890093998553867, + "grad_norm": 0.2819095193048747, + "learning_rate": 2.344632107762171e-06, + "loss": 0.0165, + "step": 3842 + }, + { + "epoch": 1.3893709327548807, + "grad_norm": 2.7700528031823013, + "learning_rate": 2.3420998020109275e-06, + "loss": 0.1455, + "step": 3843 + }, + { + "epoch": 1.3897324656543746, + "grad_norm": 0.014528343295904154, + "learning_rate": 2.339568446192654e-06, + "loss": 0.0002, + "step": 3844 + }, + { + "epoch": 1.3900939985538683, + "grad_norm": 0.13829432609554618, + "learning_rate": 2.3370380412120556e-06, + "loss": 0.0146, + "step": 3845 + }, + { + "epoch": 1.3904555314533622, + "grad_norm": 0.1393859349407713, + "learning_rate": 2.3345085879735002e-06, + "loss": 0.0131, + "step": 3846 + }, + { + "epoch": 1.3908170643528561, + "grad_norm": 0.177540135491973, + "learning_rate": 2.331980087381012e-06, + "loss": 0.0184, + "step": 3847 + }, + { + "epoch": 1.39117859725235, + "grad_norm": 0.09260807966856559, + "learning_rate": 2.3294525403382784e-06, + "loss": 0.0081, + "step": 3848 + }, + { + "epoch": 1.3915401301518437, + "grad_norm": 0.0032359646761756975, + "learning_rate": 2.326925947748641e-06, + "loss": 0.0001, + "step": 3849 + }, + { + "epoch": 1.3919016630513377, + "grad_norm": 0.36017025983681783, + "learning_rate": 2.3244003105151097e-06, + "loss": 0.0165, + "step": 3850 + }, + { + "epoch": 1.3922631959508316, + "grad_norm": 0.40865361138481177, + "learning_rate": 2.321875629540341e-06, + "loss": 0.0205, + "step": 3851 + }, + { + "epoch": 1.3926247288503255, + "grad_norm": 0.5335385560818376, + "learning_rate": 2.319351905726655e-06, + "loss": 0.0432, + "step": 3852 + }, + { + "epoch": 1.3929862617498192, + "grad_norm": 0.1626689227808501, + "learning_rate": 2.316829139976034e-06, + "loss": 0.0092, + "step": 3853 + }, + { + "epoch": 1.3933477946493131, + "grad_norm": 1.1095044692502425, + "learning_rate": 2.314307333190112e-06, + "loss": 0.0354, + "step": 3854 + }, + { + "epoch": 1.393709327548807, + "grad_norm": 0.8997411621485003, + "learning_rate": 2.3117864862701827e-06, + "loss": 0.0258, + "step": 3855 + }, + { + "epoch": 1.3940708604483008, + "grad_norm": 0.1454943377696036, + "learning_rate": 2.309266600117196e-06, + "loss": 0.0131, + "step": 3856 + }, + { + "epoch": 1.3944323933477947, + "grad_norm": 0.1166215177063575, + "learning_rate": 2.306747675631758e-06, + "loss": 0.0115, + "step": 3857 + }, + { + "epoch": 1.3947939262472886, + "grad_norm": 0.008940866856039838, + "learning_rate": 2.304229713714133e-06, + "loss": 0.0001, + "step": 3858 + }, + { + "epoch": 1.3951554591467823, + "grad_norm": 0.11760233527251361, + "learning_rate": 2.3017127152642366e-06, + "loss": 0.0103, + "step": 3859 + }, + { + "epoch": 1.3955169920462762, + "grad_norm": 0.35379264187524273, + "learning_rate": 2.2991966811816505e-06, + "loss": 0.0165, + "step": 3860 + }, + { + "epoch": 1.3958785249457701, + "grad_norm": 0.12459850585173578, + "learning_rate": 2.2966816123655987e-06, + "loss": 0.0092, + "step": 3861 + }, + { + "epoch": 1.3962400578452638, + "grad_norm": 0.06475177691705833, + "learning_rate": 2.2941675097149667e-06, + "loss": 0.0035, + "step": 3862 + }, + { + "epoch": 1.3966015907447578, + "grad_norm": 0.0255444348021181, + "learning_rate": 2.291654374128297e-06, + "loss": 0.0006, + "step": 3863 + }, + { + "epoch": 1.3969631236442517, + "grad_norm": 2.579007869836046, + "learning_rate": 2.289142206503783e-06, + "loss": 0.1836, + "step": 3864 + }, + { + "epoch": 1.3973246565437454, + "grad_norm": 0.19636675887343177, + "learning_rate": 2.286631007739272e-06, + "loss": 0.0035, + "step": 3865 + }, + { + "epoch": 1.3976861894432393, + "grad_norm": 0.3559282305324365, + "learning_rate": 2.284120778732266e-06, + "loss": 0.0229, + "step": 3866 + }, + { + "epoch": 1.3980477223427332, + "grad_norm": 0.1461710166409173, + "learning_rate": 2.2816115203799207e-06, + "loss": 0.0131, + "step": 3867 + }, + { + "epoch": 1.398409255242227, + "grad_norm": 0.17600406407686883, + "learning_rate": 2.279103233579044e-06, + "loss": 0.0081, + "step": 3868 + }, + { + "epoch": 1.3987707881417208, + "grad_norm": 2.5754069774853505, + "learning_rate": 2.276595919226096e-06, + "loss": 0.1191, + "step": 3869 + }, + { + "epoch": 1.3991323210412148, + "grad_norm": 1.3034794807222756, + "learning_rate": 2.274089578217196e-06, + "loss": 0.2441, + "step": 3870 + }, + { + "epoch": 1.3994938539407087, + "grad_norm": 0.13201401620460415, + "learning_rate": 2.2715842114481024e-06, + "loss": 0.0131, + "step": 3871 + }, + { + "epoch": 1.3998553868402024, + "grad_norm": 0.2323590332599623, + "learning_rate": 2.2690798198142343e-06, + "loss": 0.0072, + "step": 3872 + }, + { + "epoch": 1.4002169197396963, + "grad_norm": 0.4174662227994243, + "learning_rate": 2.2665764042106647e-06, + "loss": 0.0028, + "step": 3873 + }, + { + "epoch": 1.4005784526391902, + "grad_norm": 0.17013330463615278, + "learning_rate": 2.2640739655321107e-06, + "loss": 0.0103, + "step": 3874 + }, + { + "epoch": 1.4009399855386842, + "grad_norm": 0.558732164045614, + "learning_rate": 2.2615725046729442e-06, + "loss": 0.0432, + "step": 3875 + }, + { + "epoch": 1.4013015184381779, + "grad_norm": 0.15425826024777342, + "learning_rate": 2.2590720225271866e-06, + "loss": 0.0115, + "step": 3876 + }, + { + "epoch": 1.4016630513376718, + "grad_norm": 0.8004768311369755, + "learning_rate": 2.2565725199885104e-06, + "loss": 0.1738, + "step": 3877 + }, + { + "epoch": 1.4020245842371657, + "grad_norm": 0.8972416366660914, + "learning_rate": 2.2540739979502356e-06, + "loss": 0.0391, + "step": 3878 + }, + { + "epoch": 1.4023861171366594, + "grad_norm": 0.2504080521097687, + "learning_rate": 2.2515764573053336e-06, + "loss": 0.0184, + "step": 3879 + }, + { + "epoch": 1.4027476500361533, + "grad_norm": 0.11967038005819018, + "learning_rate": 2.2490798989464262e-06, + "loss": 0.0092, + "step": 3880 + }, + { + "epoch": 1.4031091829356472, + "grad_norm": 0.23373687641246974, + "learning_rate": 2.2465843237657853e-06, + "loss": 0.0131, + "step": 3881 + }, + { + "epoch": 1.403470715835141, + "grad_norm": 0.6252828263317004, + "learning_rate": 2.2440897326553217e-06, + "loss": 0.0286, + "step": 3882 + }, + { + "epoch": 1.4038322487346349, + "grad_norm": 0.004941715056495786, + "learning_rate": 2.2415961265066083e-06, + "loss": 0.0, + "step": 3883 + }, + { + "epoch": 1.4041937816341288, + "grad_norm": 0.13290190518631834, + "learning_rate": 2.2391035062108575e-06, + "loss": 0.0131, + "step": 3884 + }, + { + "epoch": 1.4045553145336225, + "grad_norm": 0.13908302240786638, + "learning_rate": 2.2366118726589304e-06, + "loss": 0.0115, + "step": 3885 + }, + { + "epoch": 1.4049168474331164, + "grad_norm": 0.19148743003942925, + "learning_rate": 2.2341212267413364e-06, + "loss": 0.0146, + "step": 3886 + }, + { + "epoch": 1.4052783803326103, + "grad_norm": 1.7391802630446254, + "learning_rate": 2.231631569348233e-06, + "loss": 0.0525, + "step": 3887 + }, + { + "epoch": 1.405639913232104, + "grad_norm": 0.16642120196038232, + "learning_rate": 2.229142901369422e-06, + "loss": 0.0117, + "step": 3888 + }, + { + "epoch": 1.406001446131598, + "grad_norm": 1.235378326099626, + "learning_rate": 2.2266552236943515e-06, + "loss": 0.1367, + "step": 3889 + }, + { + "epoch": 1.4063629790310919, + "grad_norm": 0.3643849348961727, + "learning_rate": 2.2241685372121215e-06, + "loss": 0.0184, + "step": 3890 + }, + { + "epoch": 1.4067245119305856, + "grad_norm": 2.8432420930573645, + "learning_rate": 2.2216828428114695e-06, + "loss": 0.1191, + "step": 3891 + }, + { + "epoch": 1.4070860448300795, + "grad_norm": 0.21912374179200297, + "learning_rate": 2.2191981413807834e-06, + "loss": 0.0184, + "step": 3892 + }, + { + "epoch": 1.4074475777295734, + "grad_norm": 0.014477373751313987, + "learning_rate": 2.216714433808095e-06, + "loss": 0.0003, + "step": 3893 + }, + { + "epoch": 1.407809110629067, + "grad_norm": 0.09783328283744536, + "learning_rate": 2.21423172098108e-06, + "loss": 0.0081, + "step": 3894 + }, + { + "epoch": 1.408170643528561, + "grad_norm": 0.44444168643343157, + "learning_rate": 2.21175000378706e-06, + "loss": 0.0206, + "step": 3895 + }, + { + "epoch": 1.408532176428055, + "grad_norm": 0.04983789480816397, + "learning_rate": 2.209269283112999e-06, + "loss": 0.0017, + "step": 3896 + }, + { + "epoch": 1.4088937093275489, + "grad_norm": 0.15261508348222658, + "learning_rate": 2.206789559845511e-06, + "loss": 0.0045, + "step": 3897 + }, + { + "epoch": 1.4092552422270428, + "grad_norm": 1.2230035972170035, + "learning_rate": 2.2043108348708425e-06, + "loss": 0.0317, + "step": 3898 + }, + { + "epoch": 1.4096167751265365, + "grad_norm": 1.228260209634071, + "learning_rate": 2.2018331090748895e-06, + "loss": 0.063, + "step": 3899 + }, + { + "epoch": 1.4099783080260304, + "grad_norm": 0.19787212136252563, + "learning_rate": 2.1993563833431952e-06, + "loss": 0.0092, + "step": 3900 + }, + { + "epoch": 1.4103398409255243, + "grad_norm": 0.649639216897263, + "learning_rate": 2.1968806585609383e-06, + "loss": 0.0146, + "step": 3901 + }, + { + "epoch": 1.410701373825018, + "grad_norm": 0.11007816324257894, + "learning_rate": 2.194405935612943e-06, + "loss": 0.0027, + "step": 3902 + }, + { + "epoch": 1.411062906724512, + "grad_norm": 0.24319290303113328, + "learning_rate": 2.191932215383673e-06, + "loss": 0.0103, + "step": 3903 + }, + { + "epoch": 1.4114244396240059, + "grad_norm": 0.26789398866892816, + "learning_rate": 2.1894594987572375e-06, + "loss": 0.0165, + "step": 3904 + }, + { + "epoch": 1.4117859725234996, + "grad_norm": 0.15592510181229743, + "learning_rate": 2.186987786617384e-06, + "loss": 0.0051, + "step": 3905 + }, + { + "epoch": 1.4121475054229935, + "grad_norm": 0.05781448345606113, + "learning_rate": 2.1845170798474995e-06, + "loss": 0.0024, + "step": 3906 + }, + { + "epoch": 1.4125090383224874, + "grad_norm": 1.4076817149114007, + "learning_rate": 2.1820473793306207e-06, + "loss": 0.0752, + "step": 3907 + }, + { + "epoch": 1.4128705712219811, + "grad_norm": 0.0035797748015566205, + "learning_rate": 2.1795786859494116e-06, + "loss": 0.0001, + "step": 3908 + }, + { + "epoch": 1.413232104121475, + "grad_norm": 0.29140615764422645, + "learning_rate": 2.1771110005861836e-06, + "loss": 0.0081, + "step": 3909 + }, + { + "epoch": 1.413593637020969, + "grad_norm": 1.6247929613007681, + "learning_rate": 2.17464432412289e-06, + "loss": 0.2031, + "step": 3910 + }, + { + "epoch": 1.4139551699204627, + "grad_norm": 0.0006267332955158097, + "learning_rate": 2.172178657441118e-06, + "loss": 0.0, + "step": 3911 + }, + { + "epoch": 1.4143167028199566, + "grad_norm": 0.16778507559043127, + "learning_rate": 2.1697140014220973e-06, + "loss": 0.0092, + "step": 3912 + }, + { + "epoch": 1.4146782357194505, + "grad_norm": 0.10138972793754132, + "learning_rate": 2.1672503569466956e-06, + "loss": 0.0081, + "step": 3913 + }, + { + "epoch": 1.4150397686189442, + "grad_norm": 0.49929333281022953, + "learning_rate": 2.1647877248954184e-06, + "loss": 0.0131, + "step": 3914 + }, + { + "epoch": 1.4154013015184381, + "grad_norm": 2.0565934797810375, + "learning_rate": 2.1623261061484096e-06, + "loss": 0.1738, + "step": 3915 + }, + { + "epoch": 1.415762834417932, + "grad_norm": 0.7774187513086971, + "learning_rate": 2.15986550158545e-06, + "loss": 0.1641, + "step": 3916 + }, + { + "epoch": 1.4161243673174257, + "grad_norm": 0.05943900317492932, + "learning_rate": 2.1574059120859647e-06, + "loss": 0.0024, + "step": 3917 + }, + { + "epoch": 1.4164859002169197, + "grad_norm": 0.0906665525116782, + "learning_rate": 2.154947338529005e-06, + "loss": 0.0072, + "step": 3918 + }, + { + "epoch": 1.4168474331164136, + "grad_norm": 0.5209793435755445, + "learning_rate": 2.152489781793263e-06, + "loss": 0.0258, + "step": 3919 + }, + { + "epoch": 1.4172089660159075, + "grad_norm": 4.697734516470376, + "learning_rate": 2.1500332427570745e-06, + "loss": 0.2129, + "step": 3920 + }, + { + "epoch": 1.4175704989154014, + "grad_norm": 0.8594772008380079, + "learning_rate": 2.147577722298404e-06, + "loss": 0.1455, + "step": 3921 + }, + { + "epoch": 1.4179320318148951, + "grad_norm": 0.0013356191612224253, + "learning_rate": 2.1451232212948537e-06, + "loss": 0.0, + "step": 3922 + }, + { + "epoch": 1.418293564714389, + "grad_norm": 0.18227872719138535, + "learning_rate": 2.142669740623661e-06, + "loss": 0.0115, + "step": 3923 + }, + { + "epoch": 1.418655097613883, + "grad_norm": 2.72467081773899, + "learning_rate": 2.1402172811617e-06, + "loss": 0.3105, + "step": 3924 + }, + { + "epoch": 1.4190166305133767, + "grad_norm": 0.06469122937440962, + "learning_rate": 2.1377658437854795e-06, + "loss": 0.0017, + "step": 3925 + }, + { + "epoch": 1.4193781634128706, + "grad_norm": 1.8393255969855569, + "learning_rate": 2.1353154293711403e-06, + "loss": 0.2031, + "step": 3926 + }, + { + "epoch": 1.4197396963123645, + "grad_norm": 2.0962576911993396, + "learning_rate": 2.1328660387944663e-06, + "loss": 0.0957, + "step": 3927 + }, + { + "epoch": 1.4201012292118582, + "grad_norm": 0.4394339766514339, + "learning_rate": 2.1304176729308622e-06, + "loss": 0.0082, + "step": 3928 + }, + { + "epoch": 1.4204627621113521, + "grad_norm": 0.9421313434550135, + "learning_rate": 2.1279703326553754e-06, + "loss": 0.0354, + "step": 3929 + }, + { + "epoch": 1.420824295010846, + "grad_norm": 0.24211522790848755, + "learning_rate": 2.1255240188426863e-06, + "loss": 0.0165, + "step": 3930 + }, + { + "epoch": 1.4211858279103398, + "grad_norm": 0.3285759183262337, + "learning_rate": 2.123078732367107e-06, + "loss": 0.0072, + "step": 3931 + }, + { + "epoch": 1.4215473608098337, + "grad_norm": 0.1635116642315342, + "learning_rate": 2.120634474102581e-06, + "loss": 0.0072, + "step": 3932 + }, + { + "epoch": 1.4219088937093276, + "grad_norm": 0.20032423721883205, + "learning_rate": 2.1181912449226873e-06, + "loss": 0.0146, + "step": 3933 + }, + { + "epoch": 1.4222704266088213, + "grad_norm": 0.11292251178606613, + "learning_rate": 2.1157490457006337e-06, + "loss": 0.0103, + "step": 3934 + }, + { + "epoch": 1.4226319595083152, + "grad_norm": 0.09393600745926795, + "learning_rate": 2.113307877309263e-06, + "loss": 0.0072, + "step": 3935 + }, + { + "epoch": 1.4229934924078091, + "grad_norm": 0.831052005105655, + "learning_rate": 2.1108677406210453e-06, + "loss": 0.0286, + "step": 3936 + }, + { + "epoch": 1.4233550253073028, + "grad_norm": 0.9003250971715743, + "learning_rate": 2.1084286365080916e-06, + "loss": 0.1641, + "step": 3937 + }, + { + "epoch": 1.4237165582067968, + "grad_norm": 0.005021453552718101, + "learning_rate": 2.105990565842131e-06, + "loss": 0.0001, + "step": 3938 + }, + { + "epoch": 1.4240780911062907, + "grad_norm": 0.8311601023706722, + "learning_rate": 2.103553529494529e-06, + "loss": 0.0206, + "step": 3939 + }, + { + "epoch": 1.4244396240057844, + "grad_norm": 1.9543169167944399, + "learning_rate": 2.1011175283362866e-06, + "loss": 0.1113, + "step": 3940 + }, + { + "epoch": 1.4248011569052783, + "grad_norm": 0.20670570215715822, + "learning_rate": 2.098682563238028e-06, + "loss": 0.0092, + "step": 3941 + }, + { + "epoch": 1.4251626898047722, + "grad_norm": 0.0029248487206445417, + "learning_rate": 2.096248635070009e-06, + "loss": 0.0001, + "step": 3942 + }, + { + "epoch": 1.4255242227042662, + "grad_norm": 0.1439165185313958, + "learning_rate": 2.0938157447021146e-06, + "loss": 0.0103, + "step": 3943 + }, + { + "epoch": 1.42588575560376, + "grad_norm": 0.19330700454288907, + "learning_rate": 2.09138389300386e-06, + "loss": 0.0035, + "step": 3944 + }, + { + "epoch": 1.4262472885032538, + "grad_norm": 1.2804756951426006, + "learning_rate": 2.088953080844388e-06, + "loss": 0.1367, + "step": 3945 + }, + { + "epoch": 1.4266088214027477, + "grad_norm": 0.4514020983982897, + "learning_rate": 2.0865233090924693e-06, + "loss": 0.0229, + "step": 3946 + }, + { + "epoch": 1.4269703543022416, + "grad_norm": 0.07850273255540316, + "learning_rate": 2.084094578616508e-06, + "loss": 0.0027, + "step": 3947 + }, + { + "epoch": 1.4273318872017353, + "grad_norm": 0.110631442982652, + "learning_rate": 2.0816668902845276e-06, + "loss": 0.0092, + "step": 3948 + }, + { + "epoch": 1.4276934201012292, + "grad_norm": 0.24591787391715972, + "learning_rate": 2.0792402449641825e-06, + "loss": 0.0184, + "step": 3949 + }, + { + "epoch": 1.4280549530007232, + "grad_norm": 0.3976930837520884, + "learning_rate": 2.07681464352276e-06, + "loss": 0.0146, + "step": 3950 + }, + { + "epoch": 1.4284164859002169, + "grad_norm": 0.09625691801287548, + "learning_rate": 2.074390086827166e-06, + "loss": 0.0092, + "step": 3951 + }, + { + "epoch": 1.4287780187997108, + "grad_norm": 0.12356311360319373, + "learning_rate": 2.0719665757439382e-06, + "loss": 0.0072, + "step": 3952 + }, + { + "epoch": 1.4291395516992047, + "grad_norm": 1.004541417224121, + "learning_rate": 2.0695441111392385e-06, + "loss": 0.1553, + "step": 3953 + }, + { + "epoch": 1.4295010845986984, + "grad_norm": 0.1258733174350411, + "learning_rate": 2.067122693878854e-06, + "loss": 0.0092, + "step": 3954 + }, + { + "epoch": 1.4298626174981923, + "grad_norm": 0.8043470912495618, + "learning_rate": 2.0647023248282007e-06, + "loss": 0.0354, + "step": 3955 + }, + { + "epoch": 1.4302241503976862, + "grad_norm": 0.008762735277944937, + "learning_rate": 2.062283004852315e-06, + "loss": 0.0002, + "step": 3956 + }, + { + "epoch": 1.43058568329718, + "grad_norm": 0.7372795651368457, + "learning_rate": 2.059864734815867e-06, + "loss": 0.0432, + "step": 3957 + }, + { + "epoch": 1.4309472161966739, + "grad_norm": 0.10897232032068552, + "learning_rate": 2.0574475155831386e-06, + "loss": 0.0103, + "step": 3958 + }, + { + "epoch": 1.4313087490961678, + "grad_norm": 0.3081368972672996, + "learning_rate": 2.055031348018049e-06, + "loss": 0.0206, + "step": 3959 + }, + { + "epoch": 1.4316702819956615, + "grad_norm": 0.1590772117541198, + "learning_rate": 2.052616232984134e-06, + "loss": 0.0081, + "step": 3960 + }, + { + "epoch": 1.4320318148951554, + "grad_norm": 1.290903811767092, + "learning_rate": 2.050202171344556e-06, + "loss": 0.0432, + "step": 3961 + }, + { + "epoch": 1.4323933477946493, + "grad_norm": 8.068202402660594, + "learning_rate": 2.0477891639620984e-06, + "loss": 0.1191, + "step": 3962 + }, + { + "epoch": 1.432754880694143, + "grad_norm": 0.3475550917599119, + "learning_rate": 2.0453772116991693e-06, + "loss": 0.0184, + "step": 3963 + }, + { + "epoch": 1.433116413593637, + "grad_norm": 0.14085570573789682, + "learning_rate": 2.0429663154178046e-06, + "loss": 0.0103, + "step": 3964 + }, + { + "epoch": 1.4334779464931309, + "grad_norm": 0.0015317020831876547, + "learning_rate": 2.040556475979653e-06, + "loss": 0.0, + "step": 3965 + }, + { + "epoch": 1.4338394793926248, + "grad_norm": 1.9267353394859883, + "learning_rate": 2.038147694245991e-06, + "loss": 0.0889, + "step": 3966 + }, + { + "epoch": 1.4342010122921187, + "grad_norm": 0.19061931675057675, + "learning_rate": 2.035739971077721e-06, + "loss": 0.0103, + "step": 3967 + }, + { + "epoch": 1.4345625451916124, + "grad_norm": 0.4128643326989047, + "learning_rate": 2.0333333073353563e-06, + "loss": 0.0184, + "step": 3968 + }, + { + "epoch": 1.4349240780911063, + "grad_norm": 0.13418835478978572, + "learning_rate": 2.0309277038790433e-06, + "loss": 0.0115, + "step": 3969 + }, + { + "epoch": 1.4352856109906003, + "grad_norm": 1.1465370504869592, + "learning_rate": 2.0285231615685423e-06, + "loss": 0.1738, + "step": 3970 + }, + { + "epoch": 1.435647143890094, + "grad_norm": 0.2807706155710619, + "learning_rate": 2.0261196812632368e-06, + "loss": 0.0229, + "step": 3971 + }, + { + "epoch": 1.4360086767895879, + "grad_norm": 0.1986328505330048, + "learning_rate": 2.0237172638221287e-06, + "loss": 0.0164, + "step": 3972 + }, + { + "epoch": 1.4363702096890818, + "grad_norm": 1.3972614480990646, + "learning_rate": 2.021315910103841e-06, + "loss": 0.1113, + "step": 3973 + }, + { + "epoch": 1.4367317425885755, + "grad_norm": 1.6686452525862894, + "learning_rate": 2.0189156209666223e-06, + "loss": 0.0957, + "step": 3974 + }, + { + "epoch": 1.4370932754880694, + "grad_norm": 0.7503503566151705, + "learning_rate": 2.016516397268329e-06, + "loss": 0.1641, + "step": 3975 + }, + { + "epoch": 1.4374548083875633, + "grad_norm": 0.5445991453932955, + "learning_rate": 2.0141182398664445e-06, + "loss": 0.0131, + "step": 3976 + }, + { + "epoch": 1.437816341287057, + "grad_norm": 0.14029844721170956, + "learning_rate": 2.011721149618073e-06, + "loss": 0.0056, + "step": 3977 + }, + { + "epoch": 1.438177874186551, + "grad_norm": 1.5721374382384297, + "learning_rate": 2.0093251273799313e-06, + "loss": 0.0432, + "step": 3978 + }, + { + "epoch": 1.4385394070860449, + "grad_norm": 0.11455914165677165, + "learning_rate": 2.006930174008358e-06, + "loss": 0.0035, + "step": 3979 + }, + { + "epoch": 1.4389009399855386, + "grad_norm": 0.05064639040897442, + "learning_rate": 2.004536290359309e-06, + "loss": 0.0017, + "step": 3980 + }, + { + "epoch": 1.4392624728850325, + "grad_norm": 0.1617678678080828, + "learning_rate": 2.002143477288358e-06, + "loss": 0.0115, + "step": 3981 + }, + { + "epoch": 1.4396240057845264, + "grad_norm": 2.2455314373907758, + "learning_rate": 1.999751735650695e-06, + "loss": 0.1641, + "step": 3982 + }, + { + "epoch": 1.4399855386840201, + "grad_norm": 0.683069900203842, + "learning_rate": 1.997361066301127e-06, + "loss": 0.0286, + "step": 3983 + }, + { + "epoch": 1.440347071583514, + "grad_norm": 0.505326142248209, + "learning_rate": 1.994971470094084e-06, + "loss": 0.0255, + "step": 3984 + }, + { + "epoch": 1.440708604483008, + "grad_norm": 0.18966271545772914, + "learning_rate": 1.9925829478836013e-06, + "loss": 0.0117, + "step": 3985 + }, + { + "epoch": 1.4410701373825017, + "grad_norm": 0.05567930742050673, + "learning_rate": 1.990195500523337e-06, + "loss": 0.0013, + "step": 3986 + }, + { + "epoch": 1.4414316702819956, + "grad_norm": 0.20989118990761269, + "learning_rate": 1.9878091288665667e-06, + "loss": 0.0131, + "step": 3987 + }, + { + "epoch": 1.4417932031814895, + "grad_norm": 0.003172546148911414, + "learning_rate": 1.9854238337661786e-06, + "loss": 0.0001, + "step": 3988 + }, + { + "epoch": 1.4421547360809834, + "grad_norm": 0.5229086910514201, + "learning_rate": 1.983039616074676e-06, + "loss": 0.0255, + "step": 3989 + }, + { + "epoch": 1.4425162689804774, + "grad_norm": 0.5583517614899376, + "learning_rate": 1.980656476644178e-06, + "loss": 0.0255, + "step": 3990 + }, + { + "epoch": 1.442877801879971, + "grad_norm": 0.3738410978267363, + "learning_rate": 1.978274416326418e-06, + "loss": 0.0165, + "step": 3991 + }, + { + "epoch": 1.443239334779465, + "grad_norm": 3.1968921501436474, + "learning_rate": 1.9758934359727432e-06, + "loss": 0.1738, + "step": 3992 + }, + { + "epoch": 1.443600867678959, + "grad_norm": 0.005579349919473901, + "learning_rate": 1.973513536434115e-06, + "loss": 0.0001, + "step": 3993 + }, + { + "epoch": 1.4439624005784526, + "grad_norm": 0.11071754537576692, + "learning_rate": 1.971134718561114e-06, + "loss": 0.0044, + "step": 3994 + }, + { + "epoch": 1.4443239334779465, + "grad_norm": 1.4012388063654915, + "learning_rate": 1.968756983203923e-06, + "loss": 0.0752, + "step": 3995 + }, + { + "epoch": 1.4446854663774404, + "grad_norm": 0.09306944518692938, + "learning_rate": 1.9663803312123455e-06, + "loss": 0.0039, + "step": 3996 + }, + { + "epoch": 1.4450469992769341, + "grad_norm": 0.5185793921621615, + "learning_rate": 1.964004763435799e-06, + "loss": 0.0165, + "step": 3997 + }, + { + "epoch": 1.445408532176428, + "grad_norm": 1.133995452516062, + "learning_rate": 1.961630280723309e-06, + "loss": 0.0752, + "step": 3998 + }, + { + "epoch": 1.445770065075922, + "grad_norm": 0.059356239621809845, + "learning_rate": 1.9592568839235154e-06, + "loss": 0.0013, + "step": 3999 + }, + { + "epoch": 1.4461315979754157, + "grad_norm": 0.1926156255355546, + "learning_rate": 1.9568845738846697e-06, + "loss": 0.0165, + "step": 4000 + } + ], + "logging_steps": 1.0, + "max_steps": 5532, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1214326505472000.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}