diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,63034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1264584434721976, + "eval_steps": 500, + "global_step": 90000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.4050938163577509e-05, + "grad_norm": 2.420112133026123, + "learning_rate": 3.793733314598848e-08, + "loss": 10.5041, + "step": 10 + }, + { + "epoch": 2.8101876327155018e-05, + "grad_norm": 2.226503849029541, + "learning_rate": 8.008992553042011e-08, + "loss": 10.4971, + "step": 20 + }, + { + "epoch": 4.215281449073253e-05, + "grad_norm": 2.459202527999878, + "learning_rate": 1.2224251791485176e-07, + "loss": 10.5017, + "step": 30 + }, + { + "epoch": 5.6203752654310035e-05, + "grad_norm": 2.5928807258605957, + "learning_rate": 1.6439511029928338e-07, + "loss": 10.5044, + "step": 40 + }, + { + "epoch": 7.025469081788755e-05, + "grad_norm": 2.4456496238708496, + "learning_rate": 2.0654770268371502e-07, + "loss": 10.4912, + "step": 50 + }, + { + "epoch": 8.430562898146506e-05, + "grad_norm": 3.4608006477355957, + "learning_rate": 2.4870029506814667e-07, + "loss": 10.5029, + "step": 60 + }, + { + "epoch": 9.835656714504256e-05, + "grad_norm": 2.387044668197632, + "learning_rate": 2.908528874525783e-07, + "loss": 10.4957, + "step": 70 + }, + { + "epoch": 0.00011240750530862007, + "grad_norm": 2.4745988845825195, + "learning_rate": 3.3300547983700995e-07, + "loss": 10.503, + "step": 80 + }, + { + "epoch": 0.00012645844347219758, + "grad_norm": 3.1475226879119873, + "learning_rate": 3.751580722214416e-07, + "loss": 10.4968, + "step": 90 + }, + { + "epoch": 0.0001405093816357751, + "grad_norm": 2.5408034324645996, + "learning_rate": 4.173106646058732e-07, + "loss": 10.4923, + "step": 100 + }, + { + "epoch": 0.0001545603197993526, + "grad_norm": 2.2912540435791016, + "learning_rate": 4.594632569903049e-07, + "loss": 10.4942, + "step": 110 + }, + { + "epoch": 0.0001686112579629301, + "grad_norm": 2.3079867362976074, + "learning_rate": 5.016158493747365e-07, + "loss": 10.4897, + "step": 120 + }, + { + "epoch": 0.00018266219612650763, + "grad_norm": 2.3974714279174805, + "learning_rate": 5.437684417591682e-07, + "loss": 10.4806, + "step": 130 + }, + { + "epoch": 0.00019671313429008513, + "grad_norm": 2.180088758468628, + "learning_rate": 5.859210341435998e-07, + "loss": 10.4768, + "step": 140 + }, + { + "epoch": 0.00021076407245366265, + "grad_norm": 2.415879249572754, + "learning_rate": 6.280736265280315e-07, + "loss": 10.4835, + "step": 150 + }, + { + "epoch": 0.00022481501061724014, + "grad_norm": 2.506563425064087, + "learning_rate": 6.70226218912463e-07, + "loss": 10.4767, + "step": 160 + }, + { + "epoch": 0.00023886594878081766, + "grad_norm": 2.7735683917999268, + "learning_rate": 7.123788112968946e-07, + "loss": 10.4738, + "step": 170 + }, + { + "epoch": 0.00025291688694439516, + "grad_norm": 2.5280656814575195, + "learning_rate": 7.545314036813263e-07, + "loss": 10.4746, + "step": 180 + }, + { + "epoch": 0.0002669678251079727, + "grad_norm": 2.373305559158325, + "learning_rate": 7.966839960657579e-07, + "loss": 10.4709, + "step": 190 + }, + { + "epoch": 0.0002810187632715502, + "grad_norm": 2.3311121463775635, + "learning_rate": 8.388365884501895e-07, + "loss": 10.4571, + "step": 200 + }, + { + "epoch": 0.0002950697014351277, + "grad_norm": 2.61584210395813, + "learning_rate": 8.809891808346213e-07, + "loss": 10.4502, + "step": 210 + }, + { + "epoch": 0.0003091206395987052, + "grad_norm": 2.391726016998291, + "learning_rate": 9.231417732190529e-07, + "loss": 10.4474, + "step": 220 + }, + { + "epoch": 0.0003231715777622827, + "grad_norm": 2.519890785217285, + "learning_rate": 9.652943656034844e-07, + "loss": 10.4378, + "step": 230 + }, + { + "epoch": 0.0003372225159258602, + "grad_norm": 2.557464599609375, + "learning_rate": 1.0074469579879162e-06, + "loss": 10.4381, + "step": 240 + }, + { + "epoch": 0.00035127345408943775, + "grad_norm": 2.5819175243377686, + "learning_rate": 1.0495995503723478e-06, + "loss": 10.4289, + "step": 250 + }, + { + "epoch": 0.00036532439225301527, + "grad_norm": 2.536975622177124, + "learning_rate": 1.0917521427567794e-06, + "loss": 10.4208, + "step": 260 + }, + { + "epoch": 0.00037937533041659273, + "grad_norm": 2.575305700302124, + "learning_rate": 1.1339047351412112e-06, + "loss": 10.4069, + "step": 270 + }, + { + "epoch": 0.00039342626858017025, + "grad_norm": 2.553637981414795, + "learning_rate": 1.1760573275256428e-06, + "loss": 10.3983, + "step": 280 + }, + { + "epoch": 0.0004074772067437478, + "grad_norm": 2.40779709815979, + "learning_rate": 1.2182099199100744e-06, + "loss": 10.3975, + "step": 290 + }, + { + "epoch": 0.0004215281449073253, + "grad_norm": 2.8702962398529053, + "learning_rate": 1.260362512294506e-06, + "loss": 10.3814, + "step": 300 + }, + { + "epoch": 0.0004355790830709028, + "grad_norm": 2.869860887527466, + "learning_rate": 1.3025151046789375e-06, + "loss": 10.3902, + "step": 310 + }, + { + "epoch": 0.0004496300212344803, + "grad_norm": 2.448800563812256, + "learning_rate": 1.3446676970633693e-06, + "loss": 10.3575, + "step": 320 + }, + { + "epoch": 0.0004636809593980578, + "grad_norm": 2.3154518604278564, + "learning_rate": 1.386820289447801e-06, + "loss": 10.3371, + "step": 330 + }, + { + "epoch": 0.0004777318975616353, + "grad_norm": 2.516812562942505, + "learning_rate": 1.4289728818322327e-06, + "loss": 10.3448, + "step": 340 + }, + { + "epoch": 0.0004917828357252128, + "grad_norm": 2.7383711338043213, + "learning_rate": 1.4711254742166643e-06, + "loss": 10.3181, + "step": 350 + }, + { + "epoch": 0.0005058337738887903, + "grad_norm": 2.346808910369873, + "learning_rate": 1.513278066601096e-06, + "loss": 10.2987, + "step": 360 + }, + { + "epoch": 0.0005198847120523679, + "grad_norm": 2.4467155933380127, + "learning_rate": 1.5554306589855275e-06, + "loss": 10.2587, + "step": 370 + }, + { + "epoch": 0.0005339356502159454, + "grad_norm": 2.229548454284668, + "learning_rate": 1.597583251369959e-06, + "loss": 10.2929, + "step": 380 + }, + { + "epoch": 0.0005479865883795228, + "grad_norm": 2.3183753490448, + "learning_rate": 1.6397358437543907e-06, + "loss": 10.231, + "step": 390 + }, + { + "epoch": 0.0005620375265431004, + "grad_norm": 2.497016668319702, + "learning_rate": 1.6818884361388223e-06, + "loss": 10.2457, + "step": 400 + }, + { + "epoch": 0.0005760884647066779, + "grad_norm": 2.5405783653259277, + "learning_rate": 1.724041028523254e-06, + "loss": 10.2305, + "step": 410 + }, + { + "epoch": 0.0005901394028702554, + "grad_norm": 2.377484083175659, + "learning_rate": 1.7661936209076857e-06, + "loss": 10.176, + "step": 420 + }, + { + "epoch": 0.0006041903410338329, + "grad_norm": 1.8428674936294556, + "learning_rate": 1.8083462132921172e-06, + "loss": 10.1656, + "step": 430 + }, + { + "epoch": 0.0006182412791974104, + "grad_norm": 2.1964094638824463, + "learning_rate": 1.8504988056765488e-06, + "loss": 10.1992, + "step": 440 + }, + { + "epoch": 0.0006322922173609879, + "grad_norm": 2.0467209815979004, + "learning_rate": 1.8926513980609804e-06, + "loss": 10.1928, + "step": 450 + }, + { + "epoch": 0.0006463431555245654, + "grad_norm": 2.0101869106292725, + "learning_rate": 1.9348039904454122e-06, + "loss": 10.1389, + "step": 460 + }, + { + "epoch": 0.000660394093688143, + "grad_norm": 1.808905005455017, + "learning_rate": 1.976956582829844e-06, + "loss": 10.1005, + "step": 470 + }, + { + "epoch": 0.0006744450318517205, + "grad_norm": 2.215162515640259, + "learning_rate": 2.0191091752142754e-06, + "loss": 10.0665, + "step": 480 + }, + { + "epoch": 0.0006884959700152979, + "grad_norm": 1.7877708673477173, + "learning_rate": 2.061261767598707e-06, + "loss": 10.0854, + "step": 490 + }, + { + "epoch": 0.0007025469081788755, + "grad_norm": 2.141770124435425, + "learning_rate": 2.1034143599831386e-06, + "loss": 10.04, + "step": 500 + }, + { + "epoch": 0.000716597846342453, + "grad_norm": 1.706669807434082, + "learning_rate": 2.14556695236757e-06, + "loss": 10.0788, + "step": 510 + }, + { + "epoch": 0.0007306487845060305, + "grad_norm": 1.7067891359329224, + "learning_rate": 2.1877195447520018e-06, + "loss": 9.9621, + "step": 520 + }, + { + "epoch": 0.000744699722669608, + "grad_norm": 1.784010410308838, + "learning_rate": 2.2298721371364338e-06, + "loss": 9.97, + "step": 530 + }, + { + "epoch": 0.0007587506608331855, + "grad_norm": 2.3449370861053467, + "learning_rate": 2.2720247295208654e-06, + "loss": 9.9688, + "step": 540 + }, + { + "epoch": 0.000772801598996763, + "grad_norm": 1.652551293373108, + "learning_rate": 2.314177321905297e-06, + "loss": 9.9469, + "step": 550 + }, + { + "epoch": 0.0007868525371603405, + "grad_norm": 1.5717960596084595, + "learning_rate": 2.356329914289729e-06, + "loss": 9.9077, + "step": 560 + }, + { + "epoch": 0.0008009034753239181, + "grad_norm": 1.4391708374023438, + "learning_rate": 2.3984825066741606e-06, + "loss": 9.9488, + "step": 570 + }, + { + "epoch": 0.0008149544134874955, + "grad_norm": 1.4368293285369873, + "learning_rate": 2.440635099058592e-06, + "loss": 9.848, + "step": 580 + }, + { + "epoch": 0.000829005351651073, + "grad_norm": 1.9761817455291748, + "learning_rate": 2.4827876914430237e-06, + "loss": 9.8789, + "step": 590 + }, + { + "epoch": 0.0008430562898146506, + "grad_norm": 1.1829458475112915, + "learning_rate": 2.5249402838274553e-06, + "loss": 9.8254, + "step": 600 + }, + { + "epoch": 0.0008571072279782281, + "grad_norm": 1.5284178256988525, + "learning_rate": 2.567092876211887e-06, + "loss": 9.8105, + "step": 610 + }, + { + "epoch": 0.0008711581661418056, + "grad_norm": 1.3892128467559814, + "learning_rate": 2.6092454685963185e-06, + "loss": 9.8078, + "step": 620 + }, + { + "epoch": 0.0008852091043053831, + "grad_norm": 1.2729390859603882, + "learning_rate": 2.65139806098075e-06, + "loss": 9.7646, + "step": 630 + }, + { + "epoch": 0.0008992600424689606, + "grad_norm": 1.1548229455947876, + "learning_rate": 2.6935506533651817e-06, + "loss": 9.7455, + "step": 640 + }, + { + "epoch": 0.0009133109806325381, + "grad_norm": 1.2774633169174194, + "learning_rate": 2.7357032457496133e-06, + "loss": 9.74, + "step": 650 + }, + { + "epoch": 0.0009273619187961156, + "grad_norm": 1.1594111919403076, + "learning_rate": 2.777855838134045e-06, + "loss": 9.7456, + "step": 660 + }, + { + "epoch": 0.0009414128569596932, + "grad_norm": 1.22251296043396, + "learning_rate": 2.8200084305184765e-06, + "loss": 9.7187, + "step": 670 + }, + { + "epoch": 0.0009554637951232706, + "grad_norm": 1.4419326782226562, + "learning_rate": 2.8621610229029085e-06, + "loss": 9.6953, + "step": 680 + }, + { + "epoch": 0.0009695147332868481, + "grad_norm": 1.0429575443267822, + "learning_rate": 2.90431361528734e-06, + "loss": 9.608, + "step": 690 + }, + { + "epoch": 0.0009835656714504257, + "grad_norm": 1.1225723028182983, + "learning_rate": 2.9464662076717717e-06, + "loss": 9.604, + "step": 700 + }, + { + "epoch": 0.0009976166096140032, + "grad_norm": 1.0744000673294067, + "learning_rate": 2.9886188000562033e-06, + "loss": 9.61, + "step": 710 + }, + { + "epoch": 0.0010116675477775806, + "grad_norm": 1.0817335844039917, + "learning_rate": 3.030771392440635e-06, + "loss": 9.533, + "step": 720 + }, + { + "epoch": 0.001025718485941158, + "grad_norm": 1.5660817623138428, + "learning_rate": 3.0729239848250664e-06, + "loss": 9.5771, + "step": 730 + }, + { + "epoch": 0.0010397694241047358, + "grad_norm": 1.141122579574585, + "learning_rate": 3.115076577209498e-06, + "loss": 9.5357, + "step": 740 + }, + { + "epoch": 0.0010538203622683132, + "grad_norm": 0.990804135799408, + "learning_rate": 3.1572291695939296e-06, + "loss": 9.5077, + "step": 750 + }, + { + "epoch": 0.0010678713004318907, + "grad_norm": 0.9177122116088867, + "learning_rate": 3.199381761978361e-06, + "loss": 9.4705, + "step": 760 + }, + { + "epoch": 0.0010819222385954682, + "grad_norm": 1.214892864227295, + "learning_rate": 3.241534354362793e-06, + "loss": 9.4865, + "step": 770 + }, + { + "epoch": 0.0010959731767590456, + "grad_norm": 0.943670392036438, + "learning_rate": 3.2836869467472244e-06, + "loss": 9.4527, + "step": 780 + }, + { + "epoch": 0.0011100241149226233, + "grad_norm": 1.3754514455795288, + "learning_rate": 3.325839539131656e-06, + "loss": 9.4666, + "step": 790 + }, + { + "epoch": 0.0011240750530862008, + "grad_norm": 1.0014880895614624, + "learning_rate": 3.3679921315160876e-06, + "loss": 9.4266, + "step": 800 + }, + { + "epoch": 0.0011381259912497783, + "grad_norm": 1.1103196144104004, + "learning_rate": 3.41014472390052e-06, + "loss": 9.4199, + "step": 810 + }, + { + "epoch": 0.0011521769294133557, + "grad_norm": 0.8306254148483276, + "learning_rate": 3.4522973162849516e-06, + "loss": 9.396, + "step": 820 + }, + { + "epoch": 0.0011662278675769332, + "grad_norm": 1.3354564905166626, + "learning_rate": 3.494449908669383e-06, + "loss": 9.4007, + "step": 830 + }, + { + "epoch": 0.0011802788057405109, + "grad_norm": 1.1535601615905762, + "learning_rate": 3.5366025010538148e-06, + "loss": 9.3241, + "step": 840 + }, + { + "epoch": 0.0011943297439040883, + "grad_norm": 0.8198082447052002, + "learning_rate": 3.5787550934382464e-06, + "loss": 9.3004, + "step": 850 + }, + { + "epoch": 0.0012083806820676658, + "grad_norm": 0.9945477247238159, + "learning_rate": 3.620907685822678e-06, + "loss": 9.2414, + "step": 860 + }, + { + "epoch": 0.0012224316202312433, + "grad_norm": 0.842752993106842, + "learning_rate": 3.6630602782071095e-06, + "loss": 9.2984, + "step": 870 + }, + { + "epoch": 0.0012364825583948207, + "grad_norm": 0.8242266774177551, + "learning_rate": 3.705212870591541e-06, + "loss": 9.2572, + "step": 880 + }, + { + "epoch": 0.0012505334965583984, + "grad_norm": 0.8884344696998596, + "learning_rate": 3.7473654629759727e-06, + "loss": 9.2445, + "step": 890 + }, + { + "epoch": 0.0012645844347219759, + "grad_norm": 0.8368833065032959, + "learning_rate": 3.7895180553604043e-06, + "loss": 9.254, + "step": 900 + }, + { + "epoch": 0.0012786353728855534, + "grad_norm": 0.9276847243309021, + "learning_rate": 3.831670647744836e-06, + "loss": 9.2425, + "step": 910 + }, + { + "epoch": 0.0012926863110491308, + "grad_norm": 0.827470064163208, + "learning_rate": 3.873823240129267e-06, + "loss": 9.2212, + "step": 920 + }, + { + "epoch": 0.0013067372492127083, + "grad_norm": 0.9183015823364258, + "learning_rate": 3.915975832513699e-06, + "loss": 9.1258, + "step": 930 + }, + { + "epoch": 0.001320788187376286, + "grad_norm": 0.8526141047477722, + "learning_rate": 3.958128424898131e-06, + "loss": 9.1636, + "step": 940 + }, + { + "epoch": 0.0013348391255398634, + "grad_norm": 0.9103283286094666, + "learning_rate": 4.000281017282562e-06, + "loss": 9.1574, + "step": 950 + }, + { + "epoch": 0.001348890063703441, + "grad_norm": 0.8416949510574341, + "learning_rate": 4.042433609666994e-06, + "loss": 9.1176, + "step": 960 + }, + { + "epoch": 0.0013629410018670184, + "grad_norm": 0.8522294759750366, + "learning_rate": 4.084586202051426e-06, + "loss": 9.0875, + "step": 970 + }, + { + "epoch": 0.0013769919400305958, + "grad_norm": 0.8377172946929932, + "learning_rate": 4.1267387944358575e-06, + "loss": 9.0616, + "step": 980 + }, + { + "epoch": 0.0013910428781941735, + "grad_norm": 0.7883292436599731, + "learning_rate": 4.1688913868202895e-06, + "loss": 9.0931, + "step": 990 + }, + { + "epoch": 0.001405093816357751, + "grad_norm": 1.2264714241027832, + "learning_rate": 4.211043979204721e-06, + "loss": 9.0459, + "step": 1000 + }, + { + "epoch": 0.0014191447545213285, + "grad_norm": 0.8207995295524597, + "learning_rate": 4.253196571589153e-06, + "loss": 9.0147, + "step": 1010 + }, + { + "epoch": 0.001433195692684906, + "grad_norm": 0.751477062702179, + "learning_rate": 4.295349163973584e-06, + "loss": 9.0429, + "step": 1020 + }, + { + "epoch": 0.0014472466308484834, + "grad_norm": 0.7454262971878052, + "learning_rate": 4.337501756358016e-06, + "loss": 9.0418, + "step": 1030 + }, + { + "epoch": 0.001461297569012061, + "grad_norm": 0.7283434271812439, + "learning_rate": 4.379654348742447e-06, + "loss": 9.0124, + "step": 1040 + }, + { + "epoch": 0.0014753485071756385, + "grad_norm": 0.8492335677146912, + "learning_rate": 4.421806941126879e-06, + "loss": 9.014, + "step": 1050 + }, + { + "epoch": 0.001489399445339216, + "grad_norm": 0.7394980192184448, + "learning_rate": 4.463959533511311e-06, + "loss": 9.0262, + "step": 1060 + }, + { + "epoch": 0.0015034503835027935, + "grad_norm": 0.8477874398231506, + "learning_rate": 4.506112125895742e-06, + "loss": 8.9394, + "step": 1070 + }, + { + "epoch": 0.001517501321666371, + "grad_norm": 0.7641156315803528, + "learning_rate": 4.548264718280174e-06, + "loss": 8.9173, + "step": 1080 + }, + { + "epoch": 0.0015315522598299486, + "grad_norm": 0.7645962834358215, + "learning_rate": 4.590417310664605e-06, + "loss": 8.897, + "step": 1090 + }, + { + "epoch": 0.001545603197993526, + "grad_norm": 0.7267779111862183, + "learning_rate": 4.632569903049037e-06, + "loss": 8.9216, + "step": 1100 + }, + { + "epoch": 0.0015596541361571035, + "grad_norm": 0.8410118818283081, + "learning_rate": 4.6747224954334686e-06, + "loss": 8.8682, + "step": 1110 + }, + { + "epoch": 0.001573705074320681, + "grad_norm": 0.7551319003105164, + "learning_rate": 4.7168750878179006e-06, + "loss": 8.8911, + "step": 1120 + }, + { + "epoch": 0.0015877560124842585, + "grad_norm": 1.4869343042373657, + "learning_rate": 4.759027680202332e-06, + "loss": 8.8781, + "step": 1130 + }, + { + "epoch": 0.0016018069506478362, + "grad_norm": 0.6931923627853394, + "learning_rate": 4.801180272586764e-06, + "loss": 8.8261, + "step": 1140 + }, + { + "epoch": 0.0016158578888114136, + "grad_norm": 0.7531590461730957, + "learning_rate": 4.843332864971195e-06, + "loss": 8.8656, + "step": 1150 + }, + { + "epoch": 0.001629908826974991, + "grad_norm": 0.862003743648529, + "learning_rate": 4.885485457355627e-06, + "loss": 8.8532, + "step": 1160 + }, + { + "epoch": 0.0016439597651385686, + "grad_norm": 0.6877137422561646, + "learning_rate": 4.927638049740058e-06, + "loss": 8.8643, + "step": 1170 + }, + { + "epoch": 0.001658010703302146, + "grad_norm": 0.6912798881530762, + "learning_rate": 4.96979064212449e-06, + "loss": 8.8212, + "step": 1180 + }, + { + "epoch": 0.0016720616414657237, + "grad_norm": 0.7850083708763123, + "learning_rate": 5.011943234508921e-06, + "loss": 8.8194, + "step": 1190 + }, + { + "epoch": 0.0016861125796293012, + "grad_norm": 0.7854230999946594, + "learning_rate": 5.054095826893353e-06, + "loss": 8.7891, + "step": 1200 + }, + { + "epoch": 0.0017001635177928786, + "grad_norm": 0.6913483142852783, + "learning_rate": 5.0962484192777845e-06, + "loss": 8.78, + "step": 1210 + }, + { + "epoch": 0.0017142144559564561, + "grad_norm": 0.881175696849823, + "learning_rate": 5.1384010116622165e-06, + "loss": 8.7337, + "step": 1220 + }, + { + "epoch": 0.0017282653941200336, + "grad_norm": 2.36761736869812, + "learning_rate": 5.180553604046648e-06, + "loss": 8.7539, + "step": 1230 + }, + { + "epoch": 0.0017423163322836113, + "grad_norm": 0.7986196875572205, + "learning_rate": 5.2227061964310805e-06, + "loss": 8.7183, + "step": 1240 + }, + { + "epoch": 0.0017563672704471887, + "grad_norm": 0.6862300038337708, + "learning_rate": 5.2648587888155125e-06, + "loss": 8.6992, + "step": 1250 + }, + { + "epoch": 0.0017704182086107662, + "grad_norm": 0.727844774723053, + "learning_rate": 5.307011381199944e-06, + "loss": 8.6963, + "step": 1260 + }, + { + "epoch": 0.0017844691467743437, + "grad_norm": 0.6336658596992493, + "learning_rate": 5.349163973584376e-06, + "loss": 8.7178, + "step": 1270 + }, + { + "epoch": 0.0017985200849379211, + "grad_norm": 0.6945951581001282, + "learning_rate": 5.391316565968807e-06, + "loss": 8.6921, + "step": 1280 + }, + { + "epoch": 0.0018125710231014988, + "grad_norm": 1.3838273286819458, + "learning_rate": 5.433469158353239e-06, + "loss": 8.6852, + "step": 1290 + }, + { + "epoch": 0.0018266219612650763, + "grad_norm": 1.8115507364273071, + "learning_rate": 5.47562175073767e-06, + "loss": 8.7111, + "step": 1300 + }, + { + "epoch": 0.0018406728994286537, + "grad_norm": 0.7600476741790771, + "learning_rate": 5.517774343122102e-06, + "loss": 8.6884, + "step": 1310 + }, + { + "epoch": 0.0018547238375922312, + "grad_norm": 0.6664397716522217, + "learning_rate": 5.559926935506533e-06, + "loss": 8.636, + "step": 1320 + }, + { + "epoch": 0.0018687747757558087, + "grad_norm": 0.6572439074516296, + "learning_rate": 5.602079527890965e-06, + "loss": 8.61, + "step": 1330 + }, + { + "epoch": 0.0018828257139193864, + "grad_norm": 0.6541756987571716, + "learning_rate": 5.644232120275396e-06, + "loss": 8.613, + "step": 1340 + }, + { + "epoch": 0.0018968766520829638, + "grad_norm": 0.7731262445449829, + "learning_rate": 5.686384712659828e-06, + "loss": 8.5664, + "step": 1350 + }, + { + "epoch": 0.0019109275902465413, + "grad_norm": 0.8053380846977234, + "learning_rate": 5.72853730504426e-06, + "loss": 8.6225, + "step": 1360 + }, + { + "epoch": 0.0019249785284101188, + "grad_norm": 0.6197734475135803, + "learning_rate": 5.770689897428692e-06, + "loss": 8.564, + "step": 1370 + }, + { + "epoch": 0.0019390294665736962, + "grad_norm": 0.623519241809845, + "learning_rate": 5.812842489813123e-06, + "loss": 8.5342, + "step": 1380 + }, + { + "epoch": 0.0019530804047372737, + "grad_norm": 0.9381340742111206, + "learning_rate": 5.854995082197555e-06, + "loss": 8.5497, + "step": 1390 + }, + { + "epoch": 0.0019671313429008514, + "grad_norm": 0.677198052406311, + "learning_rate": 5.897147674581986e-06, + "loss": 8.5417, + "step": 1400 + }, + { + "epoch": 0.001981182281064429, + "grad_norm": 0.6278719305992126, + "learning_rate": 5.939300266966418e-06, + "loss": 8.5709, + "step": 1410 + }, + { + "epoch": 0.0019952332192280063, + "grad_norm": 0.6248645782470703, + "learning_rate": 5.981452859350849e-06, + "loss": 8.5463, + "step": 1420 + }, + { + "epoch": 0.0020092841573915838, + "grad_norm": 0.6436347365379333, + "learning_rate": 6.023605451735281e-06, + "loss": 8.4893, + "step": 1430 + }, + { + "epoch": 0.0020233350955551612, + "grad_norm": 0.6340057849884033, + "learning_rate": 6.065758044119712e-06, + "loss": 8.5257, + "step": 1440 + }, + { + "epoch": 0.0020373860337187387, + "grad_norm": 0.715142548084259, + "learning_rate": 6.107910636504144e-06, + "loss": 8.4914, + "step": 1450 + }, + { + "epoch": 0.002051436971882316, + "grad_norm": 0.7200644612312317, + "learning_rate": 6.1500632288885755e-06, + "loss": 8.4823, + "step": 1460 + }, + { + "epoch": 0.002065487910045894, + "grad_norm": 0.8068726062774658, + "learning_rate": 6.1922158212730075e-06, + "loss": 8.4923, + "step": 1470 + }, + { + "epoch": 0.0020795388482094715, + "grad_norm": 0.6273617744445801, + "learning_rate": 6.2343684136574395e-06, + "loss": 8.4483, + "step": 1480 + }, + { + "epoch": 0.002093589786373049, + "grad_norm": 0.6604838967323303, + "learning_rate": 6.2765210060418715e-06, + "loss": 8.4753, + "step": 1490 + }, + { + "epoch": 0.0021076407245366265, + "grad_norm": 0.6091654300689697, + "learning_rate": 6.318673598426303e-06, + "loss": 8.4524, + "step": 1500 + }, + { + "epoch": 0.002121691662700204, + "grad_norm": 0.7857375144958496, + "learning_rate": 6.360826190810735e-06, + "loss": 8.4062, + "step": 1510 + }, + { + "epoch": 0.0021357426008637814, + "grad_norm": 0.6728093028068542, + "learning_rate": 6.402978783195166e-06, + "loss": 8.4559, + "step": 1520 + }, + { + "epoch": 0.002149793539027359, + "grad_norm": 0.6304994821548462, + "learning_rate": 6.445131375579598e-06, + "loss": 8.4665, + "step": 1530 + }, + { + "epoch": 0.0021638444771909363, + "grad_norm": 0.6075509786605835, + "learning_rate": 6.48728396796403e-06, + "loss": 8.4142, + "step": 1540 + }, + { + "epoch": 0.002177895415354514, + "grad_norm": 0.6211352348327637, + "learning_rate": 6.529436560348461e-06, + "loss": 8.4188, + "step": 1550 + }, + { + "epoch": 0.0021919463535180913, + "grad_norm": 0.7409493327140808, + "learning_rate": 6.571589152732893e-06, + "loss": 8.3903, + "step": 1560 + }, + { + "epoch": 0.002205997291681669, + "grad_norm": 0.6649135947227478, + "learning_rate": 6.613741745117324e-06, + "loss": 8.3788, + "step": 1570 + }, + { + "epoch": 0.0022200482298452466, + "grad_norm": 0.6305505037307739, + "learning_rate": 6.655894337501756e-06, + "loss": 8.3711, + "step": 1580 + }, + { + "epoch": 0.002234099168008824, + "grad_norm": 0.5979957580566406, + "learning_rate": 6.698046929886187e-06, + "loss": 8.3786, + "step": 1590 + }, + { + "epoch": 0.0022481501061724016, + "grad_norm": 0.6189422607421875, + "learning_rate": 6.7401995222706194e-06, + "loss": 8.3388, + "step": 1600 + }, + { + "epoch": 0.002262201044335979, + "grad_norm": 0.6126115322113037, + "learning_rate": 6.782352114655051e-06, + "loss": 8.382, + "step": 1610 + }, + { + "epoch": 0.0022762519824995565, + "grad_norm": 0.6059759855270386, + "learning_rate": 6.824504707039483e-06, + "loss": 8.3826, + "step": 1620 + }, + { + "epoch": 0.002290302920663134, + "grad_norm": 0.6775907874107361, + "learning_rate": 6.866657299423914e-06, + "loss": 8.3403, + "step": 1630 + }, + { + "epoch": 0.0023043538588267114, + "grad_norm": 0.7798088788986206, + "learning_rate": 6.908809891808346e-06, + "loss": 8.3181, + "step": 1640 + }, + { + "epoch": 0.002318404796990289, + "grad_norm": 0.8462784886360168, + "learning_rate": 6.950962484192777e-06, + "loss": 8.3389, + "step": 1650 + }, + { + "epoch": 0.0023324557351538664, + "grad_norm": 0.5759812593460083, + "learning_rate": 6.993115076577209e-06, + "loss": 8.3245, + "step": 1660 + }, + { + "epoch": 0.0023465066733174443, + "grad_norm": 0.8392948508262634, + "learning_rate": 7.03526766896164e-06, + "loss": 8.3226, + "step": 1670 + }, + { + "epoch": 0.0023605576114810217, + "grad_norm": 0.711455762386322, + "learning_rate": 7.077420261346072e-06, + "loss": 8.2694, + "step": 1680 + }, + { + "epoch": 0.002374608549644599, + "grad_norm": 0.6649680137634277, + "learning_rate": 7.119572853730503e-06, + "loss": 8.3165, + "step": 1690 + }, + { + "epoch": 0.0023886594878081767, + "grad_norm": 0.5890215635299683, + "learning_rate": 7.161725446114935e-06, + "loss": 8.3204, + "step": 1700 + }, + { + "epoch": 0.002402710425971754, + "grad_norm": 0.6956003904342651, + "learning_rate": 7.2038780384993665e-06, + "loss": 8.2913, + "step": 1710 + }, + { + "epoch": 0.0024167613641353316, + "grad_norm": 0.6579176187515259, + "learning_rate": 7.2460306308837985e-06, + "loss": 8.2964, + "step": 1720 + }, + { + "epoch": 0.002430812302298909, + "grad_norm": 0.9485933184623718, + "learning_rate": 7.2881832232682305e-06, + "loss": 8.304, + "step": 1730 + }, + { + "epoch": 0.0024448632404624865, + "grad_norm": 0.6445179581642151, + "learning_rate": 7.3303358156526625e-06, + "loss": 8.2677, + "step": 1740 + }, + { + "epoch": 0.002458914178626064, + "grad_norm": 0.6118063926696777, + "learning_rate": 7.372488408037094e-06, + "loss": 8.3056, + "step": 1750 + }, + { + "epoch": 0.0024729651167896415, + "grad_norm": 0.5871592164039612, + "learning_rate": 7.414641000421526e-06, + "loss": 8.2554, + "step": 1760 + }, + { + "epoch": 0.0024870160549532194, + "grad_norm": 0.6152786612510681, + "learning_rate": 7.456793592805957e-06, + "loss": 8.2661, + "step": 1770 + }, + { + "epoch": 0.002501066993116797, + "grad_norm": 0.6126775145530701, + "learning_rate": 7.498946185190389e-06, + "loss": 8.2189, + "step": 1780 + }, + { + "epoch": 0.0025151179312803743, + "grad_norm": 0.5303098559379578, + "learning_rate": 7.54109877757482e-06, + "loss": 8.2553, + "step": 1790 + }, + { + "epoch": 0.0025291688694439518, + "grad_norm": 0.668865978717804, + "learning_rate": 7.583251369959252e-06, + "loss": 8.225, + "step": 1800 + }, + { + "epoch": 0.0025432198076075292, + "grad_norm": 0.6807597875595093, + "learning_rate": 7.625403962343683e-06, + "loss": 8.2616, + "step": 1810 + }, + { + "epoch": 0.0025572707457711067, + "grad_norm": 0.5899785757064819, + "learning_rate": 7.667556554728115e-06, + "loss": 8.2212, + "step": 1820 + }, + { + "epoch": 0.002571321683934684, + "grad_norm": 0.7344907522201538, + "learning_rate": 7.709709147112546e-06, + "loss": 8.2091, + "step": 1830 + }, + { + "epoch": 0.0025853726220982616, + "grad_norm": 0.6710782051086426, + "learning_rate": 7.751861739496978e-06, + "loss": 8.2347, + "step": 1840 + }, + { + "epoch": 0.002599423560261839, + "grad_norm": 0.5794618725776672, + "learning_rate": 7.79401433188141e-06, + "loss": 8.1866, + "step": 1850 + }, + { + "epoch": 0.0026134744984254166, + "grad_norm": 0.6005648970603943, + "learning_rate": 7.836166924265842e-06, + "loss": 8.2057, + "step": 1860 + }, + { + "epoch": 0.002627525436588994, + "grad_norm": 0.6351447701454163, + "learning_rate": 7.878319516650273e-06, + "loss": 8.2069, + "step": 1870 + }, + { + "epoch": 0.002641576374752572, + "grad_norm": 2.6223857402801514, + "learning_rate": 7.920472109034706e-06, + "loss": 8.2184, + "step": 1880 + }, + { + "epoch": 0.0026556273129161494, + "grad_norm": 0.6811882257461548, + "learning_rate": 7.962624701419137e-06, + "loss": 8.197, + "step": 1890 + }, + { + "epoch": 0.002669678251079727, + "grad_norm": 0.6392279863357544, + "learning_rate": 8.004777293803568e-06, + "loss": 8.212, + "step": 1900 + }, + { + "epoch": 0.0026837291892433043, + "grad_norm": 0.6159211993217468, + "learning_rate": 8.046929886187999e-06, + "loss": 8.2124, + "step": 1910 + }, + { + "epoch": 0.002697780127406882, + "grad_norm": 0.9476580619812012, + "learning_rate": 8.089082478572432e-06, + "loss": 8.1537, + "step": 1920 + }, + { + "epoch": 0.0027118310655704593, + "grad_norm": 0.645937442779541, + "learning_rate": 8.131235070956863e-06, + "loss": 8.1634, + "step": 1930 + }, + { + "epoch": 0.0027258820037340367, + "grad_norm": 0.6065277457237244, + "learning_rate": 8.173387663341294e-06, + "loss": 8.1557, + "step": 1940 + }, + { + "epoch": 0.002739932941897614, + "grad_norm": 0.6352872848510742, + "learning_rate": 8.215540255725726e-06, + "loss": 8.1722, + "step": 1950 + }, + { + "epoch": 0.0027539838800611917, + "grad_norm": 0.5992732644081116, + "learning_rate": 8.257692848110158e-06, + "loss": 8.1705, + "step": 1960 + }, + { + "epoch": 0.002768034818224769, + "grad_norm": 0.8603838682174683, + "learning_rate": 8.29984544049459e-06, + "loss": 8.1483, + "step": 1970 + }, + { + "epoch": 0.002782085756388347, + "grad_norm": 0.7166236042976379, + "learning_rate": 8.341998032879022e-06, + "loss": 8.1939, + "step": 1980 + }, + { + "epoch": 0.0027961366945519245, + "grad_norm": 0.6886283755302429, + "learning_rate": 8.384150625263454e-06, + "loss": 8.1459, + "step": 1990 + }, + { + "epoch": 0.002810187632715502, + "grad_norm": 0.6554070115089417, + "learning_rate": 8.426303217647885e-06, + "loss": 8.1696, + "step": 2000 + }, + { + "epoch": 0.0028242385708790794, + "grad_norm": 0.6008890271186829, + "learning_rate": 8.468455810032316e-06, + "loss": 8.0969, + "step": 2010 + }, + { + "epoch": 0.002838289509042657, + "grad_norm": 0.5523585081100464, + "learning_rate": 8.510608402416749e-06, + "loss": 8.1298, + "step": 2020 + }, + { + "epoch": 0.0028523404472062344, + "grad_norm": 0.616106390953064, + "learning_rate": 8.55276099480118e-06, + "loss": 8.1633, + "step": 2030 + }, + { + "epoch": 0.002866391385369812, + "grad_norm": 0.7677391767501831, + "learning_rate": 8.594913587185611e-06, + "loss": 8.1168, + "step": 2040 + }, + { + "epoch": 0.0028804423235333893, + "grad_norm": 1.135625958442688, + "learning_rate": 8.637066179570042e-06, + "loss": 8.1114, + "step": 2050 + }, + { + "epoch": 0.0028944932616969668, + "grad_norm": 0.6018761396408081, + "learning_rate": 8.679218771954475e-06, + "loss": 8.125, + "step": 2060 + }, + { + "epoch": 0.0029085441998605442, + "grad_norm": 0.6487704515457153, + "learning_rate": 8.721371364338906e-06, + "loss": 8.1119, + "step": 2070 + }, + { + "epoch": 0.002922595138024122, + "grad_norm": 0.6532260775566101, + "learning_rate": 8.763523956723337e-06, + "loss": 8.0926, + "step": 2080 + }, + { + "epoch": 0.0029366460761876996, + "grad_norm": 0.7966541051864624, + "learning_rate": 8.805676549107769e-06, + "loss": 8.1293, + "step": 2090 + }, + { + "epoch": 0.002950697014351277, + "grad_norm": 0.5987772345542908, + "learning_rate": 8.847829141492201e-06, + "loss": 8.0792, + "step": 2100 + }, + { + "epoch": 0.0029647479525148545, + "grad_norm": 0.5964518189430237, + "learning_rate": 8.889981733876633e-06, + "loss": 8.1097, + "step": 2110 + }, + { + "epoch": 0.002978798890678432, + "grad_norm": 0.6664417386054993, + "learning_rate": 8.932134326261064e-06, + "loss": 8.0914, + "step": 2120 + }, + { + "epoch": 0.0029928498288420095, + "grad_norm": 0.7204576730728149, + "learning_rate": 8.974286918645495e-06, + "loss": 8.0853, + "step": 2130 + }, + { + "epoch": 0.003006900767005587, + "grad_norm": 0.698284924030304, + "learning_rate": 9.016439511029928e-06, + "loss": 8.1351, + "step": 2140 + }, + { + "epoch": 0.0030209517051691644, + "grad_norm": 0.6838796138763428, + "learning_rate": 9.058592103414359e-06, + "loss": 8.0672, + "step": 2150 + }, + { + "epoch": 0.003035002643332742, + "grad_norm": 0.6164630651473999, + "learning_rate": 9.10074469579879e-06, + "loss": 8.1009, + "step": 2160 + }, + { + "epoch": 0.0030490535814963193, + "grad_norm": 0.7379197478294373, + "learning_rate": 9.142897288183221e-06, + "loss": 8.05, + "step": 2170 + }, + { + "epoch": 0.0030631045196598972, + "grad_norm": 0.6812073588371277, + "learning_rate": 9.185049880567654e-06, + "loss": 8.0716, + "step": 2180 + }, + { + "epoch": 0.0030771554578234747, + "grad_norm": 0.6673164963722229, + "learning_rate": 9.227202472952085e-06, + "loss": 8.1383, + "step": 2190 + }, + { + "epoch": 0.003091206395987052, + "grad_norm": 0.7186004519462585, + "learning_rate": 9.269355065336517e-06, + "loss": 8.0792, + "step": 2200 + }, + { + "epoch": 0.0031052573341506296, + "grad_norm": 0.9671910405158997, + "learning_rate": 9.311507657720948e-06, + "loss": 8.0954, + "step": 2210 + }, + { + "epoch": 0.003119308272314207, + "grad_norm": 0.6637594103813171, + "learning_rate": 9.35366025010538e-06, + "loss": 8.0612, + "step": 2220 + }, + { + "epoch": 0.0031333592104777846, + "grad_norm": 0.6269683837890625, + "learning_rate": 9.395812842489812e-06, + "loss": 8.0422, + "step": 2230 + }, + { + "epoch": 0.003147410148641362, + "grad_norm": 0.6617890000343323, + "learning_rate": 9.437965434874245e-06, + "loss": 8.0337, + "step": 2240 + }, + { + "epoch": 0.0031614610868049395, + "grad_norm": 0.6194980144500732, + "learning_rate": 9.480118027258676e-06, + "loss": 8.099, + "step": 2250 + }, + { + "epoch": 0.003175512024968517, + "grad_norm": 0.6944716572761536, + "learning_rate": 9.522270619643107e-06, + "loss": 8.0398, + "step": 2260 + }, + { + "epoch": 0.0031895629631320944, + "grad_norm": 0.6326583623886108, + "learning_rate": 9.564423212027538e-06, + "loss": 8.0368, + "step": 2270 + }, + { + "epoch": 0.0032036139012956723, + "grad_norm": 0.583436906337738, + "learning_rate": 9.606575804411971e-06, + "loss": 8.0594, + "step": 2280 + }, + { + "epoch": 0.00321766483945925, + "grad_norm": 0.6804009079933167, + "learning_rate": 9.648728396796402e-06, + "loss": 8.0141, + "step": 2290 + }, + { + "epoch": 0.0032317157776228273, + "grad_norm": 0.6234678030014038, + "learning_rate": 9.690880989180833e-06, + "loss": 7.9864, + "step": 2300 + }, + { + "epoch": 0.0032457667157864047, + "grad_norm": 0.6890107989311218, + "learning_rate": 9.733033581565264e-06, + "loss": 8.0474, + "step": 2310 + }, + { + "epoch": 0.003259817653949982, + "grad_norm": 0.7134827375411987, + "learning_rate": 9.775186173949697e-06, + "loss": 8.0306, + "step": 2320 + }, + { + "epoch": 0.0032738685921135597, + "grad_norm": 0.6207192540168762, + "learning_rate": 9.817338766334128e-06, + "loss": 7.9592, + "step": 2330 + }, + { + "epoch": 0.003287919530277137, + "grad_norm": 1.1250572204589844, + "learning_rate": 9.85949135871856e-06, + "loss": 8.0243, + "step": 2340 + }, + { + "epoch": 0.0033019704684407146, + "grad_norm": 0.7023635506629944, + "learning_rate": 9.901643951102993e-06, + "loss": 8.017, + "step": 2350 + }, + { + "epoch": 0.003316021406604292, + "grad_norm": 0.6336594820022583, + "learning_rate": 9.943796543487424e-06, + "loss": 8.0084, + "step": 2360 + }, + { + "epoch": 0.0033300723447678695, + "grad_norm": 0.7274377942085266, + "learning_rate": 9.985949135871857e-06, + "loss": 8.0152, + "step": 2370 + }, + { + "epoch": 0.0033441232829314474, + "grad_norm": 0.8217412829399109, + "learning_rate": 1.0028101728256286e-05, + "loss": 7.9932, + "step": 2380 + }, + { + "epoch": 0.003358174221095025, + "grad_norm": 0.7048346400260925, + "learning_rate": 1.0070254320640719e-05, + "loss": 7.9944, + "step": 2390 + }, + { + "epoch": 0.0033722251592586024, + "grad_norm": 0.6946651935577393, + "learning_rate": 1.011240691302515e-05, + "loss": 7.9447, + "step": 2400 + }, + { + "epoch": 0.00338627609742218, + "grad_norm": 0.7006087899208069, + "learning_rate": 1.0154559505409583e-05, + "loss": 8.0093, + "step": 2410 + }, + { + "epoch": 0.0034003270355857573, + "grad_norm": 0.8661535382270813, + "learning_rate": 1.0196712097794012e-05, + "loss": 8.0699, + "step": 2420 + }, + { + "epoch": 0.0034143779737493348, + "grad_norm": 0.8086657524108887, + "learning_rate": 1.0238864690178445e-05, + "loss": 7.9983, + "step": 2430 + }, + { + "epoch": 0.0034284289119129122, + "grad_norm": 0.6144962906837463, + "learning_rate": 1.0281017282562876e-05, + "loss": 8.0364, + "step": 2440 + }, + { + "epoch": 0.0034424798500764897, + "grad_norm": 0.7119608521461487, + "learning_rate": 1.032316987494731e-05, + "loss": 7.9961, + "step": 2450 + }, + { + "epoch": 0.003456530788240067, + "grad_norm": 0.7629860043525696, + "learning_rate": 1.0365322467331739e-05, + "loss": 7.9789, + "step": 2460 + }, + { + "epoch": 0.0034705817264036446, + "grad_norm": 0.6489969491958618, + "learning_rate": 1.0407475059716172e-05, + "loss": 7.9433, + "step": 2470 + }, + { + "epoch": 0.0034846326645672225, + "grad_norm": 0.6559635400772095, + "learning_rate": 1.0449627652100603e-05, + "loss": 7.9848, + "step": 2480 + }, + { + "epoch": 0.0034986836027308, + "grad_norm": 0.6898662447929382, + "learning_rate": 1.0491780244485036e-05, + "loss": 7.9388, + "step": 2490 + }, + { + "epoch": 0.0035127345408943775, + "grad_norm": 0.7968945503234863, + "learning_rate": 1.0533932836869465e-05, + "loss": 7.987, + "step": 2500 + }, + { + "epoch": 0.003526785479057955, + "grad_norm": 0.8573049306869507, + "learning_rate": 1.0576085429253898e-05, + "loss": 7.937, + "step": 2510 + }, + { + "epoch": 0.0035408364172215324, + "grad_norm": 0.7501293420791626, + "learning_rate": 1.0618238021638329e-05, + "loss": 7.9119, + "step": 2520 + }, + { + "epoch": 0.00355488735538511, + "grad_norm": 0.6564654111862183, + "learning_rate": 1.0660390614022762e-05, + "loss": 7.9062, + "step": 2530 + }, + { + "epoch": 0.0035689382935486873, + "grad_norm": 0.7637408971786499, + "learning_rate": 1.0702543206407191e-05, + "loss": 7.9529, + "step": 2540 + }, + { + "epoch": 0.003582989231712265, + "grad_norm": 0.6853562593460083, + "learning_rate": 1.0744695798791624e-05, + "loss": 7.9617, + "step": 2550 + }, + { + "epoch": 0.0035970401698758423, + "grad_norm": 0.9160966873168945, + "learning_rate": 1.0786848391176055e-05, + "loss": 7.95, + "step": 2560 + }, + { + "epoch": 0.0036110911080394197, + "grad_norm": 0.7625723481178284, + "learning_rate": 1.0829000983560488e-05, + "loss": 7.9144, + "step": 2570 + }, + { + "epoch": 0.0036251420462029976, + "grad_norm": 0.6952919363975525, + "learning_rate": 1.0871153575944918e-05, + "loss": 7.9539, + "step": 2580 + }, + { + "epoch": 0.003639192984366575, + "grad_norm": 0.8298730254173279, + "learning_rate": 1.091330616832935e-05, + "loss": 7.9589, + "step": 2590 + }, + { + "epoch": 0.0036532439225301526, + "grad_norm": 0.8768827319145203, + "learning_rate": 1.0955458760713784e-05, + "loss": 7.9266, + "step": 2600 + }, + { + "epoch": 0.00366729486069373, + "grad_norm": 0.6922865509986877, + "learning_rate": 1.0997611353098215e-05, + "loss": 7.9015, + "step": 2610 + }, + { + "epoch": 0.0036813457988573075, + "grad_norm": 0.7246437668800354, + "learning_rate": 1.1039763945482648e-05, + "loss": 7.9234, + "step": 2620 + }, + { + "epoch": 0.003695396737020885, + "grad_norm": 0.8041806817054749, + "learning_rate": 1.1081916537867077e-05, + "loss": 7.9054, + "step": 2630 + }, + { + "epoch": 0.0037094476751844624, + "grad_norm": 0.7150896191596985, + "learning_rate": 1.112406913025151e-05, + "loss": 7.9446, + "step": 2640 + }, + { + "epoch": 0.00372349861334804, + "grad_norm": 0.8792434930801392, + "learning_rate": 1.1166221722635941e-05, + "loss": 7.9111, + "step": 2650 + }, + { + "epoch": 0.0037375495515116174, + "grad_norm": 0.7889289259910583, + "learning_rate": 1.1208374315020374e-05, + "loss": 7.9393, + "step": 2660 + }, + { + "epoch": 0.003751600489675195, + "grad_norm": 0.8603889346122742, + "learning_rate": 1.1250526907404803e-05, + "loss": 7.8837, + "step": 2670 + }, + { + "epoch": 0.0037656514278387727, + "grad_norm": 0.8094549179077148, + "learning_rate": 1.1292679499789236e-05, + "loss": 7.9193, + "step": 2680 + }, + { + "epoch": 0.00377970236600235, + "grad_norm": 0.5979373455047607, + "learning_rate": 1.1334832092173667e-05, + "loss": 7.8922, + "step": 2690 + }, + { + "epoch": 0.0037937533041659277, + "grad_norm": 0.8888489007949829, + "learning_rate": 1.13769846845581e-05, + "loss": 7.8884, + "step": 2700 + }, + { + "epoch": 0.003807804242329505, + "grad_norm": 0.7355663180351257, + "learning_rate": 1.141913727694253e-05, + "loss": 7.9256, + "step": 2710 + }, + { + "epoch": 0.0038218551804930826, + "grad_norm": 0.7639088034629822, + "learning_rate": 1.1461289869326963e-05, + "loss": 7.8711, + "step": 2720 + }, + { + "epoch": 0.00383590611865666, + "grad_norm": 1.0683914422988892, + "learning_rate": 1.1503442461711394e-05, + "loss": 7.8724, + "step": 2730 + }, + { + "epoch": 0.0038499570568202375, + "grad_norm": 0.99086594581604, + "learning_rate": 1.1545595054095827e-05, + "loss": 7.8734, + "step": 2740 + }, + { + "epoch": 0.003864007994983815, + "grad_norm": 0.7015689015388489, + "learning_rate": 1.1587747646480256e-05, + "loss": 7.893, + "step": 2750 + }, + { + "epoch": 0.0038780589331473925, + "grad_norm": 0.7968684434890747, + "learning_rate": 1.1629900238864689e-05, + "loss": 7.8584, + "step": 2760 + }, + { + "epoch": 0.00389210987131097, + "grad_norm": 0.7174790501594543, + "learning_rate": 1.167205283124912e-05, + "loss": 7.8578, + "step": 2770 + }, + { + "epoch": 0.0039061608094745474, + "grad_norm": 0.818223237991333, + "learning_rate": 1.1714205423633553e-05, + "loss": 7.8421, + "step": 2780 + }, + { + "epoch": 0.003920211747638125, + "grad_norm": 0.7619180083274841, + "learning_rate": 1.1756358016017982e-05, + "loss": 7.8614, + "step": 2790 + }, + { + "epoch": 0.003934262685801703, + "grad_norm": 1.0655688047409058, + "learning_rate": 1.1798510608402415e-05, + "loss": 7.8461, + "step": 2800 + }, + { + "epoch": 0.00394831362396528, + "grad_norm": 0.7509090304374695, + "learning_rate": 1.1840663200786846e-05, + "loss": 7.857, + "step": 2810 + }, + { + "epoch": 0.003962364562128858, + "grad_norm": 0.7447671890258789, + "learning_rate": 1.188281579317128e-05, + "loss": 7.8801, + "step": 2820 + }, + { + "epoch": 0.003976415500292435, + "grad_norm": 0.9486709237098694, + "learning_rate": 1.1924968385555709e-05, + "loss": 7.7945, + "step": 2830 + }, + { + "epoch": 0.003990466438456013, + "grad_norm": 0.6755268573760986, + "learning_rate": 1.1967120977940142e-05, + "loss": 7.8919, + "step": 2840 + }, + { + "epoch": 0.0040045173766195905, + "grad_norm": 0.8210179805755615, + "learning_rate": 1.2009273570324575e-05, + "loss": 7.8437, + "step": 2850 + }, + { + "epoch": 0.0040185683147831676, + "grad_norm": 0.7738075256347656, + "learning_rate": 1.2051426162709006e-05, + "loss": 7.7473, + "step": 2860 + }, + { + "epoch": 0.0040326192529467455, + "grad_norm": 1.1072646379470825, + "learning_rate": 1.2093578755093439e-05, + "loss": 7.8089, + "step": 2870 + }, + { + "epoch": 0.0040466701911103225, + "grad_norm": 0.7715216279029846, + "learning_rate": 1.2135731347477868e-05, + "loss": 7.8353, + "step": 2880 + }, + { + "epoch": 0.0040607211292739, + "grad_norm": 0.8774746656417847, + "learning_rate": 1.2177883939862301e-05, + "loss": 7.7942, + "step": 2890 + }, + { + "epoch": 0.004074772067437477, + "grad_norm": 0.8812255263328552, + "learning_rate": 1.2220036532246732e-05, + "loss": 7.8712, + "step": 2900 + }, + { + "epoch": 0.004088823005601055, + "grad_norm": 0.8199585676193237, + "learning_rate": 1.2262189124631165e-05, + "loss": 7.8129, + "step": 2910 + }, + { + "epoch": 0.004102873943764632, + "grad_norm": 0.6967678666114807, + "learning_rate": 1.2304341717015594e-05, + "loss": 7.7619, + "step": 2920 + }, + { + "epoch": 0.00411692488192821, + "grad_norm": 0.744611918926239, + "learning_rate": 1.2346494309400027e-05, + "loss": 7.8447, + "step": 2930 + }, + { + "epoch": 0.004130975820091788, + "grad_norm": 0.8083895444869995, + "learning_rate": 1.2388646901784458e-05, + "loss": 7.8054, + "step": 2940 + }, + { + "epoch": 0.004145026758255365, + "grad_norm": 0.8558597564697266, + "learning_rate": 1.2430799494168891e-05, + "loss": 7.756, + "step": 2950 + }, + { + "epoch": 0.004159077696418943, + "grad_norm": 0.9740859866142273, + "learning_rate": 1.247295208655332e-05, + "loss": 7.7429, + "step": 2960 + }, + { + "epoch": 0.00417312863458252, + "grad_norm": 0.7745287418365479, + "learning_rate": 1.2515104678937754e-05, + "loss": 7.7885, + "step": 2970 + }, + { + "epoch": 0.004187179572746098, + "grad_norm": 0.8137255907058716, + "learning_rate": 1.2557257271322185e-05, + "loss": 7.7817, + "step": 2980 + }, + { + "epoch": 0.004201230510909675, + "grad_norm": 0.8644803166389465, + "learning_rate": 1.2599409863706618e-05, + "loss": 7.8588, + "step": 2990 + }, + { + "epoch": 0.004215281449073253, + "grad_norm": 0.8625914454460144, + "learning_rate": 1.2641562456091047e-05, + "loss": 7.728, + "step": 3000 + }, + { + "epoch": 0.00422933238723683, + "grad_norm": 1.1142210960388184, + "learning_rate": 1.268371504847548e-05, + "loss": 7.7437, + "step": 3010 + }, + { + "epoch": 0.004243383325400408, + "grad_norm": 0.852684736251831, + "learning_rate": 1.2725867640859911e-05, + "loss": 7.7905, + "step": 3020 + }, + { + "epoch": 0.004257434263563985, + "grad_norm": 0.8908416628837585, + "learning_rate": 1.2768020233244344e-05, + "loss": 7.6894, + "step": 3030 + }, + { + "epoch": 0.004271485201727563, + "grad_norm": 0.8012714982032776, + "learning_rate": 1.2810172825628773e-05, + "loss": 7.8086, + "step": 3040 + }, + { + "epoch": 0.004285536139891141, + "grad_norm": 0.6924638748168945, + "learning_rate": 1.2852325418013206e-05, + "loss": 7.8487, + "step": 3050 + }, + { + "epoch": 0.004299587078054718, + "grad_norm": 0.8051576614379883, + "learning_rate": 1.2894478010397638e-05, + "loss": 7.7666, + "step": 3060 + }, + { + "epoch": 0.004313638016218296, + "grad_norm": 1.3711603879928589, + "learning_rate": 1.293663060278207e-05, + "loss": 7.7913, + "step": 3070 + }, + { + "epoch": 0.004327688954381873, + "grad_norm": 0.8338873982429504, + "learning_rate": 1.2978783195166503e-05, + "loss": 7.7112, + "step": 3080 + }, + { + "epoch": 0.004341739892545451, + "grad_norm": 0.7337360978126526, + "learning_rate": 1.3020935787550933e-05, + "loss": 7.753, + "step": 3090 + }, + { + "epoch": 0.004355790830709028, + "grad_norm": 0.888911247253418, + "learning_rate": 1.3063088379935366e-05, + "loss": 7.6947, + "step": 3100 + }, + { + "epoch": 0.0043698417688726055, + "grad_norm": 0.852385401725769, + "learning_rate": 1.3105240972319797e-05, + "loss": 7.7105, + "step": 3110 + }, + { + "epoch": 0.0043838927070361825, + "grad_norm": 0.840452253818512, + "learning_rate": 1.314739356470423e-05, + "loss": 7.8018, + "step": 3120 + }, + { + "epoch": 0.0043979436451997604, + "grad_norm": 0.8020384311676025, + "learning_rate": 1.3189546157088659e-05, + "loss": 7.7558, + "step": 3130 + }, + { + "epoch": 0.004411994583363338, + "grad_norm": 0.8536344170570374, + "learning_rate": 1.3231698749473092e-05, + "loss": 7.6887, + "step": 3140 + }, + { + "epoch": 0.004426045521526915, + "grad_norm": 0.8429573178291321, + "learning_rate": 1.3273851341857523e-05, + "loss": 7.7193, + "step": 3150 + }, + { + "epoch": 0.004440096459690493, + "grad_norm": 0.9258587956428528, + "learning_rate": 1.3316003934241956e-05, + "loss": 7.7103, + "step": 3160 + }, + { + "epoch": 0.00445414739785407, + "grad_norm": 0.7705568671226501, + "learning_rate": 1.3358156526626385e-05, + "loss": 7.707, + "step": 3170 + }, + { + "epoch": 0.004468198336017648, + "grad_norm": 1.0604978799819946, + "learning_rate": 1.3400309119010818e-05, + "loss": 7.7287, + "step": 3180 + }, + { + "epoch": 0.004482249274181225, + "grad_norm": 0.8479204177856445, + "learning_rate": 1.344246171139525e-05, + "loss": 7.6984, + "step": 3190 + }, + { + "epoch": 0.004496300212344803, + "grad_norm": 0.8553823232650757, + "learning_rate": 1.3484614303779682e-05, + "loss": 7.6764, + "step": 3200 + }, + { + "epoch": 0.00451035115050838, + "grad_norm": 0.9012032151222229, + "learning_rate": 1.3526766896164112e-05, + "loss": 7.7013, + "step": 3210 + }, + { + "epoch": 0.004524402088671958, + "grad_norm": 0.8775386810302734, + "learning_rate": 1.3568919488548545e-05, + "loss": 7.7516, + "step": 3220 + }, + { + "epoch": 0.004538453026835535, + "grad_norm": 1.1585534811019897, + "learning_rate": 1.3611072080932976e-05, + "loss": 7.6667, + "step": 3230 + }, + { + "epoch": 0.004552503964999113, + "grad_norm": 0.9868244528770447, + "learning_rate": 1.3653224673317409e-05, + "loss": 7.6702, + "step": 3240 + }, + { + "epoch": 0.004566554903162691, + "grad_norm": 0.9672613143920898, + "learning_rate": 1.3695377265701838e-05, + "loss": 7.6212, + "step": 3250 + }, + { + "epoch": 0.004580605841326268, + "grad_norm": 1.0549743175506592, + "learning_rate": 1.3737529858086271e-05, + "loss": 7.6721, + "step": 3260 + }, + { + "epoch": 0.004594656779489846, + "grad_norm": 0.8549468517303467, + "learning_rate": 1.3779682450470702e-05, + "loss": 7.809, + "step": 3270 + }, + { + "epoch": 0.004608707717653423, + "grad_norm": 1.5186914205551147, + "learning_rate": 1.3821835042855135e-05, + "loss": 7.6878, + "step": 3280 + }, + { + "epoch": 0.004622758655817001, + "grad_norm": 1.0272847414016724, + "learning_rate": 1.3863987635239565e-05, + "loss": 7.6362, + "step": 3290 + }, + { + "epoch": 0.004636809593980578, + "grad_norm": 0.8402895927429199, + "learning_rate": 1.3906140227623997e-05, + "loss": 7.6875, + "step": 3300 + }, + { + "epoch": 0.004650860532144156, + "grad_norm": 0.8236798644065857, + "learning_rate": 1.3948292820008429e-05, + "loss": 7.6046, + "step": 3310 + }, + { + "epoch": 0.004664911470307733, + "grad_norm": 1.09159517288208, + "learning_rate": 1.3990445412392861e-05, + "loss": 7.7144, + "step": 3320 + }, + { + "epoch": 0.004678962408471311, + "grad_norm": 0.9624311327934265, + "learning_rate": 1.4032598004777294e-05, + "loss": 7.6603, + "step": 3330 + }, + { + "epoch": 0.0046930133466348885, + "grad_norm": 0.9037488698959351, + "learning_rate": 1.4074750597161724e-05, + "loss": 7.6887, + "step": 3340 + }, + { + "epoch": 0.004707064284798466, + "grad_norm": 0.8625498414039612, + "learning_rate": 1.4116903189546157e-05, + "loss": 7.6297, + "step": 3350 + }, + { + "epoch": 0.0047211152229620435, + "grad_norm": 0.9417233467102051, + "learning_rate": 1.4159055781930588e-05, + "loss": 7.6359, + "step": 3360 + }, + { + "epoch": 0.0047351661611256205, + "grad_norm": 0.9447644948959351, + "learning_rate": 1.420120837431502e-05, + "loss": 7.6173, + "step": 3370 + }, + { + "epoch": 0.004749217099289198, + "grad_norm": 0.9068923592567444, + "learning_rate": 1.424336096669945e-05, + "loss": 7.6694, + "step": 3380 + }, + { + "epoch": 0.0047632680374527754, + "grad_norm": 0.9849382042884827, + "learning_rate": 1.4285513559083883e-05, + "loss": 7.6411, + "step": 3390 + }, + { + "epoch": 0.004777318975616353, + "grad_norm": 0.8907585740089417, + "learning_rate": 1.4327666151468314e-05, + "loss": 7.6669, + "step": 3400 + }, + { + "epoch": 0.00479136991377993, + "grad_norm": 1.1062923669815063, + "learning_rate": 1.4369818743852747e-05, + "loss": 7.6615, + "step": 3410 + }, + { + "epoch": 0.004805420851943508, + "grad_norm": 0.9381359219551086, + "learning_rate": 1.4411971336237176e-05, + "loss": 7.6943, + "step": 3420 + }, + { + "epoch": 0.004819471790107085, + "grad_norm": 1.3278305530548096, + "learning_rate": 1.445412392862161e-05, + "loss": 7.6195, + "step": 3430 + }, + { + "epoch": 0.004833522728270663, + "grad_norm": 0.8325286507606506, + "learning_rate": 1.449627652100604e-05, + "loss": 7.6601, + "step": 3440 + }, + { + "epoch": 0.004847573666434241, + "grad_norm": 0.9755851030349731, + "learning_rate": 1.4538429113390473e-05, + "loss": 7.6727, + "step": 3450 + }, + { + "epoch": 0.004861624604597818, + "grad_norm": 0.8660493493080139, + "learning_rate": 1.4580581705774903e-05, + "loss": 7.5598, + "step": 3460 + }, + { + "epoch": 0.004875675542761396, + "grad_norm": 0.8259555101394653, + "learning_rate": 1.4622734298159336e-05, + "loss": 7.5946, + "step": 3470 + }, + { + "epoch": 0.004889726480924973, + "grad_norm": 0.9008260369300842, + "learning_rate": 1.4664886890543767e-05, + "loss": 7.5731, + "step": 3480 + }, + { + "epoch": 0.004903777419088551, + "grad_norm": 0.8646583557128906, + "learning_rate": 1.47070394829282e-05, + "loss": 7.602, + "step": 3490 + }, + { + "epoch": 0.004917828357252128, + "grad_norm": 0.8853713870048523, + "learning_rate": 1.474919207531263e-05, + "loss": 7.5559, + "step": 3500 + }, + { + "epoch": 0.004931879295415706, + "grad_norm": 0.8749480247497559, + "learning_rate": 1.4791344667697062e-05, + "loss": 7.5826, + "step": 3510 + }, + { + "epoch": 0.004945930233579283, + "grad_norm": 0.8559566736221313, + "learning_rate": 1.4833497260081493e-05, + "loss": 7.6301, + "step": 3520 + }, + { + "epoch": 0.004959981171742861, + "grad_norm": 0.8258165121078491, + "learning_rate": 1.4875649852465926e-05, + "loss": 7.5909, + "step": 3530 + }, + { + "epoch": 0.004974032109906439, + "grad_norm": 0.9734731912612915, + "learning_rate": 1.4917802444850356e-05, + "loss": 7.5878, + "step": 3540 + }, + { + "epoch": 0.004988083048070016, + "grad_norm": 0.9722371697425842, + "learning_rate": 1.4959955037234788e-05, + "loss": 7.5181, + "step": 3550 + }, + { + "epoch": 0.005002133986233594, + "grad_norm": 1.2670505046844482, + "learning_rate": 1.500210762961922e-05, + "loss": 7.6094, + "step": 3560 + }, + { + "epoch": 0.005016184924397171, + "grad_norm": 0.8584160804748535, + "learning_rate": 1.5044260222003652e-05, + "loss": 7.5955, + "step": 3570 + }, + { + "epoch": 0.005030235862560749, + "grad_norm": 1.0112240314483643, + "learning_rate": 1.5086412814388085e-05, + "loss": 7.6168, + "step": 3580 + }, + { + "epoch": 0.005044286800724326, + "grad_norm": 4.073171138763428, + "learning_rate": 1.5128565406772515e-05, + "loss": 7.5669, + "step": 3590 + }, + { + "epoch": 0.0050583377388879035, + "grad_norm": 0.9480034112930298, + "learning_rate": 1.5170717999156948e-05, + "loss": 7.505, + "step": 3600 + }, + { + "epoch": 0.005072388677051481, + "grad_norm": 0.95200115442276, + "learning_rate": 1.5212870591541379e-05, + "loss": 7.6224, + "step": 3610 + }, + { + "epoch": 0.0050864396152150585, + "grad_norm": 1.117475986480713, + "learning_rate": 1.5255023183925812e-05, + "loss": 7.5023, + "step": 3620 + }, + { + "epoch": 0.0051004905533786355, + "grad_norm": 0.9365076422691345, + "learning_rate": 1.529717577631024e-05, + "loss": 7.5295, + "step": 3630 + }, + { + "epoch": 0.005114541491542213, + "grad_norm": 0.9868768453598022, + "learning_rate": 1.5339328368694676e-05, + "loss": 7.5347, + "step": 3640 + }, + { + "epoch": 0.005128592429705791, + "grad_norm": 0.860265851020813, + "learning_rate": 1.5381480961079103e-05, + "loss": 7.5199, + "step": 3650 + }, + { + "epoch": 0.005142643367869368, + "grad_norm": 0.9999997615814209, + "learning_rate": 1.5423633553463538e-05, + "loss": 7.5793, + "step": 3660 + }, + { + "epoch": 0.005156694306032946, + "grad_norm": 0.8633819222450256, + "learning_rate": 1.546578614584797e-05, + "loss": 7.6475, + "step": 3670 + }, + { + "epoch": 0.005170745244196523, + "grad_norm": 0.902273416519165, + "learning_rate": 1.55079387382324e-05, + "loss": 7.5576, + "step": 3680 + }, + { + "epoch": 0.005184796182360101, + "grad_norm": 1.04349946975708, + "learning_rate": 1.555009133061683e-05, + "loss": 7.5067, + "step": 3690 + }, + { + "epoch": 0.005198847120523678, + "grad_norm": 0.8825836181640625, + "learning_rate": 1.5592243923001263e-05, + "loss": 7.5545, + "step": 3700 + }, + { + "epoch": 0.005212898058687256, + "grad_norm": 0.853202760219574, + "learning_rate": 1.5634396515385694e-05, + "loss": 7.615, + "step": 3710 + }, + { + "epoch": 0.005226948996850833, + "grad_norm": 1.0415570735931396, + "learning_rate": 1.567654910777013e-05, + "loss": 7.5865, + "step": 3720 + }, + { + "epoch": 0.005240999935014411, + "grad_norm": 0.943265974521637, + "learning_rate": 1.5718701700154556e-05, + "loss": 7.5048, + "step": 3730 + }, + { + "epoch": 0.005255050873177988, + "grad_norm": 1.0362399816513062, + "learning_rate": 1.576085429253899e-05, + "loss": 7.5796, + "step": 3740 + }, + { + "epoch": 0.005269101811341566, + "grad_norm": 1.0052247047424316, + "learning_rate": 1.5803006884923422e-05, + "loss": 7.4803, + "step": 3750 + }, + { + "epoch": 0.005283152749505144, + "grad_norm": 1.0194339752197266, + "learning_rate": 1.5845159477307853e-05, + "loss": 7.5259, + "step": 3760 + }, + { + "epoch": 0.005297203687668721, + "grad_norm": 1.3323463201522827, + "learning_rate": 1.5887312069692284e-05, + "loss": 7.5403, + "step": 3770 + }, + { + "epoch": 0.005311254625832299, + "grad_norm": 0.9728202223777771, + "learning_rate": 1.5929464662076715e-05, + "loss": 7.526, + "step": 3780 + }, + { + "epoch": 0.005325305563995876, + "grad_norm": 0.979168176651001, + "learning_rate": 1.5971617254461147e-05, + "loss": 7.5024, + "step": 3790 + }, + { + "epoch": 0.005339356502159454, + "grad_norm": 1.0642094612121582, + "learning_rate": 1.601376984684558e-05, + "loss": 7.548, + "step": 3800 + }, + { + "epoch": 0.005353407440323031, + "grad_norm": 0.8981004953384399, + "learning_rate": 1.605592243923001e-05, + "loss": 7.5229, + "step": 3810 + }, + { + "epoch": 0.005367458378486609, + "grad_norm": 0.9074918031692505, + "learning_rate": 1.6098075031614443e-05, + "loss": 7.5607, + "step": 3820 + }, + { + "epoch": 0.005381509316650186, + "grad_norm": 0.9015301465988159, + "learning_rate": 1.6140227623998875e-05, + "loss": 7.5874, + "step": 3830 + }, + { + "epoch": 0.005395560254813764, + "grad_norm": 0.8846139907836914, + "learning_rate": 1.6182380216383306e-05, + "loss": 7.4998, + "step": 3840 + }, + { + "epoch": 0.0054096111929773415, + "grad_norm": 1.1710816621780396, + "learning_rate": 1.622453280876774e-05, + "loss": 7.4512, + "step": 3850 + }, + { + "epoch": 0.0054236621311409185, + "grad_norm": 1.1861531734466553, + "learning_rate": 1.6266685401152168e-05, + "loss": 7.4586, + "step": 3860 + }, + { + "epoch": 0.0054377130693044964, + "grad_norm": 1.260762095451355, + "learning_rate": 1.6308837993536603e-05, + "loss": 7.4841, + "step": 3870 + }, + { + "epoch": 0.0054517640074680735, + "grad_norm": 0.8817400932312012, + "learning_rate": 1.6350990585921034e-05, + "loss": 7.4975, + "step": 3880 + }, + { + "epoch": 0.005465814945631651, + "grad_norm": 1.0645872354507446, + "learning_rate": 1.6393143178305465e-05, + "loss": 7.5029, + "step": 3890 + }, + { + "epoch": 0.005479865883795228, + "grad_norm": 0.9742002487182617, + "learning_rate": 1.6435295770689896e-05, + "loss": 7.482, + "step": 3900 + }, + { + "epoch": 0.005493916821958806, + "grad_norm": 0.9298391342163086, + "learning_rate": 1.6477448363074327e-05, + "loss": 7.5051, + "step": 3910 + }, + { + "epoch": 0.005507967760122383, + "grad_norm": 0.9993727207183838, + "learning_rate": 1.651960095545876e-05, + "loss": 7.4972, + "step": 3920 + }, + { + "epoch": 0.005522018698285961, + "grad_norm": 1.3055437803268433, + "learning_rate": 1.6561753547843193e-05, + "loss": 7.4705, + "step": 3930 + }, + { + "epoch": 0.005536069636449538, + "grad_norm": 1.0684734582901, + "learning_rate": 1.660390614022762e-05, + "loss": 7.45, + "step": 3940 + }, + { + "epoch": 0.005550120574613116, + "grad_norm": 1.1004630327224731, + "learning_rate": 1.6646058732612055e-05, + "loss": 7.4706, + "step": 3950 + }, + { + "epoch": 0.005564171512776694, + "grad_norm": 1.3142606019973755, + "learning_rate": 1.6688211324996487e-05, + "loss": 7.4427, + "step": 3960 + }, + { + "epoch": 0.005578222450940271, + "grad_norm": 0.9538849592208862, + "learning_rate": 1.6730363917380918e-05, + "loss": 7.4318, + "step": 3970 + }, + { + "epoch": 0.005592273389103849, + "grad_norm": 1.0903221368789673, + "learning_rate": 1.677251650976535e-05, + "loss": 7.5066, + "step": 3980 + }, + { + "epoch": 0.005606324327267426, + "grad_norm": 1.7417656183242798, + "learning_rate": 1.681466910214978e-05, + "loss": 7.5136, + "step": 3990 + }, + { + "epoch": 0.005620375265431004, + "grad_norm": 1.4083300828933716, + "learning_rate": 1.685682169453421e-05, + "loss": 7.5295, + "step": 4000 + }, + { + "epoch": 0.005634426203594581, + "grad_norm": 0.9326221942901611, + "learning_rate": 1.6898974286918646e-05, + "loss": 7.4544, + "step": 4010 + }, + { + "epoch": 0.005648477141758159, + "grad_norm": 0.936113715171814, + "learning_rate": 1.6941126879303074e-05, + "loss": 7.4514, + "step": 4020 + }, + { + "epoch": 0.005662528079921736, + "grad_norm": 0.8883742690086365, + "learning_rate": 1.6983279471687508e-05, + "loss": 7.4411, + "step": 4030 + }, + { + "epoch": 0.005676579018085314, + "grad_norm": 1.0120148658752441, + "learning_rate": 1.702543206407194e-05, + "loss": 7.4305, + "step": 4040 + }, + { + "epoch": 0.005690629956248892, + "grad_norm": 1.0194205045700073, + "learning_rate": 1.706758465645637e-05, + "loss": 7.4132, + "step": 4050 + }, + { + "epoch": 0.005704680894412469, + "grad_norm": 0.8495796918869019, + "learning_rate": 1.71097372488408e-05, + "loss": 7.4617, + "step": 4060 + }, + { + "epoch": 0.005718731832576047, + "grad_norm": 1.0692280530929565, + "learning_rate": 1.7151889841225233e-05, + "loss": 7.4526, + "step": 4070 + }, + { + "epoch": 0.005732782770739624, + "grad_norm": 0.9888425469398499, + "learning_rate": 1.7194042433609667e-05, + "loss": 7.4095, + "step": 4080 + }, + { + "epoch": 0.005746833708903202, + "grad_norm": 1.1134666204452515, + "learning_rate": 1.72361950259941e-05, + "loss": 7.4779, + "step": 4090 + }, + { + "epoch": 0.005760884647066779, + "grad_norm": 0.9555155634880066, + "learning_rate": 1.727834761837853e-05, + "loss": 7.4583, + "step": 4100 + }, + { + "epoch": 0.0057749355852303565, + "grad_norm": 1.1139003038406372, + "learning_rate": 1.732050021076296e-05, + "loss": 7.3599, + "step": 4110 + }, + { + "epoch": 0.0057889865233939335, + "grad_norm": 0.9886006116867065, + "learning_rate": 1.7362652803147392e-05, + "loss": 7.4064, + "step": 4120 + }, + { + "epoch": 0.0058030374615575114, + "grad_norm": 1.1849690675735474, + "learning_rate": 1.7404805395531823e-05, + "loss": 7.4462, + "step": 4130 + }, + { + "epoch": 0.0058170883997210885, + "grad_norm": 1.1874699592590332, + "learning_rate": 1.7446957987916258e-05, + "loss": 7.4456, + "step": 4140 + }, + { + "epoch": 0.005831139337884666, + "grad_norm": 1.0019316673278809, + "learning_rate": 1.7489110580300686e-05, + "loss": 7.4241, + "step": 4150 + }, + { + "epoch": 0.005845190276048244, + "grad_norm": 1.0301438570022583, + "learning_rate": 1.753126317268512e-05, + "loss": 7.3951, + "step": 4160 + }, + { + "epoch": 0.005859241214211821, + "grad_norm": 1.0591559410095215, + "learning_rate": 1.757341576506955e-05, + "loss": 7.4101, + "step": 4170 + }, + { + "epoch": 0.005873292152375399, + "grad_norm": 1.3416529893875122, + "learning_rate": 1.7615568357453982e-05, + "loss": 7.4434, + "step": 4180 + }, + { + "epoch": 0.005887343090538976, + "grad_norm": 1.153102159500122, + "learning_rate": 1.7657720949838414e-05, + "loss": 7.4831, + "step": 4190 + }, + { + "epoch": 0.005901394028702554, + "grad_norm": 1.0388206243515015, + "learning_rate": 1.7699873542222845e-05, + "loss": 7.4446, + "step": 4200 + }, + { + "epoch": 0.005915444966866131, + "grad_norm": 0.9760084748268127, + "learning_rate": 1.7742026134607276e-05, + "loss": 7.4868, + "step": 4210 + }, + { + "epoch": 0.005929495905029709, + "grad_norm": 1.2405914068222046, + "learning_rate": 1.778417872699171e-05, + "loss": 7.429, + "step": 4220 + }, + { + "epoch": 0.005943546843193286, + "grad_norm": 0.9897080659866333, + "learning_rate": 1.7826331319376138e-05, + "loss": 7.3724, + "step": 4230 + }, + { + "epoch": 0.005957597781356864, + "grad_norm": 0.85589200258255, + "learning_rate": 1.7868483911760573e-05, + "loss": 7.451, + "step": 4240 + }, + { + "epoch": 0.005971648719520442, + "grad_norm": 1.0381278991699219, + "learning_rate": 1.7910636504145004e-05, + "loss": 7.3856, + "step": 4250 + }, + { + "epoch": 0.005985699657684019, + "grad_norm": 1.2100075483322144, + "learning_rate": 1.7952789096529435e-05, + "loss": 7.4264, + "step": 4260 + }, + { + "epoch": 0.005999750595847597, + "grad_norm": 0.9492905139923096, + "learning_rate": 1.7994941688913866e-05, + "loss": 7.4114, + "step": 4270 + }, + { + "epoch": 0.006013801534011174, + "grad_norm": 0.92913419008255, + "learning_rate": 1.8037094281298297e-05, + "loss": 7.4022, + "step": 4280 + }, + { + "epoch": 0.006027852472174752, + "grad_norm": 1.1783087253570557, + "learning_rate": 1.807924687368273e-05, + "loss": 7.3281, + "step": 4290 + }, + { + "epoch": 0.006041903410338329, + "grad_norm": 1.008894681930542, + "learning_rate": 1.8121399466067163e-05, + "loss": 7.39, + "step": 4300 + }, + { + "epoch": 0.006055954348501907, + "grad_norm": 0.9797470569610596, + "learning_rate": 1.816355205845159e-05, + "loss": 7.4415, + "step": 4310 + }, + { + "epoch": 0.006070005286665484, + "grad_norm": 1.3586163520812988, + "learning_rate": 1.8205704650836025e-05, + "loss": 7.3472, + "step": 4320 + }, + { + "epoch": 0.006084056224829062, + "grad_norm": 1.0115594863891602, + "learning_rate": 1.8247857243220457e-05, + "loss": 7.4058, + "step": 4330 + }, + { + "epoch": 0.006098107162992639, + "grad_norm": 1.0275291204452515, + "learning_rate": 1.8290009835604888e-05, + "loss": 7.2664, + "step": 4340 + }, + { + "epoch": 0.0061121581011562166, + "grad_norm": 0.9744628667831421, + "learning_rate": 1.833216242798932e-05, + "loss": 7.3819, + "step": 4350 + }, + { + "epoch": 0.0061262090393197945, + "grad_norm": 1.1602356433868408, + "learning_rate": 1.837431502037375e-05, + "loss": 7.3576, + "step": 4360 + }, + { + "epoch": 0.0061402599774833715, + "grad_norm": 1.18546462059021, + "learning_rate": 1.8416467612758185e-05, + "loss": 7.3605, + "step": 4370 + }, + { + "epoch": 0.006154310915646949, + "grad_norm": 0.9172095060348511, + "learning_rate": 1.8458620205142616e-05, + "loss": 7.3156, + "step": 4380 + }, + { + "epoch": 0.006168361853810526, + "grad_norm": 1.1265218257904053, + "learning_rate": 1.8500772797527047e-05, + "loss": 7.3078, + "step": 4390 + }, + { + "epoch": 0.006182412791974104, + "grad_norm": 0.9350690841674805, + "learning_rate": 1.8542925389911478e-05, + "loss": 7.3547, + "step": 4400 + }, + { + "epoch": 0.006196463730137681, + "grad_norm": 0.9449189901351929, + "learning_rate": 1.858507798229591e-05, + "loss": 7.3303, + "step": 4410 + }, + { + "epoch": 0.006210514668301259, + "grad_norm": 1.1331335306167603, + "learning_rate": 1.862723057468034e-05, + "loss": 7.3691, + "step": 4420 + }, + { + "epoch": 0.006224565606464836, + "grad_norm": 1.1100194454193115, + "learning_rate": 1.8669383167064772e-05, + "loss": 7.3479, + "step": 4430 + }, + { + "epoch": 0.006238616544628414, + "grad_norm": 1.2014113664627075, + "learning_rate": 1.8711535759449203e-05, + "loss": 7.2566, + "step": 4440 + }, + { + "epoch": 0.006252667482791992, + "grad_norm": 1.2027499675750732, + "learning_rate": 1.8753688351833634e-05, + "loss": 7.3306, + "step": 4450 + }, + { + "epoch": 0.006266718420955569, + "grad_norm": 1.2113648653030396, + "learning_rate": 1.879584094421807e-05, + "loss": 7.2312, + "step": 4460 + }, + { + "epoch": 0.006280769359119147, + "grad_norm": 1.14470636844635, + "learning_rate": 1.88379935366025e-05, + "loss": 7.3603, + "step": 4470 + }, + { + "epoch": 0.006294820297282724, + "grad_norm": 1.0293289422988892, + "learning_rate": 1.888014612898693e-05, + "loss": 7.3057, + "step": 4480 + }, + { + "epoch": 0.006308871235446302, + "grad_norm": 1.1022226810455322, + "learning_rate": 1.8922298721371362e-05, + "loss": 7.2761, + "step": 4490 + }, + { + "epoch": 0.006322922173609879, + "grad_norm": 1.1800730228424072, + "learning_rate": 1.8964451313755793e-05, + "loss": 7.2929, + "step": 4500 + }, + { + "epoch": 0.006336973111773457, + "grad_norm": 0.8201963901519775, + "learning_rate": 1.9006603906140228e-05, + "loss": 7.3127, + "step": 4510 + }, + { + "epoch": 0.006351024049937034, + "grad_norm": 1.0580271482467651, + "learning_rate": 1.904875649852466e-05, + "loss": 7.2965, + "step": 4520 + }, + { + "epoch": 0.006365074988100612, + "grad_norm": 1.135973572731018, + "learning_rate": 1.9090909090909087e-05, + "loss": 7.3115, + "step": 4530 + }, + { + "epoch": 0.006379125926264189, + "grad_norm": 1.0042225122451782, + "learning_rate": 1.913306168329352e-05, + "loss": 7.326, + "step": 4540 + }, + { + "epoch": 0.006393176864427767, + "grad_norm": 1.073927879333496, + "learning_rate": 1.9175214275677952e-05, + "loss": 7.2742, + "step": 4550 + }, + { + "epoch": 0.006407227802591345, + "grad_norm": 1.384734034538269, + "learning_rate": 1.9217366868062384e-05, + "loss": 7.2784, + "step": 4560 + }, + { + "epoch": 0.006421278740754922, + "grad_norm": 1.0155476331710815, + "learning_rate": 1.9259519460446818e-05, + "loss": 7.2515, + "step": 4570 + }, + { + "epoch": 0.0064353296789185, + "grad_norm": 1.076328992843628, + "learning_rate": 1.9301672052831246e-05, + "loss": 7.3195, + "step": 4580 + }, + { + "epoch": 0.006449380617082077, + "grad_norm": 1.0640712976455688, + "learning_rate": 1.934382464521568e-05, + "loss": 7.2912, + "step": 4590 + }, + { + "epoch": 0.0064634315552456545, + "grad_norm": 0.9461340308189392, + "learning_rate": 1.9385977237600112e-05, + "loss": 7.3366, + "step": 4600 + }, + { + "epoch": 0.0064774824934092316, + "grad_norm": 0.9778033494949341, + "learning_rate": 1.9428129829984543e-05, + "loss": 7.2975, + "step": 4610 + }, + { + "epoch": 0.0064915334315728095, + "grad_norm": 1.0978727340698242, + "learning_rate": 1.9470282422368974e-05, + "loss": 7.271, + "step": 4620 + }, + { + "epoch": 0.0065055843697363865, + "grad_norm": 1.1852045059204102, + "learning_rate": 1.9512435014753405e-05, + "loss": 7.2987, + "step": 4630 + }, + { + "epoch": 0.006519635307899964, + "grad_norm": 1.0794644355773926, + "learning_rate": 1.9554587607137836e-05, + "loss": 7.2204, + "step": 4640 + }, + { + "epoch": 0.006533686246063541, + "grad_norm": 1.1216827630996704, + "learning_rate": 1.959674019952227e-05, + "loss": 7.2575, + "step": 4650 + }, + { + "epoch": 0.006547737184227119, + "grad_norm": 0.9647300839424133, + "learning_rate": 1.96388927919067e-05, + "loss": 7.3514, + "step": 4660 + }, + { + "epoch": 0.006561788122390697, + "grad_norm": 1.1785202026367188, + "learning_rate": 1.9681045384291133e-05, + "loss": 7.2722, + "step": 4670 + }, + { + "epoch": 0.006575839060554274, + "grad_norm": 1.2579607963562012, + "learning_rate": 1.9723197976675564e-05, + "loss": 7.2684, + "step": 4680 + }, + { + "epoch": 0.006589889998717852, + "grad_norm": 1.069290280342102, + "learning_rate": 1.9765350569059996e-05, + "loss": 7.2563, + "step": 4690 + }, + { + "epoch": 0.006603940936881429, + "grad_norm": 1.1216323375701904, + "learning_rate": 1.9807503161444427e-05, + "loss": 7.1934, + "step": 4700 + }, + { + "epoch": 0.006617991875045007, + "grad_norm": 1.0338348150253296, + "learning_rate": 1.9849655753828858e-05, + "loss": 7.2595, + "step": 4710 + }, + { + "epoch": 0.006632042813208584, + "grad_norm": 1.0657542943954468, + "learning_rate": 1.989180834621329e-05, + "loss": 7.2447, + "step": 4720 + }, + { + "epoch": 0.006646093751372162, + "grad_norm": 1.0090150833129883, + "learning_rate": 1.9933960938597724e-05, + "loss": 7.2676, + "step": 4730 + }, + { + "epoch": 0.006660144689535739, + "grad_norm": 1.244844913482666, + "learning_rate": 1.997611353098215e-05, + "loss": 7.2931, + "step": 4740 + }, + { + "epoch": 0.006674195627699317, + "grad_norm": 1.4052555561065674, + "learning_rate": 2.0018266123366586e-05, + "loss": 7.3218, + "step": 4750 + }, + { + "epoch": 0.006688246565862895, + "grad_norm": 1.0389999151229858, + "learning_rate": 2.0060418715751017e-05, + "loss": 7.3679, + "step": 4760 + }, + { + "epoch": 0.006702297504026472, + "grad_norm": 1.2123712301254272, + "learning_rate": 2.010257130813545e-05, + "loss": 7.3174, + "step": 4770 + }, + { + "epoch": 0.00671634844219005, + "grad_norm": 1.31826651096344, + "learning_rate": 2.014472390051988e-05, + "loss": 7.2676, + "step": 4780 + }, + { + "epoch": 0.006730399380353627, + "grad_norm": 1.0967237949371338, + "learning_rate": 2.018687649290431e-05, + "loss": 7.2652, + "step": 4790 + }, + { + "epoch": 0.006744450318517205, + "grad_norm": 1.1359235048294067, + "learning_rate": 2.0229029085288742e-05, + "loss": 7.2868, + "step": 4800 + }, + { + "epoch": 0.006758501256680782, + "grad_norm": 1.052154779434204, + "learning_rate": 2.0271181677673176e-05, + "loss": 7.288, + "step": 4810 + }, + { + "epoch": 0.00677255219484436, + "grad_norm": 1.0705679655075073, + "learning_rate": 2.0313334270057608e-05, + "loss": 7.2405, + "step": 4820 + }, + { + "epoch": 0.006786603133007937, + "grad_norm": 0.9692743420600891, + "learning_rate": 2.035548686244204e-05, + "loss": 7.1738, + "step": 4830 + }, + { + "epoch": 0.006800654071171515, + "grad_norm": 1.2184343338012695, + "learning_rate": 2.039763945482647e-05, + "loss": 7.2294, + "step": 4840 + }, + { + "epoch": 0.006814705009335092, + "grad_norm": 1.4309422969818115, + "learning_rate": 2.04397920472109e-05, + "loss": 7.2412, + "step": 4850 + }, + { + "epoch": 0.0068287559474986695, + "grad_norm": 1.538222312927246, + "learning_rate": 2.0481944639595336e-05, + "loss": 7.1677, + "step": 4860 + }, + { + "epoch": 0.006842806885662247, + "grad_norm": 1.1325163841247559, + "learning_rate": 2.0524097231979763e-05, + "loss": 7.2538, + "step": 4870 + }, + { + "epoch": 0.0068568578238258245, + "grad_norm": 1.0051361322402954, + "learning_rate": 2.0566249824364198e-05, + "loss": 7.2056, + "step": 4880 + }, + { + "epoch": 0.006870908761989402, + "grad_norm": 1.08148193359375, + "learning_rate": 2.060840241674863e-05, + "loss": 7.1734, + "step": 4890 + }, + { + "epoch": 0.006884959700152979, + "grad_norm": 1.2696799039840698, + "learning_rate": 2.065055500913306e-05, + "loss": 7.2288, + "step": 4900 + }, + { + "epoch": 0.006899010638316557, + "grad_norm": 1.0847587585449219, + "learning_rate": 2.069270760151749e-05, + "loss": 7.2802, + "step": 4910 + }, + { + "epoch": 0.006913061576480134, + "grad_norm": 1.2094314098358154, + "learning_rate": 2.0734860193901923e-05, + "loss": 7.2064, + "step": 4920 + }, + { + "epoch": 0.006927112514643712, + "grad_norm": 1.1414620876312256, + "learning_rate": 2.0777012786286354e-05, + "loss": 7.1366, + "step": 4930 + }, + { + "epoch": 0.006941163452807289, + "grad_norm": 1.2508331537246704, + "learning_rate": 2.081916537867079e-05, + "loss": 7.2477, + "step": 4940 + }, + { + "epoch": 0.006955214390970867, + "grad_norm": 1.1051875352859497, + "learning_rate": 2.0861317971055216e-05, + "loss": 7.2439, + "step": 4950 + }, + { + "epoch": 0.006969265329134445, + "grad_norm": 0.9596846103668213, + "learning_rate": 2.090347056343965e-05, + "loss": 7.1859, + "step": 4960 + }, + { + "epoch": 0.006983316267298022, + "grad_norm": 1.0876295566558838, + "learning_rate": 2.0945623155824082e-05, + "loss": 7.2189, + "step": 4970 + }, + { + "epoch": 0.0069973672054616, + "grad_norm": 1.1610974073410034, + "learning_rate": 2.0987775748208513e-05, + "loss": 7.1999, + "step": 4980 + }, + { + "epoch": 0.007011418143625177, + "grad_norm": 1.023998498916626, + "learning_rate": 2.1029928340592944e-05, + "loss": 7.1728, + "step": 4990 + }, + { + "epoch": 0.007025469081788755, + "grad_norm": 1.176242470741272, + "learning_rate": 2.1072080932977375e-05, + "loss": 7.162, + "step": 5000 + }, + { + "epoch": 0.007039520019952332, + "grad_norm": 1.1266804933547974, + "learning_rate": 2.1114233525361806e-05, + "loss": 7.2674, + "step": 5010 + }, + { + "epoch": 0.00705357095811591, + "grad_norm": 1.0107471942901611, + "learning_rate": 2.115638611774624e-05, + "loss": 7.168, + "step": 5020 + }, + { + "epoch": 0.007067621896279487, + "grad_norm": 0.9729942679405212, + "learning_rate": 2.119853871013067e-05, + "loss": 7.2244, + "step": 5030 + }, + { + "epoch": 0.007081672834443065, + "grad_norm": 1.117859125137329, + "learning_rate": 2.1240691302515103e-05, + "loss": 7.1621, + "step": 5040 + }, + { + "epoch": 0.007095723772606642, + "grad_norm": 1.206984281539917, + "learning_rate": 2.1282843894899535e-05, + "loss": 7.2517, + "step": 5050 + }, + { + "epoch": 0.00710977471077022, + "grad_norm": 1.356512188911438, + "learning_rate": 2.1324996487283966e-05, + "loss": 7.2202, + "step": 5060 + }, + { + "epoch": 0.007123825648933798, + "grad_norm": 1.0092519521713257, + "learning_rate": 2.13671490796684e-05, + "loss": 7.2292, + "step": 5070 + }, + { + "epoch": 0.007137876587097375, + "grad_norm": 1.4104299545288086, + "learning_rate": 2.1409301672052828e-05, + "loss": 7.1438, + "step": 5080 + }, + { + "epoch": 0.0071519275252609526, + "grad_norm": 1.2122313976287842, + "learning_rate": 2.145145426443726e-05, + "loss": 7.1562, + "step": 5090 + }, + { + "epoch": 0.00716597846342453, + "grad_norm": 1.1602479219436646, + "learning_rate": 2.1493606856821694e-05, + "loss": 7.166, + "step": 5100 + }, + { + "epoch": 0.0071800294015881075, + "grad_norm": 1.1590441465377808, + "learning_rate": 2.1535759449206125e-05, + "loss": 7.155, + "step": 5110 + }, + { + "epoch": 0.0071940803397516845, + "grad_norm": 0.9332759380340576, + "learning_rate": 2.1577912041590556e-05, + "loss": 7.1189, + "step": 5120 + }, + { + "epoch": 0.007208131277915262, + "grad_norm": 1.1661341190338135, + "learning_rate": 2.1620064633974987e-05, + "loss": 7.1424, + "step": 5130 + }, + { + "epoch": 0.0072221822160788395, + "grad_norm": 1.2383019924163818, + "learning_rate": 2.166221722635942e-05, + "loss": 7.1516, + "step": 5140 + }, + { + "epoch": 0.007236233154242417, + "grad_norm": 1.1211152076721191, + "learning_rate": 2.1704369818743853e-05, + "loss": 7.0884, + "step": 5150 + }, + { + "epoch": 0.007250284092405995, + "grad_norm": 1.0935217142105103, + "learning_rate": 2.174652241112828e-05, + "loss": 7.1914, + "step": 5160 + }, + { + "epoch": 0.007264335030569572, + "grad_norm": 1.0398094654083252, + "learning_rate": 2.1788675003512712e-05, + "loss": 7.1011, + "step": 5170 + }, + { + "epoch": 0.00727838596873315, + "grad_norm": 1.0619064569473267, + "learning_rate": 2.1830827595897146e-05, + "loss": 7.1611, + "step": 5180 + }, + { + "epoch": 0.007292436906896727, + "grad_norm": 1.0957516431808472, + "learning_rate": 2.1872980188281578e-05, + "loss": 7.0981, + "step": 5190 + }, + { + "epoch": 0.007306487845060305, + "grad_norm": 1.1831027269363403, + "learning_rate": 2.191513278066601e-05, + "loss": 7.1843, + "step": 5200 + }, + { + "epoch": 0.007320538783223882, + "grad_norm": 1.323491096496582, + "learning_rate": 2.195728537305044e-05, + "loss": 7.1189, + "step": 5210 + }, + { + "epoch": 0.00733458972138746, + "grad_norm": 1.022131085395813, + "learning_rate": 2.199943796543487e-05, + "loss": 7.1855, + "step": 5220 + }, + { + "epoch": 0.007348640659551037, + "grad_norm": 1.081374168395996, + "learning_rate": 2.2041590557819306e-05, + "loss": 7.1447, + "step": 5230 + }, + { + "epoch": 0.007362691597714615, + "grad_norm": 1.0144031047821045, + "learning_rate": 2.2083743150203733e-05, + "loss": 7.1438, + "step": 5240 + }, + { + "epoch": 0.007376742535878192, + "grad_norm": 1.2261079549789429, + "learning_rate": 2.2125895742588168e-05, + "loss": 7.2035, + "step": 5250 + }, + { + "epoch": 0.00739079347404177, + "grad_norm": 1.2673313617706299, + "learning_rate": 2.21680483349726e-05, + "loss": 7.1767, + "step": 5260 + }, + { + "epoch": 0.007404844412205348, + "grad_norm": 1.063643455505371, + "learning_rate": 2.221020092735703e-05, + "loss": 7.1446, + "step": 5270 + }, + { + "epoch": 0.007418895350368925, + "grad_norm": 1.092332124710083, + "learning_rate": 2.225235351974146e-05, + "loss": 7.132, + "step": 5280 + }, + { + "epoch": 0.007432946288532503, + "grad_norm": 1.2160872220993042, + "learning_rate": 2.2294506112125893e-05, + "loss": 7.2127, + "step": 5290 + }, + { + "epoch": 0.00744699722669608, + "grad_norm": 1.1070367097854614, + "learning_rate": 2.2336658704510324e-05, + "loss": 7.1532, + "step": 5300 + }, + { + "epoch": 0.007461048164859658, + "grad_norm": 1.136398196220398, + "learning_rate": 2.237881129689476e-05, + "loss": 7.1131, + "step": 5310 + }, + { + "epoch": 0.007475099103023235, + "grad_norm": 1.0606321096420288, + "learning_rate": 2.242096388927919e-05, + "loss": 7.0342, + "step": 5320 + }, + { + "epoch": 0.007489150041186813, + "grad_norm": 1.138594150543213, + "learning_rate": 2.246311648166362e-05, + "loss": 7.0824, + "step": 5330 + }, + { + "epoch": 0.00750320097935039, + "grad_norm": 1.3768428564071655, + "learning_rate": 2.2505269074048052e-05, + "loss": 7.1352, + "step": 5340 + }, + { + "epoch": 0.0075172519175139675, + "grad_norm": 1.2241921424865723, + "learning_rate": 2.2547421666432483e-05, + "loss": 7.0765, + "step": 5350 + }, + { + "epoch": 0.0075313028556775454, + "grad_norm": 1.0443278551101685, + "learning_rate": 2.2589574258816918e-05, + "loss": 7.0452, + "step": 5360 + }, + { + "epoch": 0.0075453537938411225, + "grad_norm": 1.295654296875, + "learning_rate": 2.2631726851201345e-05, + "loss": 7.0193, + "step": 5370 + }, + { + "epoch": 0.0075594047320047, + "grad_norm": 1.6669979095458984, + "learning_rate": 2.2673879443585777e-05, + "loss": 7.0958, + "step": 5380 + }, + { + "epoch": 0.007573455670168277, + "grad_norm": 1.039218544960022, + "learning_rate": 2.271603203597021e-05, + "loss": 7.1377, + "step": 5390 + }, + { + "epoch": 0.007587506608331855, + "grad_norm": 1.0409983396530151, + "learning_rate": 2.2758184628354642e-05, + "loss": 7.0415, + "step": 5400 + }, + { + "epoch": 0.007601557546495432, + "grad_norm": 1.0646721124649048, + "learning_rate": 2.2800337220739073e-05, + "loss": 7.0761, + "step": 5410 + }, + { + "epoch": 0.00761560848465901, + "grad_norm": 1.143831729888916, + "learning_rate": 2.2842489813123505e-05, + "loss": 7.083, + "step": 5420 + }, + { + "epoch": 0.007629659422822587, + "grad_norm": 1.0598105192184448, + "learning_rate": 2.2884642405507936e-05, + "loss": 7.0908, + "step": 5430 + }, + { + "epoch": 0.007643710360986165, + "grad_norm": 1.2153172492980957, + "learning_rate": 2.292679499789237e-05, + "loss": 7.0801, + "step": 5440 + }, + { + "epoch": 0.007657761299149742, + "grad_norm": 1.0428032875061035, + "learning_rate": 2.2968947590276798e-05, + "loss": 7.1072, + "step": 5450 + }, + { + "epoch": 0.00767181223731332, + "grad_norm": 1.0740203857421875, + "learning_rate": 2.301110018266123e-05, + "loss": 7.101, + "step": 5460 + }, + { + "epoch": 0.007685863175476898, + "grad_norm": 1.193075180053711, + "learning_rate": 2.3053252775045664e-05, + "loss": 7.1157, + "step": 5470 + }, + { + "epoch": 0.007699914113640475, + "grad_norm": 1.1059104204177856, + "learning_rate": 2.3095405367430095e-05, + "loss": 7.1466, + "step": 5480 + }, + { + "epoch": 0.007713965051804053, + "grad_norm": 1.121106743812561, + "learning_rate": 2.3137557959814526e-05, + "loss": 7.1207, + "step": 5490 + }, + { + "epoch": 0.00772801598996763, + "grad_norm": 1.1866158246994019, + "learning_rate": 2.3179710552198957e-05, + "loss": 7.0526, + "step": 5500 + }, + { + "epoch": 0.007742066928131208, + "grad_norm": 1.1638514995574951, + "learning_rate": 2.322186314458339e-05, + "loss": 7.0261, + "step": 5510 + }, + { + "epoch": 0.007756117866294785, + "grad_norm": 1.1464370489120483, + "learning_rate": 2.3264015736967823e-05, + "loss": 7.1148, + "step": 5520 + }, + { + "epoch": 0.007770168804458363, + "grad_norm": 1.263282060623169, + "learning_rate": 2.330616832935225e-05, + "loss": 7.1217, + "step": 5530 + }, + { + "epoch": 0.00778421974262194, + "grad_norm": 1.1215978860855103, + "learning_rate": 2.3348320921736685e-05, + "loss": 7.1119, + "step": 5540 + }, + { + "epoch": 0.007798270680785518, + "grad_norm": 1.093409776687622, + "learning_rate": 2.3390473514121117e-05, + "loss": 7.0892, + "step": 5550 + }, + { + "epoch": 0.007812321618949095, + "grad_norm": 1.2007728815078735, + "learning_rate": 2.3432626106505548e-05, + "loss": 7.051, + "step": 5560 + }, + { + "epoch": 0.007826372557112674, + "grad_norm": 1.0814827680587769, + "learning_rate": 2.3474778698889982e-05, + "loss": 7.1414, + "step": 5570 + }, + { + "epoch": 0.00784042349527625, + "grad_norm": 1.2719553709030151, + "learning_rate": 2.351693129127441e-05, + "loss": 7.0283, + "step": 5580 + }, + { + "epoch": 0.007854474433439828, + "grad_norm": 1.3586655855178833, + "learning_rate": 2.355908388365884e-05, + "loss": 7.0621, + "step": 5590 + }, + { + "epoch": 0.007868525371603406, + "grad_norm": 1.0803841352462769, + "learning_rate": 2.3601236476043276e-05, + "loss": 7.0976, + "step": 5600 + }, + { + "epoch": 0.007882576309766983, + "grad_norm": 1.3272629976272583, + "learning_rate": 2.3643389068427707e-05, + "loss": 7.0699, + "step": 5610 + }, + { + "epoch": 0.00789662724793056, + "grad_norm": 1.1906023025512695, + "learning_rate": 2.3685541660812138e-05, + "loss": 7.1024, + "step": 5620 + }, + { + "epoch": 0.007910678186094137, + "grad_norm": 1.0353652238845825, + "learning_rate": 2.372769425319657e-05, + "loss": 7.1339, + "step": 5630 + }, + { + "epoch": 0.007924729124257715, + "grad_norm": 1.1890437602996826, + "learning_rate": 2.3769846845581e-05, + "loss": 7.1143, + "step": 5640 + }, + { + "epoch": 0.007938780062421293, + "grad_norm": 1.1110581159591675, + "learning_rate": 2.3811999437965435e-05, + "loss": 7.0033, + "step": 5650 + }, + { + "epoch": 0.00795283100058487, + "grad_norm": 1.1018701791763306, + "learning_rate": 2.3854152030349863e-05, + "loss": 7.0789, + "step": 5660 + }, + { + "epoch": 0.007966881938748447, + "grad_norm": 1.2645286321640015, + "learning_rate": 2.3896304622734294e-05, + "loss": 7.0673, + "step": 5670 + }, + { + "epoch": 0.007980932876912025, + "grad_norm": 1.2000459432601929, + "learning_rate": 2.393845721511873e-05, + "loss": 7.0056, + "step": 5680 + }, + { + "epoch": 0.007994983815075603, + "grad_norm": 1.5909388065338135, + "learning_rate": 2.398060980750316e-05, + "loss": 7.0763, + "step": 5690 + }, + { + "epoch": 0.008009034753239181, + "grad_norm": 1.1488021612167358, + "learning_rate": 2.402276239988759e-05, + "loss": 7.0547, + "step": 5700 + }, + { + "epoch": 0.008023085691402757, + "grad_norm": 1.6401759386062622, + "learning_rate": 2.4064914992272022e-05, + "loss": 7.0965, + "step": 5710 + }, + { + "epoch": 0.008037136629566335, + "grad_norm": 1.1876939535140991, + "learning_rate": 2.4107067584656453e-05, + "loss": 7.0763, + "step": 5720 + }, + { + "epoch": 0.008051187567729913, + "grad_norm": 1.2102808952331543, + "learning_rate": 2.4149220177040888e-05, + "loss": 6.9282, + "step": 5730 + }, + { + "epoch": 0.008065238505893491, + "grad_norm": 1.1134921312332153, + "learning_rate": 2.4191372769425316e-05, + "loss": 7.0256, + "step": 5740 + }, + { + "epoch": 0.008079289444057067, + "grad_norm": 1.1920967102050781, + "learning_rate": 2.4233525361809747e-05, + "loss": 7.0314, + "step": 5750 + }, + { + "epoch": 0.008093340382220645, + "grad_norm": 1.218900442123413, + "learning_rate": 2.427567795419418e-05, + "loss": 7.0789, + "step": 5760 + }, + { + "epoch": 0.008107391320384223, + "grad_norm": 1.160243034362793, + "learning_rate": 2.4317830546578612e-05, + "loss": 6.9588, + "step": 5770 + }, + { + "epoch": 0.0081214422585478, + "grad_norm": 1.1619845628738403, + "learning_rate": 2.4359983138963044e-05, + "loss": 7.0157, + "step": 5780 + }, + { + "epoch": 0.008135493196711379, + "grad_norm": 1.2614120244979858, + "learning_rate": 2.4402135731347475e-05, + "loss": 6.9232, + "step": 5790 + }, + { + "epoch": 0.008149544134874955, + "grad_norm": 1.1834042072296143, + "learning_rate": 2.4444288323731906e-05, + "loss": 7.0801, + "step": 5800 + }, + { + "epoch": 0.008163595073038533, + "grad_norm": 1.147611141204834, + "learning_rate": 2.448644091611634e-05, + "loss": 7.0824, + "step": 5810 + }, + { + "epoch": 0.00817764601120211, + "grad_norm": 1.121339201927185, + "learning_rate": 2.452859350850077e-05, + "loss": 6.9956, + "step": 5820 + }, + { + "epoch": 0.008191696949365689, + "grad_norm": 1.15691077709198, + "learning_rate": 2.45707461008852e-05, + "loss": 6.977, + "step": 5830 + }, + { + "epoch": 0.008205747887529265, + "grad_norm": 1.2041352987289429, + "learning_rate": 2.4612898693269634e-05, + "loss": 6.9522, + "step": 5840 + }, + { + "epoch": 0.008219798825692843, + "grad_norm": 1.0645039081573486, + "learning_rate": 2.4655051285654065e-05, + "loss": 7.0521, + "step": 5850 + }, + { + "epoch": 0.00823384976385642, + "grad_norm": 1.0398435592651367, + "learning_rate": 2.46972038780385e-05, + "loss": 6.9993, + "step": 5860 + }, + { + "epoch": 0.008247900702019998, + "grad_norm": 1.1055554151535034, + "learning_rate": 2.4739356470422927e-05, + "loss": 7.0278, + "step": 5870 + }, + { + "epoch": 0.008261951640183576, + "grad_norm": 1.3027024269104004, + "learning_rate": 2.478150906280736e-05, + "loss": 7.0749, + "step": 5880 + }, + { + "epoch": 0.008276002578347152, + "grad_norm": 1.2701828479766846, + "learning_rate": 2.4823661655191793e-05, + "loss": 6.9991, + "step": 5890 + }, + { + "epoch": 0.00829005351651073, + "grad_norm": 1.2022544145584106, + "learning_rate": 2.4865814247576224e-05, + "loss": 6.957, + "step": 5900 + }, + { + "epoch": 0.008304104454674308, + "grad_norm": 1.1316558122634888, + "learning_rate": 2.4907966839960656e-05, + "loss": 6.9934, + "step": 5910 + }, + { + "epoch": 0.008318155392837886, + "grad_norm": 1.539913535118103, + "learning_rate": 2.4950119432345087e-05, + "loss": 7.0327, + "step": 5920 + }, + { + "epoch": 0.008332206331001462, + "grad_norm": 1.088806390762329, + "learning_rate": 2.4992272024729518e-05, + "loss": 6.9959, + "step": 5930 + }, + { + "epoch": 0.00834625726916504, + "grad_norm": 1.0958342552185059, + "learning_rate": 2.5034424617113952e-05, + "loss": 7.0824, + "step": 5940 + }, + { + "epoch": 0.008360308207328618, + "grad_norm": 1.241213321685791, + "learning_rate": 2.507657720949838e-05, + "loss": 7.007, + "step": 5950 + }, + { + "epoch": 0.008374359145492196, + "grad_norm": 1.1807730197906494, + "learning_rate": 2.511872980188281e-05, + "loss": 7.0675, + "step": 5960 + }, + { + "epoch": 0.008388410083655774, + "grad_norm": 1.1051346063613892, + "learning_rate": 2.5160882394267246e-05, + "loss": 7.0387, + "step": 5970 + }, + { + "epoch": 0.00840246102181935, + "grad_norm": 1.1446903944015503, + "learning_rate": 2.5203034986651677e-05, + "loss": 6.9076, + "step": 5980 + }, + { + "epoch": 0.008416511959982928, + "grad_norm": 1.0661766529083252, + "learning_rate": 2.5245187579036108e-05, + "loss": 6.9638, + "step": 5990 + }, + { + "epoch": 0.008430562898146506, + "grad_norm": 1.3106430768966675, + "learning_rate": 2.528734017142054e-05, + "loss": 6.9139, + "step": 6000 + }, + { + "epoch": 0.008444613836310084, + "grad_norm": 1.0190626382827759, + "learning_rate": 2.532949276380497e-05, + "loss": 7.0121, + "step": 6010 + }, + { + "epoch": 0.00845866477447366, + "grad_norm": 1.2163209915161133, + "learning_rate": 2.5371645356189405e-05, + "loss": 6.9522, + "step": 6020 + }, + { + "epoch": 0.008472715712637238, + "grad_norm": 1.3020004034042358, + "learning_rate": 2.5413797948573833e-05, + "loss": 7.0786, + "step": 6030 + }, + { + "epoch": 0.008486766650800816, + "grad_norm": 1.3286876678466797, + "learning_rate": 2.5455950540958264e-05, + "loss": 6.9549, + "step": 6040 + }, + { + "epoch": 0.008500817588964394, + "grad_norm": 1.3438900709152222, + "learning_rate": 2.54981031333427e-05, + "loss": 6.9829, + "step": 6050 + }, + { + "epoch": 0.00851486852712797, + "grad_norm": 1.0740140676498413, + "learning_rate": 2.554025572572713e-05, + "loss": 6.9899, + "step": 6060 + }, + { + "epoch": 0.008528919465291548, + "grad_norm": 1.1501201391220093, + "learning_rate": 2.5582408318111564e-05, + "loss": 7.0018, + "step": 6070 + }, + { + "epoch": 0.008542970403455126, + "grad_norm": 1.1902880668640137, + "learning_rate": 2.5624560910495992e-05, + "loss": 7.0088, + "step": 6080 + }, + { + "epoch": 0.008557021341618704, + "grad_norm": 1.0988359451293945, + "learning_rate": 2.5666713502880423e-05, + "loss": 6.9861, + "step": 6090 + }, + { + "epoch": 0.008571072279782281, + "grad_norm": 1.1356360912322998, + "learning_rate": 2.5708866095264858e-05, + "loss": 7.0025, + "step": 6100 + }, + { + "epoch": 0.008585123217945858, + "grad_norm": 1.2396526336669922, + "learning_rate": 2.575101868764929e-05, + "loss": 6.9691, + "step": 6110 + }, + { + "epoch": 0.008599174156109436, + "grad_norm": 1.1919825077056885, + "learning_rate": 2.5793171280033717e-05, + "loss": 6.968, + "step": 6120 + }, + { + "epoch": 0.008613225094273013, + "grad_norm": 1.2087693214416504, + "learning_rate": 2.583532387241815e-05, + "loss": 6.9081, + "step": 6130 + }, + { + "epoch": 0.008627276032436591, + "grad_norm": 1.0366860628128052, + "learning_rate": 2.5877476464802583e-05, + "loss": 6.9781, + "step": 6140 + }, + { + "epoch": 0.008641326970600167, + "grad_norm": 1.0888359546661377, + "learning_rate": 2.5919629057187017e-05, + "loss": 7.0216, + "step": 6150 + }, + { + "epoch": 0.008655377908763745, + "grad_norm": 1.055550217628479, + "learning_rate": 2.5961781649571445e-05, + "loss": 6.9755, + "step": 6160 + }, + { + "epoch": 0.008669428846927323, + "grad_norm": 1.300152063369751, + "learning_rate": 2.6003934241955876e-05, + "loss": 6.9486, + "step": 6170 + }, + { + "epoch": 0.008683479785090901, + "grad_norm": 1.2063759565353394, + "learning_rate": 2.604608683434031e-05, + "loss": 6.9448, + "step": 6180 + }, + { + "epoch": 0.008697530723254479, + "grad_norm": 1.3412996530532837, + "learning_rate": 2.6088239426724742e-05, + "loss": 6.9007, + "step": 6190 + }, + { + "epoch": 0.008711581661418055, + "grad_norm": 1.2316458225250244, + "learning_rate": 2.613039201910917e-05, + "loss": 6.842, + "step": 6200 + }, + { + "epoch": 0.008725632599581633, + "grad_norm": 1.3386321067810059, + "learning_rate": 2.6172544611493604e-05, + "loss": 6.9621, + "step": 6210 + }, + { + "epoch": 0.008739683537745211, + "grad_norm": 1.2841423749923706, + "learning_rate": 2.6214697203878035e-05, + "loss": 6.9877, + "step": 6220 + }, + { + "epoch": 0.008753734475908789, + "grad_norm": 1.238442301750183, + "learning_rate": 2.625684979626247e-05, + "loss": 6.9483, + "step": 6230 + }, + { + "epoch": 0.008767785414072365, + "grad_norm": 1.2642581462860107, + "learning_rate": 2.6299002388646898e-05, + "loss": 6.974, + "step": 6240 + }, + { + "epoch": 0.008781836352235943, + "grad_norm": 1.0157082080841064, + "learning_rate": 2.634115498103133e-05, + "loss": 6.9708, + "step": 6250 + }, + { + "epoch": 0.008795887290399521, + "grad_norm": 1.1940841674804688, + "learning_rate": 2.6383307573415763e-05, + "loss": 6.8949, + "step": 6260 + }, + { + "epoch": 0.008809938228563099, + "grad_norm": 1.1627519130706787, + "learning_rate": 2.6425460165800194e-05, + "loss": 6.9352, + "step": 6270 + }, + { + "epoch": 0.008823989166726677, + "grad_norm": 1.2034648656845093, + "learning_rate": 2.6467612758184626e-05, + "loss": 6.8588, + "step": 6280 + }, + { + "epoch": 0.008838040104890253, + "grad_norm": 1.2993476390838623, + "learning_rate": 2.6509765350569057e-05, + "loss": 6.8985, + "step": 6290 + }, + { + "epoch": 0.00885209104305383, + "grad_norm": 1.2249815464019775, + "learning_rate": 2.6551917942953488e-05, + "loss": 6.9706, + "step": 6300 + }, + { + "epoch": 0.008866141981217409, + "grad_norm": 1.2293840646743774, + "learning_rate": 2.6594070535337923e-05, + "loss": 6.8822, + "step": 6310 + }, + { + "epoch": 0.008880192919380987, + "grad_norm": 1.242686152458191, + "learning_rate": 2.6636223127722354e-05, + "loss": 6.8955, + "step": 6320 + }, + { + "epoch": 0.008894243857544563, + "grad_norm": 1.4114511013031006, + "learning_rate": 2.667837572010678e-05, + "loss": 6.8935, + "step": 6330 + }, + { + "epoch": 0.00890829479570814, + "grad_norm": 1.3672587871551514, + "learning_rate": 2.6720528312491216e-05, + "loss": 6.8912, + "step": 6340 + }, + { + "epoch": 0.008922345733871719, + "grad_norm": 1.2746614217758179, + "learning_rate": 2.6762680904875647e-05, + "loss": 6.8686, + "step": 6350 + }, + { + "epoch": 0.008936396672035296, + "grad_norm": 1.19154691696167, + "learning_rate": 2.6804833497260082e-05, + "loss": 6.9352, + "step": 6360 + }, + { + "epoch": 0.008950447610198874, + "grad_norm": 1.0989205837249756, + "learning_rate": 2.684698608964451e-05, + "loss": 6.9164, + "step": 6370 + }, + { + "epoch": 0.00896449854836245, + "grad_norm": 1.2119799852371216, + "learning_rate": 2.688913868202894e-05, + "loss": 6.9376, + "step": 6380 + }, + { + "epoch": 0.008978549486526028, + "grad_norm": 1.1754270792007446, + "learning_rate": 2.6931291274413375e-05, + "loss": 7.0092, + "step": 6390 + }, + { + "epoch": 0.008992600424689606, + "grad_norm": 1.3046258687973022, + "learning_rate": 2.6973443866797806e-05, + "loss": 6.9119, + "step": 6400 + }, + { + "epoch": 0.009006651362853184, + "grad_norm": 1.185444712638855, + "learning_rate": 2.7015596459182234e-05, + "loss": 6.9164, + "step": 6410 + }, + { + "epoch": 0.00902070230101676, + "grad_norm": 1.089029312133789, + "learning_rate": 2.705774905156667e-05, + "loss": 6.9452, + "step": 6420 + }, + { + "epoch": 0.009034753239180338, + "grad_norm": 1.1562166213989258, + "learning_rate": 2.70999016439511e-05, + "loss": 6.9627, + "step": 6430 + }, + { + "epoch": 0.009048804177343916, + "grad_norm": 1.1243866682052612, + "learning_rate": 2.7142054236335534e-05, + "loss": 6.9085, + "step": 6440 + }, + { + "epoch": 0.009062855115507494, + "grad_norm": 1.2206236124038696, + "learning_rate": 2.7184206828719962e-05, + "loss": 6.9762, + "step": 6450 + }, + { + "epoch": 0.00907690605367107, + "grad_norm": 1.1540404558181763, + "learning_rate": 2.7226359421104393e-05, + "loss": 6.9419, + "step": 6460 + }, + { + "epoch": 0.009090956991834648, + "grad_norm": 1.1724482774734497, + "learning_rate": 2.7268512013488828e-05, + "loss": 6.8162, + "step": 6470 + }, + { + "epoch": 0.009105007929998226, + "grad_norm": 1.1414722204208374, + "learning_rate": 2.731066460587326e-05, + "loss": 6.85, + "step": 6480 + }, + { + "epoch": 0.009119058868161804, + "grad_norm": 1.3120932579040527, + "learning_rate": 2.7352817198257687e-05, + "loss": 6.883, + "step": 6490 + }, + { + "epoch": 0.009133109806325382, + "grad_norm": 1.4635417461395264, + "learning_rate": 2.739496979064212e-05, + "loss": 6.8511, + "step": 6500 + }, + { + "epoch": 0.009147160744488958, + "grad_norm": 1.1401208639144897, + "learning_rate": 2.7437122383026553e-05, + "loss": 6.8325, + "step": 6510 + }, + { + "epoch": 0.009161211682652536, + "grad_norm": 1.2455246448516846, + "learning_rate": 2.7479274975410987e-05, + "loss": 6.9031, + "step": 6520 + }, + { + "epoch": 0.009175262620816114, + "grad_norm": 1.1095499992370605, + "learning_rate": 2.752142756779542e-05, + "loss": 6.9847, + "step": 6530 + }, + { + "epoch": 0.009189313558979692, + "grad_norm": 1.1827448606491089, + "learning_rate": 2.7563580160179846e-05, + "loss": 6.8022, + "step": 6540 + }, + { + "epoch": 0.009203364497143268, + "grad_norm": 1.260500431060791, + "learning_rate": 2.760573275256428e-05, + "loss": 6.8418, + "step": 6550 + }, + { + "epoch": 0.009217415435306846, + "grad_norm": 1.2838935852050781, + "learning_rate": 2.7647885344948712e-05, + "loss": 6.7921, + "step": 6560 + }, + { + "epoch": 0.009231466373470424, + "grad_norm": 1.320967674255371, + "learning_rate": 2.7690037937333146e-05, + "loss": 6.9222, + "step": 6570 + }, + { + "epoch": 0.009245517311634002, + "grad_norm": 1.126835823059082, + "learning_rate": 2.7732190529717574e-05, + "loss": 6.8736, + "step": 6580 + }, + { + "epoch": 0.00925956824979758, + "grad_norm": 1.314828634262085, + "learning_rate": 2.7774343122102005e-05, + "loss": 6.9342, + "step": 6590 + }, + { + "epoch": 0.009273619187961156, + "grad_norm": 1.155680775642395, + "learning_rate": 2.781649571448644e-05, + "loss": 6.8753, + "step": 6600 + }, + { + "epoch": 0.009287670126124734, + "grad_norm": 1.1509275436401367, + "learning_rate": 2.785864830687087e-05, + "loss": 6.9379, + "step": 6610 + }, + { + "epoch": 0.009301721064288311, + "grad_norm": 1.267380952835083, + "learning_rate": 2.79008008992553e-05, + "loss": 6.729, + "step": 6620 + }, + { + "epoch": 0.00931577200245189, + "grad_norm": 1.0873631238937378, + "learning_rate": 2.7942953491639733e-05, + "loss": 6.9163, + "step": 6630 + }, + { + "epoch": 0.009329822940615465, + "grad_norm": 1.1794565916061401, + "learning_rate": 2.7985106084024165e-05, + "loss": 6.8978, + "step": 6640 + }, + { + "epoch": 0.009343873878779043, + "grad_norm": 1.2664817571640015, + "learning_rate": 2.80272586764086e-05, + "loss": 6.8032, + "step": 6650 + }, + { + "epoch": 0.009357924816942621, + "grad_norm": 1.2982505559921265, + "learning_rate": 2.8069411268793027e-05, + "loss": 6.8812, + "step": 6660 + }, + { + "epoch": 0.0093719757551062, + "grad_norm": 1.0752772092819214, + "learning_rate": 2.8111563861177458e-05, + "loss": 6.8571, + "step": 6670 + }, + { + "epoch": 0.009386026693269777, + "grad_norm": 1.1509788036346436, + "learning_rate": 2.8153716453561893e-05, + "loss": 6.8439, + "step": 6680 + }, + { + "epoch": 0.009400077631433353, + "grad_norm": 1.1949745416641235, + "learning_rate": 2.8195869045946324e-05, + "loss": 6.8121, + "step": 6690 + }, + { + "epoch": 0.009414128569596931, + "grad_norm": 1.280206561088562, + "learning_rate": 2.823802163833075e-05, + "loss": 6.8162, + "step": 6700 + }, + { + "epoch": 0.009428179507760509, + "grad_norm": 1.1214113235473633, + "learning_rate": 2.8280174230715186e-05, + "loss": 6.9153, + "step": 6710 + }, + { + "epoch": 0.009442230445924087, + "grad_norm": 1.1382800340652466, + "learning_rate": 2.8322326823099617e-05, + "loss": 6.7568, + "step": 6720 + }, + { + "epoch": 0.009456281384087663, + "grad_norm": 1.1620829105377197, + "learning_rate": 2.8364479415484052e-05, + "loss": 6.8636, + "step": 6730 + }, + { + "epoch": 0.009470332322251241, + "grad_norm": 1.1612575054168701, + "learning_rate": 2.840663200786848e-05, + "loss": 6.8372, + "step": 6740 + }, + { + "epoch": 0.009484383260414819, + "grad_norm": 1.1702032089233398, + "learning_rate": 2.844878460025291e-05, + "loss": 6.9363, + "step": 6750 + }, + { + "epoch": 0.009498434198578397, + "grad_norm": 1.1699141263961792, + "learning_rate": 2.8490937192637345e-05, + "loss": 6.8231, + "step": 6760 + }, + { + "epoch": 0.009512485136741973, + "grad_norm": 1.2124075889587402, + "learning_rate": 2.8533089785021777e-05, + "loss": 6.9048, + "step": 6770 + }, + { + "epoch": 0.009526536074905551, + "grad_norm": 1.189448595046997, + "learning_rate": 2.857524237740621e-05, + "loss": 6.8002, + "step": 6780 + }, + { + "epoch": 0.009540587013069129, + "grad_norm": 1.2542109489440918, + "learning_rate": 2.861739496979064e-05, + "loss": 6.7874, + "step": 6790 + }, + { + "epoch": 0.009554637951232707, + "grad_norm": 1.1644352674484253, + "learning_rate": 2.865954756217507e-05, + "loss": 6.8212, + "step": 6800 + }, + { + "epoch": 0.009568688889396285, + "grad_norm": 1.0751014947891235, + "learning_rate": 2.8701700154559505e-05, + "loss": 6.8566, + "step": 6810 + }, + { + "epoch": 0.00958273982755986, + "grad_norm": 1.3088459968566895, + "learning_rate": 2.8743852746943936e-05, + "loss": 6.8754, + "step": 6820 + }, + { + "epoch": 0.009596790765723439, + "grad_norm": 1.2002370357513428, + "learning_rate": 2.8786005339328364e-05, + "loss": 6.7577, + "step": 6830 + }, + { + "epoch": 0.009610841703887017, + "grad_norm": 1.3408019542694092, + "learning_rate": 2.8828157931712798e-05, + "loss": 6.7329, + "step": 6840 + }, + { + "epoch": 0.009624892642050594, + "grad_norm": 1.1012219190597534, + "learning_rate": 2.887031052409723e-05, + "loss": 6.8618, + "step": 6850 + }, + { + "epoch": 0.00963894358021417, + "grad_norm": 1.610071063041687, + "learning_rate": 2.8912463116481664e-05, + "loss": 6.8109, + "step": 6860 + }, + { + "epoch": 0.009652994518377749, + "grad_norm": 1.3259707689285278, + "learning_rate": 2.895461570886609e-05, + "loss": 6.8621, + "step": 6870 + }, + { + "epoch": 0.009667045456541326, + "grad_norm": 1.3494741916656494, + "learning_rate": 2.8996768301250523e-05, + "loss": 6.8059, + "step": 6880 + }, + { + "epoch": 0.009681096394704904, + "grad_norm": 1.2349156141281128, + "learning_rate": 2.9038920893634957e-05, + "loss": 6.7744, + "step": 6890 + }, + { + "epoch": 0.009695147332868482, + "grad_norm": 1.1511788368225098, + "learning_rate": 2.908107348601939e-05, + "loss": 6.8519, + "step": 6900 + }, + { + "epoch": 0.009709198271032058, + "grad_norm": 1.3229706287384033, + "learning_rate": 2.9123226078403816e-05, + "loss": 6.9277, + "step": 6910 + }, + { + "epoch": 0.009723249209195636, + "grad_norm": 1.3774425983428955, + "learning_rate": 2.916537867078825e-05, + "loss": 6.821, + "step": 6920 + }, + { + "epoch": 0.009737300147359214, + "grad_norm": 1.2297898530960083, + "learning_rate": 2.9207531263172682e-05, + "loss": 6.7814, + "step": 6930 + }, + { + "epoch": 0.009751351085522792, + "grad_norm": 1.274950385093689, + "learning_rate": 2.9249683855557117e-05, + "loss": 6.8029, + "step": 6940 + }, + { + "epoch": 0.009765402023686368, + "grad_norm": 1.3338391780853271, + "learning_rate": 2.9291836447941544e-05, + "loss": 6.772, + "step": 6950 + }, + { + "epoch": 0.009779452961849946, + "grad_norm": 1.3144457340240479, + "learning_rate": 2.9333989040325975e-05, + "loss": 6.8667, + "step": 6960 + }, + { + "epoch": 0.009793503900013524, + "grad_norm": 1.1216480731964111, + "learning_rate": 2.937614163271041e-05, + "loss": 6.8348, + "step": 6970 + }, + { + "epoch": 0.009807554838177102, + "grad_norm": 1.1729443073272705, + "learning_rate": 2.941829422509484e-05, + "loss": 6.8038, + "step": 6980 + }, + { + "epoch": 0.00982160577634068, + "grad_norm": 1.1787049770355225, + "learning_rate": 2.946044681747927e-05, + "loss": 6.8645, + "step": 6990 + }, + { + "epoch": 0.009835656714504256, + "grad_norm": 1.3254892826080322, + "learning_rate": 2.9502599409863704e-05, + "loss": 6.7894, + "step": 7000 + }, + { + "epoch": 0.009849707652667834, + "grad_norm": 1.362343668937683, + "learning_rate": 2.9544752002248135e-05, + "loss": 6.7402, + "step": 7010 + }, + { + "epoch": 0.009863758590831412, + "grad_norm": 1.3607126474380493, + "learning_rate": 2.958690459463257e-05, + "loss": 6.7859, + "step": 7020 + }, + { + "epoch": 0.00987780952899499, + "grad_norm": 1.1887863874435425, + "learning_rate": 2.9629057187017e-05, + "loss": 6.847, + "step": 7030 + }, + { + "epoch": 0.009891860467158566, + "grad_norm": 3.412855625152588, + "learning_rate": 2.9671209779401428e-05, + "loss": 6.7606, + "step": 7040 + }, + { + "epoch": 0.009905911405322144, + "grad_norm": 1.3983832597732544, + "learning_rate": 2.9713362371785863e-05, + "loss": 6.8176, + "step": 7050 + }, + { + "epoch": 0.009919962343485722, + "grad_norm": 1.4330317974090576, + "learning_rate": 2.9755514964170294e-05, + "loss": 6.845, + "step": 7060 + }, + { + "epoch": 0.0099340132816493, + "grad_norm": 2.303821325302124, + "learning_rate": 2.979766755655473e-05, + "loss": 6.6737, + "step": 7070 + }, + { + "epoch": 0.009948064219812877, + "grad_norm": 1.079118013381958, + "learning_rate": 2.9839820148939156e-05, + "loss": 6.8816, + "step": 7080 + }, + { + "epoch": 0.009962115157976454, + "grad_norm": 1.1327508687973022, + "learning_rate": 2.9881972741323587e-05, + "loss": 6.8724, + "step": 7090 + }, + { + "epoch": 0.009976166096140032, + "grad_norm": 1.1694974899291992, + "learning_rate": 2.9924125333708022e-05, + "loss": 6.779, + "step": 7100 + }, + { + "epoch": 0.00999021703430361, + "grad_norm": 1.2839218378067017, + "learning_rate": 2.9966277926092453e-05, + "loss": 6.8217, + "step": 7110 + }, + { + "epoch": 0.010004267972467187, + "grad_norm": 1.1525654792785645, + "learning_rate": 3.000843051847688e-05, + "loss": 6.798, + "step": 7120 + }, + { + "epoch": 0.010018318910630764, + "grad_norm": 1.0679171085357666, + "learning_rate": 3.0050583110861315e-05, + "loss": 6.7938, + "step": 7130 + }, + { + "epoch": 0.010032369848794341, + "grad_norm": 1.440910816192627, + "learning_rate": 3.0092735703245747e-05, + "loss": 6.6892, + "step": 7140 + }, + { + "epoch": 0.01004642078695792, + "grad_norm": 1.1955666542053223, + "learning_rate": 3.013488829563018e-05, + "loss": 6.7765, + "step": 7150 + }, + { + "epoch": 0.010060471725121497, + "grad_norm": 1.318375825881958, + "learning_rate": 3.017704088801461e-05, + "loss": 6.8462, + "step": 7160 + }, + { + "epoch": 0.010074522663285073, + "grad_norm": 1.328094244003296, + "learning_rate": 3.021919348039904e-05, + "loss": 6.7747, + "step": 7170 + }, + { + "epoch": 0.010088573601448651, + "grad_norm": 1.5209085941314697, + "learning_rate": 3.0261346072783475e-05, + "loss": 6.852, + "step": 7180 + }, + { + "epoch": 0.01010262453961223, + "grad_norm": 1.3846906423568726, + "learning_rate": 3.0303498665167906e-05, + "loss": 6.7635, + "step": 7190 + }, + { + "epoch": 0.010116675477775807, + "grad_norm": 1.2889553308486938, + "learning_rate": 3.0345651257552334e-05, + "loss": 6.7899, + "step": 7200 + }, + { + "epoch": 0.010130726415939385, + "grad_norm": 1.244968056678772, + "learning_rate": 3.0387803849936768e-05, + "loss": 6.8504, + "step": 7210 + }, + { + "epoch": 0.010144777354102961, + "grad_norm": 1.5130319595336914, + "learning_rate": 3.04299564423212e-05, + "loss": 6.7033, + "step": 7220 + }, + { + "epoch": 0.010158828292266539, + "grad_norm": 1.188735008239746, + "learning_rate": 3.0472109034705634e-05, + "loss": 6.7656, + "step": 7230 + }, + { + "epoch": 0.010172879230430117, + "grad_norm": 1.2299892902374268, + "learning_rate": 3.051426162709006e-05, + "loss": 6.7203, + "step": 7240 + }, + { + "epoch": 0.010186930168593695, + "grad_norm": 1.3426145315170288, + "learning_rate": 3.055641421947449e-05, + "loss": 6.7867, + "step": 7250 + }, + { + "epoch": 0.010200981106757271, + "grad_norm": 1.440065860748291, + "learning_rate": 3.0598566811858924e-05, + "loss": 6.6989, + "step": 7260 + }, + { + "epoch": 0.010215032044920849, + "grad_norm": 1.7373557090759277, + "learning_rate": 3.064071940424336e-05, + "loss": 6.762, + "step": 7270 + }, + { + "epoch": 0.010229082983084427, + "grad_norm": 1.1723196506500244, + "learning_rate": 3.068287199662779e-05, + "loss": 6.7585, + "step": 7280 + }, + { + "epoch": 0.010243133921248005, + "grad_norm": 1.1131891012191772, + "learning_rate": 3.072502458901222e-05, + "loss": 6.6601, + "step": 7290 + }, + { + "epoch": 0.010257184859411583, + "grad_norm": 1.3221385478973389, + "learning_rate": 3.0767177181396655e-05, + "loss": 6.798, + "step": 7300 + }, + { + "epoch": 0.010271235797575159, + "grad_norm": 1.1797014474868774, + "learning_rate": 3.080932977378109e-05, + "loss": 6.7014, + "step": 7310 + }, + { + "epoch": 0.010285286735738737, + "grad_norm": 1.2688157558441162, + "learning_rate": 3.085148236616552e-05, + "loss": 6.7095, + "step": 7320 + }, + { + "epoch": 0.010299337673902315, + "grad_norm": 1.3802592754364014, + "learning_rate": 3.089363495854995e-05, + "loss": 6.6367, + "step": 7330 + }, + { + "epoch": 0.010313388612065892, + "grad_norm": 1.2973464727401733, + "learning_rate": 3.093578755093438e-05, + "loss": 6.6415, + "step": 7340 + }, + { + "epoch": 0.010327439550229469, + "grad_norm": 1.2183587551116943, + "learning_rate": 3.097794014331881e-05, + "loss": 6.7612, + "step": 7350 + }, + { + "epoch": 0.010341490488393047, + "grad_norm": 1.2998961210250854, + "learning_rate": 3.102009273570324e-05, + "loss": 6.7186, + "step": 7360 + }, + { + "epoch": 0.010355541426556624, + "grad_norm": 1.2842721939086914, + "learning_rate": 3.1062245328087674e-05, + "loss": 6.796, + "step": 7370 + }, + { + "epoch": 0.010369592364720202, + "grad_norm": 1.5530544519424438, + "learning_rate": 3.1104397920472105e-05, + "loss": 6.7277, + "step": 7380 + }, + { + "epoch": 0.01038364330288378, + "grad_norm": 1.4051449298858643, + "learning_rate": 3.1146550512856536e-05, + "loss": 6.6586, + "step": 7390 + }, + { + "epoch": 0.010397694241047356, + "grad_norm": 1.4194215536117554, + "learning_rate": 3.1188703105240974e-05, + "loss": 6.7337, + "step": 7400 + }, + { + "epoch": 0.010411745179210934, + "grad_norm": 1.2050557136535645, + "learning_rate": 3.12308556976254e-05, + "loss": 6.7198, + "step": 7410 + }, + { + "epoch": 0.010425796117374512, + "grad_norm": 1.340126872062683, + "learning_rate": 3.127300829000983e-05, + "loss": 6.6974, + "step": 7420 + }, + { + "epoch": 0.01043984705553809, + "grad_norm": 1.363870620727539, + "learning_rate": 3.131516088239427e-05, + "loss": 6.653, + "step": 7430 + }, + { + "epoch": 0.010453897993701666, + "grad_norm": 1.3523595333099365, + "learning_rate": 3.13573134747787e-05, + "loss": 6.6777, + "step": 7440 + }, + { + "epoch": 0.010467948931865244, + "grad_norm": 1.3135426044464111, + "learning_rate": 3.139946606716312e-05, + "loss": 6.7118, + "step": 7450 + }, + { + "epoch": 0.010481999870028822, + "grad_norm": 1.7983295917510986, + "learning_rate": 3.144161865954756e-05, + "loss": 6.5969, + "step": 7460 + }, + { + "epoch": 0.0104960508081924, + "grad_norm": 1.22897469997406, + "learning_rate": 3.148377125193199e-05, + "loss": 6.6821, + "step": 7470 + }, + { + "epoch": 0.010510101746355976, + "grad_norm": 1.2474480867385864, + "learning_rate": 3.152592384431642e-05, + "loss": 6.6954, + "step": 7480 + }, + { + "epoch": 0.010524152684519554, + "grad_norm": 1.1647069454193115, + "learning_rate": 3.1568076436700854e-05, + "loss": 6.7303, + "step": 7490 + }, + { + "epoch": 0.010538203622683132, + "grad_norm": 1.2561836242675781, + "learning_rate": 3.1610229029085286e-05, + "loss": 6.7037, + "step": 7500 + }, + { + "epoch": 0.01055225456084671, + "grad_norm": 1.7592111825942993, + "learning_rate": 3.165238162146972e-05, + "loss": 6.6393, + "step": 7510 + }, + { + "epoch": 0.010566305499010288, + "grad_norm": 1.4957785606384277, + "learning_rate": 3.169453421385415e-05, + "loss": 6.7021, + "step": 7520 + }, + { + "epoch": 0.010580356437173864, + "grad_norm": 1.2453256845474243, + "learning_rate": 3.1736686806238586e-05, + "loss": 6.7024, + "step": 7530 + }, + { + "epoch": 0.010594407375337442, + "grad_norm": 1.2314335107803345, + "learning_rate": 3.177883939862301e-05, + "loss": 6.6214, + "step": 7540 + }, + { + "epoch": 0.01060845831350102, + "grad_norm": 1.3570759296417236, + "learning_rate": 3.182099199100744e-05, + "loss": 6.6482, + "step": 7550 + }, + { + "epoch": 0.010622509251664598, + "grad_norm": 1.2058477401733398, + "learning_rate": 3.186314458339188e-05, + "loss": 6.7454, + "step": 7560 + }, + { + "epoch": 0.010636560189828174, + "grad_norm": 1.3977819681167603, + "learning_rate": 3.190529717577631e-05, + "loss": 6.7017, + "step": 7570 + }, + { + "epoch": 0.010650611127991752, + "grad_norm": 1.2504565715789795, + "learning_rate": 3.1947449768160735e-05, + "loss": 6.7633, + "step": 7580 + }, + { + "epoch": 0.01066466206615533, + "grad_norm": 1.3335305452346802, + "learning_rate": 3.198960236054517e-05, + "loss": 6.6588, + "step": 7590 + }, + { + "epoch": 0.010678713004318907, + "grad_norm": 1.2901426553726196, + "learning_rate": 3.2031754952929604e-05, + "loss": 6.6741, + "step": 7600 + }, + { + "epoch": 0.010692763942482485, + "grad_norm": 1.484226107597351, + "learning_rate": 3.2073907545314035e-05, + "loss": 6.684, + "step": 7610 + }, + { + "epoch": 0.010706814880646062, + "grad_norm": 1.2085685729980469, + "learning_rate": 3.2116060137698466e-05, + "loss": 6.7001, + "step": 7620 + }, + { + "epoch": 0.01072086581880964, + "grad_norm": 1.306835412979126, + "learning_rate": 3.21582127300829e-05, + "loss": 6.7055, + "step": 7630 + }, + { + "epoch": 0.010734916756973217, + "grad_norm": 1.3770405054092407, + "learning_rate": 3.220036532246733e-05, + "loss": 6.7178, + "step": 7640 + }, + { + "epoch": 0.010748967695136795, + "grad_norm": 1.4259955883026123, + "learning_rate": 3.224251791485176e-05, + "loss": 6.6999, + "step": 7650 + }, + { + "epoch": 0.010763018633300371, + "grad_norm": 1.2039424180984497, + "learning_rate": 3.228467050723619e-05, + "loss": 6.6817, + "step": 7660 + }, + { + "epoch": 0.01077706957146395, + "grad_norm": 1.3195371627807617, + "learning_rate": 3.232682309962062e-05, + "loss": 6.6196, + "step": 7670 + }, + { + "epoch": 0.010791120509627527, + "grad_norm": 1.228158950805664, + "learning_rate": 3.236897569200505e-05, + "loss": 6.6338, + "step": 7680 + }, + { + "epoch": 0.010805171447791105, + "grad_norm": 1.4088062047958374, + "learning_rate": 3.241112828438949e-05, + "loss": 6.6519, + "step": 7690 + }, + { + "epoch": 0.010819222385954683, + "grad_norm": 1.2354357242584229, + "learning_rate": 3.2453280876773916e-05, + "loss": 6.6397, + "step": 7700 + }, + { + "epoch": 0.01083327332411826, + "grad_norm": 1.2311620712280273, + "learning_rate": 3.249543346915835e-05, + "loss": 6.6254, + "step": 7710 + }, + { + "epoch": 0.010847324262281837, + "grad_norm": 1.1504223346710205, + "learning_rate": 3.2537586061542785e-05, + "loss": 6.7407, + "step": 7720 + }, + { + "epoch": 0.010861375200445415, + "grad_norm": 1.3598875999450684, + "learning_rate": 3.2579738653927216e-05, + "loss": 6.7246, + "step": 7730 + }, + { + "epoch": 0.010875426138608993, + "grad_norm": 1.3000996112823486, + "learning_rate": 3.262189124631164e-05, + "loss": 6.7343, + "step": 7740 + }, + { + "epoch": 0.010889477076772569, + "grad_norm": 1.4005444049835205, + "learning_rate": 3.266404383869608e-05, + "loss": 6.6171, + "step": 7750 + }, + { + "epoch": 0.010903528014936147, + "grad_norm": 1.3075186014175415, + "learning_rate": 3.270619643108051e-05, + "loss": 6.7218, + "step": 7760 + }, + { + "epoch": 0.010917578953099725, + "grad_norm": 1.383267879486084, + "learning_rate": 3.274834902346494e-05, + "loss": 6.6246, + "step": 7770 + }, + { + "epoch": 0.010931629891263303, + "grad_norm": 1.3374590873718262, + "learning_rate": 3.279050161584937e-05, + "loss": 6.757, + "step": 7780 + }, + { + "epoch": 0.01094568082942688, + "grad_norm": 1.3575273752212524, + "learning_rate": 3.28326542082338e-05, + "loss": 6.7405, + "step": 7790 + }, + { + "epoch": 0.010959731767590457, + "grad_norm": 1.5748355388641357, + "learning_rate": 3.2874806800618234e-05, + "loss": 6.6514, + "step": 7800 + }, + { + "epoch": 0.010973782705754035, + "grad_norm": 1.6142196655273438, + "learning_rate": 3.2916959393002665e-05, + "loss": 6.5779, + "step": 7810 + }, + { + "epoch": 0.010987833643917613, + "grad_norm": 1.1203367710113525, + "learning_rate": 3.29591119853871e-05, + "loss": 6.7072, + "step": 7820 + }, + { + "epoch": 0.01100188458208119, + "grad_norm": 1.2387946844100952, + "learning_rate": 3.300126457777153e-05, + "loss": 6.5173, + "step": 7830 + }, + { + "epoch": 0.011015935520244767, + "grad_norm": 1.3735456466674805, + "learning_rate": 3.304341717015596e-05, + "loss": 6.7404, + "step": 7840 + }, + { + "epoch": 0.011029986458408345, + "grad_norm": 1.399306297302246, + "learning_rate": 3.30855697625404e-05, + "loss": 6.6327, + "step": 7850 + }, + { + "epoch": 0.011044037396571922, + "grad_norm": 1.3363549709320068, + "learning_rate": 3.312772235492483e-05, + "loss": 6.6128, + "step": 7860 + }, + { + "epoch": 0.0110580883347355, + "grad_norm": 1.1382472515106201, + "learning_rate": 3.316987494730925e-05, + "loss": 6.6297, + "step": 7870 + }, + { + "epoch": 0.011072139272899077, + "grad_norm": 1.407041072845459, + "learning_rate": 3.321202753969369e-05, + "loss": 6.5615, + "step": 7880 + }, + { + "epoch": 0.011086190211062654, + "grad_norm": 1.3194184303283691, + "learning_rate": 3.325418013207812e-05, + "loss": 6.7404, + "step": 7890 + }, + { + "epoch": 0.011100241149226232, + "grad_norm": 1.298169732093811, + "learning_rate": 3.329633272446255e-05, + "loss": 6.6713, + "step": 7900 + }, + { + "epoch": 0.01111429208738981, + "grad_norm": 1.251718282699585, + "learning_rate": 3.3338485316846984e-05, + "loss": 6.623, + "step": 7910 + }, + { + "epoch": 0.011128343025553388, + "grad_norm": 1.2488762140274048, + "learning_rate": 3.3380637909231415e-05, + "loss": 6.5778, + "step": 7920 + }, + { + "epoch": 0.011142393963716964, + "grad_norm": 1.156110405921936, + "learning_rate": 3.3422790501615846e-05, + "loss": 6.66, + "step": 7930 + }, + { + "epoch": 0.011156444901880542, + "grad_norm": 1.4375478029251099, + "learning_rate": 3.346494309400028e-05, + "loss": 6.5979, + "step": 7940 + }, + { + "epoch": 0.01117049584004412, + "grad_norm": 1.3894151449203491, + "learning_rate": 3.350709568638471e-05, + "loss": 6.6524, + "step": 7950 + }, + { + "epoch": 0.011184546778207698, + "grad_norm": 1.1708953380584717, + "learning_rate": 3.354924827876914e-05, + "loss": 6.6713, + "step": 7960 + }, + { + "epoch": 0.011198597716371274, + "grad_norm": 1.281219482421875, + "learning_rate": 3.359140087115357e-05, + "loss": 6.6162, + "step": 7970 + }, + { + "epoch": 0.011212648654534852, + "grad_norm": 1.3841021060943604, + "learning_rate": 3.363355346353801e-05, + "loss": 6.5542, + "step": 7980 + }, + { + "epoch": 0.01122669959269843, + "grad_norm": 1.1785082817077637, + "learning_rate": 3.367570605592243e-05, + "loss": 6.5345, + "step": 7990 + }, + { + "epoch": 0.011240750530862008, + "grad_norm": 1.361771821975708, + "learning_rate": 3.3717858648306864e-05, + "loss": 6.6326, + "step": 8000 + }, + { + "epoch": 0.011254801469025586, + "grad_norm": 1.6702193021774292, + "learning_rate": 3.37600112406913e-05, + "loss": 6.6394, + "step": 8010 + }, + { + "epoch": 0.011268852407189162, + "grad_norm": 1.1941388845443726, + "learning_rate": 3.380216383307573e-05, + "loss": 6.6372, + "step": 8020 + }, + { + "epoch": 0.01128290334535274, + "grad_norm": 1.274591326713562, + "learning_rate": 3.3844316425460164e-05, + "loss": 6.6055, + "step": 8030 + }, + { + "epoch": 0.011296954283516318, + "grad_norm": 1.3808879852294922, + "learning_rate": 3.3886469017844596e-05, + "loss": 6.5508, + "step": 8040 + }, + { + "epoch": 0.011311005221679896, + "grad_norm": 1.2351412773132324, + "learning_rate": 3.392862161022903e-05, + "loss": 6.6163, + "step": 8050 + }, + { + "epoch": 0.011325056159843472, + "grad_norm": 1.38816237449646, + "learning_rate": 3.397077420261346e-05, + "loss": 6.6613, + "step": 8060 + }, + { + "epoch": 0.01133910709800705, + "grad_norm": 1.3823046684265137, + "learning_rate": 3.401292679499789e-05, + "loss": 6.6828, + "step": 8070 + }, + { + "epoch": 0.011353158036170628, + "grad_norm": 1.2434042692184448, + "learning_rate": 3.405507938738232e-05, + "loss": 6.5608, + "step": 8080 + }, + { + "epoch": 0.011367208974334206, + "grad_norm": 1.3518249988555908, + "learning_rate": 3.409723197976675e-05, + "loss": 6.6077, + "step": 8090 + }, + { + "epoch": 0.011381259912497783, + "grad_norm": 1.5739810466766357, + "learning_rate": 3.413938457215118e-05, + "loss": 6.7089, + "step": 8100 + }, + { + "epoch": 0.01139531085066136, + "grad_norm": 1.2564321756362915, + "learning_rate": 3.418153716453562e-05, + "loss": 6.6499, + "step": 8110 + }, + { + "epoch": 0.011409361788824937, + "grad_norm": 1.2737637758255005, + "learning_rate": 3.4223689756920045e-05, + "loss": 6.5284, + "step": 8120 + }, + { + "epoch": 0.011423412726988515, + "grad_norm": 1.3004697561264038, + "learning_rate": 3.4265842349304476e-05, + "loss": 6.6387, + "step": 8130 + }, + { + "epoch": 0.011437463665152093, + "grad_norm": 1.3879871368408203, + "learning_rate": 3.4307994941688914e-05, + "loss": 6.5927, + "step": 8140 + }, + { + "epoch": 0.01145151460331567, + "grad_norm": 1.338653802871704, + "learning_rate": 3.4350147534073345e-05, + "loss": 6.5564, + "step": 8150 + }, + { + "epoch": 0.011465565541479247, + "grad_norm": 1.4665417671203613, + "learning_rate": 3.439230012645777e-05, + "loss": 6.5217, + "step": 8160 + }, + { + "epoch": 0.011479616479642825, + "grad_norm": 1.3037580251693726, + "learning_rate": 3.443445271884221e-05, + "loss": 6.6446, + "step": 8170 + }, + { + "epoch": 0.011493667417806403, + "grad_norm": 1.2073763608932495, + "learning_rate": 3.447660531122664e-05, + "loss": 6.646, + "step": 8180 + }, + { + "epoch": 0.011507718355969981, + "grad_norm": 1.291408896446228, + "learning_rate": 3.451875790361107e-05, + "loss": 6.672, + "step": 8190 + }, + { + "epoch": 0.011521769294133557, + "grad_norm": 1.1927132606506348, + "learning_rate": 3.45609104959955e-05, + "loss": 6.5918, + "step": 8200 + }, + { + "epoch": 0.011535820232297135, + "grad_norm": 1.3004413843154907, + "learning_rate": 3.460306308837993e-05, + "loss": 6.4986, + "step": 8210 + }, + { + "epoch": 0.011549871170460713, + "grad_norm": 1.3016691207885742, + "learning_rate": 3.4645215680764363e-05, + "loss": 6.5268, + "step": 8220 + }, + { + "epoch": 0.011563922108624291, + "grad_norm": 1.3215467929840088, + "learning_rate": 3.4687368273148795e-05, + "loss": 6.5952, + "step": 8230 + }, + { + "epoch": 0.011577973046787867, + "grad_norm": 1.1828099489212036, + "learning_rate": 3.4729520865533226e-05, + "loss": 6.6606, + "step": 8240 + }, + { + "epoch": 0.011592023984951445, + "grad_norm": 1.3190696239471436, + "learning_rate": 3.477167345791766e-05, + "loss": 6.5955, + "step": 8250 + }, + { + "epoch": 0.011606074923115023, + "grad_norm": 1.2753030061721802, + "learning_rate": 3.481382605030209e-05, + "loss": 6.6255, + "step": 8260 + }, + { + "epoch": 0.0116201258612786, + "grad_norm": 1.3981962203979492, + "learning_rate": 3.4855978642686526e-05, + "loss": 6.4703, + "step": 8270 + }, + { + "epoch": 0.011634176799442177, + "grad_norm": 1.2518624067306519, + "learning_rate": 3.489813123507096e-05, + "loss": 6.5757, + "step": 8280 + }, + { + "epoch": 0.011648227737605755, + "grad_norm": 1.1934725046157837, + "learning_rate": 3.494028382745538e-05, + "loss": 6.5563, + "step": 8290 + }, + { + "epoch": 0.011662278675769333, + "grad_norm": 1.5110435485839844, + "learning_rate": 3.498243641983982e-05, + "loss": 6.565, + "step": 8300 + }, + { + "epoch": 0.01167632961393291, + "grad_norm": 1.2714707851409912, + "learning_rate": 3.502458901222425e-05, + "loss": 6.6103, + "step": 8310 + }, + { + "epoch": 0.011690380552096489, + "grad_norm": 1.3573921918869019, + "learning_rate": 3.506674160460868e-05, + "loss": 6.5801, + "step": 8320 + }, + { + "epoch": 0.011704431490260065, + "grad_norm": 1.2519656419754028, + "learning_rate": 3.510889419699311e-05, + "loss": 6.503, + "step": 8330 + }, + { + "epoch": 0.011718482428423643, + "grad_norm": 1.3184814453125, + "learning_rate": 3.5151046789377544e-05, + "loss": 6.6102, + "step": 8340 + }, + { + "epoch": 0.01173253336658722, + "grad_norm": 1.2902504205703735, + "learning_rate": 3.5193199381761975e-05, + "loss": 6.4634, + "step": 8350 + }, + { + "epoch": 0.011746584304750798, + "grad_norm": 1.3389567136764526, + "learning_rate": 3.5235351974146407e-05, + "loss": 6.5529, + "step": 8360 + }, + { + "epoch": 0.011760635242914375, + "grad_norm": 1.684384822845459, + "learning_rate": 3.527750456653084e-05, + "loss": 6.4738, + "step": 8370 + }, + { + "epoch": 0.011774686181077952, + "grad_norm": 1.1865743398666382, + "learning_rate": 3.531965715891527e-05, + "loss": 6.4601, + "step": 8380 + }, + { + "epoch": 0.01178873711924153, + "grad_norm": 1.2686684131622314, + "learning_rate": 3.53618097512997e-05, + "loss": 6.5644, + "step": 8390 + }, + { + "epoch": 0.011802788057405108, + "grad_norm": 1.3220882415771484, + "learning_rate": 3.540396234368414e-05, + "loss": 6.5406, + "step": 8400 + }, + { + "epoch": 0.011816838995568686, + "grad_norm": 1.302966833114624, + "learning_rate": 3.544611493606856e-05, + "loss": 6.5661, + "step": 8410 + }, + { + "epoch": 0.011830889933732262, + "grad_norm": 1.270646572113037, + "learning_rate": 3.5488267528452994e-05, + "loss": 6.4866, + "step": 8420 + }, + { + "epoch": 0.01184494087189584, + "grad_norm": 1.3006125688552856, + "learning_rate": 3.553042012083743e-05, + "loss": 6.5404, + "step": 8430 + }, + { + "epoch": 0.011858991810059418, + "grad_norm": 1.5760924816131592, + "learning_rate": 3.557257271322186e-05, + "loss": 6.6259, + "step": 8440 + }, + { + "epoch": 0.011873042748222996, + "grad_norm": 1.215240240097046, + "learning_rate": 3.561472530560629e-05, + "loss": 6.4761, + "step": 8450 + }, + { + "epoch": 0.011887093686386572, + "grad_norm": 1.2556153535842896, + "learning_rate": 3.5656877897990725e-05, + "loss": 6.53, + "step": 8460 + }, + { + "epoch": 0.01190114462455015, + "grad_norm": 1.2683357000350952, + "learning_rate": 3.5699030490375156e-05, + "loss": 6.4377, + "step": 8470 + }, + { + "epoch": 0.011915195562713728, + "grad_norm": 1.187314748764038, + "learning_rate": 3.574118308275959e-05, + "loss": 6.5894, + "step": 8480 + }, + { + "epoch": 0.011929246500877306, + "grad_norm": 1.3034703731536865, + "learning_rate": 3.578333567514402e-05, + "loss": 6.5753, + "step": 8490 + }, + { + "epoch": 0.011943297439040884, + "grad_norm": 1.4902338981628418, + "learning_rate": 3.582548826752845e-05, + "loss": 6.5297, + "step": 8500 + }, + { + "epoch": 0.01195734837720446, + "grad_norm": 1.3260395526885986, + "learning_rate": 3.586764085991288e-05, + "loss": 6.6046, + "step": 8510 + }, + { + "epoch": 0.011971399315368038, + "grad_norm": 1.3407644033432007, + "learning_rate": 3.590979345229731e-05, + "loss": 6.4248, + "step": 8520 + }, + { + "epoch": 0.011985450253531616, + "grad_norm": 1.1548830270767212, + "learning_rate": 3.595194604468175e-05, + "loss": 6.5296, + "step": 8530 + }, + { + "epoch": 0.011999501191695194, + "grad_norm": 1.3393534421920776, + "learning_rate": 3.5994098637066174e-05, + "loss": 6.5889, + "step": 8540 + }, + { + "epoch": 0.01201355212985877, + "grad_norm": 1.3542684316635132, + "learning_rate": 3.6036251229450605e-05, + "loss": 6.5508, + "step": 8550 + }, + { + "epoch": 0.012027603068022348, + "grad_norm": 1.5744121074676514, + "learning_rate": 3.6078403821835043e-05, + "loss": 6.5049, + "step": 8560 + }, + { + "epoch": 0.012041654006185926, + "grad_norm": 1.1964699029922485, + "learning_rate": 3.6120556414219475e-05, + "loss": 6.538, + "step": 8570 + }, + { + "epoch": 0.012055704944349504, + "grad_norm": 1.3729437589645386, + "learning_rate": 3.61627090066039e-05, + "loss": 6.5814, + "step": 8580 + }, + { + "epoch": 0.01206975588251308, + "grad_norm": 1.190537691116333, + "learning_rate": 3.620486159898834e-05, + "loss": 6.503, + "step": 8590 + }, + { + "epoch": 0.012083806820676658, + "grad_norm": 1.2392871379852295, + "learning_rate": 3.624701419137277e-05, + "loss": 6.4875, + "step": 8600 + }, + { + "epoch": 0.012097857758840235, + "grad_norm": 1.3132319450378418, + "learning_rate": 3.62891667837572e-05, + "loss": 6.4879, + "step": 8610 + }, + { + "epoch": 0.012111908697003813, + "grad_norm": 1.3270124197006226, + "learning_rate": 3.633131937614163e-05, + "loss": 6.415, + "step": 8620 + }, + { + "epoch": 0.012125959635167391, + "grad_norm": 1.2737349271774292, + "learning_rate": 3.637347196852606e-05, + "loss": 6.5448, + "step": 8630 + }, + { + "epoch": 0.012140010573330967, + "grad_norm": 1.1706392765045166, + "learning_rate": 3.641562456091049e-05, + "loss": 6.4907, + "step": 8640 + }, + { + "epoch": 0.012154061511494545, + "grad_norm": 1.3053278923034668, + "learning_rate": 3.6457777153294924e-05, + "loss": 6.6704, + "step": 8650 + }, + { + "epoch": 0.012168112449658123, + "grad_norm": 1.3305950164794922, + "learning_rate": 3.6499929745679355e-05, + "loss": 6.4386, + "step": 8660 + }, + { + "epoch": 0.012182163387821701, + "grad_norm": 1.307633638381958, + "learning_rate": 3.6542082338063786e-05, + "loss": 6.4467, + "step": 8670 + }, + { + "epoch": 0.012196214325985277, + "grad_norm": 1.3092464208602905, + "learning_rate": 3.658423493044822e-05, + "loss": 6.4597, + "step": 8680 + }, + { + "epoch": 0.012210265264148855, + "grad_norm": 1.368456244468689, + "learning_rate": 3.6626387522832655e-05, + "loss": 6.5467, + "step": 8690 + }, + { + "epoch": 0.012224316202312433, + "grad_norm": 1.2031217813491821, + "learning_rate": 3.666854011521708e-05, + "loss": 6.4379, + "step": 8700 + }, + { + "epoch": 0.012238367140476011, + "grad_norm": 1.300486445426941, + "learning_rate": 3.671069270760151e-05, + "loss": 6.4787, + "step": 8710 + }, + { + "epoch": 0.012252418078639589, + "grad_norm": 1.6316710710525513, + "learning_rate": 3.675284529998595e-05, + "loss": 6.4383, + "step": 8720 + }, + { + "epoch": 0.012266469016803165, + "grad_norm": 1.327189564704895, + "learning_rate": 3.679499789237038e-05, + "loss": 6.4901, + "step": 8730 + }, + { + "epoch": 0.012280519954966743, + "grad_norm": 1.4074703454971313, + "learning_rate": 3.6837150484754804e-05, + "loss": 6.5223, + "step": 8740 + }, + { + "epoch": 0.012294570893130321, + "grad_norm": 1.3575947284698486, + "learning_rate": 3.687930307713924e-05, + "loss": 6.5039, + "step": 8750 + }, + { + "epoch": 0.012308621831293899, + "grad_norm": 1.2397525310516357, + "learning_rate": 3.6921455669523674e-05, + "loss": 6.3608, + "step": 8760 + }, + { + "epoch": 0.012322672769457475, + "grad_norm": 1.3196090459823608, + "learning_rate": 3.6963608261908105e-05, + "loss": 6.4237, + "step": 8770 + }, + { + "epoch": 0.012336723707621053, + "grad_norm": 1.4720513820648193, + "learning_rate": 3.7005760854292536e-05, + "loss": 6.4594, + "step": 8780 + }, + { + "epoch": 0.01235077464578463, + "grad_norm": 1.3891912698745728, + "learning_rate": 3.704791344667697e-05, + "loss": 6.4615, + "step": 8790 + }, + { + "epoch": 0.012364825583948209, + "grad_norm": 1.2116029262542725, + "learning_rate": 3.70900660390614e-05, + "loss": 6.4994, + "step": 8800 + }, + { + "epoch": 0.012378876522111787, + "grad_norm": 1.3657456636428833, + "learning_rate": 3.713221863144583e-05, + "loss": 6.5264, + "step": 8810 + }, + { + "epoch": 0.012392927460275363, + "grad_norm": 1.2680433988571167, + "learning_rate": 3.717437122383027e-05, + "loss": 6.4179, + "step": 8820 + }, + { + "epoch": 0.01240697839843894, + "grad_norm": 1.2071449756622314, + "learning_rate": 3.721652381621469e-05, + "loss": 6.6144, + "step": 8830 + }, + { + "epoch": 0.012421029336602519, + "grad_norm": 1.2629899978637695, + "learning_rate": 3.725867640859912e-05, + "loss": 6.5407, + "step": 8840 + }, + { + "epoch": 0.012435080274766096, + "grad_norm": 1.296008586883545, + "learning_rate": 3.730082900098356e-05, + "loss": 6.4203, + "step": 8850 + }, + { + "epoch": 0.012449131212929673, + "grad_norm": 1.4418439865112305, + "learning_rate": 3.734298159336799e-05, + "loss": 6.4575, + "step": 8860 + }, + { + "epoch": 0.01246318215109325, + "grad_norm": 1.3441669940948486, + "learning_rate": 3.7385134185752416e-05, + "loss": 6.4932, + "step": 8870 + }, + { + "epoch": 0.012477233089256828, + "grad_norm": 1.339735507965088, + "learning_rate": 3.7427286778136854e-05, + "loss": 6.5467, + "step": 8880 + }, + { + "epoch": 0.012491284027420406, + "grad_norm": 1.3307244777679443, + "learning_rate": 3.7469439370521285e-05, + "loss": 6.5179, + "step": 8890 + }, + { + "epoch": 0.012505334965583984, + "grad_norm": 1.3968795537948608, + "learning_rate": 3.751159196290571e-05, + "loss": 6.5239, + "step": 8900 + }, + { + "epoch": 0.01251938590374756, + "grad_norm": 1.3690930604934692, + "learning_rate": 3.755374455529015e-05, + "loss": 6.585, + "step": 8910 + }, + { + "epoch": 0.012533436841911138, + "grad_norm": 1.4065380096435547, + "learning_rate": 3.759589714767458e-05, + "loss": 6.5333, + "step": 8920 + }, + { + "epoch": 0.012547487780074716, + "grad_norm": 1.351392388343811, + "learning_rate": 3.7638049740059e-05, + "loss": 6.4869, + "step": 8930 + }, + { + "epoch": 0.012561538718238294, + "grad_norm": 1.3816670179367065, + "learning_rate": 3.768020233244344e-05, + "loss": 6.4295, + "step": 8940 + }, + { + "epoch": 0.01257558965640187, + "grad_norm": 1.369028091430664, + "learning_rate": 3.772235492482787e-05, + "loss": 6.504, + "step": 8950 + }, + { + "epoch": 0.012589640594565448, + "grad_norm": 1.4562000036239624, + "learning_rate": 3.776450751721231e-05, + "loss": 6.4471, + "step": 8960 + }, + { + "epoch": 0.012603691532729026, + "grad_norm": 1.8173776865005493, + "learning_rate": 3.7806660109596735e-05, + "loss": 6.5009, + "step": 8970 + }, + { + "epoch": 0.012617742470892604, + "grad_norm": 1.3316607475280762, + "learning_rate": 3.7848812701981166e-05, + "loss": 6.3643, + "step": 8980 + }, + { + "epoch": 0.01263179340905618, + "grad_norm": 1.2292101383209229, + "learning_rate": 3.7890965294365604e-05, + "loss": 6.3989, + "step": 8990 + }, + { + "epoch": 0.012645844347219758, + "grad_norm": 1.34901762008667, + "learning_rate": 3.793311788675003e-05, + "loss": 6.4566, + "step": 9000 + }, + { + "epoch": 0.012659895285383336, + "grad_norm": 1.2581123113632202, + "learning_rate": 3.7975270479134466e-05, + "loss": 6.4794, + "step": 9010 + }, + { + "epoch": 0.012673946223546914, + "grad_norm": 1.2883479595184326, + "learning_rate": 3.80174230715189e-05, + "loss": 6.4325, + "step": 9020 + }, + { + "epoch": 0.012687997161710492, + "grad_norm": 1.5478203296661377, + "learning_rate": 3.805957566390332e-05, + "loss": 6.3938, + "step": 9030 + }, + { + "epoch": 0.012702048099874068, + "grad_norm": 1.318039059638977, + "learning_rate": 3.810172825628776e-05, + "loss": 6.5348, + "step": 9040 + }, + { + "epoch": 0.012716099038037646, + "grad_norm": 1.3280646800994873, + "learning_rate": 3.814388084867219e-05, + "loss": 6.5114, + "step": 9050 + }, + { + "epoch": 0.012730149976201224, + "grad_norm": 1.2793512344360352, + "learning_rate": 3.8186033441056615e-05, + "loss": 6.483, + "step": 9060 + }, + { + "epoch": 0.012744200914364802, + "grad_norm": 1.3596251010894775, + "learning_rate": 3.822818603344105e-05, + "loss": 6.449, + "step": 9070 + }, + { + "epoch": 0.012758251852528378, + "grad_norm": 1.2819275856018066, + "learning_rate": 3.8270338625825484e-05, + "loss": 6.554, + "step": 9080 + }, + { + "epoch": 0.012772302790691956, + "grad_norm": 1.3615672588348389, + "learning_rate": 3.831249121820992e-05, + "loss": 6.4931, + "step": 9090 + }, + { + "epoch": 0.012786353728855534, + "grad_norm": 1.944034218788147, + "learning_rate": 3.835464381059435e-05, + "loss": 6.4004, + "step": 9100 + }, + { + "epoch": 0.012800404667019111, + "grad_norm": 1.4013776779174805, + "learning_rate": 3.839679640297878e-05, + "loss": 6.3946, + "step": 9110 + }, + { + "epoch": 0.01281445560518269, + "grad_norm": 1.3120478391647339, + "learning_rate": 3.8438948995363216e-05, + "loss": 6.3586, + "step": 9120 + }, + { + "epoch": 0.012828506543346265, + "grad_norm": 1.2097243070602417, + "learning_rate": 3.848110158774764e-05, + "loss": 6.3219, + "step": 9130 + }, + { + "epoch": 0.012842557481509843, + "grad_norm": 1.2938241958618164, + "learning_rate": 3.852325418013207e-05, + "loss": 6.5543, + "step": 9140 + }, + { + "epoch": 0.012856608419673421, + "grad_norm": 1.4525402784347534, + "learning_rate": 3.856540677251651e-05, + "loss": 6.4622, + "step": 9150 + }, + { + "epoch": 0.012870659357837, + "grad_norm": 1.2755237817764282, + "learning_rate": 3.8607559364900934e-05, + "loss": 6.3855, + "step": 9160 + }, + { + "epoch": 0.012884710296000575, + "grad_norm": 1.2028285264968872, + "learning_rate": 3.864971195728537e-05, + "loss": 6.4834, + "step": 9170 + }, + { + "epoch": 0.012898761234164153, + "grad_norm": 1.3781890869140625, + "learning_rate": 3.86918645496698e-05, + "loss": 6.4325, + "step": 9180 + }, + { + "epoch": 0.012912812172327731, + "grad_norm": 1.3861724138259888, + "learning_rate": 3.873401714205423e-05, + "loss": 6.3593, + "step": 9190 + }, + { + "epoch": 0.012926863110491309, + "grad_norm": 1.4471116065979004, + "learning_rate": 3.8776169734438665e-05, + "loss": 6.4511, + "step": 9200 + }, + { + "epoch": 0.012940914048654887, + "grad_norm": 1.250730037689209, + "learning_rate": 3.8818322326823096e-05, + "loss": 6.4343, + "step": 9210 + }, + { + "epoch": 0.012954964986818463, + "grad_norm": 1.2801191806793213, + "learning_rate": 3.886047491920752e-05, + "loss": 6.3752, + "step": 9220 + }, + { + "epoch": 0.012969015924982041, + "grad_norm": 1.210339069366455, + "learning_rate": 3.890262751159196e-05, + "loss": 6.4224, + "step": 9230 + }, + { + "epoch": 0.012983066863145619, + "grad_norm": 1.3136581182479858, + "learning_rate": 3.894478010397639e-05, + "loss": 6.3563, + "step": 9240 + }, + { + "epoch": 0.012997117801309197, + "grad_norm": 1.5053766965866089, + "learning_rate": 3.898693269636083e-05, + "loss": 6.3402, + "step": 9250 + }, + { + "epoch": 0.013011168739472773, + "grad_norm": 1.661244511604309, + "learning_rate": 3.902908528874525e-05, + "loss": 6.3517, + "step": 9260 + }, + { + "epoch": 0.013025219677636351, + "grad_norm": 1.3023381233215332, + "learning_rate": 3.907123788112968e-05, + "loss": 6.5525, + "step": 9270 + }, + { + "epoch": 0.013039270615799929, + "grad_norm": 1.444373369216919, + "learning_rate": 3.911339047351412e-05, + "loss": 6.5172, + "step": 9280 + }, + { + "epoch": 0.013053321553963507, + "grad_norm": 1.375860333442688, + "learning_rate": 3.9155543065898546e-05, + "loss": 6.4256, + "step": 9290 + }, + { + "epoch": 0.013067372492127083, + "grad_norm": 1.346632719039917, + "learning_rate": 3.9197695658282984e-05, + "loss": 6.3385, + "step": 9300 + }, + { + "epoch": 0.01308142343029066, + "grad_norm": 1.2784466743469238, + "learning_rate": 3.9239848250667415e-05, + "loss": 6.3262, + "step": 9310 + }, + { + "epoch": 0.013095474368454239, + "grad_norm": 1.4308885335922241, + "learning_rate": 3.928200084305184e-05, + "loss": 6.3977, + "step": 9320 + }, + { + "epoch": 0.013109525306617817, + "grad_norm": 1.379128336906433, + "learning_rate": 3.932415343543628e-05, + "loss": 6.3745, + "step": 9330 + }, + { + "epoch": 0.013123576244781394, + "grad_norm": 1.3541892766952515, + "learning_rate": 3.936630602782071e-05, + "loss": 6.5073, + "step": 9340 + }, + { + "epoch": 0.01313762718294497, + "grad_norm": 1.5439261198043823, + "learning_rate": 3.940845862020513e-05, + "loss": 6.4435, + "step": 9350 + }, + { + "epoch": 0.013151678121108549, + "grad_norm": 1.257303237915039, + "learning_rate": 3.945061121258957e-05, + "loss": 6.3503, + "step": 9360 + }, + { + "epoch": 0.013165729059272126, + "grad_norm": 1.358400583267212, + "learning_rate": 3.9492763804974e-05, + "loss": 6.3759, + "step": 9370 + }, + { + "epoch": 0.013179779997435704, + "grad_norm": 1.232875108718872, + "learning_rate": 3.953491639735844e-05, + "loss": 6.3632, + "step": 9380 + }, + { + "epoch": 0.01319383093559928, + "grad_norm": 1.3150238990783691, + "learning_rate": 3.9577068989742864e-05, + "loss": 6.4415, + "step": 9390 + }, + { + "epoch": 0.013207881873762858, + "grad_norm": 1.2941501140594482, + "learning_rate": 3.9619221582127295e-05, + "loss": 6.5106, + "step": 9400 + }, + { + "epoch": 0.013221932811926436, + "grad_norm": 1.3233859539031982, + "learning_rate": 3.966137417451173e-05, + "loss": 6.4211, + "step": 9410 + }, + { + "epoch": 0.013235983750090014, + "grad_norm": 1.309694528579712, + "learning_rate": 3.970352676689616e-05, + "loss": 6.4375, + "step": 9420 + }, + { + "epoch": 0.013250034688253592, + "grad_norm": 1.3564949035644531, + "learning_rate": 3.974567935928059e-05, + "loss": 6.3014, + "step": 9430 + }, + { + "epoch": 0.013264085626417168, + "grad_norm": 1.276855230331421, + "learning_rate": 3.978783195166503e-05, + "loss": 6.3429, + "step": 9440 + }, + { + "epoch": 0.013278136564580746, + "grad_norm": 1.356063961982727, + "learning_rate": 3.982998454404945e-05, + "loss": 6.4402, + "step": 9450 + }, + { + "epoch": 0.013292187502744324, + "grad_norm": 1.242181658744812, + "learning_rate": 3.987213713643389e-05, + "loss": 6.4431, + "step": 9460 + }, + { + "epoch": 0.013306238440907902, + "grad_norm": 1.3604857921600342, + "learning_rate": 3.991428972881832e-05, + "loss": 6.3414, + "step": 9470 + }, + { + "epoch": 0.013320289379071478, + "grad_norm": 1.2341139316558838, + "learning_rate": 3.9956442321202745e-05, + "loss": 6.4268, + "step": 9480 + }, + { + "epoch": 0.013334340317235056, + "grad_norm": 1.4619051218032837, + "learning_rate": 3.999859491358718e-05, + "loss": 6.4312, + "step": 9490 + }, + { + "epoch": 0.013348391255398634, + "grad_norm": 1.2710881233215332, + "learning_rate": 4.0040747505971614e-05, + "loss": 6.3772, + "step": 9500 + }, + { + "epoch": 0.013362442193562212, + "grad_norm": 1.3365315198898315, + "learning_rate": 4.008290009835605e-05, + "loss": 6.3996, + "step": 9510 + }, + { + "epoch": 0.01337649313172579, + "grad_norm": 1.246775507926941, + "learning_rate": 4.0125052690740476e-05, + "loss": 6.4799, + "step": 9520 + }, + { + "epoch": 0.013390544069889366, + "grad_norm": 1.3018437623977661, + "learning_rate": 4.016720528312491e-05, + "loss": 6.3685, + "step": 9530 + }, + { + "epoch": 0.013404595008052944, + "grad_norm": 1.413640022277832, + "learning_rate": 4.0209357875509345e-05, + "loss": 6.3522, + "step": 9540 + }, + { + "epoch": 0.013418645946216522, + "grad_norm": 1.3587901592254639, + "learning_rate": 4.025151046789377e-05, + "loss": 6.3307, + "step": 9550 + }, + { + "epoch": 0.0134326968843801, + "grad_norm": 1.3375356197357178, + "learning_rate": 4.02936630602782e-05, + "loss": 6.423, + "step": 9560 + }, + { + "epoch": 0.013446747822543676, + "grad_norm": 1.5337927341461182, + "learning_rate": 4.033581565266264e-05, + "loss": 6.3995, + "step": 9570 + }, + { + "epoch": 0.013460798760707254, + "grad_norm": 1.299627661705017, + "learning_rate": 4.037796824504706e-05, + "loss": 6.3389, + "step": 9580 + }, + { + "epoch": 0.013474849698870832, + "grad_norm": 1.1940332651138306, + "learning_rate": 4.04201208374315e-05, + "loss": 6.3567, + "step": 9590 + }, + { + "epoch": 0.01348890063703441, + "grad_norm": 1.4569729566574097, + "learning_rate": 4.046227342981593e-05, + "loss": 6.3624, + "step": 9600 + }, + { + "epoch": 0.013502951575197987, + "grad_norm": 1.3093900680541992, + "learning_rate": 4.0504426022200357e-05, + "loss": 6.4537, + "step": 9610 + }, + { + "epoch": 0.013517002513361564, + "grad_norm": 1.5371428728103638, + "learning_rate": 4.0546578614584795e-05, + "loss": 6.414, + "step": 9620 + }, + { + "epoch": 0.013531053451525141, + "grad_norm": 1.2927149534225464, + "learning_rate": 4.0588731206969226e-05, + "loss": 6.3938, + "step": 9630 + }, + { + "epoch": 0.01354510438968872, + "grad_norm": 1.2334342002868652, + "learning_rate": 4.063088379935365e-05, + "loss": 6.36, + "step": 9640 + }, + { + "epoch": 0.013559155327852297, + "grad_norm": 1.3164347410202026, + "learning_rate": 4.067303639173809e-05, + "loss": 6.324, + "step": 9650 + }, + { + "epoch": 0.013573206266015873, + "grad_norm": 1.3269118070602417, + "learning_rate": 4.071518898412252e-05, + "loss": 6.3407, + "step": 9660 + }, + { + "epoch": 0.013587257204179451, + "grad_norm": 1.3465888500213623, + "learning_rate": 4.075734157650696e-05, + "loss": 6.3591, + "step": 9670 + }, + { + "epoch": 0.01360130814234303, + "grad_norm": 1.2967374324798584, + "learning_rate": 4.079949416889138e-05, + "loss": 6.3319, + "step": 9680 + }, + { + "epoch": 0.013615359080506607, + "grad_norm": 1.2995193004608154, + "learning_rate": 4.084164676127581e-05, + "loss": 6.3171, + "step": 9690 + }, + { + "epoch": 0.013629410018670183, + "grad_norm": 1.4583133459091187, + "learning_rate": 4.088379935366025e-05, + "loss": 6.296, + "step": 9700 + }, + { + "epoch": 0.013643460956833761, + "grad_norm": 1.3236838579177856, + "learning_rate": 4.0925951946044675e-05, + "loss": 6.4108, + "step": 9710 + }, + { + "epoch": 0.013657511894997339, + "grad_norm": 1.3856712579727173, + "learning_rate": 4.0968104538429106e-05, + "loss": 6.3383, + "step": 9720 + }, + { + "epoch": 0.013671562833160917, + "grad_norm": 1.4215503931045532, + "learning_rate": 4.1010257130813544e-05, + "loss": 6.3143, + "step": 9730 + }, + { + "epoch": 0.013685613771324495, + "grad_norm": 1.3027349710464478, + "learning_rate": 4.105240972319797e-05, + "loss": 6.3573, + "step": 9740 + }, + { + "epoch": 0.013699664709488071, + "grad_norm": 1.5820553302764893, + "learning_rate": 4.1094562315582406e-05, + "loss": 6.3765, + "step": 9750 + }, + { + "epoch": 0.013713715647651649, + "grad_norm": 1.3663697242736816, + "learning_rate": 4.113671490796684e-05, + "loss": 6.3362, + "step": 9760 + }, + { + "epoch": 0.013727766585815227, + "grad_norm": 1.3366793394088745, + "learning_rate": 4.117886750035126e-05, + "loss": 6.3561, + "step": 9770 + }, + { + "epoch": 0.013741817523978805, + "grad_norm": 1.2957160472869873, + "learning_rate": 4.12210200927357e-05, + "loss": 6.402, + "step": 9780 + }, + { + "epoch": 0.013755868462142381, + "grad_norm": 1.2606065273284912, + "learning_rate": 4.126317268512013e-05, + "loss": 6.3366, + "step": 9790 + }, + { + "epoch": 0.013769919400305959, + "grad_norm": 1.3806688785552979, + "learning_rate": 4.130532527750457e-05, + "loss": 6.3261, + "step": 9800 + }, + { + "epoch": 0.013783970338469537, + "grad_norm": 1.4678658246994019, + "learning_rate": 4.1347477869888993e-05, + "loss": 6.3808, + "step": 9810 + }, + { + "epoch": 0.013798021276633115, + "grad_norm": 1.2760154008865356, + "learning_rate": 4.1389630462273425e-05, + "loss": 6.3305, + "step": 9820 + }, + { + "epoch": 0.013812072214796692, + "grad_norm": 1.4611419439315796, + "learning_rate": 4.143178305465786e-05, + "loss": 6.2193, + "step": 9830 + }, + { + "epoch": 0.013826123152960269, + "grad_norm": 1.3268877267837524, + "learning_rate": 4.147393564704229e-05, + "loss": 6.3524, + "step": 9840 + }, + { + "epoch": 0.013840174091123847, + "grad_norm": 1.3382116556167603, + "learning_rate": 4.151608823942672e-05, + "loss": 6.4381, + "step": 9850 + }, + { + "epoch": 0.013854225029287424, + "grad_norm": 1.4644103050231934, + "learning_rate": 4.1558240831811156e-05, + "loss": 6.2919, + "step": 9860 + }, + { + "epoch": 0.013868275967451002, + "grad_norm": 1.427890658378601, + "learning_rate": 4.160039342419558e-05, + "loss": 6.3372, + "step": 9870 + }, + { + "epoch": 0.013882326905614579, + "grad_norm": 1.189791202545166, + "learning_rate": 4.164254601658002e-05, + "loss": 6.3238, + "step": 9880 + }, + { + "epoch": 0.013896377843778156, + "grad_norm": 1.2701431512832642, + "learning_rate": 4.168469860896445e-05, + "loss": 6.3215, + "step": 9890 + }, + { + "epoch": 0.013910428781941734, + "grad_norm": 1.2288933992385864, + "learning_rate": 4.1726851201348874e-05, + "loss": 6.2862, + "step": 9900 + }, + { + "epoch": 0.013924479720105312, + "grad_norm": 1.2237406969070435, + "learning_rate": 4.176900379373331e-05, + "loss": 6.47, + "step": 9910 + }, + { + "epoch": 0.01393853065826889, + "grad_norm": 1.292963981628418, + "learning_rate": 4.181115638611774e-05, + "loss": 6.4382, + "step": 9920 + }, + { + "epoch": 0.013952581596432466, + "grad_norm": 1.2853929996490479, + "learning_rate": 4.185330897850217e-05, + "loss": 6.3467, + "step": 9930 + }, + { + "epoch": 0.013966632534596044, + "grad_norm": 1.2729337215423584, + "learning_rate": 4.1895461570886605e-05, + "loss": 6.3551, + "step": 9940 + }, + { + "epoch": 0.013980683472759622, + "grad_norm": 1.2991608381271362, + "learning_rate": 4.1937614163271037e-05, + "loss": 6.3629, + "step": 9950 + }, + { + "epoch": 0.0139947344109232, + "grad_norm": 1.286419153213501, + "learning_rate": 4.1979766755655475e-05, + "loss": 6.4067, + "step": 9960 + }, + { + "epoch": 0.014008785349086776, + "grad_norm": 1.2718861103057861, + "learning_rate": 4.20219193480399e-05, + "loss": 6.3202, + "step": 9970 + }, + { + "epoch": 0.014022836287250354, + "grad_norm": 1.3436089754104614, + "learning_rate": 4.206407194042433e-05, + "loss": 6.219, + "step": 9980 + }, + { + "epoch": 0.014036887225413932, + "grad_norm": 1.2512117624282837, + "learning_rate": 4.210622453280877e-05, + "loss": 6.2775, + "step": 9990 + }, + { + "epoch": 0.01405093816357751, + "grad_norm": 1.3441169261932373, + "learning_rate": 4.214837712519319e-05, + "loss": 6.3914, + "step": 10000 + }, + { + "epoch": 0.014064989101741088, + "grad_norm": 1.3099346160888672, + "learning_rate": 4.219052971757763e-05, + "loss": 6.3451, + "step": 10010 + }, + { + "epoch": 0.014079040039904664, + "grad_norm": 1.3395233154296875, + "learning_rate": 4.223268230996206e-05, + "loss": 6.3383, + "step": 10020 + }, + { + "epoch": 0.014093090978068242, + "grad_norm": 1.3763487339019775, + "learning_rate": 4.2274834902346486e-05, + "loss": 6.289, + "step": 10030 + }, + { + "epoch": 0.01410714191623182, + "grad_norm": 1.429871916770935, + "learning_rate": 4.2316987494730924e-05, + "loss": 6.2917, + "step": 10040 + }, + { + "epoch": 0.014121192854395398, + "grad_norm": 1.394494891166687, + "learning_rate": 4.235492482787691e-05, + "loss": 6.3043, + "step": 10050 + }, + { + "epoch": 0.014135243792558974, + "grad_norm": 1.4214140176773071, + "learning_rate": 4.239707742026134e-05, + "loss": 6.3363, + "step": 10060 + }, + { + "epoch": 0.014149294730722552, + "grad_norm": 1.2173434495925903, + "learning_rate": 4.2439230012645776e-05, + "loss": 6.3484, + "step": 10070 + }, + { + "epoch": 0.01416334566888613, + "grad_norm": 1.3638588190078735, + "learning_rate": 4.248138260503021e-05, + "loss": 6.3495, + "step": 10080 + }, + { + "epoch": 0.014177396607049707, + "grad_norm": 1.300630807876587, + "learning_rate": 4.252353519741464e-05, + "loss": 6.3795, + "step": 10090 + }, + { + "epoch": 0.014191447545213284, + "grad_norm": 1.279421329498291, + "learning_rate": 4.256568778979907e-05, + "loss": 6.3479, + "step": 10100 + }, + { + "epoch": 0.014205498483376862, + "grad_norm": 1.2923552989959717, + "learning_rate": 4.26078403821835e-05, + "loss": 6.3892, + "step": 10110 + }, + { + "epoch": 0.01421954942154044, + "grad_norm": 1.2862358093261719, + "learning_rate": 4.264999297456793e-05, + "loss": 6.3228, + "step": 10120 + }, + { + "epoch": 0.014233600359704017, + "grad_norm": 1.3364365100860596, + "learning_rate": 4.269214556695236e-05, + "loss": 6.4024, + "step": 10130 + }, + { + "epoch": 0.014247651297867595, + "grad_norm": 1.2868720293045044, + "learning_rate": 4.27342981593368e-05, + "loss": 6.2095, + "step": 10140 + }, + { + "epoch": 0.014261702236031171, + "grad_norm": 1.306137204170227, + "learning_rate": 4.2776450751721225e-05, + "loss": 6.3935, + "step": 10150 + }, + { + "epoch": 0.01427575317419475, + "grad_norm": 1.35478675365448, + "learning_rate": 4.2818603344105656e-05, + "loss": 6.3092, + "step": 10160 + }, + { + "epoch": 0.014289804112358327, + "grad_norm": 1.340928316116333, + "learning_rate": 4.2860755936490094e-05, + "loss": 6.2858, + "step": 10170 + }, + { + "epoch": 0.014303855050521905, + "grad_norm": 1.4878336191177368, + "learning_rate": 4.290290852887452e-05, + "loss": 6.297, + "step": 10180 + }, + { + "epoch": 0.014317905988685481, + "grad_norm": 1.3728898763656616, + "learning_rate": 4.294506112125895e-05, + "loss": 6.2908, + "step": 10190 + }, + { + "epoch": 0.01433195692684906, + "grad_norm": 1.3865792751312256, + "learning_rate": 4.298721371364339e-05, + "loss": 6.3898, + "step": 10200 + }, + { + "epoch": 0.014346007865012637, + "grad_norm": 1.3288946151733398, + "learning_rate": 4.302936630602782e-05, + "loss": 6.2661, + "step": 10210 + }, + { + "epoch": 0.014360058803176215, + "grad_norm": 1.5158910751342773, + "learning_rate": 4.306730363917381e-05, + "loss": 6.2648, + "step": 10220 + }, + { + "epoch": 0.014374109741339793, + "grad_norm": 1.41054368019104, + "learning_rate": 4.310945623155824e-05, + "loss": 6.3832, + "step": 10230 + }, + { + "epoch": 0.014388160679503369, + "grad_norm": 1.5252634286880493, + "learning_rate": 4.3151608823942664e-05, + "loss": 6.3621, + "step": 10240 + }, + { + "epoch": 0.014402211617666947, + "grad_norm": 1.2399297952651978, + "learning_rate": 4.31937614163271e-05, + "loss": 6.3802, + "step": 10250 + }, + { + "epoch": 0.014416262555830525, + "grad_norm": 1.2246840000152588, + "learning_rate": 4.323591400871153e-05, + "loss": 6.3442, + "step": 10260 + }, + { + "epoch": 0.014430313493994103, + "grad_norm": 1.349806785583496, + "learning_rate": 4.327806660109597e-05, + "loss": 6.2469, + "step": 10270 + }, + { + "epoch": 0.014444364432157679, + "grad_norm": 1.199388027191162, + "learning_rate": 4.3320219193480395e-05, + "loss": 6.3044, + "step": 10280 + }, + { + "epoch": 0.014458415370321257, + "grad_norm": 1.371137261390686, + "learning_rate": 4.3362371785864826e-05, + "loss": 6.2964, + "step": 10290 + }, + { + "epoch": 0.014472466308484835, + "grad_norm": 1.3206599950790405, + "learning_rate": 4.3404524378249264e-05, + "loss": 6.3184, + "step": 10300 + }, + { + "epoch": 0.014486517246648413, + "grad_norm": 1.4087791442871094, + "learning_rate": 4.344667697063369e-05, + "loss": 6.2876, + "step": 10310 + }, + { + "epoch": 0.01450056818481199, + "grad_norm": 1.4454388618469238, + "learning_rate": 4.348882956301812e-05, + "loss": 6.2846, + "step": 10320 + }, + { + "epoch": 0.014514619122975567, + "grad_norm": 1.414621353149414, + "learning_rate": 4.353098215540256e-05, + "loss": 6.2277, + "step": 10330 + }, + { + "epoch": 0.014528670061139145, + "grad_norm": 1.3868155479431152, + "learning_rate": 4.357313474778698e-05, + "loss": 6.2959, + "step": 10340 + }, + { + "epoch": 0.014542720999302722, + "grad_norm": 1.3480900526046753, + "learning_rate": 4.361528734017142e-05, + "loss": 6.2991, + "step": 10350 + }, + { + "epoch": 0.0145567719374663, + "grad_norm": 1.3821889162063599, + "learning_rate": 4.365743993255585e-05, + "loss": 6.2858, + "step": 10360 + }, + { + "epoch": 0.014570822875629877, + "grad_norm": 1.3947319984436035, + "learning_rate": 4.3699592524940276e-05, + "loss": 6.2764, + "step": 10370 + }, + { + "epoch": 0.014584873813793454, + "grad_norm": 1.4054744243621826, + "learning_rate": 4.3741745117324714e-05, + "loss": 6.3224, + "step": 10380 + }, + { + "epoch": 0.014598924751957032, + "grad_norm": 1.2518731355667114, + "learning_rate": 4.3783897709709145e-05, + "loss": 6.3417, + "step": 10390 + }, + { + "epoch": 0.01461297569012061, + "grad_norm": 1.4203547239303589, + "learning_rate": 4.382605030209357e-05, + "loss": 6.3121, + "step": 10400 + }, + { + "epoch": 0.014627026628284186, + "grad_norm": 1.4533650875091553, + "learning_rate": 4.386820289447801e-05, + "loss": 6.2781, + "step": 10410 + }, + { + "epoch": 0.014641077566447764, + "grad_norm": 1.4527699947357178, + "learning_rate": 4.391035548686244e-05, + "loss": 6.3558, + "step": 10420 + }, + { + "epoch": 0.014655128504611342, + "grad_norm": 1.324060082435608, + "learning_rate": 4.3952508079246876e-05, + "loss": 6.3507, + "step": 10430 + }, + { + "epoch": 0.01466917944277492, + "grad_norm": 1.4087752103805542, + "learning_rate": 4.39946606716313e-05, + "loss": 6.3084, + "step": 10440 + }, + { + "epoch": 0.014683230380938498, + "grad_norm": 1.2102359533309937, + "learning_rate": 4.403681326401573e-05, + "loss": 6.3034, + "step": 10450 + }, + { + "epoch": 0.014697281319102074, + "grad_norm": 1.3251889944076538, + "learning_rate": 4.407896585640017e-05, + "loss": 6.2459, + "step": 10460 + }, + { + "epoch": 0.014711332257265652, + "grad_norm": 1.3922446966171265, + "learning_rate": 4.4121118448784594e-05, + "loss": 6.3356, + "step": 10470 + }, + { + "epoch": 0.01472538319542923, + "grad_norm": 1.269550085067749, + "learning_rate": 4.416327104116903e-05, + "loss": 6.2387, + "step": 10480 + }, + { + "epoch": 0.014739434133592808, + "grad_norm": 1.3506200313568115, + "learning_rate": 4.420542363355346e-05, + "loss": 6.2655, + "step": 10490 + }, + { + "epoch": 0.014753485071756384, + "grad_norm": 1.3239065408706665, + "learning_rate": 4.424757622593789e-05, + "loss": 6.267, + "step": 10500 + }, + { + "epoch": 0.014767536009919962, + "grad_norm": 1.2084630727767944, + "learning_rate": 4.4289728818322325e-05, + "loss": 6.3075, + "step": 10510 + }, + { + "epoch": 0.01478158694808354, + "grad_norm": 1.3695273399353027, + "learning_rate": 4.433188141070676e-05, + "loss": 6.3784, + "step": 10520 + }, + { + "epoch": 0.014795637886247118, + "grad_norm": 1.4278502464294434, + "learning_rate": 4.437403400309118e-05, + "loss": 6.2445, + "step": 10530 + }, + { + "epoch": 0.014809688824410696, + "grad_norm": 1.606279969215393, + "learning_rate": 4.441618659547562e-05, + "loss": 6.2251, + "step": 10540 + }, + { + "epoch": 0.014823739762574272, + "grad_norm": 1.283535122871399, + "learning_rate": 4.445833918786005e-05, + "loss": 6.2737, + "step": 10550 + }, + { + "epoch": 0.01483779070073785, + "grad_norm": 1.2936784029006958, + "learning_rate": 4.450049178024449e-05, + "loss": 6.3372, + "step": 10560 + }, + { + "epoch": 0.014851841638901428, + "grad_norm": 1.4122823476791382, + "learning_rate": 4.454264437262891e-05, + "loss": 6.3038, + "step": 10570 + }, + { + "epoch": 0.014865892577065005, + "grad_norm": 1.2987902164459229, + "learning_rate": 4.4584796965013344e-05, + "loss": 6.2672, + "step": 10580 + }, + { + "epoch": 0.014879943515228582, + "grad_norm": 1.2851322889328003, + "learning_rate": 4.462694955739778e-05, + "loss": 6.2606, + "step": 10590 + }, + { + "epoch": 0.01489399445339216, + "grad_norm": 1.359154224395752, + "learning_rate": 4.4669102149782206e-05, + "loss": 6.3153, + "step": 10600 + }, + { + "epoch": 0.014908045391555737, + "grad_norm": 1.3480968475341797, + "learning_rate": 4.471125474216664e-05, + "loss": 6.3336, + "step": 10610 + }, + { + "epoch": 0.014922096329719315, + "grad_norm": 1.3098258972167969, + "learning_rate": 4.4753407334551075e-05, + "loss": 6.302, + "step": 10620 + }, + { + "epoch": 0.014936147267882893, + "grad_norm": 1.3076599836349487, + "learning_rate": 4.47955599269355e-05, + "loss": 6.3044, + "step": 10630 + }, + { + "epoch": 0.01495019820604647, + "grad_norm": 1.2845678329467773, + "learning_rate": 4.483771251931994e-05, + "loss": 6.2815, + "step": 10640 + }, + { + "epoch": 0.014964249144210047, + "grad_norm": 1.2279037237167358, + "learning_rate": 4.487986511170437e-05, + "loss": 6.2752, + "step": 10650 + }, + { + "epoch": 0.014978300082373625, + "grad_norm": 1.3851795196533203, + "learning_rate": 4.492201770408879e-05, + "loss": 6.2336, + "step": 10660 + }, + { + "epoch": 0.014992351020537203, + "grad_norm": 1.5091443061828613, + "learning_rate": 4.496417029647323e-05, + "loss": 6.1372, + "step": 10670 + }, + { + "epoch": 0.01500640195870078, + "grad_norm": 1.263030767440796, + "learning_rate": 4.500632288885766e-05, + "loss": 6.2136, + "step": 10680 + }, + { + "epoch": 0.015020452896864357, + "grad_norm": 1.508408546447754, + "learning_rate": 4.5048475481242086e-05, + "loss": 6.2105, + "step": 10690 + }, + { + "epoch": 0.015034503835027935, + "grad_norm": 1.5163711309432983, + "learning_rate": 4.5090628073626524e-05, + "loss": 6.1942, + "step": 10700 + }, + { + "epoch": 0.015048554773191513, + "grad_norm": 1.3551746606826782, + "learning_rate": 4.5132780666010956e-05, + "loss": 6.1646, + "step": 10710 + }, + { + "epoch": 0.015062605711355091, + "grad_norm": 1.3781139850616455, + "learning_rate": 4.5174933258395394e-05, + "loss": 6.2178, + "step": 10720 + }, + { + "epoch": 0.015076656649518667, + "grad_norm": 1.3036412000656128, + "learning_rate": 4.521708585077982e-05, + "loss": 6.2511, + "step": 10730 + }, + { + "epoch": 0.015090707587682245, + "grad_norm": 1.3681645393371582, + "learning_rate": 4.525923844316425e-05, + "loss": 6.1301, + "step": 10740 + }, + { + "epoch": 0.015104758525845823, + "grad_norm": 1.3642160892486572, + "learning_rate": 4.530139103554869e-05, + "loss": 6.228, + "step": 10750 + }, + { + "epoch": 0.0151188094640094, + "grad_norm": 1.410559058189392, + "learning_rate": 4.534354362793311e-05, + "loss": 6.2981, + "step": 10760 + }, + { + "epoch": 0.015132860402172977, + "grad_norm": 1.3881288766860962, + "learning_rate": 4.538569622031755e-05, + "loss": 6.1561, + "step": 10770 + }, + { + "epoch": 0.015146911340336555, + "grad_norm": 1.231331467628479, + "learning_rate": 4.542784881270198e-05, + "loss": 6.2037, + "step": 10780 + }, + { + "epoch": 0.015160962278500133, + "grad_norm": 1.5481162071228027, + "learning_rate": 4.5470001405086405e-05, + "loss": 6.1547, + "step": 10790 + }, + { + "epoch": 0.01517501321666371, + "grad_norm": 1.3833677768707275, + "learning_rate": 4.551215399747084e-05, + "loss": 6.2681, + "step": 10800 + }, + { + "epoch": 0.015189064154827287, + "grad_norm": 1.297667145729065, + "learning_rate": 4.5554306589855274e-05, + "loss": 6.1601, + "step": 10810 + }, + { + "epoch": 0.015203115092990865, + "grad_norm": 1.2731614112854004, + "learning_rate": 4.55964591822397e-05, + "loss": 6.2338, + "step": 10820 + }, + { + "epoch": 0.015217166031154443, + "grad_norm": 1.392951488494873, + "learning_rate": 4.5638611774624136e-05, + "loss": 6.223, + "step": 10830 + }, + { + "epoch": 0.01523121696931802, + "grad_norm": 1.152622103691101, + "learning_rate": 4.568076436700857e-05, + "loss": 6.1884, + "step": 10840 + }, + { + "epoch": 0.015245267907481598, + "grad_norm": 1.287348985671997, + "learning_rate": 4.5722916959393005e-05, + "loss": 6.1756, + "step": 10850 + }, + { + "epoch": 0.015259318845645175, + "grad_norm": 1.3075429201126099, + "learning_rate": 4.576506955177743e-05, + "loss": 6.1748, + "step": 10860 + }, + { + "epoch": 0.015273369783808752, + "grad_norm": 1.3519240617752075, + "learning_rate": 4.580722214416186e-05, + "loss": 6.2611, + "step": 10870 + }, + { + "epoch": 0.01528742072197233, + "grad_norm": 1.257500410079956, + "learning_rate": 4.58493747365463e-05, + "loss": 6.1883, + "step": 10880 + }, + { + "epoch": 0.015301471660135908, + "grad_norm": 1.276843786239624, + "learning_rate": 4.589152732893072e-05, + "loss": 6.1289, + "step": 10890 + }, + { + "epoch": 0.015315522598299484, + "grad_norm": 1.397799015045166, + "learning_rate": 4.5933679921315155e-05, + "loss": 6.1006, + "step": 10900 + }, + { + "epoch": 0.015329573536463062, + "grad_norm": 1.3642454147338867, + "learning_rate": 4.597583251369959e-05, + "loss": 6.179, + "step": 10910 + }, + { + "epoch": 0.01534362447462664, + "grad_norm": 1.406509280204773, + "learning_rate": 4.601798510608402e-05, + "loss": 6.1252, + "step": 10920 + }, + { + "epoch": 0.015357675412790218, + "grad_norm": 1.4512871503829956, + "learning_rate": 4.6060137698468455e-05, + "loss": 6.2659, + "step": 10930 + }, + { + "epoch": 0.015371726350953796, + "grad_norm": 1.332694172859192, + "learning_rate": 4.6102290290852886e-05, + "loss": 6.2391, + "step": 10940 + }, + { + "epoch": 0.015385777289117372, + "grad_norm": 1.3527559041976929, + "learning_rate": 4.614444288323731e-05, + "loss": 6.2161, + "step": 10950 + }, + { + "epoch": 0.01539982822728095, + "grad_norm": 1.3626435995101929, + "learning_rate": 4.618659547562175e-05, + "loss": 6.1904, + "step": 10960 + }, + { + "epoch": 0.015413879165444528, + "grad_norm": 1.4977718591690063, + "learning_rate": 4.622874806800618e-05, + "loss": 6.1898, + "step": 10970 + }, + { + "epoch": 0.015427930103608106, + "grad_norm": 1.512209415435791, + "learning_rate": 4.627090066039062e-05, + "loss": 6.139, + "step": 10980 + }, + { + "epoch": 0.015441981041771682, + "grad_norm": 1.363409161567688, + "learning_rate": 4.631305325277504e-05, + "loss": 6.1603, + "step": 10990 + }, + { + "epoch": 0.01545603197993526, + "grad_norm": 1.5099941492080688, + "learning_rate": 4.635520584515947e-05, + "loss": 6.1706, + "step": 11000 + }, + { + "epoch": 0.015470082918098838, + "grad_norm": 1.9168140888214111, + "learning_rate": 4.639735843754391e-05, + "loss": 6.2084, + "step": 11010 + }, + { + "epoch": 0.015484133856262416, + "grad_norm": 1.2205970287322998, + "learning_rate": 4.6439511029928335e-05, + "loss": 6.252, + "step": 11020 + }, + { + "epoch": 0.015498184794425994, + "grad_norm": 1.5673115253448486, + "learning_rate": 4.6481663622312766e-05, + "loss": 6.0759, + "step": 11030 + }, + { + "epoch": 0.01551223573258957, + "grad_norm": 1.4149396419525146, + "learning_rate": 4.6523816214697204e-05, + "loss": 6.2689, + "step": 11040 + }, + { + "epoch": 0.015526286670753148, + "grad_norm": 1.3954936265945435, + "learning_rate": 4.656596880708163e-05, + "loss": 6.2501, + "step": 11050 + }, + { + "epoch": 0.015540337608916726, + "grad_norm": 1.5006779432296753, + "learning_rate": 4.660812139946607e-05, + "loss": 6.1785, + "step": 11060 + }, + { + "epoch": 0.015554388547080304, + "grad_norm": 1.3999886512756348, + "learning_rate": 4.66502739918505e-05, + "loss": 6.15, + "step": 11070 + }, + { + "epoch": 0.01556843948524388, + "grad_norm": 2.2167131900787354, + "learning_rate": 4.669242658423492e-05, + "loss": 6.132, + "step": 11080 + }, + { + "epoch": 0.015582490423407458, + "grad_norm": 1.401848316192627, + "learning_rate": 4.673457917661936e-05, + "loss": 6.281, + "step": 11090 + }, + { + "epoch": 0.015596541361571035, + "grad_norm": 1.3416310548782349, + "learning_rate": 4.677673176900379e-05, + "loss": 6.1515, + "step": 11100 + }, + { + "epoch": 0.015610592299734613, + "grad_norm": 1.5161608457565308, + "learning_rate": 4.6818884361388216e-05, + "loss": 6.1189, + "step": 11110 + }, + { + "epoch": 0.01562464323789819, + "grad_norm": 1.393735408782959, + "learning_rate": 4.6861036953772654e-05, + "loss": 6.2588, + "step": 11120 + }, + { + "epoch": 0.015638694176061767, + "grad_norm": 1.3404139280319214, + "learning_rate": 4.6903189546157085e-05, + "loss": 6.234, + "step": 11130 + }, + { + "epoch": 0.015652745114225347, + "grad_norm": 1.372836947441101, + "learning_rate": 4.694534213854152e-05, + "loss": 6.2106, + "step": 11140 + }, + { + "epoch": 0.015666796052388923, + "grad_norm": 1.3898221254348755, + "learning_rate": 4.698749473092595e-05, + "loss": 6.0849, + "step": 11150 + }, + { + "epoch": 0.0156808469905525, + "grad_norm": 1.336380124092102, + "learning_rate": 4.702964732331038e-05, + "loss": 6.1996, + "step": 11160 + }, + { + "epoch": 0.01569489792871608, + "grad_norm": 1.478607177734375, + "learning_rate": 4.7071799915694816e-05, + "loss": 6.1785, + "step": 11170 + }, + { + "epoch": 0.015708948866879655, + "grad_norm": 1.5594801902770996, + "learning_rate": 4.711395250807924e-05, + "loss": 6.2302, + "step": 11180 + }, + { + "epoch": 0.01572299980504323, + "grad_norm": 1.377120852470398, + "learning_rate": 4.715610510046367e-05, + "loss": 6.2416, + "step": 11190 + }, + { + "epoch": 0.01573705074320681, + "grad_norm": 1.5377556085586548, + "learning_rate": 4.719825769284811e-05, + "loss": 6.1337, + "step": 11200 + }, + { + "epoch": 0.015751101681370387, + "grad_norm": 1.7049307823181152, + "learning_rate": 4.7240410285232534e-05, + "loss": 6.1986, + "step": 11210 + }, + { + "epoch": 0.015765152619533967, + "grad_norm": 1.3457587957382202, + "learning_rate": 4.728256287761697e-05, + "loss": 6.0877, + "step": 11220 + }, + { + "epoch": 0.015779203557697543, + "grad_norm": 1.3759392499923706, + "learning_rate": 4.73247154700014e-05, + "loss": 6.1598, + "step": 11230 + }, + { + "epoch": 0.01579325449586112, + "grad_norm": 1.3922202587127686, + "learning_rate": 4.736686806238583e-05, + "loss": 6.2017, + "step": 11240 + }, + { + "epoch": 0.0158073054340247, + "grad_norm": 1.3885234594345093, + "learning_rate": 4.7409020654770266e-05, + "loss": 6.2302, + "step": 11250 + }, + { + "epoch": 0.015821356372188275, + "grad_norm": 1.4035016298294067, + "learning_rate": 4.74511732471547e-05, + "loss": 6.204, + "step": 11260 + }, + { + "epoch": 0.015835407310351855, + "grad_norm": 1.3315571546554565, + "learning_rate": 4.749332583953913e-05, + "loss": 6.1677, + "step": 11270 + }, + { + "epoch": 0.01584945824851543, + "grad_norm": 1.2895710468292236, + "learning_rate": 4.753547843192356e-05, + "loss": 6.2274, + "step": 11280 + }, + { + "epoch": 0.015863509186679007, + "grad_norm": 1.3065532445907593, + "learning_rate": 4.757763102430799e-05, + "loss": 6.0836, + "step": 11290 + }, + { + "epoch": 0.015877560124842587, + "grad_norm": 1.392031192779541, + "learning_rate": 4.761978361669243e-05, + "loss": 6.1599, + "step": 11300 + }, + { + "epoch": 0.015891611063006163, + "grad_norm": 1.3048975467681885, + "learning_rate": 4.766193620907685e-05, + "loss": 6.1467, + "step": 11310 + }, + { + "epoch": 0.01590566200116974, + "grad_norm": 1.5152511596679688, + "learning_rate": 4.7704088801461284e-05, + "loss": 6.0913, + "step": 11320 + }, + { + "epoch": 0.01591971293933332, + "grad_norm": 1.3078099489212036, + "learning_rate": 4.774624139384572e-05, + "loss": 6.0885, + "step": 11330 + }, + { + "epoch": 0.015933763877496895, + "grad_norm": 1.2487436532974243, + "learning_rate": 4.7788393986230146e-05, + "loss": 6.2689, + "step": 11340 + }, + { + "epoch": 0.015947814815660474, + "grad_norm": 1.5608822107315063, + "learning_rate": 4.7830546578614584e-05, + "loss": 6.1847, + "step": 11350 + }, + { + "epoch": 0.01596186575382405, + "grad_norm": 1.7839996814727783, + "learning_rate": 4.7872699170999015e-05, + "loss": 6.0849, + "step": 11360 + }, + { + "epoch": 0.015975916691987627, + "grad_norm": 1.5817362070083618, + "learning_rate": 4.791485176338344e-05, + "loss": 6.2027, + "step": 11370 + }, + { + "epoch": 0.015989967630151206, + "grad_norm": 1.7094651460647583, + "learning_rate": 4.795700435576788e-05, + "loss": 6.1865, + "step": 11380 + }, + { + "epoch": 0.016004018568314782, + "grad_norm": 1.3029943704605103, + "learning_rate": 4.799915694815231e-05, + "loss": 6.1732, + "step": 11390 + }, + { + "epoch": 0.016018069506478362, + "grad_norm": 1.6077370643615723, + "learning_rate": 4.804130954053673e-05, + "loss": 6.2421, + "step": 11400 + }, + { + "epoch": 0.016032120444641938, + "grad_norm": 1.7310693264007568, + "learning_rate": 4.808346213292117e-05, + "loss": 6.0574, + "step": 11410 + }, + { + "epoch": 0.016046171382805514, + "grad_norm": 1.3837238550186157, + "learning_rate": 4.81256147253056e-05, + "loss": 6.1435, + "step": 11420 + }, + { + "epoch": 0.016060222320969094, + "grad_norm": 1.3638036251068115, + "learning_rate": 4.816776731769004e-05, + "loss": 6.0729, + "step": 11430 + }, + { + "epoch": 0.01607427325913267, + "grad_norm": 1.391371726989746, + "learning_rate": 4.8209919910074465e-05, + "loss": 6.0995, + "step": 11440 + }, + { + "epoch": 0.01608832419729625, + "grad_norm": 1.3891973495483398, + "learning_rate": 4.8252072502458896e-05, + "loss": 6.0128, + "step": 11450 + }, + { + "epoch": 0.016102375135459826, + "grad_norm": 1.384169578552246, + "learning_rate": 4.8294225094843334e-05, + "loss": 6.2458, + "step": 11460 + }, + { + "epoch": 0.016116426073623402, + "grad_norm": 1.3138374090194702, + "learning_rate": 4.833637768722776e-05, + "loss": 6.1883, + "step": 11470 + }, + { + "epoch": 0.016130477011786982, + "grad_norm": 1.3187780380249023, + "learning_rate": 4.8378530279612196e-05, + "loss": 6.128, + "step": 11480 + }, + { + "epoch": 0.016144527949950558, + "grad_norm": 1.3277584314346313, + "learning_rate": 4.842068287199663e-05, + "loss": 6.2424, + "step": 11490 + }, + { + "epoch": 0.016158578888114134, + "grad_norm": 1.2720946073532104, + "learning_rate": 4.846283546438105e-05, + "loss": 6.2401, + "step": 11500 + }, + { + "epoch": 0.016172629826277714, + "grad_norm": 1.4264894723892212, + "learning_rate": 4.850498805676549e-05, + "loss": 6.1807, + "step": 11510 + }, + { + "epoch": 0.01618668076444129, + "grad_norm": 1.3776253461837769, + "learning_rate": 4.854714064914992e-05, + "loss": 6.2413, + "step": 11520 + }, + { + "epoch": 0.01620073170260487, + "grad_norm": 1.2899820804595947, + "learning_rate": 4.8589293241534345e-05, + "loss": 6.1676, + "step": 11530 + }, + { + "epoch": 0.016214782640768446, + "grad_norm": 1.3107612133026123, + "learning_rate": 4.863144583391878e-05, + "loss": 6.1658, + "step": 11540 + }, + { + "epoch": 0.016228833578932022, + "grad_norm": 1.4616020917892456, + "learning_rate": 4.8673598426303214e-05, + "loss": 6.1274, + "step": 11550 + }, + { + "epoch": 0.0162428845170956, + "grad_norm": 1.3054602146148682, + "learning_rate": 4.8715751018687645e-05, + "loss": 6.119, + "step": 11560 + }, + { + "epoch": 0.016256935455259178, + "grad_norm": 1.3119100332260132, + "learning_rate": 4.8757903611072077e-05, + "loss": 6.0736, + "step": 11570 + }, + { + "epoch": 0.016270986393422757, + "grad_norm": 1.3544719219207764, + "learning_rate": 4.880005620345651e-05, + "loss": 6.1392, + "step": 11580 + }, + { + "epoch": 0.016285037331586334, + "grad_norm": 1.3637272119522095, + "learning_rate": 4.8842208795840946e-05, + "loss": 6.0982, + "step": 11590 + }, + { + "epoch": 0.01629908826974991, + "grad_norm": 1.431994915008545, + "learning_rate": 4.888436138822537e-05, + "loss": 6.07, + "step": 11600 + }, + { + "epoch": 0.01631313920791349, + "grad_norm": 1.322766661643982, + "learning_rate": 4.89265139806098e-05, + "loss": 6.2458, + "step": 11610 + }, + { + "epoch": 0.016327190146077065, + "grad_norm": 1.5207664966583252, + "learning_rate": 4.896866657299424e-05, + "loss": 6.0894, + "step": 11620 + }, + { + "epoch": 0.016341241084240645, + "grad_norm": 1.950318455696106, + "learning_rate": 4.9010819165378664e-05, + "loss": 6.112, + "step": 11630 + }, + { + "epoch": 0.01635529202240422, + "grad_norm": 1.8058013916015625, + "learning_rate": 4.90529717577631e-05, + "loss": 6.1553, + "step": 11640 + }, + { + "epoch": 0.016369342960567797, + "grad_norm": 1.2825206518173218, + "learning_rate": 4.909512435014753e-05, + "loss": 6.185, + "step": 11650 + }, + { + "epoch": 0.016383393898731377, + "grad_norm": 1.4293164014816284, + "learning_rate": 4.913727694253196e-05, + "loss": 6.2412, + "step": 11660 + }, + { + "epoch": 0.016397444836894953, + "grad_norm": 1.2992738485336304, + "learning_rate": 4.9179429534916395e-05, + "loss": 6.1107, + "step": 11670 + }, + { + "epoch": 0.01641149577505853, + "grad_norm": 1.3839462995529175, + "learning_rate": 4.9221582127300826e-05, + "loss": 6.0917, + "step": 11680 + }, + { + "epoch": 0.01642554671322211, + "grad_norm": 1.3559750318527222, + "learning_rate": 4.926373471968525e-05, + "loss": 6.1816, + "step": 11690 + }, + { + "epoch": 0.016439597651385685, + "grad_norm": 1.3381038904190063, + "learning_rate": 4.930588731206969e-05, + "loss": 6.2072, + "step": 11700 + }, + { + "epoch": 0.016453648589549265, + "grad_norm": 1.3606219291687012, + "learning_rate": 4.934803990445412e-05, + "loss": 6.1239, + "step": 11710 + }, + { + "epoch": 0.01646769952771284, + "grad_norm": 1.2954307794570923, + "learning_rate": 4.939019249683856e-05, + "loss": 6.1274, + "step": 11720 + }, + { + "epoch": 0.016481750465876417, + "grad_norm": 1.3133169412612915, + "learning_rate": 4.943234508922298e-05, + "loss": 6.0767, + "step": 11730 + }, + { + "epoch": 0.016495801404039997, + "grad_norm": 1.675938367843628, + "learning_rate": 4.947449768160741e-05, + "loss": 6.1603, + "step": 11740 + }, + { + "epoch": 0.016509852342203573, + "grad_norm": 1.3132497072219849, + "learning_rate": 4.951665027399185e-05, + "loss": 6.1483, + "step": 11750 + }, + { + "epoch": 0.016523903280367153, + "grad_norm": 1.491671085357666, + "learning_rate": 4.9558802866376276e-05, + "loss": 6.0745, + "step": 11760 + }, + { + "epoch": 0.01653795421853073, + "grad_norm": 1.487144112586975, + "learning_rate": 4.9600955458760713e-05, + "loss": 6.0128, + "step": 11770 + }, + { + "epoch": 0.016552005156694305, + "grad_norm": 1.572177767753601, + "learning_rate": 4.9643108051145145e-05, + "loss": 6.1975, + "step": 11780 + }, + { + "epoch": 0.016566056094857885, + "grad_norm": 1.355470061302185, + "learning_rate": 4.968526064352957e-05, + "loss": 6.0389, + "step": 11790 + }, + { + "epoch": 0.01658010703302146, + "grad_norm": 1.437559962272644, + "learning_rate": 4.972741323591401e-05, + "loss": 6.0687, + "step": 11800 + }, + { + "epoch": 0.016594157971185037, + "grad_norm": 1.2300171852111816, + "learning_rate": 4.976956582829844e-05, + "loss": 6.119, + "step": 11810 + }, + { + "epoch": 0.016608208909348617, + "grad_norm": 1.3405951261520386, + "learning_rate": 4.981171842068286e-05, + "loss": 6.2048, + "step": 11820 + }, + { + "epoch": 0.016622259847512193, + "grad_norm": 1.3226433992385864, + "learning_rate": 4.98538710130673e-05, + "loss": 6.0897, + "step": 11830 + }, + { + "epoch": 0.016636310785675772, + "grad_norm": 1.3560805320739746, + "learning_rate": 4.989602360545173e-05, + "loss": 6.0075, + "step": 11840 + }, + { + "epoch": 0.01665036172383935, + "grad_norm": 1.4106473922729492, + "learning_rate": 4.993817619783616e-05, + "loss": 5.9296, + "step": 11850 + }, + { + "epoch": 0.016664412662002925, + "grad_norm": 1.2530484199523926, + "learning_rate": 4.9980328790220594e-05, + "loss": 6.0896, + "step": 11860 + }, + { + "epoch": 0.016678463600166504, + "grad_norm": 1.28843092918396, + "learning_rate": 5.0022481382605025e-05, + "loss": 6.1235, + "step": 11870 + }, + { + "epoch": 0.01669251453833008, + "grad_norm": 1.4062579870224, + "learning_rate": 5.006463397498946e-05, + "loss": 6.1136, + "step": 11880 + }, + { + "epoch": 0.01670656547649366, + "grad_norm": 1.275445818901062, + "learning_rate": 5.010678656737389e-05, + "loss": 6.0155, + "step": 11890 + }, + { + "epoch": 0.016720616414657236, + "grad_norm": 1.9027255773544312, + "learning_rate": 5.014893915975832e-05, + "loss": 6.1248, + "step": 11900 + }, + { + "epoch": 0.016734667352820812, + "grad_norm": 1.4187486171722412, + "learning_rate": 5.0191091752142757e-05, + "loss": 6.0323, + "step": 11910 + }, + { + "epoch": 0.016748718290984392, + "grad_norm": 1.3106961250305176, + "learning_rate": 5.023324434452718e-05, + "loss": 6.0743, + "step": 11920 + }, + { + "epoch": 0.016762769229147968, + "grad_norm": 1.2284607887268066, + "learning_rate": 5.027539693691162e-05, + "loss": 6.1276, + "step": 11930 + }, + { + "epoch": 0.016776820167311548, + "grad_norm": 1.403949499130249, + "learning_rate": 5.031754952929605e-05, + "loss": 6.0686, + "step": 11940 + }, + { + "epoch": 0.016790871105475124, + "grad_norm": 1.421419382095337, + "learning_rate": 5.0359702121680474e-05, + "loss": 6.241, + "step": 11950 + }, + { + "epoch": 0.0168049220436387, + "grad_norm": 1.5890074968338013, + "learning_rate": 5.040185471406491e-05, + "loss": 6.1183, + "step": 11960 + }, + { + "epoch": 0.01681897298180228, + "grad_norm": 1.38971745967865, + "learning_rate": 5.0444007306449344e-05, + "loss": 6.0829, + "step": 11970 + }, + { + "epoch": 0.016833023919965856, + "grad_norm": 1.4102171659469604, + "learning_rate": 5.0486159898833775e-05, + "loss": 6.1236, + "step": 11980 + }, + { + "epoch": 0.016847074858129432, + "grad_norm": 1.258589506149292, + "learning_rate": 5.0528312491218206e-05, + "loss": 6.1806, + "step": 11990 + }, + { + "epoch": 0.016861125796293012, + "grad_norm": 1.3471759557724, + "learning_rate": 5.057046508360264e-05, + "loss": 6.0465, + "step": 12000 + }, + { + "epoch": 0.016875176734456588, + "grad_norm": 1.5169402360916138, + "learning_rate": 5.0612617675987075e-05, + "loss": 6.1002, + "step": 12010 + }, + { + "epoch": 0.016889227672620168, + "grad_norm": 1.5497729778289795, + "learning_rate": 5.06547702683715e-05, + "loss": 6.1487, + "step": 12020 + }, + { + "epoch": 0.016903278610783744, + "grad_norm": 1.364227294921875, + "learning_rate": 5.069692286075593e-05, + "loss": 6.0852, + "step": 12030 + }, + { + "epoch": 0.01691732954894732, + "grad_norm": 1.3978562355041504, + "learning_rate": 5.073907545314037e-05, + "loss": 6.0646, + "step": 12040 + }, + { + "epoch": 0.0169313804871109, + "grad_norm": 1.2657595872879028, + "learning_rate": 5.078122804552479e-05, + "loss": 6.0511, + "step": 12050 + }, + { + "epoch": 0.016945431425274476, + "grad_norm": 1.472236156463623, + "learning_rate": 5.082338063790923e-05, + "loss": 6.1274, + "step": 12060 + }, + { + "epoch": 0.016959482363438055, + "grad_norm": 1.381701946258545, + "learning_rate": 5.086553323029366e-05, + "loss": 6.0854, + "step": 12070 + }, + { + "epoch": 0.01697353330160163, + "grad_norm": 1.3113288879394531, + "learning_rate": 5.0907685822678086e-05, + "loss": 6.1267, + "step": 12080 + }, + { + "epoch": 0.016987584239765208, + "grad_norm": 1.3406379222869873, + "learning_rate": 5.0949838415062524e-05, + "loss": 6.0556, + "step": 12090 + }, + { + "epoch": 0.017001635177928787, + "grad_norm": 1.3727220296859741, + "learning_rate": 5.0991991007446956e-05, + "loss": 6.0826, + "step": 12100 + }, + { + "epoch": 0.017015686116092364, + "grad_norm": 1.3808116912841797, + "learning_rate": 5.103414359983138e-05, + "loss": 6.2429, + "step": 12110 + }, + { + "epoch": 0.01702973705425594, + "grad_norm": 1.4226034879684448, + "learning_rate": 5.107629619221582e-05, + "loss": 5.9922, + "step": 12120 + }, + { + "epoch": 0.01704378799241952, + "grad_norm": 1.3345751762390137, + "learning_rate": 5.111844878460025e-05, + "loss": 6.1254, + "step": 12130 + }, + { + "epoch": 0.017057838930583095, + "grad_norm": 1.391182780265808, + "learning_rate": 5.116060137698468e-05, + "loss": 5.9991, + "step": 12140 + }, + { + "epoch": 0.017071889868746675, + "grad_norm": 1.3782682418823242, + "learning_rate": 5.120275396936911e-05, + "loss": 6.1427, + "step": 12150 + }, + { + "epoch": 0.01708594080691025, + "grad_norm": 1.4560561180114746, + "learning_rate": 5.124490656175354e-05, + "loss": 6.0616, + "step": 12160 + }, + { + "epoch": 0.017099991745073827, + "grad_norm": 1.3621957302093506, + "learning_rate": 5.128705915413798e-05, + "loss": 6.099, + "step": 12170 + }, + { + "epoch": 0.017114042683237407, + "grad_norm": 1.2872947454452515, + "learning_rate": 5.1329211746522405e-05, + "loss": 6.068, + "step": 12180 + }, + { + "epoch": 0.017128093621400983, + "grad_norm": 1.3031105995178223, + "learning_rate": 5.1371364338906836e-05, + "loss": 6.1765, + "step": 12190 + }, + { + "epoch": 0.017142144559564563, + "grad_norm": 1.4918880462646484, + "learning_rate": 5.1413516931291274e-05, + "loss": 6.0744, + "step": 12200 + }, + { + "epoch": 0.01715619549772814, + "grad_norm": 1.66692316532135, + "learning_rate": 5.14556695236757e-05, + "loss": 6.1172, + "step": 12210 + }, + { + "epoch": 0.017170246435891715, + "grad_norm": 1.3032985925674438, + "learning_rate": 5.1497822116060136e-05, + "loss": 6.0353, + "step": 12220 + }, + { + "epoch": 0.017184297374055295, + "grad_norm": 1.3920645713806152, + "learning_rate": 5.153997470844457e-05, + "loss": 6.1914, + "step": 12230 + }, + { + "epoch": 0.01719834831221887, + "grad_norm": 1.3095813989639282, + "learning_rate": 5.158212730082899e-05, + "loss": 6.0989, + "step": 12240 + }, + { + "epoch": 0.01721239925038245, + "grad_norm": 1.347722053527832, + "learning_rate": 5.162427989321343e-05, + "loss": 5.9789, + "step": 12250 + }, + { + "epoch": 0.017226450188546027, + "grad_norm": 1.4624723196029663, + "learning_rate": 5.166643248559786e-05, + "loss": 6.0599, + "step": 12260 + }, + { + "epoch": 0.017240501126709603, + "grad_norm": 1.4272193908691406, + "learning_rate": 5.170858507798229e-05, + "loss": 6.1182, + "step": 12270 + }, + { + "epoch": 0.017254552064873183, + "grad_norm": 1.3339524269104004, + "learning_rate": 5.175073767036672e-05, + "loss": 6.0278, + "step": 12280 + }, + { + "epoch": 0.01726860300303676, + "grad_norm": 1.505544900894165, + "learning_rate": 5.1792890262751154e-05, + "loss": 6.089, + "step": 12290 + }, + { + "epoch": 0.017282653941200335, + "grad_norm": 1.400678277015686, + "learning_rate": 5.1835042855135586e-05, + "loss": 6.1141, + "step": 12300 + }, + { + "epoch": 0.017296704879363915, + "grad_norm": 1.316159963607788, + "learning_rate": 5.187719544752002e-05, + "loss": 6.1014, + "step": 12310 + }, + { + "epoch": 0.01731075581752749, + "grad_norm": 1.41743803024292, + "learning_rate": 5.191934803990445e-05, + "loss": 6.0915, + "step": 12320 + }, + { + "epoch": 0.01732480675569107, + "grad_norm": 1.7660351991653442, + "learning_rate": 5.1961500632288886e-05, + "loss": 6.0905, + "step": 12330 + }, + { + "epoch": 0.017338857693854647, + "grad_norm": 1.3657212257385254, + "learning_rate": 5.200365322467331e-05, + "loss": 6.0799, + "step": 12340 + }, + { + "epoch": 0.017352908632018223, + "grad_norm": 1.3705576658248901, + "learning_rate": 5.204580581705775e-05, + "loss": 6.013, + "step": 12350 + }, + { + "epoch": 0.017366959570181802, + "grad_norm": 1.4308192729949951, + "learning_rate": 5.208795840944218e-05, + "loss": 6.1158, + "step": 12360 + }, + { + "epoch": 0.01738101050834538, + "grad_norm": 1.3916292190551758, + "learning_rate": 5.2130111001826604e-05, + "loss": 5.9591, + "step": 12370 + }, + { + "epoch": 0.017395061446508958, + "grad_norm": 1.3628098964691162, + "learning_rate": 5.217226359421104e-05, + "loss": 6.1443, + "step": 12380 + }, + { + "epoch": 0.017409112384672534, + "grad_norm": 1.2999829053878784, + "learning_rate": 5.221441618659547e-05, + "loss": 6.0797, + "step": 12390 + }, + { + "epoch": 0.01742316332283611, + "grad_norm": 1.4970372915267944, + "learning_rate": 5.22565687789799e-05, + "loss": 6.0399, + "step": 12400 + }, + { + "epoch": 0.01743721426099969, + "grad_norm": 1.466609239578247, + "learning_rate": 5.2298721371364335e-05, + "loss": 6.0547, + "step": 12410 + }, + { + "epoch": 0.017451265199163266, + "grad_norm": 1.4185417890548706, + "learning_rate": 5.2340873963748766e-05, + "loss": 6.0352, + "step": 12420 + }, + { + "epoch": 0.017465316137326842, + "grad_norm": 1.6092323064804077, + "learning_rate": 5.23830265561332e-05, + "loss": 6.0051, + "step": 12430 + }, + { + "epoch": 0.017479367075490422, + "grad_norm": 1.5270646810531616, + "learning_rate": 5.242517914851763e-05, + "loss": 6.1214, + "step": 12440 + }, + { + "epoch": 0.017493418013653998, + "grad_norm": 1.3804599046707153, + "learning_rate": 5.246733174090206e-05, + "loss": 6.0227, + "step": 12450 + }, + { + "epoch": 0.017507468951817578, + "grad_norm": 1.4650616645812988, + "learning_rate": 5.25094843332865e-05, + "loss": 6.1227, + "step": 12460 + }, + { + "epoch": 0.017521519889981154, + "grad_norm": 1.3534270524978638, + "learning_rate": 5.255163692567092e-05, + "loss": 6.0883, + "step": 12470 + }, + { + "epoch": 0.01753557082814473, + "grad_norm": 1.298676609992981, + "learning_rate": 5.259378951805536e-05, + "loss": 6.047, + "step": 12480 + }, + { + "epoch": 0.01754962176630831, + "grad_norm": 1.342682123184204, + "learning_rate": 5.263594211043979e-05, + "loss": 6.0541, + "step": 12490 + }, + { + "epoch": 0.017563672704471886, + "grad_norm": 1.375024676322937, + "learning_rate": 5.2678094702824216e-05, + "loss": 6.0099, + "step": 12500 + }, + { + "epoch": 0.017577723642635466, + "grad_norm": 1.4682930707931519, + "learning_rate": 5.2720247295208654e-05, + "loss": 6.1075, + "step": 12510 + }, + { + "epoch": 0.017591774580799042, + "grad_norm": 1.4236594438552856, + "learning_rate": 5.2762399887593085e-05, + "loss": 6.052, + "step": 12520 + }, + { + "epoch": 0.017605825518962618, + "grad_norm": 1.344035267829895, + "learning_rate": 5.280455247997751e-05, + "loss": 5.9737, + "step": 12530 + }, + { + "epoch": 0.017619876457126198, + "grad_norm": 1.3492854833602905, + "learning_rate": 5.284670507236195e-05, + "loss": 6.1154, + "step": 12540 + }, + { + "epoch": 0.017633927395289774, + "grad_norm": 1.4145216941833496, + "learning_rate": 5.288885766474638e-05, + "loss": 6.0093, + "step": 12550 + }, + { + "epoch": 0.017647978333453353, + "grad_norm": 1.3583636283874512, + "learning_rate": 5.293101025713081e-05, + "loss": 6.0203, + "step": 12560 + }, + { + "epoch": 0.01766202927161693, + "grad_norm": 1.3437408208847046, + "learning_rate": 5.297316284951524e-05, + "loss": 6.0749, + "step": 12570 + }, + { + "epoch": 0.017676080209780506, + "grad_norm": 1.3554259538650513, + "learning_rate": 5.301531544189967e-05, + "loss": 5.97, + "step": 12580 + }, + { + "epoch": 0.017690131147944085, + "grad_norm": 1.4989122152328491, + "learning_rate": 5.30574680342841e-05, + "loss": 6.0707, + "step": 12590 + }, + { + "epoch": 0.01770418208610766, + "grad_norm": 1.3016396760940552, + "learning_rate": 5.3099620626668534e-05, + "loss": 6.0922, + "step": 12600 + }, + { + "epoch": 0.017718233024271238, + "grad_norm": 1.3750371932983398, + "learning_rate": 5.3141773219052965e-05, + "loss": 5.9968, + "step": 12610 + }, + { + "epoch": 0.017732283962434817, + "grad_norm": 1.3117624521255493, + "learning_rate": 5.31839258114374e-05, + "loss": 5.9331, + "step": 12620 + }, + { + "epoch": 0.017746334900598393, + "grad_norm": 1.3047480583190918, + "learning_rate": 5.322607840382183e-05, + "loss": 6.1537, + "step": 12630 + }, + { + "epoch": 0.017760385838761973, + "grad_norm": 1.4629182815551758, + "learning_rate": 5.3268230996206266e-05, + "loss": 6.1317, + "step": 12640 + }, + { + "epoch": 0.01777443677692555, + "grad_norm": 1.5809321403503418, + "learning_rate": 5.33103835885907e-05, + "loss": 6.0821, + "step": 12650 + }, + { + "epoch": 0.017788487715089125, + "grad_norm": 1.3663504123687744, + "learning_rate": 5.335253618097512e-05, + "loss": 6.0406, + "step": 12660 + }, + { + "epoch": 0.017802538653252705, + "grad_norm": 1.3569718599319458, + "learning_rate": 5.339468877335956e-05, + "loss": 6.0533, + "step": 12670 + }, + { + "epoch": 0.01781658959141628, + "grad_norm": 1.3136539459228516, + "learning_rate": 5.343684136574399e-05, + "loss": 5.9933, + "step": 12680 + }, + { + "epoch": 0.01783064052957986, + "grad_norm": 1.3268357515335083, + "learning_rate": 5.347899395812842e-05, + "loss": 6.0975, + "step": 12690 + }, + { + "epoch": 0.017844691467743437, + "grad_norm": 1.3501018285751343, + "learning_rate": 5.352114655051285e-05, + "loss": 6.0835, + "step": 12700 + }, + { + "epoch": 0.017858742405907013, + "grad_norm": 1.4120509624481201, + "learning_rate": 5.3563299142897284e-05, + "loss": 6.0582, + "step": 12710 + }, + { + "epoch": 0.017872793344070593, + "grad_norm": 1.6523367166519165, + "learning_rate": 5.3605451735281715e-05, + "loss": 5.9865, + "step": 12720 + }, + { + "epoch": 0.01788684428223417, + "grad_norm": 1.4812240600585938, + "learning_rate": 5.3647604327666146e-05, + "loss": 6.0202, + "step": 12730 + }, + { + "epoch": 0.01790089522039775, + "grad_norm": 1.4424370527267456, + "learning_rate": 5.368975692005058e-05, + "loss": 6.0952, + "step": 12740 + }, + { + "epoch": 0.017914946158561325, + "grad_norm": 1.420494556427002, + "learning_rate": 5.3731909512435015e-05, + "loss": 6.1514, + "step": 12750 + }, + { + "epoch": 0.0179289970967249, + "grad_norm": 1.4344902038574219, + "learning_rate": 5.377406210481944e-05, + "loss": 6.0853, + "step": 12760 + }, + { + "epoch": 0.01794304803488848, + "grad_norm": 1.3586077690124512, + "learning_rate": 5.381621469720388e-05, + "loss": 6.0114, + "step": 12770 + }, + { + "epoch": 0.017957098973052057, + "grad_norm": 1.310850977897644, + "learning_rate": 5.385836728958831e-05, + "loss": 5.9627, + "step": 12780 + }, + { + "epoch": 0.017971149911215633, + "grad_norm": 1.4529310464859009, + "learning_rate": 5.390051988197273e-05, + "loss": 5.9186, + "step": 12790 + }, + { + "epoch": 0.017985200849379213, + "grad_norm": 1.4991663694381714, + "learning_rate": 5.394267247435717e-05, + "loss": 5.9598, + "step": 12800 + }, + { + "epoch": 0.01799925178754279, + "grad_norm": 1.3883793354034424, + "learning_rate": 5.39848250667416e-05, + "loss": 6.0834, + "step": 12810 + }, + { + "epoch": 0.01801330272570637, + "grad_norm": 1.35647451877594, + "learning_rate": 5.4026977659126027e-05, + "loss": 6.1214, + "step": 12820 + }, + { + "epoch": 0.018027353663869945, + "grad_norm": 1.3526296615600586, + "learning_rate": 5.4069130251510465e-05, + "loss": 6.0049, + "step": 12830 + }, + { + "epoch": 0.01804140460203352, + "grad_norm": 1.364187240600586, + "learning_rate": 5.4111282843894896e-05, + "loss": 5.9091, + "step": 12840 + }, + { + "epoch": 0.0180554555401971, + "grad_norm": 1.277514100074768, + "learning_rate": 5.415343543627933e-05, + "loss": 6.0787, + "step": 12850 + }, + { + "epoch": 0.018069506478360677, + "grad_norm": 1.7247467041015625, + "learning_rate": 5.419558802866376e-05, + "loss": 6.0347, + "step": 12860 + }, + { + "epoch": 0.018083557416524256, + "grad_norm": 1.2670753002166748, + "learning_rate": 5.423774062104819e-05, + "loss": 6.0311, + "step": 12870 + }, + { + "epoch": 0.018097608354687832, + "grad_norm": 1.3634384870529175, + "learning_rate": 5.427989321343262e-05, + "loss": 6.0208, + "step": 12880 + }, + { + "epoch": 0.01811165929285141, + "grad_norm": 1.5839406251907349, + "learning_rate": 5.432204580581705e-05, + "loss": 6.0122, + "step": 12890 + }, + { + "epoch": 0.018125710231014988, + "grad_norm": 1.3542689085006714, + "learning_rate": 5.436419839820148e-05, + "loss": 5.9979, + "step": 12900 + }, + { + "epoch": 0.018139761169178564, + "grad_norm": 1.4818419218063354, + "learning_rate": 5.440635099058592e-05, + "loss": 6.0396, + "step": 12910 + }, + { + "epoch": 0.01815381210734214, + "grad_norm": 1.4347730875015259, + "learning_rate": 5.4448503582970345e-05, + "loss": 6.0252, + "step": 12920 + }, + { + "epoch": 0.01816786304550572, + "grad_norm": 1.463624119758606, + "learning_rate": 5.449065617535478e-05, + "loss": 6.0238, + "step": 12930 + }, + { + "epoch": 0.018181913983669296, + "grad_norm": 1.5159509181976318, + "learning_rate": 5.4532808767739214e-05, + "loss": 6.0117, + "step": 12940 + }, + { + "epoch": 0.018195964921832876, + "grad_norm": 1.306830644607544, + "learning_rate": 5.457496136012364e-05, + "loss": 6.1917, + "step": 12950 + }, + { + "epoch": 0.018210015859996452, + "grad_norm": 1.3079791069030762, + "learning_rate": 5.4617113952508076e-05, + "loss": 6.0244, + "step": 12960 + }, + { + "epoch": 0.018224066798160028, + "grad_norm": 1.3655420541763306, + "learning_rate": 5.465926654489251e-05, + "loss": 5.9953, + "step": 12970 + }, + { + "epoch": 0.018238117736323608, + "grad_norm": 1.5014837980270386, + "learning_rate": 5.470141913727694e-05, + "loss": 6.0585, + "step": 12980 + }, + { + "epoch": 0.018252168674487184, + "grad_norm": 1.595160722732544, + "learning_rate": 5.474357172966137e-05, + "loss": 5.9723, + "step": 12990 + }, + { + "epoch": 0.018266219612650764, + "grad_norm": 1.2678043842315674, + "learning_rate": 5.47857243220458e-05, + "loss": 6.03, + "step": 13000 + }, + { + "epoch": 0.01828027055081434, + "grad_norm": 1.397788405418396, + "learning_rate": 5.482787691443023e-05, + "loss": 5.9627, + "step": 13010 + }, + { + "epoch": 0.018294321488977916, + "grad_norm": 1.4957337379455566, + "learning_rate": 5.4870029506814663e-05, + "loss": 5.9015, + "step": 13020 + }, + { + "epoch": 0.018308372427141496, + "grad_norm": 1.3790498971939087, + "learning_rate": 5.4912182099199095e-05, + "loss": 5.9694, + "step": 13030 + }, + { + "epoch": 0.018322423365305072, + "grad_norm": 1.4078073501586914, + "learning_rate": 5.4954334691583526e-05, + "loss": 6.0007, + "step": 13040 + }, + { + "epoch": 0.01833647430346865, + "grad_norm": 1.4309850931167603, + "learning_rate": 5.499648728396796e-05, + "loss": 5.8825, + "step": 13050 + }, + { + "epoch": 0.018350525241632228, + "grad_norm": 1.3303261995315552, + "learning_rate": 5.5038639876352395e-05, + "loss": 5.96, + "step": 13060 + }, + { + "epoch": 0.018364576179795804, + "grad_norm": 1.2958881855010986, + "learning_rate": 5.5080792468736826e-05, + "loss": 6.1374, + "step": 13070 + }, + { + "epoch": 0.018378627117959383, + "grad_norm": 1.2930759191513062, + "learning_rate": 5.512294506112125e-05, + "loss": 6.0608, + "step": 13080 + }, + { + "epoch": 0.01839267805612296, + "grad_norm": 1.3812742233276367, + "learning_rate": 5.516509765350569e-05, + "loss": 6.0503, + "step": 13090 + }, + { + "epoch": 0.018406728994286536, + "grad_norm": 1.5456600189208984, + "learning_rate": 5.520725024589012e-05, + "loss": 5.9067, + "step": 13100 + }, + { + "epoch": 0.018420779932450115, + "grad_norm": 1.3641170263290405, + "learning_rate": 5.5249402838274544e-05, + "loss": 6.0021, + "step": 13110 + }, + { + "epoch": 0.01843483087061369, + "grad_norm": 1.5792025327682495, + "learning_rate": 5.529155543065898e-05, + "loss": 5.9245, + "step": 13120 + }, + { + "epoch": 0.01844888180877727, + "grad_norm": 1.3822730779647827, + "learning_rate": 5.533370802304341e-05, + "loss": 5.9526, + "step": 13130 + }, + { + "epoch": 0.018462932746940847, + "grad_norm": 1.744850754737854, + "learning_rate": 5.5375860615427844e-05, + "loss": 6.0086, + "step": 13140 + }, + { + "epoch": 0.018476983685104423, + "grad_norm": 1.3281506299972534, + "learning_rate": 5.5418013207812275e-05, + "loss": 5.9487, + "step": 13150 + }, + { + "epoch": 0.018491034623268003, + "grad_norm": 1.401555061340332, + "learning_rate": 5.5460165800196707e-05, + "loss": 6.0319, + "step": 13160 + }, + { + "epoch": 0.01850508556143158, + "grad_norm": 1.3791970014572144, + "learning_rate": 5.550231839258114e-05, + "loss": 5.9704, + "step": 13170 + }, + { + "epoch": 0.01851913649959516, + "grad_norm": 1.3170675039291382, + "learning_rate": 5.554447098496557e-05, + "loss": 5.9452, + "step": 13180 + }, + { + "epoch": 0.018533187437758735, + "grad_norm": 1.3928661346435547, + "learning_rate": 5.558662357735001e-05, + "loss": 5.9074, + "step": 13190 + }, + { + "epoch": 0.01854723837592231, + "grad_norm": 1.7144209146499634, + "learning_rate": 5.562877616973444e-05, + "loss": 5.9106, + "step": 13200 + }, + { + "epoch": 0.01856128931408589, + "grad_norm": 1.4539275169372559, + "learning_rate": 5.567092876211886e-05, + "loss": 6.0976, + "step": 13210 + }, + { + "epoch": 0.018575340252249467, + "grad_norm": 1.3310813903808594, + "learning_rate": 5.57130813545033e-05, + "loss": 6.0462, + "step": 13220 + }, + { + "epoch": 0.018589391190413043, + "grad_norm": 1.4120519161224365, + "learning_rate": 5.575523394688773e-05, + "loss": 5.9199, + "step": 13230 + }, + { + "epoch": 0.018603442128576623, + "grad_norm": 1.307513952255249, + "learning_rate": 5.5797386539272156e-05, + "loss": 5.9939, + "step": 13240 + }, + { + "epoch": 0.0186174930667402, + "grad_norm": 1.2844830751419067, + "learning_rate": 5.5839539131656594e-05, + "loss": 5.9712, + "step": 13250 + }, + { + "epoch": 0.01863154400490378, + "grad_norm": 1.4797707796096802, + "learning_rate": 5.5881691724041025e-05, + "loss": 6.0282, + "step": 13260 + }, + { + "epoch": 0.018645594943067355, + "grad_norm": 1.3347375392913818, + "learning_rate": 5.5923844316425456e-05, + "loss": 5.928, + "step": 13270 + }, + { + "epoch": 0.01865964588123093, + "grad_norm": 1.4141706228256226, + "learning_rate": 5.596599690880989e-05, + "loss": 5.965, + "step": 13280 + }, + { + "epoch": 0.01867369681939451, + "grad_norm": 1.3178112506866455, + "learning_rate": 5.600814950119432e-05, + "loss": 5.9698, + "step": 13290 + }, + { + "epoch": 0.018687747757558087, + "grad_norm": 1.3804385662078857, + "learning_rate": 5.605030209357875e-05, + "loss": 5.9723, + "step": 13300 + }, + { + "epoch": 0.018701798695721666, + "grad_norm": 1.3772902488708496, + "learning_rate": 5.609245468596318e-05, + "loss": 5.9814, + "step": 13310 + }, + { + "epoch": 0.018715849633885243, + "grad_norm": 1.3835526704788208, + "learning_rate": 5.613460727834761e-05, + "loss": 6.0272, + "step": 13320 + }, + { + "epoch": 0.01872990057204882, + "grad_norm": 1.3807588815689087, + "learning_rate": 5.617675987073204e-05, + "loss": 6.0179, + "step": 13330 + }, + { + "epoch": 0.0187439515102124, + "grad_norm": 1.3371533155441284, + "learning_rate": 5.6218912463116474e-05, + "loss": 6.0128, + "step": 13340 + }, + { + "epoch": 0.018758002448375975, + "grad_norm": 1.4686000347137451, + "learning_rate": 5.626106505550091e-05, + "loss": 5.9024, + "step": 13350 + }, + { + "epoch": 0.018772053386539554, + "grad_norm": 1.7148891687393188, + "learning_rate": 5.6303217647885343e-05, + "loss": 6.0377, + "step": 13360 + }, + { + "epoch": 0.01878610432470313, + "grad_norm": 1.4156179428100586, + "learning_rate": 5.634537024026977e-05, + "loss": 6.0123, + "step": 13370 + }, + { + "epoch": 0.018800155262866707, + "grad_norm": 1.2572938203811646, + "learning_rate": 5.6387522832654206e-05, + "loss": 5.8405, + "step": 13380 + }, + { + "epoch": 0.018814206201030286, + "grad_norm": 1.3694168329238892, + "learning_rate": 5.642967542503864e-05, + "loss": 6.0598, + "step": 13390 + }, + { + "epoch": 0.018828257139193862, + "grad_norm": 1.4427056312561035, + "learning_rate": 5.647182801742306e-05, + "loss": 5.91, + "step": 13400 + }, + { + "epoch": 0.01884230807735744, + "grad_norm": 1.3949912786483765, + "learning_rate": 5.65139806098075e-05, + "loss": 5.941, + "step": 13410 + }, + { + "epoch": 0.018856359015521018, + "grad_norm": 2.0856781005859375, + "learning_rate": 5.655613320219193e-05, + "loss": 5.8934, + "step": 13420 + }, + { + "epoch": 0.018870409953684594, + "grad_norm": 1.295962929725647, + "learning_rate": 5.659828579457636e-05, + "loss": 6.0145, + "step": 13430 + }, + { + "epoch": 0.018884460891848174, + "grad_norm": 1.350246787071228, + "learning_rate": 5.664043838696079e-05, + "loss": 5.9026, + "step": 13440 + }, + { + "epoch": 0.01889851183001175, + "grad_norm": 1.334167718887329, + "learning_rate": 5.6682590979345224e-05, + "loss": 6.0541, + "step": 13450 + }, + { + "epoch": 0.018912562768175326, + "grad_norm": 1.4749501943588257, + "learning_rate": 5.6724743571729655e-05, + "loss": 5.9257, + "step": 13460 + }, + { + "epoch": 0.018926613706338906, + "grad_norm": 1.4598968029022217, + "learning_rate": 5.6766896164114086e-05, + "loss": 5.9241, + "step": 13470 + }, + { + "epoch": 0.018940664644502482, + "grad_norm": 1.5799657106399536, + "learning_rate": 5.6809048756498524e-05, + "loss": 5.8404, + "step": 13480 + }, + { + "epoch": 0.01895471558266606, + "grad_norm": 1.455440878868103, + "learning_rate": 5.6851201348882955e-05, + "loss": 5.8688, + "step": 13490 + }, + { + "epoch": 0.018968766520829638, + "grad_norm": 1.424730896949768, + "learning_rate": 5.689335394126738e-05, + "loss": 5.9688, + "step": 13500 + }, + { + "epoch": 0.018982817458993214, + "grad_norm": 1.2700331211090088, + "learning_rate": 5.693550653365182e-05, + "loss": 5.9306, + "step": 13510 + }, + { + "epoch": 0.018996868397156794, + "grad_norm": 1.3546868562698364, + "learning_rate": 5.697765912603625e-05, + "loss": 5.9353, + "step": 13520 + }, + { + "epoch": 0.01901091933532037, + "grad_norm": 1.417958378791809, + "learning_rate": 5.701981171842067e-05, + "loss": 5.9517, + "step": 13530 + }, + { + "epoch": 0.019024970273483946, + "grad_norm": 1.3052090406417847, + "learning_rate": 5.706196431080511e-05, + "loss": 5.8374, + "step": 13540 + }, + { + "epoch": 0.019039021211647526, + "grad_norm": 1.298235297203064, + "learning_rate": 5.710411690318954e-05, + "loss": 5.9565, + "step": 13550 + }, + { + "epoch": 0.019053072149811102, + "grad_norm": 1.4889936447143555, + "learning_rate": 5.7146269495573974e-05, + "loss": 5.9748, + "step": 13560 + }, + { + "epoch": 0.01906712308797468, + "grad_norm": 1.4061089754104614, + "learning_rate": 5.7188422087958405e-05, + "loss": 5.9171, + "step": 13570 + }, + { + "epoch": 0.019081174026138258, + "grad_norm": 1.4704773426055908, + "learning_rate": 5.7230574680342836e-05, + "loss": 6.0023, + "step": 13580 + }, + { + "epoch": 0.019095224964301834, + "grad_norm": 1.344780683517456, + "learning_rate": 5.727272727272727e-05, + "loss": 5.9356, + "step": 13590 + }, + { + "epoch": 0.019109275902465413, + "grad_norm": 1.3091164827346802, + "learning_rate": 5.73148798651117e-05, + "loss": 5.9137, + "step": 13600 + }, + { + "epoch": 0.01912332684062899, + "grad_norm": 1.3960378170013428, + "learning_rate": 5.735703245749613e-05, + "loss": 5.9779, + "step": 13610 + }, + { + "epoch": 0.01913737777879257, + "grad_norm": 1.5556848049163818, + "learning_rate": 5.739918504988056e-05, + "loss": 5.9581, + "step": 13620 + }, + { + "epoch": 0.019151428716956145, + "grad_norm": 1.3245570659637451, + "learning_rate": 5.744133764226499e-05, + "loss": 5.9721, + "step": 13630 + }, + { + "epoch": 0.01916547965511972, + "grad_norm": 1.578223705291748, + "learning_rate": 5.748349023464943e-05, + "loss": 5.9285, + "step": 13640 + }, + { + "epoch": 0.0191795305932833, + "grad_norm": 2.247985363006592, + "learning_rate": 5.752564282703386e-05, + "loss": 5.9884, + "step": 13650 + }, + { + "epoch": 0.019193581531446877, + "grad_norm": 1.6837748289108276, + "learning_rate": 5.7567795419418285e-05, + "loss": 6.0128, + "step": 13660 + }, + { + "epoch": 0.019207632469610457, + "grad_norm": 1.367041826248169, + "learning_rate": 5.760994801180272e-05, + "loss": 5.9926, + "step": 13670 + }, + { + "epoch": 0.019221683407774033, + "grad_norm": 1.4417003393173218, + "learning_rate": 5.7652100604187154e-05, + "loss": 5.9076, + "step": 13680 + }, + { + "epoch": 0.01923573434593761, + "grad_norm": 1.3726568222045898, + "learning_rate": 5.7694253196571586e-05, + "loss": 5.875, + "step": 13690 + }, + { + "epoch": 0.01924978528410119, + "grad_norm": 1.3321431875228882, + "learning_rate": 5.773640578895602e-05, + "loss": 5.9552, + "step": 13700 + }, + { + "epoch": 0.019263836222264765, + "grad_norm": 1.372376561164856, + "learning_rate": 5.777855838134045e-05, + "loss": 5.955, + "step": 13710 + }, + { + "epoch": 0.01927788716042834, + "grad_norm": 1.4178439378738403, + "learning_rate": 5.782071097372488e-05, + "loss": 5.8555, + "step": 13720 + }, + { + "epoch": 0.01929193809859192, + "grad_norm": 1.3034266233444214, + "learning_rate": 5.786286356610931e-05, + "loss": 6.0031, + "step": 13730 + }, + { + "epoch": 0.019305989036755497, + "grad_norm": 1.7195543050765991, + "learning_rate": 5.790501615849374e-05, + "loss": 5.8395, + "step": 13740 + }, + { + "epoch": 0.019320039974919077, + "grad_norm": 1.3207449913024902, + "learning_rate": 5.794716875087817e-05, + "loss": 5.9347, + "step": 13750 + }, + { + "epoch": 0.019334090913082653, + "grad_norm": 1.3822189569473267, + "learning_rate": 5.7989321343262604e-05, + "loss": 5.8751, + "step": 13760 + }, + { + "epoch": 0.01934814185124623, + "grad_norm": 1.5890324115753174, + "learning_rate": 5.803147393564704e-05, + "loss": 5.8387, + "step": 13770 + }, + { + "epoch": 0.01936219278940981, + "grad_norm": 1.2956420183181763, + "learning_rate": 5.807362652803147e-05, + "loss": 5.8924, + "step": 13780 + }, + { + "epoch": 0.019376243727573385, + "grad_norm": 1.572546362876892, + "learning_rate": 5.81157791204159e-05, + "loss": 5.9325, + "step": 13790 + }, + { + "epoch": 0.019390294665736964, + "grad_norm": 1.4117943048477173, + "learning_rate": 5.8157931712800335e-05, + "loss": 5.8553, + "step": 13800 + }, + { + "epoch": 0.01940434560390054, + "grad_norm": 1.48485267162323, + "learning_rate": 5.8200084305184766e-05, + "loss": 5.8948, + "step": 13810 + }, + { + "epoch": 0.019418396542064117, + "grad_norm": 1.3272455930709839, + "learning_rate": 5.824223689756919e-05, + "loss": 5.8358, + "step": 13820 + }, + { + "epoch": 0.019432447480227696, + "grad_norm": 1.4525662660598755, + "learning_rate": 5.828438948995363e-05, + "loss": 5.937, + "step": 13830 + }, + { + "epoch": 0.019446498418391273, + "grad_norm": 1.495578646659851, + "learning_rate": 5.832654208233806e-05, + "loss": 5.9296, + "step": 13840 + }, + { + "epoch": 0.01946054935655485, + "grad_norm": 1.4523495435714722, + "learning_rate": 5.836869467472249e-05, + "loss": 5.7838, + "step": 13850 + }, + { + "epoch": 0.01947460029471843, + "grad_norm": 1.3701502084732056, + "learning_rate": 5.841084726710692e-05, + "loss": 5.9265, + "step": 13860 + }, + { + "epoch": 0.019488651232882005, + "grad_norm": 1.435974359512329, + "learning_rate": 5.845299985949135e-05, + "loss": 5.7779, + "step": 13870 + }, + { + "epoch": 0.019502702171045584, + "grad_norm": 1.507411003112793, + "learning_rate": 5.8495152451875784e-05, + "loss": 6.0682, + "step": 13880 + }, + { + "epoch": 0.01951675310920916, + "grad_norm": 1.71627938747406, + "learning_rate": 5.8537305044260216e-05, + "loss": 5.8261, + "step": 13890 + }, + { + "epoch": 0.019530804047372737, + "grad_norm": 1.3306772708892822, + "learning_rate": 5.857945763664465e-05, + "loss": 5.879, + "step": 13900 + }, + { + "epoch": 0.019544854985536316, + "grad_norm": 1.433265209197998, + "learning_rate": 5.862161022902908e-05, + "loss": 5.8721, + "step": 13910 + }, + { + "epoch": 0.019558905923699892, + "grad_norm": 1.2996599674224854, + "learning_rate": 5.866376282141351e-05, + "loss": 6.0022, + "step": 13920 + }, + { + "epoch": 0.019572956861863472, + "grad_norm": 1.3266264200210571, + "learning_rate": 5.870591541379795e-05, + "loss": 5.8978, + "step": 13930 + }, + { + "epoch": 0.019587007800027048, + "grad_norm": 1.3822417259216309, + "learning_rate": 5.874806800618238e-05, + "loss": 5.9197, + "step": 13940 + }, + { + "epoch": 0.019601058738190624, + "grad_norm": 1.4425207376480103, + "learning_rate": 5.87902205985668e-05, + "loss": 5.8509, + "step": 13950 + }, + { + "epoch": 0.019615109676354204, + "grad_norm": 1.3024529218673706, + "learning_rate": 5.883237319095124e-05, + "loss": 5.9852, + "step": 13960 + }, + { + "epoch": 0.01962916061451778, + "grad_norm": 1.3837809562683105, + "learning_rate": 5.887452578333567e-05, + "loss": 5.9071, + "step": 13970 + }, + { + "epoch": 0.01964321155268136, + "grad_norm": 1.37079918384552, + "learning_rate": 5.89166783757201e-05, + "loss": 5.9889, + "step": 13980 + }, + { + "epoch": 0.019657262490844936, + "grad_norm": 1.5206080675125122, + "learning_rate": 5.8958830968104534e-05, + "loss": 5.7997, + "step": 13990 + }, + { + "epoch": 0.019671313429008512, + "grad_norm": 1.3415592908859253, + "learning_rate": 5.9000983560488965e-05, + "loss": 5.9298, + "step": 14000 + }, + { + "epoch": 0.01968536436717209, + "grad_norm": 1.7559236288070679, + "learning_rate": 5.9043136152873396e-05, + "loss": 6.0002, + "step": 14010 + }, + { + "epoch": 0.019699415305335668, + "grad_norm": 1.3365631103515625, + "learning_rate": 5.908528874525783e-05, + "loss": 5.8709, + "step": 14020 + }, + { + "epoch": 0.019713466243499244, + "grad_norm": 1.3177831172943115, + "learning_rate": 5.912744133764226e-05, + "loss": 5.8005, + "step": 14030 + }, + { + "epoch": 0.019727517181662824, + "grad_norm": 1.3259848356246948, + "learning_rate": 5.916959393002669e-05, + "loss": 5.9384, + "step": 14040 + }, + { + "epoch": 0.0197415681198264, + "grad_norm": 1.5154435634613037, + "learning_rate": 5.921174652241112e-05, + "loss": 5.9317, + "step": 14050 + }, + { + "epoch": 0.01975561905798998, + "grad_norm": 1.3720771074295044, + "learning_rate": 5.925389911479556e-05, + "loss": 5.8961, + "step": 14060 + }, + { + "epoch": 0.019769669996153556, + "grad_norm": 1.6641851663589478, + "learning_rate": 5.9296051707179983e-05, + "loss": 5.9034, + "step": 14070 + }, + { + "epoch": 0.019783720934317132, + "grad_norm": 1.3994899988174438, + "learning_rate": 5.9338204299564415e-05, + "loss": 5.8884, + "step": 14080 + }, + { + "epoch": 0.01979777187248071, + "grad_norm": 1.3691990375518799, + "learning_rate": 5.938035689194885e-05, + "loss": 5.8671, + "step": 14090 + }, + { + "epoch": 0.019811822810644288, + "grad_norm": 1.7563625574111938, + "learning_rate": 5.9422509484333284e-05, + "loss": 5.8736, + "step": 14100 + }, + { + "epoch": 0.019825873748807867, + "grad_norm": 1.3554989099502563, + "learning_rate": 5.946466207671771e-05, + "loss": 5.9775, + "step": 14110 + }, + { + "epoch": 0.019839924686971443, + "grad_norm": 1.4109904766082764, + "learning_rate": 5.9506814669102146e-05, + "loss": 5.9443, + "step": 14120 + }, + { + "epoch": 0.01985397562513502, + "grad_norm": 1.383750081062317, + "learning_rate": 5.954896726148658e-05, + "loss": 5.855, + "step": 14130 + }, + { + "epoch": 0.0198680265632986, + "grad_norm": 1.4050294160842896, + "learning_rate": 5.959111985387101e-05, + "loss": 5.9805, + "step": 14140 + }, + { + "epoch": 0.019882077501462175, + "grad_norm": 1.4362891912460327, + "learning_rate": 5.963327244625544e-05, + "loss": 5.8024, + "step": 14150 + }, + { + "epoch": 0.019896128439625755, + "grad_norm": 1.4630895853042603, + "learning_rate": 5.967542503863987e-05, + "loss": 5.9194, + "step": 14160 + }, + { + "epoch": 0.01991017937778933, + "grad_norm": 1.3018559217453003, + "learning_rate": 5.97175776310243e-05, + "loss": 5.9572, + "step": 14170 + }, + { + "epoch": 0.019924230315952907, + "grad_norm": 1.3787480592727661, + "learning_rate": 5.975973022340873e-05, + "loss": 5.9111, + "step": 14180 + }, + { + "epoch": 0.019938281254116487, + "grad_norm": 1.5770620107650757, + "learning_rate": 5.980188281579317e-05, + "loss": 5.949, + "step": 14190 + }, + { + "epoch": 0.019952332192280063, + "grad_norm": 1.32578706741333, + "learning_rate": 5.9844035408177595e-05, + "loss": 5.8478, + "step": 14200 + }, + { + "epoch": 0.01996638313044364, + "grad_norm": 1.3311973810195923, + "learning_rate": 5.9886188000562027e-05, + "loss": 5.7895, + "step": 14210 + }, + { + "epoch": 0.01998043406860722, + "grad_norm": 1.4675605297088623, + "learning_rate": 5.9928340592946464e-05, + "loss": 5.8426, + "step": 14220 + }, + { + "epoch": 0.019994485006770795, + "grad_norm": 1.389006495475769, + "learning_rate": 5.9970493185330896e-05, + "loss": 5.8199, + "step": 14230 + }, + { + "epoch": 0.020008535944934375, + "grad_norm": 1.7180744409561157, + "learning_rate": 6.001264577771532e-05, + "loss": 5.8948, + "step": 14240 + }, + { + "epoch": 0.02002258688309795, + "grad_norm": 1.4792721271514893, + "learning_rate": 6.005479837009976e-05, + "loss": 5.8289, + "step": 14250 + }, + { + "epoch": 0.020036637821261527, + "grad_norm": 1.3706181049346924, + "learning_rate": 6.009695096248419e-05, + "loss": 5.9004, + "step": 14260 + }, + { + "epoch": 0.020050688759425107, + "grad_norm": 1.3976632356643677, + "learning_rate": 6.013910355486862e-05, + "loss": 5.837, + "step": 14270 + }, + { + "epoch": 0.020064739697588683, + "grad_norm": 1.3973368406295776, + "learning_rate": 6.018125614725305e-05, + "loss": 5.9126, + "step": 14280 + }, + { + "epoch": 0.020078790635752262, + "grad_norm": 1.5715432167053223, + "learning_rate": 6.022340873963748e-05, + "loss": 6.0113, + "step": 14290 + }, + { + "epoch": 0.02009284157391584, + "grad_norm": 1.3540730476379395, + "learning_rate": 6.0265561332021914e-05, + "loss": 5.9477, + "step": 14300 + }, + { + "epoch": 0.020106892512079415, + "grad_norm": 1.3370450735092163, + "learning_rate": 6.0307713924406345e-05, + "loss": 5.8446, + "step": 14310 + }, + { + "epoch": 0.020120943450242994, + "grad_norm": 1.3597146272659302, + "learning_rate": 6.0349866516790776e-05, + "loss": 5.8455, + "step": 14320 + }, + { + "epoch": 0.02013499438840657, + "grad_norm": 1.4030274152755737, + "learning_rate": 6.039201910917521e-05, + "loss": 5.8089, + "step": 14330 + }, + { + "epoch": 0.020149045326570147, + "grad_norm": 1.470107913017273, + "learning_rate": 6.043417170155964e-05, + "loss": 5.8458, + "step": 14340 + }, + { + "epoch": 0.020163096264733726, + "grad_norm": 1.3324445486068726, + "learning_rate": 6.0476324293944076e-05, + "loss": 5.9478, + "step": 14350 + }, + { + "epoch": 0.020177147202897303, + "grad_norm": 1.3293122053146362, + "learning_rate": 6.05184768863285e-05, + "loss": 5.8075, + "step": 14360 + }, + { + "epoch": 0.020191198141060882, + "grad_norm": 1.3335965871810913, + "learning_rate": 6.056062947871293e-05, + "loss": 6.0072, + "step": 14370 + }, + { + "epoch": 0.02020524907922446, + "grad_norm": 1.4222981929779053, + "learning_rate": 6.060278207109737e-05, + "loss": 5.9405, + "step": 14380 + }, + { + "epoch": 0.020219300017388035, + "grad_norm": 1.292315125465393, + "learning_rate": 6.06449346634818e-05, + "loss": 5.959, + "step": 14390 + }, + { + "epoch": 0.020233350955551614, + "grad_norm": 1.3387219905853271, + "learning_rate": 6.0687087255866225e-05, + "loss": 5.9308, + "step": 14400 + }, + { + "epoch": 0.02024740189371519, + "grad_norm": 1.4554129838943481, + "learning_rate": 6.0729239848250663e-05, + "loss": 5.8909, + "step": 14410 + }, + { + "epoch": 0.02026145283187877, + "grad_norm": Infinity, + "learning_rate": 6.0771392440635095e-05, + "loss": 5.927, + "step": 14420 + }, + { + "epoch": 0.020275503770042346, + "grad_norm": 1.4156428575515747, + "learning_rate": 6.0809329773781084e-05, + "loss": 5.964, + "step": 14430 + }, + { + "epoch": 0.020289554708205922, + "grad_norm": 1.5934478044509888, + "learning_rate": 6.0851482366165515e-05, + "loss": 5.7974, + "step": 14440 + }, + { + "epoch": 0.020303605646369502, + "grad_norm": 1.6974800825119019, + "learning_rate": 6.089363495854994e-05, + "loss": 5.913, + "step": 14450 + }, + { + "epoch": 0.020317656584533078, + "grad_norm": 1.4129645824432373, + "learning_rate": 6.093578755093438e-05, + "loss": 5.8084, + "step": 14460 + }, + { + "epoch": 0.020331707522696658, + "grad_norm": 1.3150914907455444, + "learning_rate": 6.097794014331881e-05, + "loss": 5.8431, + "step": 14470 + }, + { + "epoch": 0.020345758460860234, + "grad_norm": 1.4848755598068237, + "learning_rate": 6.1020092735703247e-05, + "loss": 5.8406, + "step": 14480 + }, + { + "epoch": 0.02035980939902381, + "grad_norm": 1.348905324935913, + "learning_rate": 6.106224532808767e-05, + "loss": 5.9378, + "step": 14490 + }, + { + "epoch": 0.02037386033718739, + "grad_norm": 1.266818881034851, + "learning_rate": 6.11043979204721e-05, + "loss": 5.8756, + "step": 14500 + }, + { + "epoch": 0.020387911275350966, + "grad_norm": 1.3369197845458984, + "learning_rate": 6.114655051285653e-05, + "loss": 5.8134, + "step": 14510 + }, + { + "epoch": 0.020401962213514542, + "grad_norm": 1.5117855072021484, + "learning_rate": 6.118870310524096e-05, + "loss": 5.8981, + "step": 14520 + }, + { + "epoch": 0.02041601315167812, + "grad_norm": 1.425666332244873, + "learning_rate": 6.12308556976254e-05, + "loss": 5.9648, + "step": 14530 + }, + { + "epoch": 0.020430064089841698, + "grad_norm": 1.3908796310424805, + "learning_rate": 6.127300829000983e-05, + "loss": 5.7853, + "step": 14540 + }, + { + "epoch": 0.020444115028005277, + "grad_norm": 1.3612520694732666, + "learning_rate": 6.131516088239426e-05, + "loss": 5.7987, + "step": 14550 + }, + { + "epoch": 0.020458165966168854, + "grad_norm": 1.5223125219345093, + "learning_rate": 6.13573134747787e-05, + "loss": 5.9002, + "step": 14560 + }, + { + "epoch": 0.02047221690433243, + "grad_norm": 1.5592213869094849, + "learning_rate": 6.139946606716312e-05, + "loss": 5.867, + "step": 14570 + }, + { + "epoch": 0.02048626784249601, + "grad_norm": 1.5414535999298096, + "learning_rate": 6.144161865954755e-05, + "loss": 5.7891, + "step": 14580 + }, + { + "epoch": 0.020500318780659586, + "grad_norm": 1.3643969297409058, + "learning_rate": 6.1483771251932e-05, + "loss": 5.8342, + "step": 14590 + }, + { + "epoch": 0.020514369718823165, + "grad_norm": 1.3756287097930908, + "learning_rate": 6.152592384431641e-05, + "loss": 5.9926, + "step": 14600 + }, + { + "epoch": 0.02052842065698674, + "grad_norm": 1.461431622505188, + "learning_rate": 6.156807643670086e-05, + "loss": 5.8282, + "step": 14610 + }, + { + "epoch": 0.020542471595150318, + "grad_norm": 1.5052664279937744, + "learning_rate": 6.161022902908529e-05, + "loss": 5.867, + "step": 14620 + }, + { + "epoch": 0.020556522533313897, + "grad_norm": 1.815929651260376, + "learning_rate": 6.165238162146971e-05, + "loss": 5.875, + "step": 14630 + }, + { + "epoch": 0.020570573471477473, + "grad_norm": 1.991647481918335, + "learning_rate": 6.169453421385415e-05, + "loss": 5.767, + "step": 14640 + }, + { + "epoch": 0.02058462440964105, + "grad_norm": 1.380648136138916, + "learning_rate": 6.173668680623858e-05, + "loss": 5.9249, + "step": 14650 + }, + { + "epoch": 0.02059867534780463, + "grad_norm": 1.3880946636199951, + "learning_rate": 6.1778839398623e-05, + "loss": 6.0072, + "step": 14660 + }, + { + "epoch": 0.020612726285968205, + "grad_norm": 1.4181009531021118, + "learning_rate": 6.182099199100745e-05, + "loss": 5.7029, + "step": 14670 + }, + { + "epoch": 0.020626777224131785, + "grad_norm": 1.4614509344100952, + "learning_rate": 6.186314458339188e-05, + "loss": 5.778, + "step": 14680 + }, + { + "epoch": 0.02064082816229536, + "grad_norm": 1.4017057418823242, + "learning_rate": 6.190529717577631e-05, + "loss": 5.7973, + "step": 14690 + }, + { + "epoch": 0.020654879100458937, + "grad_norm": 1.4342793226242065, + "learning_rate": 6.194744976816074e-05, + "loss": 5.8359, + "step": 14700 + }, + { + "epoch": 0.020668930038622517, + "grad_norm": 1.4066189527511597, + "learning_rate": 6.198960236054517e-05, + "loss": 5.8003, + "step": 14710 + }, + { + "epoch": 0.020682980976786093, + "grad_norm": 1.3695813417434692, + "learning_rate": 6.20317549529296e-05, + "loss": 5.9072, + "step": 14720 + }, + { + "epoch": 0.020697031914949673, + "grad_norm": 1.7797951698303223, + "learning_rate": 6.207390754531403e-05, + "loss": 5.772, + "step": 14730 + }, + { + "epoch": 0.02071108285311325, + "grad_norm": 1.340478539466858, + "learning_rate": 6.211606013769846e-05, + "loss": 5.9631, + "step": 14740 + }, + { + "epoch": 0.020725133791276825, + "grad_norm": 1.3329967260360718, + "learning_rate": 6.21582127300829e-05, + "loss": 5.899, + "step": 14750 + }, + { + "epoch": 0.020739184729440405, + "grad_norm": 1.4106431007385254, + "learning_rate": 6.220036532246733e-05, + "loss": 5.9145, + "step": 14760 + }, + { + "epoch": 0.02075323566760398, + "grad_norm": 1.4221521615982056, + "learning_rate": 6.224251791485176e-05, + "loss": 5.9208, + "step": 14770 + }, + { + "epoch": 0.02076728660576756, + "grad_norm": 1.3601226806640625, + "learning_rate": 6.228467050723619e-05, + "loss": 5.8966, + "step": 14780 + }, + { + "epoch": 0.020781337543931137, + "grad_norm": 1.3591755628585815, + "learning_rate": 6.232682309962062e-05, + "loss": 5.7644, + "step": 14790 + }, + { + "epoch": 0.020795388482094713, + "grad_norm": 1.4168270826339722, + "learning_rate": 6.236897569200505e-05, + "loss": 5.8457, + "step": 14800 + }, + { + "epoch": 0.020809439420258292, + "grad_norm": 1.443789005279541, + "learning_rate": 6.241112828438948e-05, + "loss": 5.8221, + "step": 14810 + }, + { + "epoch": 0.02082349035842187, + "grad_norm": 1.4544609785079956, + "learning_rate": 6.245328087677393e-05, + "loss": 5.8429, + "step": 14820 + }, + { + "epoch": 0.020837541296585445, + "grad_norm": 1.2243056297302246, + "learning_rate": 6.249543346915834e-05, + "loss": 5.8578, + "step": 14830 + }, + { + "epoch": 0.020851592234749024, + "grad_norm": 1.3254730701446533, + "learning_rate": 6.253758606154278e-05, + "loss": 5.9038, + "step": 14840 + }, + { + "epoch": 0.0208656431729126, + "grad_norm": 1.5144115686416626, + "learning_rate": 6.257973865392722e-05, + "loss": 5.81, + "step": 14850 + }, + { + "epoch": 0.02087969411107618, + "grad_norm": 1.35421621799469, + "learning_rate": 6.262189124631164e-05, + "loss": 5.8403, + "step": 14860 + }, + { + "epoch": 0.020893745049239756, + "grad_norm": 1.3334990739822388, + "learning_rate": 6.266404383869607e-05, + "loss": 5.7432, + "step": 14870 + }, + { + "epoch": 0.020907795987403333, + "grad_norm": 1.3845977783203125, + "learning_rate": 6.270619643108051e-05, + "loss": 5.7457, + "step": 14880 + }, + { + "epoch": 0.020921846925566912, + "grad_norm": 1.606353521347046, + "learning_rate": 6.274834902346493e-05, + "loss": 5.7664, + "step": 14890 + }, + { + "epoch": 0.02093589786373049, + "grad_norm": 1.4412970542907715, + "learning_rate": 6.279050161584938e-05, + "loss": 5.8228, + "step": 14900 + }, + { + "epoch": 0.020949948801894068, + "grad_norm": 1.57476007938385, + "learning_rate": 6.283265420823381e-05, + "loss": 5.8189, + "step": 14910 + }, + { + "epoch": 0.020963999740057644, + "grad_norm": 1.4712239503860474, + "learning_rate": 6.287480680061822e-05, + "loss": 5.8303, + "step": 14920 + }, + { + "epoch": 0.02097805067822122, + "grad_norm": 1.3110334873199463, + "learning_rate": 6.291695939300267e-05, + "loss": 5.8904, + "step": 14930 + }, + { + "epoch": 0.0209921016163848, + "grad_norm": 1.3854535818099976, + "learning_rate": 6.29591119853871e-05, + "loss": 5.7378, + "step": 14940 + }, + { + "epoch": 0.021006152554548376, + "grad_norm": 1.5063308477401733, + "learning_rate": 6.300126457777152e-05, + "loss": 5.8302, + "step": 14950 + }, + { + "epoch": 0.021020203492711952, + "grad_norm": 1.347753643989563, + "learning_rate": 6.304341717015596e-05, + "loss": 5.8284, + "step": 14960 + }, + { + "epoch": 0.021034254430875532, + "grad_norm": 1.7087639570236206, + "learning_rate": 6.30855697625404e-05, + "loss": 5.9404, + "step": 14970 + }, + { + "epoch": 0.021048305369039108, + "grad_norm": 1.4151090383529663, + "learning_rate": 6.312772235492483e-05, + "loss": 5.8551, + "step": 14980 + }, + { + "epoch": 0.021062356307202688, + "grad_norm": 1.3254867792129517, + "learning_rate": 6.316987494730926e-05, + "loss": 5.8843, + "step": 14990 + }, + { + "epoch": 0.021076407245366264, + "grad_norm": 1.3656538724899292, + "learning_rate": 6.321202753969369e-05, + "loss": 5.8179, + "step": 15000 + }, + { + "epoch": 0.02109045818352984, + "grad_norm": 1.3731048107147217, + "learning_rate": 6.325418013207812e-05, + "loss": 5.776, + "step": 15010 + }, + { + "epoch": 0.02110450912169342, + "grad_norm": 1.6539822816848755, + "learning_rate": 6.329633272446255e-05, + "loss": 5.796, + "step": 15020 + }, + { + "epoch": 0.021118560059856996, + "grad_norm": 1.6321409940719604, + "learning_rate": 6.333848531684698e-05, + "loss": 5.7292, + "step": 15030 + }, + { + "epoch": 0.021132610998020575, + "grad_norm": 1.516080379486084, + "learning_rate": 6.338063790923141e-05, + "loss": 5.7586, + "step": 15040 + }, + { + "epoch": 0.02114666193618415, + "grad_norm": 1.3451189994812012, + "learning_rate": 6.342279050161584e-05, + "loss": 5.7364, + "step": 15050 + }, + { + "epoch": 0.021160712874347728, + "grad_norm": 1.4270164966583252, + "learning_rate": 6.346494309400027e-05, + "loss": 5.7612, + "step": 15060 + }, + { + "epoch": 0.021174763812511307, + "grad_norm": 1.353387475013733, + "learning_rate": 6.35070956863847e-05, + "loss": 5.8736, + "step": 15070 + }, + { + "epoch": 0.021188814750674884, + "grad_norm": 1.4181205034255981, + "learning_rate": 6.354924827876914e-05, + "loss": 5.9262, + "step": 15080 + }, + { + "epoch": 0.021202865688838463, + "grad_norm": 1.3350406885147095, + "learning_rate": 6.359140087115357e-05, + "loss": 5.843, + "step": 15090 + }, + { + "epoch": 0.02121691662700204, + "grad_norm": 1.676595687866211, + "learning_rate": 6.3633553463538e-05, + "loss": 5.806, + "step": 15100 + }, + { + "epoch": 0.021230967565165616, + "grad_norm": 1.579698920249939, + "learning_rate": 6.367570605592244e-05, + "loss": 5.8444, + "step": 15110 + }, + { + "epoch": 0.021245018503329195, + "grad_norm": 1.5201925039291382, + "learning_rate": 6.371785864830686e-05, + "loss": 5.948, + "step": 15120 + }, + { + "epoch": 0.02125906944149277, + "grad_norm": 1.3935141563415527, + "learning_rate": 6.376001124069129e-05, + "loss": 5.8822, + "step": 15130 + }, + { + "epoch": 0.021273120379656348, + "grad_norm": 1.358180284500122, + "learning_rate": 6.380216383307574e-05, + "loss": 5.8469, + "step": 15140 + }, + { + "epoch": 0.021287171317819927, + "grad_norm": 1.4216656684875488, + "learning_rate": 6.384431642546016e-05, + "loss": 5.8851, + "step": 15150 + }, + { + "epoch": 0.021301222255983503, + "grad_norm": 1.40565025806427, + "learning_rate": 6.388646901784459e-05, + "loss": 5.7583, + "step": 15160 + }, + { + "epoch": 0.021315273194147083, + "grad_norm": 1.30914306640625, + "learning_rate": 6.392862161022903e-05, + "loss": 5.9395, + "step": 15170 + }, + { + "epoch": 0.02132932413231066, + "grad_norm": 1.4431228637695312, + "learning_rate": 6.397077420261345e-05, + "loss": 5.776, + "step": 15180 + }, + { + "epoch": 0.021343375070474235, + "grad_norm": 1.397368311882019, + "learning_rate": 6.40129267949979e-05, + "loss": 5.7714, + "step": 15190 + }, + { + "epoch": 0.021357426008637815, + "grad_norm": 1.303511142730713, + "learning_rate": 6.405507938738232e-05, + "loss": 5.8594, + "step": 15200 + }, + { + "epoch": 0.02137147694680139, + "grad_norm": 1.359408974647522, + "learning_rate": 6.409723197976674e-05, + "loss": 5.7992, + "step": 15210 + }, + { + "epoch": 0.02138552788496497, + "grad_norm": 1.2777255773544312, + "learning_rate": 6.413938457215119e-05, + "loss": 5.8816, + "step": 15220 + }, + { + "epoch": 0.021399578823128547, + "grad_norm": 1.7330981492996216, + "learning_rate": 6.418153716453562e-05, + "loss": 5.7651, + "step": 15230 + }, + { + "epoch": 0.021413629761292123, + "grad_norm": 1.324683427810669, + "learning_rate": 6.42194744976816e-05, + "loss": 5.7404, + "step": 15240 + }, + { + "epoch": 0.021427680699455703, + "grad_norm": 1.3045562505722046, + "learning_rate": 6.426162709006603e-05, + "loss": 5.7965, + "step": 15250 + }, + { + "epoch": 0.02144173163761928, + "grad_norm": 1.3371610641479492, + "learning_rate": 6.430377968245046e-05, + "loss": 5.8492, + "step": 15260 + }, + { + "epoch": 0.02145578257578286, + "grad_norm": 1.61046302318573, + "learning_rate": 6.43459322748349e-05, + "loss": 5.8089, + "step": 15270 + }, + { + "epoch": 0.021469833513946435, + "grad_norm": 1.5446206331253052, + "learning_rate": 6.438808486721933e-05, + "loss": 5.8823, + "step": 15280 + }, + { + "epoch": 0.02148388445211001, + "grad_norm": 1.337916612625122, + "learning_rate": 6.443023745960376e-05, + "loss": 5.8088, + "step": 15290 + }, + { + "epoch": 0.02149793539027359, + "grad_norm": 1.5150680541992188, + "learning_rate": 6.447239005198819e-05, + "loss": 5.7181, + "step": 15300 + }, + { + "epoch": 0.021511986328437167, + "grad_norm": 1.3614176511764526, + "learning_rate": 6.451454264437262e-05, + "loss": 5.8309, + "step": 15310 + }, + { + "epoch": 0.021526037266600743, + "grad_norm": 1.357839584350586, + "learning_rate": 6.455669523675706e-05, + "loss": 5.8946, + "step": 15320 + }, + { + "epoch": 0.021540088204764322, + "grad_norm": 1.4024747610092163, + "learning_rate": 6.459884782914148e-05, + "loss": 5.8192, + "step": 15330 + }, + { + "epoch": 0.0215541391429279, + "grad_norm": 1.4322363138198853, + "learning_rate": 6.464100042152591e-05, + "loss": 5.6945, + "step": 15340 + }, + { + "epoch": 0.021568190081091478, + "grad_norm": 1.4403985738754272, + "learning_rate": 6.468315301391036e-05, + "loss": 5.926, + "step": 15350 + }, + { + "epoch": 0.021582241019255054, + "grad_norm": 1.593833565711975, + "learning_rate": 6.472530560629477e-05, + "loss": 5.8134, + "step": 15360 + }, + { + "epoch": 0.02159629195741863, + "grad_norm": 1.4765543937683105, + "learning_rate": 6.47674581986792e-05, + "loss": 5.8854, + "step": 15370 + }, + { + "epoch": 0.02161034289558221, + "grad_norm": 1.3382881879806519, + "learning_rate": 6.480961079106365e-05, + "loss": 5.8777, + "step": 15380 + }, + { + "epoch": 0.021624393833745786, + "grad_norm": 1.6510887145996094, + "learning_rate": 6.485176338344807e-05, + "loss": 5.7864, + "step": 15390 + }, + { + "epoch": 0.021638444771909366, + "grad_norm": 1.3946115970611572, + "learning_rate": 6.489391597583251e-05, + "loss": 5.7041, + "step": 15400 + }, + { + "epoch": 0.021652495710072942, + "grad_norm": 1.3984819650650024, + "learning_rate": 6.493606856821694e-05, + "loss": 5.7186, + "step": 15410 + }, + { + "epoch": 0.02166654664823652, + "grad_norm": 1.9340324401855469, + "learning_rate": 6.497822116060136e-05, + "loss": 5.7089, + "step": 15420 + }, + { + "epoch": 0.021680597586400098, + "grad_norm": 1.4083871841430664, + "learning_rate": 6.50203737529858e-05, + "loss": 5.8155, + "step": 15430 + }, + { + "epoch": 0.021694648524563674, + "grad_norm": 1.389730453491211, + "learning_rate": 6.506252634537024e-05, + "loss": 5.7463, + "step": 15440 + }, + { + "epoch": 0.02170869946272725, + "grad_norm": 1.2647806406021118, + "learning_rate": 6.510467893775467e-05, + "loss": 5.8177, + "step": 15450 + }, + { + "epoch": 0.02172275040089083, + "grad_norm": 1.4326426982879639, + "learning_rate": 6.51468315301391e-05, + "loss": 5.7414, + "step": 15460 + }, + { + "epoch": 0.021736801339054406, + "grad_norm": 1.3881163597106934, + "learning_rate": 6.518898412252353e-05, + "loss": 5.624, + "step": 15470 + }, + { + "epoch": 0.021750852277217986, + "grad_norm": 1.3433114290237427, + "learning_rate": 6.523113671490796e-05, + "loss": 5.7344, + "step": 15480 + }, + { + "epoch": 0.021764903215381562, + "grad_norm": 1.3703086376190186, + "learning_rate": 6.52732893072924e-05, + "loss": 5.7321, + "step": 15490 + }, + { + "epoch": 0.021778954153545138, + "grad_norm": 1.3004043102264404, + "learning_rate": 6.531544189967682e-05, + "loss": 5.7776, + "step": 15500 + }, + { + "epoch": 0.021793005091708718, + "grad_norm": 1.3912044763565063, + "learning_rate": 6.535759449206126e-05, + "loss": 5.75, + "step": 15510 + }, + { + "epoch": 0.021807056029872294, + "grad_norm": 1.3019267320632935, + "learning_rate": 6.539974708444569e-05, + "loss": 5.8134, + "step": 15520 + }, + { + "epoch": 0.021821106968035874, + "grad_norm": 1.2605361938476562, + "learning_rate": 6.544189967683012e-05, + "loss": 5.7915, + "step": 15530 + }, + { + "epoch": 0.02183515790619945, + "grad_norm": 1.4486380815505981, + "learning_rate": 6.548405226921455e-05, + "loss": 5.8089, + "step": 15540 + }, + { + "epoch": 0.021849208844363026, + "grad_norm": 1.4889944791793823, + "learning_rate": 6.552620486159898e-05, + "loss": 5.8072, + "step": 15550 + }, + { + "epoch": 0.021863259782526605, + "grad_norm": 1.389483094215393, + "learning_rate": 6.556835745398341e-05, + "loss": 5.8426, + "step": 15560 + }, + { + "epoch": 0.02187731072069018, + "grad_norm": 1.465795874595642, + "learning_rate": 6.561051004636784e-05, + "loss": 5.7026, + "step": 15570 + }, + { + "epoch": 0.02189136165885376, + "grad_norm": 1.4371973276138306, + "learning_rate": 6.565266263875227e-05, + "loss": 5.8007, + "step": 15580 + }, + { + "epoch": 0.021905412597017337, + "grad_norm": 1.3100478649139404, + "learning_rate": 6.56948152311367e-05, + "loss": 5.7882, + "step": 15590 + }, + { + "epoch": 0.021919463535180914, + "grad_norm": 1.3720271587371826, + "learning_rate": 6.573696782352114e-05, + "loss": 5.7386, + "step": 15600 + }, + { + "epoch": 0.021933514473344493, + "grad_norm": 1.6056287288665771, + "learning_rate": 6.577912041590558e-05, + "loss": 5.7643, + "step": 15610 + }, + { + "epoch": 0.02194756541150807, + "grad_norm": 1.4117242097854614, + "learning_rate": 6.582127300829e-05, + "loss": 5.8054, + "step": 15620 + }, + { + "epoch": 0.021961616349671646, + "grad_norm": 1.3101048469543457, + "learning_rate": 6.586342560067443e-05, + "loss": 5.8407, + "step": 15630 + }, + { + "epoch": 0.021975667287835225, + "grad_norm": 1.535385251045227, + "learning_rate": 6.590557819305887e-05, + "loss": 5.7486, + "step": 15640 + }, + { + "epoch": 0.0219897182259988, + "grad_norm": 1.3539336919784546, + "learning_rate": 6.594773078544329e-05, + "loss": 5.7473, + "step": 15650 + }, + { + "epoch": 0.02200376916416238, + "grad_norm": 1.5434645414352417, + "learning_rate": 6.598988337782774e-05, + "loss": 5.8255, + "step": 15660 + }, + { + "epoch": 0.022017820102325957, + "grad_norm": 1.3859068155288696, + "learning_rate": 6.603203597021217e-05, + "loss": 5.8002, + "step": 15670 + }, + { + "epoch": 0.022031871040489533, + "grad_norm": 1.397512674331665, + "learning_rate": 6.607418856259659e-05, + "loss": 5.8374, + "step": 15680 + }, + { + "epoch": 0.022045921978653113, + "grad_norm": 1.4506967067718506, + "learning_rate": 6.611634115498103e-05, + "loss": 5.8459, + "step": 15690 + }, + { + "epoch": 0.02205997291681669, + "grad_norm": 1.3882638216018677, + "learning_rate": 6.615849374736546e-05, + "loss": 5.8247, + "step": 15700 + }, + { + "epoch": 0.02207402385498027, + "grad_norm": 1.308134913444519, + "learning_rate": 6.620064633974988e-05, + "loss": 5.7157, + "step": 15710 + }, + { + "epoch": 0.022088074793143845, + "grad_norm": 1.4445452690124512, + "learning_rate": 6.624279893213432e-05, + "loss": 5.7608, + "step": 15720 + }, + { + "epoch": 0.02210212573130742, + "grad_norm": 1.362390398979187, + "learning_rate": 6.628495152451875e-05, + "loss": 5.9047, + "step": 15730 + }, + { + "epoch": 0.022116176669471, + "grad_norm": 1.2991571426391602, + "learning_rate": 6.632710411690319e-05, + "loss": 5.7554, + "step": 15740 + }, + { + "epoch": 0.022130227607634577, + "grad_norm": 1.328895926475525, + "learning_rate": 6.636925670928762e-05, + "loss": 5.7317, + "step": 15750 + }, + { + "epoch": 0.022144278545798153, + "grad_norm": 1.303283929824829, + "learning_rate": 6.641140930167205e-05, + "loss": 5.7671, + "step": 15760 + }, + { + "epoch": 0.022158329483961733, + "grad_norm": 1.358275055885315, + "learning_rate": 6.645356189405648e-05, + "loss": 5.7322, + "step": 15770 + }, + { + "epoch": 0.02217238042212531, + "grad_norm": 1.4214792251586914, + "learning_rate": 6.649571448644091e-05, + "loss": 5.6888, + "step": 15780 + }, + { + "epoch": 0.02218643136028889, + "grad_norm": 1.3279885053634644, + "learning_rate": 6.653786707882534e-05, + "loss": 5.7591, + "step": 15790 + }, + { + "epoch": 0.022200482298452465, + "grad_norm": 1.3507026433944702, + "learning_rate": 6.658001967120977e-05, + "loss": 5.7571, + "step": 15800 + }, + { + "epoch": 0.02221453323661604, + "grad_norm": 1.323520541191101, + "learning_rate": 6.66221722635942e-05, + "loss": 5.8356, + "step": 15810 + }, + { + "epoch": 0.02222858417477962, + "grad_norm": 1.2954471111297607, + "learning_rate": 6.666432485597864e-05, + "loss": 5.9235, + "step": 15820 + }, + { + "epoch": 0.022242635112943197, + "grad_norm": 1.3912978172302246, + "learning_rate": 6.670647744836307e-05, + "loss": 5.7872, + "step": 15830 + }, + { + "epoch": 0.022256686051106776, + "grad_norm": 1.5537766218185425, + "learning_rate": 6.67486300407475e-05, + "loss": 5.7668, + "step": 15840 + }, + { + "epoch": 0.022270736989270352, + "grad_norm": 1.3550326824188232, + "learning_rate": 6.679078263313193e-05, + "loss": 5.7059, + "step": 15850 + }, + { + "epoch": 0.02228478792743393, + "grad_norm": 1.3379523754119873, + "learning_rate": 6.683293522551636e-05, + "loss": 5.7736, + "step": 15860 + }, + { + "epoch": 0.022298838865597508, + "grad_norm": 1.4727954864501953, + "learning_rate": 6.687508781790079e-05, + "loss": 5.687, + "step": 15870 + }, + { + "epoch": 0.022312889803761084, + "grad_norm": 1.5783716440200806, + "learning_rate": 6.691724041028522e-05, + "loss": 5.6273, + "step": 15880 + }, + { + "epoch": 0.022326940741924664, + "grad_norm": 1.2996593713760376, + "learning_rate": 6.695939300266965e-05, + "loss": 5.809, + "step": 15890 + }, + { + "epoch": 0.02234099168008824, + "grad_norm": 1.3592562675476074, + "learning_rate": 6.70015455950541e-05, + "loss": 5.7186, + "step": 15900 + }, + { + "epoch": 0.022355042618251816, + "grad_norm": 1.363147258758545, + "learning_rate": 6.704369818743852e-05, + "loss": 5.7087, + "step": 15910 + }, + { + "epoch": 0.022369093556415396, + "grad_norm": 1.3323581218719482, + "learning_rate": 6.708585077982295e-05, + "loss": 5.7382, + "step": 15920 + }, + { + "epoch": 0.022383144494578972, + "grad_norm": 1.3843733072280884, + "learning_rate": 6.712800337220739e-05, + "loss": 5.7236, + "step": 15930 + }, + { + "epoch": 0.02239719543274255, + "grad_norm": 1.336782455444336, + "learning_rate": 6.717015596459181e-05, + "loss": 5.737, + "step": 15940 + }, + { + "epoch": 0.022411246370906128, + "grad_norm": 1.3597850799560547, + "learning_rate": 6.721230855697625e-05, + "loss": 5.6848, + "step": 15950 + }, + { + "epoch": 0.022425297309069704, + "grad_norm": 1.360641360282898, + "learning_rate": 6.725446114936069e-05, + "loss": 5.8756, + "step": 15960 + }, + { + "epoch": 0.022439348247233284, + "grad_norm": 1.4128172397613525, + "learning_rate": 6.72966137417451e-05, + "loss": 5.7612, + "step": 15970 + }, + { + "epoch": 0.02245339918539686, + "grad_norm": 1.4090365171432495, + "learning_rate": 6.733876633412955e-05, + "loss": 5.6862, + "step": 15980 + }, + { + "epoch": 0.022467450123560436, + "grad_norm": 1.555239200592041, + "learning_rate": 6.738091892651398e-05, + "loss": 5.6615, + "step": 15990 + }, + { + "epoch": 0.022481501061724016, + "grad_norm": 1.3207861185073853, + "learning_rate": 6.74230715188984e-05, + "loss": 5.7863, + "step": 16000 + }, + { + "epoch": 0.022495551999887592, + "grad_norm": 1.3086305856704712, + "learning_rate": 6.746522411128284e-05, + "loss": 5.7663, + "step": 16010 + }, + { + "epoch": 0.02250960293805117, + "grad_norm": 1.5462536811828613, + "learning_rate": 6.750737670366727e-05, + "loss": 5.7121, + "step": 16020 + }, + { + "epoch": 0.022523653876214748, + "grad_norm": 1.4813846349716187, + "learning_rate": 6.75495292960517e-05, + "loss": 5.7485, + "step": 16030 + }, + { + "epoch": 0.022537704814378324, + "grad_norm": 1.3974696397781372, + "learning_rate": 6.759168188843613e-05, + "loss": 5.6604, + "step": 16040 + }, + { + "epoch": 0.022551755752541904, + "grad_norm": 1.3554956912994385, + "learning_rate": 6.763383448082057e-05, + "loss": 5.5456, + "step": 16050 + }, + { + "epoch": 0.02256580669070548, + "grad_norm": 1.4472044706344604, + "learning_rate": 6.7675987073205e-05, + "loss": 5.6876, + "step": 16060 + }, + { + "epoch": 0.022579857628869056, + "grad_norm": 1.4171758890151978, + "learning_rate": 6.771813966558943e-05, + "loss": 5.7768, + "step": 16070 + }, + { + "epoch": 0.022593908567032635, + "grad_norm": 1.4032776355743408, + "learning_rate": 6.776029225797386e-05, + "loss": 5.6466, + "step": 16080 + }, + { + "epoch": 0.02260795950519621, + "grad_norm": 1.2590081691741943, + "learning_rate": 6.780244485035829e-05, + "loss": 5.6616, + "step": 16090 + }, + { + "epoch": 0.02262201044335979, + "grad_norm": 1.3556309938430786, + "learning_rate": 6.784459744274272e-05, + "loss": 5.797, + "step": 16100 + }, + { + "epoch": 0.022636061381523367, + "grad_norm": 1.6029269695281982, + "learning_rate": 6.788675003512715e-05, + "loss": 5.7445, + "step": 16110 + }, + { + "epoch": 0.022650112319686944, + "grad_norm": 1.6440356969833374, + "learning_rate": 6.792890262751158e-05, + "loss": 5.6423, + "step": 16120 + }, + { + "epoch": 0.022664163257850523, + "grad_norm": 1.447268009185791, + "learning_rate": 6.797105521989602e-05, + "loss": 5.8162, + "step": 16130 + }, + { + "epoch": 0.0226782141960141, + "grad_norm": 1.3291443586349487, + "learning_rate": 6.801320781228045e-05, + "loss": 5.822, + "step": 16140 + }, + { + "epoch": 0.02269226513417768, + "grad_norm": 1.4091272354125977, + "learning_rate": 6.805536040466488e-05, + "loss": 5.7244, + "step": 16150 + }, + { + "epoch": 0.022706316072341255, + "grad_norm": 1.579410433769226, + "learning_rate": 6.809751299704932e-05, + "loss": 5.7525, + "step": 16160 + }, + { + "epoch": 0.02272036701050483, + "grad_norm": 1.368188500404358, + "learning_rate": 6.813966558943374e-05, + "loss": 5.7373, + "step": 16170 + }, + { + "epoch": 0.02273441794866841, + "grad_norm": 1.3030613660812378, + "learning_rate": 6.818181818181817e-05, + "loss": 5.7726, + "step": 16180 + }, + { + "epoch": 0.022748468886831987, + "grad_norm": 1.3315412998199463, + "learning_rate": 6.822397077420262e-05, + "loss": 5.7665, + "step": 16190 + }, + { + "epoch": 0.022762519824995567, + "grad_norm": 1.265251636505127, + "learning_rate": 6.826612336658703e-05, + "loss": 5.7164, + "step": 16200 + }, + { + "epoch": 0.022776570763159143, + "grad_norm": 1.3779250383377075, + "learning_rate": 6.830827595897146e-05, + "loss": 5.8774, + "step": 16210 + }, + { + "epoch": 0.02279062170132272, + "grad_norm": 1.3391507863998413, + "learning_rate": 6.835042855135591e-05, + "loss": 5.6763, + "step": 16220 + }, + { + "epoch": 0.0228046726394863, + "grad_norm": 1.2613348960876465, + "learning_rate": 6.839258114374033e-05, + "loss": 5.7929, + "step": 16230 + }, + { + "epoch": 0.022818723577649875, + "grad_norm": 1.3553537130355835, + "learning_rate": 6.843473373612477e-05, + "loss": 5.6879, + "step": 16240 + }, + { + "epoch": 0.02283277451581345, + "grad_norm": 1.3512293100357056, + "learning_rate": 6.84768863285092e-05, + "loss": 5.7638, + "step": 16250 + }, + { + "epoch": 0.02284682545397703, + "grad_norm": 1.3155027627944946, + "learning_rate": 6.851903892089362e-05, + "loss": 5.7267, + "step": 16260 + }, + { + "epoch": 0.022860876392140607, + "grad_norm": 1.4464770555496216, + "learning_rate": 6.856119151327806e-05, + "loss": 5.6172, + "step": 16270 + }, + { + "epoch": 0.022874927330304187, + "grad_norm": 1.4445135593414307, + "learning_rate": 6.86033441056625e-05, + "loss": 5.7, + "step": 16280 + }, + { + "epoch": 0.022888978268467763, + "grad_norm": 1.5342646837234497, + "learning_rate": 6.864549669804691e-05, + "loss": 5.6978, + "step": 16290 + }, + { + "epoch": 0.02290302920663134, + "grad_norm": 1.335935354232788, + "learning_rate": 6.868764929043136e-05, + "loss": 5.7446, + "step": 16300 + }, + { + "epoch": 0.02291708014479492, + "grad_norm": 1.326428771018982, + "learning_rate": 6.872980188281579e-05, + "loss": 5.6136, + "step": 16310 + }, + { + "epoch": 0.022931131082958495, + "grad_norm": 1.3541918992996216, + "learning_rate": 6.877195447520022e-05, + "loss": 5.7738, + "step": 16320 + }, + { + "epoch": 0.022945182021122074, + "grad_norm": 1.8092317581176758, + "learning_rate": 6.881410706758465e-05, + "loss": 5.6128, + "step": 16330 + }, + { + "epoch": 0.02295923295928565, + "grad_norm": 1.2965271472930908, + "learning_rate": 6.885625965996908e-05, + "loss": 5.8584, + "step": 16340 + }, + { + "epoch": 0.022973283897449227, + "grad_norm": 1.3288991451263428, + "learning_rate": 6.889841225235351e-05, + "loss": 5.7374, + "step": 16350 + }, + { + "epoch": 0.022987334835612806, + "grad_norm": 1.3659766912460327, + "learning_rate": 6.894056484473795e-05, + "loss": 5.6953, + "step": 16360 + }, + { + "epoch": 0.023001385773776382, + "grad_norm": 1.3702479600906372, + "learning_rate": 6.898271743712238e-05, + "loss": 5.7156, + "step": 16370 + }, + { + "epoch": 0.023015436711939962, + "grad_norm": 1.3132061958312988, + "learning_rate": 6.902487002950681e-05, + "loss": 5.6502, + "step": 16380 + }, + { + "epoch": 0.023029487650103538, + "grad_norm": 1.3793644905090332, + "learning_rate": 6.906702262189124e-05, + "loss": 5.756, + "step": 16390 + }, + { + "epoch": 0.023043538588267114, + "grad_norm": 1.364650011062622, + "learning_rate": 6.910917521427567e-05, + "loss": 5.7081, + "step": 16400 + }, + { + "epoch": 0.023057589526430694, + "grad_norm": 1.303212285041809, + "learning_rate": 6.91513278066601e-05, + "loss": 5.7405, + "step": 16410 + }, + { + "epoch": 0.02307164046459427, + "grad_norm": 1.382996678352356, + "learning_rate": 6.919348039904453e-05, + "loss": 5.6833, + "step": 16420 + }, + { + "epoch": 0.023085691402757846, + "grad_norm": 1.4374713897705078, + "learning_rate": 6.923563299142896e-05, + "loss": 5.7699, + "step": 16430 + }, + { + "epoch": 0.023099742340921426, + "grad_norm": 1.286665439605713, + "learning_rate": 6.92777855838134e-05, + "loss": 5.7886, + "step": 16440 + }, + { + "epoch": 0.023113793279085002, + "grad_norm": 1.403110384941101, + "learning_rate": 6.931993817619784e-05, + "loss": 5.7172, + "step": 16450 + }, + { + "epoch": 0.023127844217248582, + "grad_norm": 1.7290191650390625, + "learning_rate": 6.936209076858226e-05, + "loss": 5.7493, + "step": 16460 + }, + { + "epoch": 0.023141895155412158, + "grad_norm": 1.2815217971801758, + "learning_rate": 6.940424336096669e-05, + "loss": 5.7261, + "step": 16470 + }, + { + "epoch": 0.023155946093575734, + "grad_norm": 1.5143299102783203, + "learning_rate": 6.944639595335113e-05, + "loss": 5.7154, + "step": 16480 + }, + { + "epoch": 0.023169997031739314, + "grad_norm": 1.3418786525726318, + "learning_rate": 6.948854854573555e-05, + "loss": 5.7851, + "step": 16490 + }, + { + "epoch": 0.02318404796990289, + "grad_norm": 1.296385407447815, + "learning_rate": 6.953070113811998e-05, + "loss": 5.7819, + "step": 16500 + }, + { + "epoch": 0.02319809890806647, + "grad_norm": 1.3354800939559937, + "learning_rate": 6.957285373050443e-05, + "loss": 5.7586, + "step": 16510 + }, + { + "epoch": 0.023212149846230046, + "grad_norm": 1.6433796882629395, + "learning_rate": 6.961500632288884e-05, + "loss": 5.6559, + "step": 16520 + }, + { + "epoch": 0.023226200784393622, + "grad_norm": 1.4096791744232178, + "learning_rate": 6.965715891527329e-05, + "loss": 5.7111, + "step": 16530 + }, + { + "epoch": 0.0232402517225572, + "grad_norm": 1.3959141969680786, + "learning_rate": 6.969931150765772e-05, + "loss": 5.7255, + "step": 16540 + }, + { + "epoch": 0.023254302660720778, + "grad_norm": 1.2790894508361816, + "learning_rate": 6.974146410004214e-05, + "loss": 5.6613, + "step": 16550 + }, + { + "epoch": 0.023268353598884354, + "grad_norm": 1.3713908195495605, + "learning_rate": 6.978361669242658e-05, + "loss": 5.6093, + "step": 16560 + }, + { + "epoch": 0.023282404537047934, + "grad_norm": 1.4066718816757202, + "learning_rate": 6.982576928481101e-05, + "loss": 5.5383, + "step": 16570 + }, + { + "epoch": 0.02329645547521151, + "grad_norm": 1.5075857639312744, + "learning_rate": 6.986792187719543e-05, + "loss": 5.5824, + "step": 16580 + }, + { + "epoch": 0.02331050641337509, + "grad_norm": 1.4346543550491333, + "learning_rate": 6.991007446957988e-05, + "loss": 5.8348, + "step": 16590 + }, + { + "epoch": 0.023324557351538665, + "grad_norm": 1.4327467679977417, + "learning_rate": 6.995222706196431e-05, + "loss": 5.5361, + "step": 16600 + }, + { + "epoch": 0.02333860828970224, + "grad_norm": 1.403082251548767, + "learning_rate": 6.999437965434874e-05, + "loss": 5.6565, + "step": 16610 + }, + { + "epoch": 0.02335265922786582, + "grad_norm": 1.7867377996444702, + "learning_rate": 7.003653224673317e-05, + "loss": 5.7467, + "step": 16620 + }, + { + "epoch": 0.023366710166029397, + "grad_norm": 1.3568111658096313, + "learning_rate": 7.00786848391176e-05, + "loss": 5.624, + "step": 16630 + }, + { + "epoch": 0.023380761104192977, + "grad_norm": 1.4141385555267334, + "learning_rate": 7.012083743150203e-05, + "loss": 5.7333, + "step": 16640 + }, + { + "epoch": 0.023394812042356553, + "grad_norm": 1.526979923248291, + "learning_rate": 7.016299002388646e-05, + "loss": 5.6305, + "step": 16650 + }, + { + "epoch": 0.02340886298052013, + "grad_norm": 1.2974737882614136, + "learning_rate": 7.02051426162709e-05, + "loss": 5.7538, + "step": 16660 + }, + { + "epoch": 0.02342291391868371, + "grad_norm": 1.3604271411895752, + "learning_rate": 7.024729520865533e-05, + "loss": 5.7628, + "step": 16670 + }, + { + "epoch": 0.023436964856847285, + "grad_norm": 1.2945364713668823, + "learning_rate": 7.028944780103976e-05, + "loss": 5.6513, + "step": 16680 + }, + { + "epoch": 0.023451015795010865, + "grad_norm": 1.8330632448196411, + "learning_rate": 7.033160039342419e-05, + "loss": 5.7871, + "step": 16690 + }, + { + "epoch": 0.02346506673317444, + "grad_norm": 1.4950273036956787, + "learning_rate": 7.037375298580862e-05, + "loss": 5.7615, + "step": 16700 + }, + { + "epoch": 0.023479117671338017, + "grad_norm": 1.4327980279922485, + "learning_rate": 7.041590557819305e-05, + "loss": 5.6334, + "step": 16710 + }, + { + "epoch": 0.023493168609501597, + "grad_norm": 1.2713170051574707, + "learning_rate": 7.045805817057748e-05, + "loss": 5.8051, + "step": 16720 + }, + { + "epoch": 0.023507219547665173, + "grad_norm": 1.3698358535766602, + "learning_rate": 7.050021076296191e-05, + "loss": 5.6809, + "step": 16730 + }, + { + "epoch": 0.02352127048582875, + "grad_norm": 1.40662682056427, + "learning_rate": 7.054236335534636e-05, + "loss": 5.6715, + "step": 16740 + }, + { + "epoch": 0.02353532142399233, + "grad_norm": 1.3832154273986816, + "learning_rate": 7.058451594773077e-05, + "loss": 5.7687, + "step": 16750 + }, + { + "epoch": 0.023549372362155905, + "grad_norm": 1.2708238363265991, + "learning_rate": 7.06266685401152e-05, + "loss": 5.7387, + "step": 16760 + }, + { + "epoch": 0.023563423300319485, + "grad_norm": 1.5633302927017212, + "learning_rate": 7.066882113249965e-05, + "loss": 5.6757, + "step": 16770 + }, + { + "epoch": 0.02357747423848306, + "grad_norm": 1.4342371225357056, + "learning_rate": 7.071097372488407e-05, + "loss": 5.7106, + "step": 16780 + }, + { + "epoch": 0.023591525176646637, + "grad_norm": 1.3297955989837646, + "learning_rate": 7.07531263172685e-05, + "loss": 5.664, + "step": 16790 + }, + { + "epoch": 0.023605576114810217, + "grad_norm": 1.9332789182662964, + "learning_rate": 7.079527890965294e-05, + "loss": 5.6539, + "step": 16800 + }, + { + "epoch": 0.023619627052973793, + "grad_norm": 1.3228834867477417, + "learning_rate": 7.083743150203736e-05, + "loss": 5.6064, + "step": 16810 + }, + { + "epoch": 0.023633677991137372, + "grad_norm": 1.4686253070831299, + "learning_rate": 7.08795840944218e-05, + "loss": 5.6387, + "step": 16820 + }, + { + "epoch": 0.02364772892930095, + "grad_norm": 1.4611742496490479, + "learning_rate": 7.092173668680624e-05, + "loss": 5.7412, + "step": 16830 + }, + { + "epoch": 0.023661779867464525, + "grad_norm": 1.2942790985107422, + "learning_rate": 7.096388927919066e-05, + "loss": 5.6693, + "step": 16840 + }, + { + "epoch": 0.023675830805628104, + "grad_norm": 1.254348635673523, + "learning_rate": 7.10060418715751e-05, + "loss": 5.7693, + "step": 16850 + }, + { + "epoch": 0.02368988174379168, + "grad_norm": 1.426842212677002, + "learning_rate": 7.104819446395953e-05, + "loss": 5.6919, + "step": 16860 + }, + { + "epoch": 0.023703932681955257, + "grad_norm": 1.349927544593811, + "learning_rate": 7.109034705634395e-05, + "loss": 5.62, + "step": 16870 + }, + { + "epoch": 0.023717983620118836, + "grad_norm": 1.4106230735778809, + "learning_rate": 7.11324996487284e-05, + "loss": 5.8021, + "step": 16880 + }, + { + "epoch": 0.023732034558282412, + "grad_norm": 1.3730113506317139, + "learning_rate": 7.117465224111282e-05, + "loss": 5.6808, + "step": 16890 + }, + { + "epoch": 0.023746085496445992, + "grad_norm": 1.320567011833191, + "learning_rate": 7.121680483349726e-05, + "loss": 5.6301, + "step": 16900 + }, + { + "epoch": 0.023760136434609568, + "grad_norm": 1.3973300457000732, + "learning_rate": 7.125895742588169e-05, + "loss": 5.6409, + "step": 16910 + }, + { + "epoch": 0.023774187372773144, + "grad_norm": 1.354918122291565, + "learning_rate": 7.130111001826612e-05, + "loss": 5.6952, + "step": 16920 + }, + { + "epoch": 0.023788238310936724, + "grad_norm": 1.419696569442749, + "learning_rate": 7.134326261065055e-05, + "loss": 5.6368, + "step": 16930 + }, + { + "epoch": 0.0238022892491003, + "grad_norm": 1.403704285621643, + "learning_rate": 7.138541520303498e-05, + "loss": 5.6484, + "step": 16940 + }, + { + "epoch": 0.02381634018726388, + "grad_norm": 1.366337776184082, + "learning_rate": 7.142756779541941e-05, + "loss": 5.7407, + "step": 16950 + }, + { + "epoch": 0.023830391125427456, + "grad_norm": 1.469281554222107, + "learning_rate": 7.146972038780384e-05, + "loss": 5.719, + "step": 16960 + }, + { + "epoch": 0.023844442063591032, + "grad_norm": 1.321475625038147, + "learning_rate": 7.151187298018827e-05, + "loss": 5.517, + "step": 16970 + }, + { + "epoch": 0.023858493001754612, + "grad_norm": 1.3294295072555542, + "learning_rate": 7.15540255725727e-05, + "loss": 5.6871, + "step": 16980 + }, + { + "epoch": 0.023872543939918188, + "grad_norm": 1.3294674158096313, + "learning_rate": 7.159617816495714e-05, + "loss": 5.6561, + "step": 16990 + }, + { + "epoch": 0.023886594878081768, + "grad_norm": 1.4403409957885742, + "learning_rate": 7.163833075734157e-05, + "loss": 5.5484, + "step": 17000 + }, + { + "epoch": 0.023900645816245344, + "grad_norm": 1.3805781602859497, + "learning_rate": 7.1680483349726e-05, + "loss": 5.7715, + "step": 17010 + }, + { + "epoch": 0.02391469675440892, + "grad_norm": 1.2874103784561157, + "learning_rate": 7.172263594211043e-05, + "loss": 5.6586, + "step": 17020 + }, + { + "epoch": 0.0239287476925725, + "grad_norm": 1.2773886919021606, + "learning_rate": 7.176478853449487e-05, + "loss": 5.7795, + "step": 17030 + }, + { + "epoch": 0.023942798630736076, + "grad_norm": 1.4012190103530884, + "learning_rate": 7.180694112687929e-05, + "loss": 5.686, + "step": 17040 + }, + { + "epoch": 0.023956849568899652, + "grad_norm": 1.3619203567504883, + "learning_rate": 7.184909371926372e-05, + "loss": 5.7779, + "step": 17050 + }, + { + "epoch": 0.02397090050706323, + "grad_norm": 1.6374523639678955, + "learning_rate": 7.189124631164817e-05, + "loss": 5.7293, + "step": 17060 + }, + { + "epoch": 0.023984951445226808, + "grad_norm": 1.365061640739441, + "learning_rate": 7.193339890403259e-05, + "loss": 5.6493, + "step": 17070 + }, + { + "epoch": 0.023999002383390387, + "grad_norm": 1.458706021308899, + "learning_rate": 7.197555149641702e-05, + "loss": 5.6065, + "step": 17080 + }, + { + "epoch": 0.024013053321553963, + "grad_norm": 1.3380498886108398, + "learning_rate": 7.201770408880146e-05, + "loss": 5.575, + "step": 17090 + }, + { + "epoch": 0.02402710425971754, + "grad_norm": 1.2822099924087524, + "learning_rate": 7.205985668118588e-05, + "loss": 5.6609, + "step": 17100 + }, + { + "epoch": 0.02404115519788112, + "grad_norm": 1.3940728902816772, + "learning_rate": 7.210200927357032e-05, + "loss": 5.6604, + "step": 17110 + }, + { + "epoch": 0.024055206136044695, + "grad_norm": 1.3046749830245972, + "learning_rate": 7.214416186595475e-05, + "loss": 5.6748, + "step": 17120 + }, + { + "epoch": 0.024069257074208275, + "grad_norm": 1.3955029249191284, + "learning_rate": 7.218631445833917e-05, + "loss": 5.6872, + "step": 17130 + }, + { + "epoch": 0.02408330801237185, + "grad_norm": 1.2928085327148438, + "learning_rate": 7.222846705072362e-05, + "loss": 5.6749, + "step": 17140 + }, + { + "epoch": 0.024097358950535427, + "grad_norm": 1.4071931838989258, + "learning_rate": 7.227061964310805e-05, + "loss": 5.6209, + "step": 17150 + }, + { + "epoch": 0.024111409888699007, + "grad_norm": 1.4092259407043457, + "learning_rate": 7.231277223549248e-05, + "loss": 5.616, + "step": 17160 + }, + { + "epoch": 0.024125460826862583, + "grad_norm": 1.3400150537490845, + "learning_rate": 7.235492482787691e-05, + "loss": 5.7729, + "step": 17170 + }, + { + "epoch": 0.02413951176502616, + "grad_norm": 1.7592744827270508, + "learning_rate": 7.239707742026134e-05, + "loss": 5.6002, + "step": 17180 + }, + { + "epoch": 0.02415356270318974, + "grad_norm": 1.4428009986877441, + "learning_rate": 7.243923001264577e-05, + "loss": 5.5269, + "step": 17190 + }, + { + "epoch": 0.024167613641353315, + "grad_norm": 1.266127347946167, + "learning_rate": 7.24813826050302e-05, + "loss": 5.6309, + "step": 17200 + }, + { + "epoch": 0.024181664579516895, + "grad_norm": 1.3543121814727783, + "learning_rate": 7.252353519741464e-05, + "loss": 5.5518, + "step": 17210 + }, + { + "epoch": 0.02419571551768047, + "grad_norm": 1.3485888242721558, + "learning_rate": 7.256568778979907e-05, + "loss": 5.6967, + "step": 17220 + }, + { + "epoch": 0.024209766455844047, + "grad_norm": 1.4433296918869019, + "learning_rate": 7.26078403821835e-05, + "loss": 5.7349, + "step": 17230 + }, + { + "epoch": 0.024223817394007627, + "grad_norm": 1.47990882396698, + "learning_rate": 7.264999297456793e-05, + "loss": 5.7185, + "step": 17240 + }, + { + "epoch": 0.024237868332171203, + "grad_norm": 1.3784239292144775, + "learning_rate": 7.269214556695236e-05, + "loss": 5.6906, + "step": 17250 + }, + { + "epoch": 0.024251919270334783, + "grad_norm": 1.329148530960083, + "learning_rate": 7.273429815933679e-05, + "loss": 5.6781, + "step": 17260 + }, + { + "epoch": 0.02426597020849836, + "grad_norm": 1.3315861225128174, + "learning_rate": 7.277645075172122e-05, + "loss": 5.6335, + "step": 17270 + }, + { + "epoch": 0.024280021146661935, + "grad_norm": 1.4295711517333984, + "learning_rate": 7.281860334410565e-05, + "loss": 5.6599, + "step": 17280 + }, + { + "epoch": 0.024294072084825515, + "grad_norm": 1.3033784627914429, + "learning_rate": 7.286075593649008e-05, + "loss": 5.6773, + "step": 17290 + }, + { + "epoch": 0.02430812302298909, + "grad_norm": 1.5363476276397705, + "learning_rate": 7.290290852887452e-05, + "loss": 5.7043, + "step": 17300 + }, + { + "epoch": 0.02432217396115267, + "grad_norm": 1.4859110116958618, + "learning_rate": 7.294506112125895e-05, + "loss": 5.5239, + "step": 17310 + }, + { + "epoch": 0.024336224899316247, + "grad_norm": 1.3148119449615479, + "learning_rate": 7.298721371364339e-05, + "loss": 5.7742, + "step": 17320 + }, + { + "epoch": 0.024350275837479823, + "grad_norm": 1.4127575159072876, + "learning_rate": 7.302936630602781e-05, + "loss": 5.6235, + "step": 17330 + }, + { + "epoch": 0.024364326775643402, + "grad_norm": 1.7024136781692505, + "learning_rate": 7.307151889841224e-05, + "loss": 5.6536, + "step": 17340 + }, + { + "epoch": 0.02437837771380698, + "grad_norm": 1.4915293455123901, + "learning_rate": 7.311367149079669e-05, + "loss": 5.5354, + "step": 17350 + }, + { + "epoch": 0.024392428651970555, + "grad_norm": 1.431097149848938, + "learning_rate": 7.31558240831811e-05, + "loss": 5.626, + "step": 17360 + }, + { + "epoch": 0.024406479590134134, + "grad_norm": 1.411797285079956, + "learning_rate": 7.319797667556555e-05, + "loss": 5.6458, + "step": 17370 + }, + { + "epoch": 0.02442053052829771, + "grad_norm": 1.374850869178772, + "learning_rate": 7.324012926794998e-05, + "loss": 5.6726, + "step": 17380 + }, + { + "epoch": 0.02443458146646129, + "grad_norm": 1.3310719728469849, + "learning_rate": 7.32822818603344e-05, + "loss": 5.6913, + "step": 17390 + }, + { + "epoch": 0.024448632404624866, + "grad_norm": 1.5239213705062866, + "learning_rate": 7.332443445271884e-05, + "loss": 5.7034, + "step": 17400 + }, + { + "epoch": 0.024462683342788442, + "grad_norm": 1.3302022218704224, + "learning_rate": 7.336658704510327e-05, + "loss": 5.6239, + "step": 17410 + }, + { + "epoch": 0.024476734280952022, + "grad_norm": 1.499558687210083, + "learning_rate": 7.340873963748769e-05, + "loss": 5.7574, + "step": 17420 + }, + { + "epoch": 0.024490785219115598, + "grad_norm": 1.323960304260254, + "learning_rate": 7.345089222987213e-05, + "loss": 5.6001, + "step": 17430 + }, + { + "epoch": 0.024504836157279178, + "grad_norm": 1.2899998426437378, + "learning_rate": 7.349304482225657e-05, + "loss": 5.5648, + "step": 17440 + }, + { + "epoch": 0.024518887095442754, + "grad_norm": 1.3837937116622925, + "learning_rate": 7.3535197414641e-05, + "loss": 5.6718, + "step": 17450 + }, + { + "epoch": 0.02453293803360633, + "grad_norm": 1.2842652797698975, + "learning_rate": 7.357735000702543e-05, + "loss": 5.7037, + "step": 17460 + }, + { + "epoch": 0.02454698897176991, + "grad_norm": 1.2789689302444458, + "learning_rate": 7.361950259940986e-05, + "loss": 5.6116, + "step": 17470 + }, + { + "epoch": 0.024561039909933486, + "grad_norm": 1.3312838077545166, + "learning_rate": 7.366165519179429e-05, + "loss": 5.7832, + "step": 17480 + }, + { + "epoch": 0.024575090848097062, + "grad_norm": 1.3789297342300415, + "learning_rate": 7.370380778417872e-05, + "loss": 5.6628, + "step": 17490 + }, + { + "epoch": 0.024589141786260642, + "grad_norm": 1.552386999130249, + "learning_rate": 7.374596037656315e-05, + "loss": 5.7062, + "step": 17500 + }, + { + "epoch": 0.024603192724424218, + "grad_norm": 1.3529428243637085, + "learning_rate": 7.378811296894758e-05, + "loss": 5.5797, + "step": 17510 + }, + { + "epoch": 0.024617243662587798, + "grad_norm": 1.3707754611968994, + "learning_rate": 7.383026556133202e-05, + "loss": 5.6016, + "step": 17520 + }, + { + "epoch": 0.024631294600751374, + "grad_norm": 1.6769342422485352, + "learning_rate": 7.387241815371645e-05, + "loss": 5.6126, + "step": 17530 + }, + { + "epoch": 0.02464534553891495, + "grad_norm": 1.3697290420532227, + "learning_rate": 7.391457074610088e-05, + "loss": 5.5616, + "step": 17540 + }, + { + "epoch": 0.02465939647707853, + "grad_norm": 1.4338105916976929, + "learning_rate": 7.395672333848531e-05, + "loss": 5.6251, + "step": 17550 + }, + { + "epoch": 0.024673447415242106, + "grad_norm": 1.338873028755188, + "learning_rate": 7.399887593086974e-05, + "loss": 5.5995, + "step": 17560 + }, + { + "epoch": 0.024687498353405685, + "grad_norm": 1.6078602075576782, + "learning_rate": 7.404102852325417e-05, + "loss": 5.7184, + "step": 17570 + }, + { + "epoch": 0.02470154929156926, + "grad_norm": 1.374680995941162, + "learning_rate": 7.40831811156386e-05, + "loss": 5.54, + "step": 17580 + }, + { + "epoch": 0.024715600229732838, + "grad_norm": 1.3792024850845337, + "learning_rate": 7.41211184487846e-05, + "loss": 5.6917, + "step": 17590 + }, + { + "epoch": 0.024729651167896417, + "grad_norm": 1.3895095586776733, + "learning_rate": 7.416327104116903e-05, + "loss": 5.5612, + "step": 17600 + }, + { + "epoch": 0.024743702106059993, + "grad_norm": 1.251436710357666, + "learning_rate": 7.420542363355346e-05, + "loss": 5.6356, + "step": 17610 + }, + { + "epoch": 0.024757753044223573, + "grad_norm": 1.6639325618743896, + "learning_rate": 7.424757622593789e-05, + "loss": 5.613, + "step": 17620 + }, + { + "epoch": 0.02477180398238715, + "grad_norm": 1.2654144763946533, + "learning_rate": 7.428972881832232e-05, + "loss": 5.6399, + "step": 17630 + }, + { + "epoch": 0.024785854920550725, + "grad_norm": 1.3583436012268066, + "learning_rate": 7.433188141070675e-05, + "loss": 5.7586, + "step": 17640 + }, + { + "epoch": 0.024799905858714305, + "grad_norm": 1.3544237613677979, + "learning_rate": 7.437403400309119e-05, + "loss": 5.7153, + "step": 17650 + }, + { + "epoch": 0.02481395679687788, + "grad_norm": 1.3555983304977417, + "learning_rate": 7.441618659547562e-05, + "loss": 5.5901, + "step": 17660 + }, + { + "epoch": 0.024828007735041457, + "grad_norm": 1.5493931770324707, + "learning_rate": 7.445833918786005e-05, + "loss": 5.5702, + "step": 17670 + }, + { + "epoch": 0.024842058673205037, + "grad_norm": 1.4650983810424805, + "learning_rate": 7.450049178024448e-05, + "loss": 5.6018, + "step": 17680 + }, + { + "epoch": 0.024856109611368613, + "grad_norm": 1.2635927200317383, + "learning_rate": 7.454264437262891e-05, + "loss": 5.5942, + "step": 17690 + }, + { + "epoch": 0.024870160549532193, + "grad_norm": 1.5327472686767578, + "learning_rate": 7.458479696501334e-05, + "loss": 5.5997, + "step": 17700 + }, + { + "epoch": 0.02488421148769577, + "grad_norm": 1.350309133529663, + "learning_rate": 7.462694955739777e-05, + "loss": 5.582, + "step": 17710 + }, + { + "epoch": 0.024898262425859345, + "grad_norm": 1.321396827697754, + "learning_rate": 7.46691021497822e-05, + "loss": 5.582, + "step": 17720 + }, + { + "epoch": 0.024912313364022925, + "grad_norm": 1.3941783905029297, + "learning_rate": 7.471125474216663e-05, + "loss": 5.7505, + "step": 17730 + }, + { + "epoch": 0.0249263643021865, + "grad_norm": 1.573265790939331, + "learning_rate": 7.475340733455107e-05, + "loss": 5.4622, + "step": 17740 + }, + { + "epoch": 0.02494041524035008, + "grad_norm": 1.3758049011230469, + "learning_rate": 7.47955599269355e-05, + "loss": 5.5887, + "step": 17750 + }, + { + "epoch": 0.024954466178513657, + "grad_norm": 1.3547019958496094, + "learning_rate": 7.483771251931993e-05, + "loss": 5.603, + "step": 17760 + }, + { + "epoch": 0.024968517116677233, + "grad_norm": 1.464343786239624, + "learning_rate": 7.487986511170436e-05, + "loss": 5.5724, + "step": 17770 + }, + { + "epoch": 0.024982568054840813, + "grad_norm": 1.409480333328247, + "learning_rate": 7.492201770408879e-05, + "loss": 5.6456, + "step": 17780 + }, + { + "epoch": 0.02499661899300439, + "grad_norm": 1.4030753374099731, + "learning_rate": 7.496417029647324e-05, + "loss": 5.6821, + "step": 17790 + }, + { + "epoch": 0.02501066993116797, + "grad_norm": 1.4202769994735718, + "learning_rate": 7.500632288885765e-05, + "loss": 5.6878, + "step": 17800 + }, + { + "epoch": 0.025024720869331545, + "grad_norm": 1.3402527570724487, + "learning_rate": 7.504847548124208e-05, + "loss": 5.5967, + "step": 17810 + }, + { + "epoch": 0.02503877180749512, + "grad_norm": 1.4031054973602295, + "learning_rate": 7.509062807362653e-05, + "loss": 5.6123, + "step": 17820 + }, + { + "epoch": 0.0250528227456587, + "grad_norm": 1.5110526084899902, + "learning_rate": 7.513278066601096e-05, + "loss": 5.6823, + "step": 17830 + }, + { + "epoch": 0.025066873683822277, + "grad_norm": 1.3663294315338135, + "learning_rate": 7.517493325839538e-05, + "loss": 5.6382, + "step": 17840 + }, + { + "epoch": 0.025080924621985853, + "grad_norm": 1.2902328968048096, + "learning_rate": 7.521708585077982e-05, + "loss": 5.6346, + "step": 17850 + }, + { + "epoch": 0.025094975560149432, + "grad_norm": 1.4534049034118652, + "learning_rate": 7.525923844316425e-05, + "loss": 5.5818, + "step": 17860 + }, + { + "epoch": 0.02510902649831301, + "grad_norm": 1.3165909051895142, + "learning_rate": 7.530139103554867e-05, + "loss": 5.6324, + "step": 17870 + }, + { + "epoch": 0.025123077436476588, + "grad_norm": 1.443505048751831, + "learning_rate": 7.534354362793312e-05, + "loss": 5.6841, + "step": 17880 + }, + { + "epoch": 0.025137128374640164, + "grad_norm": 1.364423155784607, + "learning_rate": 7.538569622031755e-05, + "loss": 5.6508, + "step": 17890 + }, + { + "epoch": 0.02515117931280374, + "grad_norm": 1.3083369731903076, + "learning_rate": 7.542784881270196e-05, + "loss": 5.7182, + "step": 17900 + }, + { + "epoch": 0.02516523025096732, + "grad_norm": 1.2806565761566162, + "learning_rate": 7.547000140508641e-05, + "loss": 5.6075, + "step": 17910 + }, + { + "epoch": 0.025179281189130896, + "grad_norm": 1.3733397722244263, + "learning_rate": 7.551215399747084e-05, + "loss": 5.6118, + "step": 17920 + }, + { + "epoch": 0.025193332127294476, + "grad_norm": 1.3024393320083618, + "learning_rate": 7.555430658985526e-05, + "loss": 5.628, + "step": 17930 + }, + { + "epoch": 0.025207383065458052, + "grad_norm": 1.447492003440857, + "learning_rate": 7.55964591822397e-05, + "loss": 5.6351, + "step": 17940 + }, + { + "epoch": 0.025221434003621628, + "grad_norm": 1.2635152339935303, + "learning_rate": 7.563861177462413e-05, + "loss": 5.5923, + "step": 17950 + }, + { + "epoch": 0.025235484941785208, + "grad_norm": 1.2946640253067017, + "learning_rate": 7.568076436700855e-05, + "loss": 5.6809, + "step": 17960 + }, + { + "epoch": 0.025249535879948784, + "grad_norm": 1.283579707145691, + "learning_rate": 7.5722916959393e-05, + "loss": 5.7118, + "step": 17970 + }, + { + "epoch": 0.02526358681811236, + "grad_norm": 1.259277582168579, + "learning_rate": 7.576506955177743e-05, + "loss": 5.6226, + "step": 17980 + }, + { + "epoch": 0.02527763775627594, + "grad_norm": 1.519494891166687, + "learning_rate": 7.580722214416187e-05, + "loss": 5.6467, + "step": 17990 + }, + { + "epoch": 0.025291688694439516, + "grad_norm": 1.3044480085372925, + "learning_rate": 7.584937473654629e-05, + "loss": 5.7049, + "step": 18000 + }, + { + "epoch": 0.025305739632603096, + "grad_norm": 1.3547728061676025, + "learning_rate": 7.589152732893072e-05, + "loss": 5.5962, + "step": 18010 + }, + { + "epoch": 0.025319790570766672, + "grad_norm": 1.3373003005981445, + "learning_rate": 7.593367992131517e-05, + "loss": 5.6107, + "step": 18020 + }, + { + "epoch": 0.025333841508930248, + "grad_norm": 1.3276454210281372, + "learning_rate": 7.597583251369958e-05, + "loss": 5.5887, + "step": 18030 + }, + { + "epoch": 0.025347892447093828, + "grad_norm": 1.3012672662734985, + "learning_rate": 7.601798510608401e-05, + "loss": 5.6691, + "step": 18040 + }, + { + "epoch": 0.025361943385257404, + "grad_norm": 1.3194389343261719, + "learning_rate": 7.606013769846846e-05, + "loss": 5.6852, + "step": 18050 + }, + { + "epoch": 0.025375994323420983, + "grad_norm": 1.2897201776504517, + "learning_rate": 7.610229029085288e-05, + "loss": 5.6437, + "step": 18060 + }, + { + "epoch": 0.02539004526158456, + "grad_norm": 2.91089129447937, + "learning_rate": 7.614444288323731e-05, + "loss": 5.6226, + "step": 18070 + }, + { + "epoch": 0.025404096199748136, + "grad_norm": 1.2783751487731934, + "learning_rate": 7.618659547562175e-05, + "loss": 5.5288, + "step": 18080 + }, + { + "epoch": 0.025418147137911715, + "grad_norm": 1.5145020484924316, + "learning_rate": 7.622874806800617e-05, + "loss": 5.611, + "step": 18090 + }, + { + "epoch": 0.02543219807607529, + "grad_norm": 1.3364498615264893, + "learning_rate": 7.62709006603906e-05, + "loss": 5.5694, + "step": 18100 + }, + { + "epoch": 0.02544624901423887, + "grad_norm": 1.3756706714630127, + "learning_rate": 7.631305325277505e-05, + "loss": 5.5867, + "step": 18110 + }, + { + "epoch": 0.025460299952402447, + "grad_norm": 1.4030925035476685, + "learning_rate": 7.635520584515948e-05, + "loss": 5.5534, + "step": 18120 + }, + { + "epoch": 0.025474350890566023, + "grad_norm": 1.3178571462631226, + "learning_rate": 7.63973584375439e-05, + "loss": 5.6151, + "step": 18130 + }, + { + "epoch": 0.025488401828729603, + "grad_norm": 1.3697510957717896, + "learning_rate": 7.643951102992834e-05, + "loss": 5.648, + "step": 18140 + }, + { + "epoch": 0.02550245276689318, + "grad_norm": 1.3539316654205322, + "learning_rate": 7.648166362231277e-05, + "loss": 5.5328, + "step": 18150 + }, + { + "epoch": 0.025516503705056755, + "grad_norm": 1.3074488639831543, + "learning_rate": 7.652381621469719e-05, + "loss": 5.6203, + "step": 18160 + }, + { + "epoch": 0.025530554643220335, + "grad_norm": 1.3524091243743896, + "learning_rate": 7.656596880708163e-05, + "loss": 5.6306, + "step": 18170 + }, + { + "epoch": 0.02554460558138391, + "grad_norm": 1.3374627828598022, + "learning_rate": 7.660812139946606e-05, + "loss": 5.6288, + "step": 18180 + }, + { + "epoch": 0.02555865651954749, + "grad_norm": 1.351073145866394, + "learning_rate": 7.665027399185048e-05, + "loss": 5.606, + "step": 18190 + }, + { + "epoch": 0.025572707457711067, + "grad_norm": 1.4369218349456787, + "learning_rate": 7.669242658423493e-05, + "loss": 5.5697, + "step": 18200 + }, + { + "epoch": 0.025586758395874643, + "grad_norm": 1.3648687601089478, + "learning_rate": 7.673457917661936e-05, + "loss": 5.5595, + "step": 18210 + }, + { + "epoch": 0.025600809334038223, + "grad_norm": 1.3496894836425781, + "learning_rate": 7.677673176900378e-05, + "loss": 5.528, + "step": 18220 + }, + { + "epoch": 0.0256148602722018, + "grad_norm": 1.3187439441680908, + "learning_rate": 7.681888436138822e-05, + "loss": 5.6865, + "step": 18230 + }, + { + "epoch": 0.02562891121036538, + "grad_norm": 1.4901741743087769, + "learning_rate": 7.686103695377265e-05, + "loss": 5.5185, + "step": 18240 + }, + { + "epoch": 0.025642962148528955, + "grad_norm": 1.4721450805664062, + "learning_rate": 7.69031895461571e-05, + "loss": 5.5971, + "step": 18250 + }, + { + "epoch": 0.02565701308669253, + "grad_norm": 1.3747284412384033, + "learning_rate": 7.694534213854151e-05, + "loss": 5.6086, + "step": 18260 + }, + { + "epoch": 0.02567106402485611, + "grad_norm": 1.4034130573272705, + "learning_rate": 7.698749473092594e-05, + "loss": 5.585, + "step": 18270 + }, + { + "epoch": 0.025685114963019687, + "grad_norm": 1.4254305362701416, + "learning_rate": 7.702964732331039e-05, + "loss": 5.5106, + "step": 18280 + }, + { + "epoch": 0.025699165901183263, + "grad_norm": 1.4483678340911865, + "learning_rate": 7.707179991569481e-05, + "loss": 5.5116, + "step": 18290 + }, + { + "epoch": 0.025713216839346843, + "grad_norm": 1.3134675025939941, + "learning_rate": 7.711395250807924e-05, + "loss": 5.5165, + "step": 18300 + }, + { + "epoch": 0.02572726777751042, + "grad_norm": 1.3608453273773193, + "learning_rate": 7.715610510046368e-05, + "loss": 5.4691, + "step": 18310 + }, + { + "epoch": 0.025741318715674, + "grad_norm": 1.3626971244812012, + "learning_rate": 7.71982576928481e-05, + "loss": 5.5634, + "step": 18320 + }, + { + "epoch": 0.025755369653837575, + "grad_norm": 1.3774510622024536, + "learning_rate": 7.724041028523253e-05, + "loss": 5.5621, + "step": 18330 + }, + { + "epoch": 0.02576942059200115, + "grad_norm": 1.361126184463501, + "learning_rate": 7.728256287761698e-05, + "loss": 5.5653, + "step": 18340 + }, + { + "epoch": 0.02578347153016473, + "grad_norm": 1.440781593322754, + "learning_rate": 7.73247154700014e-05, + "loss": 5.5529, + "step": 18350 + }, + { + "epoch": 0.025797522468328307, + "grad_norm": 1.3827389478683472, + "learning_rate": 7.736686806238583e-05, + "loss": 5.4872, + "step": 18360 + }, + { + "epoch": 0.025811573406491886, + "grad_norm": 1.3901987075805664, + "learning_rate": 7.740902065477027e-05, + "loss": 5.6825, + "step": 18370 + }, + { + "epoch": 0.025825624344655462, + "grad_norm": 1.549964189529419, + "learning_rate": 7.745117324715469e-05, + "loss": 5.5766, + "step": 18380 + }, + { + "epoch": 0.02583967528281904, + "grad_norm": 1.3051047325134277, + "learning_rate": 7.749332583953912e-05, + "loss": 5.5934, + "step": 18390 + }, + { + "epoch": 0.025853726220982618, + "grad_norm": 1.3041412830352783, + "learning_rate": 7.753547843192356e-05, + "loss": 5.5017, + "step": 18400 + }, + { + "epoch": 0.025867777159146194, + "grad_norm": 1.3146135807037354, + "learning_rate": 7.7577631024308e-05, + "loss": 5.5809, + "step": 18410 + }, + { + "epoch": 0.025881828097309774, + "grad_norm": 1.3756433725357056, + "learning_rate": 7.761978361669241e-05, + "loss": 5.553, + "step": 18420 + }, + { + "epoch": 0.02589587903547335, + "grad_norm": 1.5293617248535156, + "learning_rate": 7.766193620907686e-05, + "loss": 5.6161, + "step": 18430 + }, + { + "epoch": 0.025909929973636926, + "grad_norm": 1.3322265148162842, + "learning_rate": 7.770408880146129e-05, + "loss": 5.6277, + "step": 18440 + }, + { + "epoch": 0.025923980911800506, + "grad_norm": 1.5310215950012207, + "learning_rate": 7.77462413938457e-05, + "loss": 5.7149, + "step": 18450 + }, + { + "epoch": 0.025938031849964082, + "grad_norm": 1.3711066246032715, + "learning_rate": 7.778839398623015e-05, + "loss": 5.5819, + "step": 18460 + }, + { + "epoch": 0.025952082788127658, + "grad_norm": 1.3732609748840332, + "learning_rate": 7.783054657861458e-05, + "loss": 5.557, + "step": 18470 + }, + { + "epoch": 0.025966133726291238, + "grad_norm": 1.3032290935516357, + "learning_rate": 7.7872699170999e-05, + "loss": 5.6042, + "step": 18480 + }, + { + "epoch": 0.025980184664454814, + "grad_norm": 1.4029079675674438, + "learning_rate": 7.791485176338344e-05, + "loss": 5.6057, + "step": 18490 + }, + { + "epoch": 0.025994235602618394, + "grad_norm": 1.2981117963790894, + "learning_rate": 7.795700435576788e-05, + "loss": 5.572, + "step": 18500 + }, + { + "epoch": 0.02600828654078197, + "grad_norm": 1.346895456314087, + "learning_rate": 7.799915694815229e-05, + "loss": 5.5795, + "step": 18510 + }, + { + "epoch": 0.026022337478945546, + "grad_norm": 1.3572524785995483, + "learning_rate": 7.804130954053674e-05, + "loss": 5.5995, + "step": 18520 + }, + { + "epoch": 0.026036388417109126, + "grad_norm": 1.4479949474334717, + "learning_rate": 7.808346213292117e-05, + "loss": 5.6112, + "step": 18530 + }, + { + "epoch": 0.026050439355272702, + "grad_norm": 1.5404094457626343, + "learning_rate": 7.812561472530561e-05, + "loss": 5.6727, + "step": 18540 + }, + { + "epoch": 0.02606449029343628, + "grad_norm": 1.4202011823654175, + "learning_rate": 7.816776731769003e-05, + "loss": 5.61, + "step": 18550 + }, + { + "epoch": 0.026078541231599858, + "grad_norm": 1.4051263332366943, + "learning_rate": 7.820991991007446e-05, + "loss": 5.5492, + "step": 18560 + }, + { + "epoch": 0.026092592169763434, + "grad_norm": 1.3520148992538452, + "learning_rate": 7.82520725024589e-05, + "loss": 5.6385, + "step": 18570 + }, + { + "epoch": 0.026106643107927013, + "grad_norm": 1.389690637588501, + "learning_rate": 7.829422509484332e-05, + "loss": 5.482, + "step": 18580 + }, + { + "epoch": 0.02612069404609059, + "grad_norm": 1.3874398469924927, + "learning_rate": 7.833637768722776e-05, + "loss": 5.5605, + "step": 18590 + }, + { + "epoch": 0.026134744984254166, + "grad_norm": 1.4007757902145386, + "learning_rate": 7.83785302796122e-05, + "loss": 5.5494, + "step": 18600 + }, + { + "epoch": 0.026148795922417745, + "grad_norm": 1.5084589719772339, + "learning_rate": 7.842068287199662e-05, + "loss": 5.5122, + "step": 18610 + }, + { + "epoch": 0.02616284686058132, + "grad_norm": 1.4278961420059204, + "learning_rate": 7.846283546438105e-05, + "loss": 5.5855, + "step": 18620 + }, + { + "epoch": 0.0261768977987449, + "grad_norm": 1.4229931831359863, + "learning_rate": 7.85049880567655e-05, + "loss": 5.4738, + "step": 18630 + }, + { + "epoch": 0.026190948736908477, + "grad_norm": 1.4617115259170532, + "learning_rate": 7.854714064914991e-05, + "loss": 5.6064, + "step": 18640 + }, + { + "epoch": 0.026204999675072053, + "grad_norm": 1.335037350654602, + "learning_rate": 7.858929324153434e-05, + "loss": 5.5919, + "step": 18650 + }, + { + "epoch": 0.026219050613235633, + "grad_norm": 1.3201597929000854, + "learning_rate": 7.863144583391879e-05, + "loss": 5.6363, + "step": 18660 + }, + { + "epoch": 0.02623310155139921, + "grad_norm": 1.5056530237197876, + "learning_rate": 7.86735984263032e-05, + "loss": 5.5251, + "step": 18670 + }, + { + "epoch": 0.02624715248956279, + "grad_norm": 1.3666268587112427, + "learning_rate": 7.871575101868764e-05, + "loss": 5.5708, + "step": 18680 + }, + { + "epoch": 0.026261203427726365, + "grad_norm": 1.4931973218917847, + "learning_rate": 7.875790361107208e-05, + "loss": 5.5645, + "step": 18690 + }, + { + "epoch": 0.02627525436588994, + "grad_norm": 1.3463979959487915, + "learning_rate": 7.880005620345651e-05, + "loss": 5.4477, + "step": 18700 + }, + { + "epoch": 0.02628930530405352, + "grad_norm": 1.3805086612701416, + "learning_rate": 7.884220879584093e-05, + "loss": 5.6309, + "step": 18710 + }, + { + "epoch": 0.026303356242217097, + "grad_norm": 1.3408585786819458, + "learning_rate": 7.888436138822537e-05, + "loss": 5.5943, + "step": 18720 + }, + { + "epoch": 0.026317407180380677, + "grad_norm": 1.3854987621307373, + "learning_rate": 7.89265139806098e-05, + "loss": 5.6367, + "step": 18730 + }, + { + "epoch": 0.026331458118544253, + "grad_norm": 1.4840235710144043, + "learning_rate": 7.896866657299422e-05, + "loss": 5.6203, + "step": 18740 + }, + { + "epoch": 0.02634550905670783, + "grad_norm": 1.4539438486099243, + "learning_rate": 7.901081916537867e-05, + "loss": 5.6533, + "step": 18750 + }, + { + "epoch": 0.02635955999487141, + "grad_norm": 1.6064668893814087, + "learning_rate": 7.90529717577631e-05, + "loss": 5.57, + "step": 18760 + }, + { + "epoch": 0.026373610933034985, + "grad_norm": 1.3408623933792114, + "learning_rate": 7.909512435014752e-05, + "loss": 5.4955, + "step": 18770 + }, + { + "epoch": 0.02638766187119856, + "grad_norm": 1.9005026817321777, + "learning_rate": 7.913727694253196e-05, + "loss": 5.6209, + "step": 18780 + }, + { + "epoch": 0.02640171280936214, + "grad_norm": 1.2707617282867432, + "learning_rate": 7.917942953491639e-05, + "loss": 5.6083, + "step": 18790 + }, + { + "epoch": 0.026415763747525717, + "grad_norm": 1.494612216949463, + "learning_rate": 7.922158212730081e-05, + "loss": 5.5413, + "step": 18800 + }, + { + "epoch": 0.026429814685689296, + "grad_norm": 1.4194583892822266, + "learning_rate": 7.926373471968525e-05, + "loss": 5.5375, + "step": 18810 + }, + { + "epoch": 0.026443865623852873, + "grad_norm": 1.5205409526824951, + "learning_rate": 7.930588731206969e-05, + "loss": 5.5377, + "step": 18820 + }, + { + "epoch": 0.02645791656201645, + "grad_norm": 1.2400825023651123, + "learning_rate": 7.934803990445413e-05, + "loss": 5.5744, + "step": 18830 + }, + { + "epoch": 0.02647196750018003, + "grad_norm": 1.3689380884170532, + "learning_rate": 7.939019249683855e-05, + "loss": 5.618, + "step": 18840 + }, + { + "epoch": 0.026486018438343605, + "grad_norm": 1.4133222103118896, + "learning_rate": 7.943234508922298e-05, + "loss": 5.7025, + "step": 18850 + }, + { + "epoch": 0.026500069376507184, + "grad_norm": 1.4041091203689575, + "learning_rate": 7.947449768160742e-05, + "loss": 5.5289, + "step": 18860 + }, + { + "epoch": 0.02651412031467076, + "grad_norm": 1.2778563499450684, + "learning_rate": 7.951665027399184e-05, + "loss": 5.504, + "step": 18870 + }, + { + "epoch": 0.026528171252834336, + "grad_norm": 1.4318342208862305, + "learning_rate": 7.955880286637627e-05, + "loss": 5.5416, + "step": 18880 + }, + { + "epoch": 0.026542222190997916, + "grad_norm": 1.3022536039352417, + "learning_rate": 7.960095545876072e-05, + "loss": 5.4591, + "step": 18890 + }, + { + "epoch": 0.026556273129161492, + "grad_norm": 1.3003450632095337, + "learning_rate": 7.964310805114514e-05, + "loss": 5.5049, + "step": 18900 + }, + { + "epoch": 0.026570324067325072, + "grad_norm": 1.4249634742736816, + "learning_rate": 7.968526064352957e-05, + "loss": 5.5521, + "step": 18910 + }, + { + "epoch": 0.026584375005488648, + "grad_norm": 2.4674744606018066, + "learning_rate": 7.972741323591401e-05, + "loss": 5.5821, + "step": 18920 + }, + { + "epoch": 0.026598425943652224, + "grad_norm": 1.4836219549179077, + "learning_rate": 7.976956582829843e-05, + "loss": 5.6154, + "step": 18930 + }, + { + "epoch": 0.026612476881815804, + "grad_norm": 1.395345687866211, + "learning_rate": 7.981171842068286e-05, + "loss": 5.589, + "step": 18940 + }, + { + "epoch": 0.02662652781997938, + "grad_norm": 1.3151682615280151, + "learning_rate": 7.98538710130673e-05, + "loss": 5.4916, + "step": 18950 + }, + { + "epoch": 0.026640578758142956, + "grad_norm": 1.6428303718566895, + "learning_rate": 7.989602360545172e-05, + "loss": 5.6574, + "step": 18960 + }, + { + "epoch": 0.026654629696306536, + "grad_norm": 1.3292784690856934, + "learning_rate": 7.993817619783615e-05, + "loss": 5.6062, + "step": 18970 + }, + { + "epoch": 0.026668680634470112, + "grad_norm": 1.4000933170318604, + "learning_rate": 7.99803287902206e-05, + "loss": 5.5415, + "step": 18980 + }, + { + "epoch": 0.02668273157263369, + "grad_norm": 1.428321361541748, + "learning_rate": 8.002248138260503e-05, + "loss": 5.5416, + "step": 18990 + }, + { + "epoch": 0.026696782510797268, + "grad_norm": 1.3596489429473877, + "learning_rate": 8.006463397498945e-05, + "loss": 5.6136, + "step": 19000 + }, + { + "epoch": 0.026710833448960844, + "grad_norm": 1.3439053297042847, + "learning_rate": 8.010678656737389e-05, + "loss": 5.5557, + "step": 19010 + }, + { + "epoch": 0.026724884387124424, + "grad_norm": 1.3456841707229614, + "learning_rate": 8.014893915975832e-05, + "loss": 5.484, + "step": 19020 + }, + { + "epoch": 0.026738935325288, + "grad_norm": 1.317115306854248, + "learning_rate": 8.019109175214274e-05, + "loss": 5.6083, + "step": 19030 + }, + { + "epoch": 0.02675298626345158, + "grad_norm": 1.845428466796875, + "learning_rate": 8.023324434452719e-05, + "loss": 5.5963, + "step": 19040 + }, + { + "epoch": 0.026767037201615156, + "grad_norm": 1.309800386428833, + "learning_rate": 8.027539693691162e-05, + "loss": 5.5722, + "step": 19050 + }, + { + "epoch": 0.026781088139778732, + "grad_norm": 1.389327883720398, + "learning_rate": 8.031754952929603e-05, + "loss": 5.4975, + "step": 19060 + }, + { + "epoch": 0.02679513907794231, + "grad_norm": 1.251827359199524, + "learning_rate": 8.035970212168048e-05, + "loss": 5.5743, + "step": 19070 + }, + { + "epoch": 0.026809190016105888, + "grad_norm": 1.2980040311813354, + "learning_rate": 8.040185471406491e-05, + "loss": 5.5124, + "step": 19080 + }, + { + "epoch": 0.026823240954269464, + "grad_norm": 1.3532140254974365, + "learning_rate": 8.044400730644933e-05, + "loss": 5.5087, + "step": 19090 + }, + { + "epoch": 0.026837291892433043, + "grad_norm": 1.3311684131622314, + "learning_rate": 8.048615989883377e-05, + "loss": 5.5773, + "step": 19100 + }, + { + "epoch": 0.02685134283059662, + "grad_norm": 1.3414361476898193, + "learning_rate": 8.05283124912182e-05, + "loss": 5.4767, + "step": 19110 + }, + { + "epoch": 0.0268653937687602, + "grad_norm": 1.47315514087677, + "learning_rate": 8.057046508360265e-05, + "loss": 5.5304, + "step": 19120 + }, + { + "epoch": 0.026879444706923775, + "grad_norm": 1.3902984857559204, + "learning_rate": 8.061261767598707e-05, + "loss": 5.5592, + "step": 19130 + }, + { + "epoch": 0.02689349564508735, + "grad_norm": 1.3104959726333618, + "learning_rate": 8.06547702683715e-05, + "loss": 5.5947, + "step": 19140 + }, + { + "epoch": 0.02690754658325093, + "grad_norm": 1.377226710319519, + "learning_rate": 8.069692286075594e-05, + "loss": 5.6055, + "step": 19150 + }, + { + "epoch": 0.026921597521414507, + "grad_norm": 1.3036943674087524, + "learning_rate": 8.073907545314036e-05, + "loss": 5.5622, + "step": 19160 + }, + { + "epoch": 0.026935648459578087, + "grad_norm": 1.395580768585205, + "learning_rate": 8.078122804552479e-05, + "loss": 5.4697, + "step": 19170 + }, + { + "epoch": 0.026949699397741663, + "grad_norm": 1.2811263799667358, + "learning_rate": 8.082338063790923e-05, + "loss": 5.5932, + "step": 19180 + }, + { + "epoch": 0.02696375033590524, + "grad_norm": 1.2819894552230835, + "learning_rate": 8.086553323029365e-05, + "loss": 5.5619, + "step": 19190 + }, + { + "epoch": 0.02697780127406882, + "grad_norm": 1.2503257989883423, + "learning_rate": 8.090768582267808e-05, + "loss": 5.6324, + "step": 19200 + }, + { + "epoch": 0.026991852212232395, + "grad_norm": 1.6546611785888672, + "learning_rate": 8.094983841506253e-05, + "loss": 5.451, + "step": 19210 + }, + { + "epoch": 0.027005903150395975, + "grad_norm": 1.3742631673812866, + "learning_rate": 8.099199100744695e-05, + "loss": 5.6103, + "step": 19220 + }, + { + "epoch": 0.02701995408855955, + "grad_norm": 1.5028876066207886, + "learning_rate": 8.103414359983138e-05, + "loss": 5.4252, + "step": 19230 + }, + { + "epoch": 0.027034005026723127, + "grad_norm": 1.4231295585632324, + "learning_rate": 8.107629619221582e-05, + "loss": 5.5802, + "step": 19240 + }, + { + "epoch": 0.027048055964886707, + "grad_norm": 1.3292512893676758, + "learning_rate": 8.111844878460025e-05, + "loss": 5.4812, + "step": 19250 + }, + { + "epoch": 0.027062106903050283, + "grad_norm": 1.4668748378753662, + "learning_rate": 8.116060137698467e-05, + "loss": 5.5739, + "step": 19260 + }, + { + "epoch": 0.02707615784121386, + "grad_norm": 1.3313186168670654, + "learning_rate": 8.120275396936912e-05, + "loss": 5.5789, + "step": 19270 + }, + { + "epoch": 0.02709020877937744, + "grad_norm": 1.3527790307998657, + "learning_rate": 8.124490656175355e-05, + "loss": 5.4486, + "step": 19280 + }, + { + "epoch": 0.027104259717541015, + "grad_norm": 1.3377543687820435, + "learning_rate": 8.128705915413796e-05, + "loss": 5.4651, + "step": 19290 + }, + { + "epoch": 0.027118310655704594, + "grad_norm": 1.3352538347244263, + "learning_rate": 8.132921174652241e-05, + "loss": 5.4763, + "step": 19300 + }, + { + "epoch": 0.02713236159386817, + "grad_norm": 1.3395129442214966, + "learning_rate": 8.137136433890684e-05, + "loss": 5.4819, + "step": 19310 + }, + { + "epoch": 0.027146412532031747, + "grad_norm": 1.3543874025344849, + "learning_rate": 8.141351693129126e-05, + "loss": 5.5129, + "step": 19320 + }, + { + "epoch": 0.027160463470195326, + "grad_norm": 1.3789215087890625, + "learning_rate": 8.14556695236757e-05, + "loss": 5.5339, + "step": 19330 + }, + { + "epoch": 0.027174514408358903, + "grad_norm": 1.388160228729248, + "learning_rate": 8.149782211606013e-05, + "loss": 5.5459, + "step": 19340 + }, + { + "epoch": 0.027188565346522482, + "grad_norm": 1.3371630907058716, + "learning_rate": 8.153997470844455e-05, + "loss": 5.6077, + "step": 19350 + }, + { + "epoch": 0.02720261628468606, + "grad_norm": 1.3276069164276123, + "learning_rate": 8.1582127300829e-05, + "loss": 5.5951, + "step": 19360 + }, + { + "epoch": 0.027216667222849635, + "grad_norm": 1.3663477897644043, + "learning_rate": 8.162427989321343e-05, + "loss": 5.581, + "step": 19370 + }, + { + "epoch": 0.027230718161013214, + "grad_norm": 1.48804771900177, + "learning_rate": 8.166643248559784e-05, + "loss": 5.6145, + "step": 19380 + }, + { + "epoch": 0.02724476909917679, + "grad_norm": 1.3625435829162598, + "learning_rate": 8.170858507798229e-05, + "loss": 5.4211, + "step": 19390 + }, + { + "epoch": 0.027258820037340366, + "grad_norm": 1.324641227722168, + "learning_rate": 8.175073767036672e-05, + "loss": 5.429, + "step": 19400 + }, + { + "epoch": 0.027272870975503946, + "grad_norm": 1.3358043432235718, + "learning_rate": 8.179289026275117e-05, + "loss": 5.4692, + "step": 19410 + }, + { + "epoch": 0.027286921913667522, + "grad_norm": 1.48837411403656, + "learning_rate": 8.183504285513558e-05, + "loss": 5.5711, + "step": 19420 + }, + { + "epoch": 0.027300972851831102, + "grad_norm": 1.3000835180282593, + "learning_rate": 8.187719544752001e-05, + "loss": 5.4646, + "step": 19430 + }, + { + "epoch": 0.027315023789994678, + "grad_norm": 1.4490911960601807, + "learning_rate": 8.191934803990446e-05, + "loss": 5.495, + "step": 19440 + }, + { + "epoch": 0.027329074728158254, + "grad_norm": 1.3494820594787598, + "learning_rate": 8.196150063228888e-05, + "loss": 5.573, + "step": 19450 + }, + { + "epoch": 0.027343125666321834, + "grad_norm": 1.3359415531158447, + "learning_rate": 8.200365322467331e-05, + "loss": 5.6712, + "step": 19460 + }, + { + "epoch": 0.02735717660448541, + "grad_norm": 1.3212742805480957, + "learning_rate": 8.204580581705775e-05, + "loss": 5.6058, + "step": 19470 + }, + { + "epoch": 0.02737122754264899, + "grad_norm": 1.3946962356567383, + "learning_rate": 8.208795840944217e-05, + "loss": 5.4978, + "step": 19480 + }, + { + "epoch": 0.027385278480812566, + "grad_norm": 1.3919817209243774, + "learning_rate": 8.21301110018266e-05, + "loss": 5.5974, + "step": 19490 + }, + { + "epoch": 0.027399329418976142, + "grad_norm": 1.320351481437683, + "learning_rate": 8.217226359421105e-05, + "loss": 5.5079, + "step": 19500 + }, + { + "epoch": 0.02741338035713972, + "grad_norm": 1.326079249382019, + "learning_rate": 8.221441618659546e-05, + "loss": 5.5104, + "step": 19510 + }, + { + "epoch": 0.027427431295303298, + "grad_norm": 1.6833606958389282, + "learning_rate": 8.22565687789799e-05, + "loss": 5.5943, + "step": 19520 + }, + { + "epoch": 0.027441482233466877, + "grad_norm": 1.373024344444275, + "learning_rate": 8.229872137136434e-05, + "loss": 5.5817, + "step": 19530 + }, + { + "epoch": 0.027455533171630454, + "grad_norm": 1.2997173070907593, + "learning_rate": 8.234087396374877e-05, + "loss": 5.5115, + "step": 19540 + }, + { + "epoch": 0.02746958410979403, + "grad_norm": 1.349067211151123, + "learning_rate": 8.238302655613319e-05, + "loss": 5.5306, + "step": 19550 + }, + { + "epoch": 0.02748363504795761, + "grad_norm": 1.379137396812439, + "learning_rate": 8.242517914851763e-05, + "loss": 5.5341, + "step": 19560 + }, + { + "epoch": 0.027497685986121186, + "grad_norm": 1.4701979160308838, + "learning_rate": 8.246733174090206e-05, + "loss": 5.4278, + "step": 19570 + }, + { + "epoch": 0.027511736924284762, + "grad_norm": 1.373063087463379, + "learning_rate": 8.250948433328648e-05, + "loss": 5.4814, + "step": 19580 + }, + { + "epoch": 0.02752578786244834, + "grad_norm": 1.3411657810211182, + "learning_rate": 8.255163692567093e-05, + "loss": 5.5641, + "step": 19590 + }, + { + "epoch": 0.027539838800611918, + "grad_norm": 1.2834957838058472, + "learning_rate": 8.259378951805536e-05, + "loss": 5.5242, + "step": 19600 + }, + { + "epoch": 0.027553889738775497, + "grad_norm": 1.3774769306182861, + "learning_rate": 8.263594211043978e-05, + "loss": 5.6078, + "step": 19610 + }, + { + "epoch": 0.027567940676939073, + "grad_norm": 1.3527755737304688, + "learning_rate": 8.267809470282422e-05, + "loss": 5.443, + "step": 19620 + }, + { + "epoch": 0.02758199161510265, + "grad_norm": 1.8241328001022339, + "learning_rate": 8.272024729520865e-05, + "loss": 5.62, + "step": 19630 + }, + { + "epoch": 0.02759604255326623, + "grad_norm": 1.3106392621994019, + "learning_rate": 8.276239988759307e-05, + "loss": 5.5135, + "step": 19640 + }, + { + "epoch": 0.027610093491429805, + "grad_norm": 1.3482909202575684, + "learning_rate": 8.280455247997751e-05, + "loss": 5.4549, + "step": 19650 + }, + { + "epoch": 0.027624144429593385, + "grad_norm": 1.3328837156295776, + "learning_rate": 8.284670507236194e-05, + "loss": 5.4759, + "step": 19660 + }, + { + "epoch": 0.02763819536775696, + "grad_norm": 1.3374454975128174, + "learning_rate": 8.288885766474636e-05, + "loss": 5.5537, + "step": 19670 + }, + { + "epoch": 0.027652246305920537, + "grad_norm": 1.3683466911315918, + "learning_rate": 8.293101025713081e-05, + "loss": 5.462, + "step": 19680 + }, + { + "epoch": 0.027666297244084117, + "grad_norm": 1.3449815511703491, + "learning_rate": 8.297316284951524e-05, + "loss": 5.5387, + "step": 19690 + }, + { + "epoch": 0.027680348182247693, + "grad_norm": 1.3451807498931885, + "learning_rate": 8.301531544189968e-05, + "loss": 5.5744, + "step": 19700 + }, + { + "epoch": 0.02769439912041127, + "grad_norm": 1.3333075046539307, + "learning_rate": 8.30574680342841e-05, + "loss": 5.5967, + "step": 19710 + }, + { + "epoch": 0.02770845005857485, + "grad_norm": 1.331404685974121, + "learning_rate": 8.309962062666853e-05, + "loss": 5.5404, + "step": 19720 + }, + { + "epoch": 0.027722500996738425, + "grad_norm": 2.0781617164611816, + "learning_rate": 8.314177321905298e-05, + "loss": 5.4125, + "step": 19730 + }, + { + "epoch": 0.027736551934902005, + "grad_norm": 1.341507077217102, + "learning_rate": 8.31839258114374e-05, + "loss": 5.5681, + "step": 19740 + }, + { + "epoch": 0.02775060287306558, + "grad_norm": 1.3700333833694458, + "learning_rate": 8.322607840382183e-05, + "loss": 5.5813, + "step": 19750 + }, + { + "epoch": 0.027764653811229157, + "grad_norm": 1.3675099611282349, + "learning_rate": 8.326823099620627e-05, + "loss": 5.4456, + "step": 19760 + }, + { + "epoch": 0.027778704749392737, + "grad_norm": 1.3649263381958008, + "learning_rate": 8.331038358859069e-05, + "loss": 5.5941, + "step": 19770 + }, + { + "epoch": 0.027792755687556313, + "grad_norm": 1.3281179666519165, + "learning_rate": 8.335253618097512e-05, + "loss": 5.4776, + "step": 19780 + }, + { + "epoch": 0.027806806625719892, + "grad_norm": 1.314491629600525, + "learning_rate": 8.339468877335956e-05, + "loss": 5.6428, + "step": 19790 + }, + { + "epoch": 0.02782085756388347, + "grad_norm": 1.4704593420028687, + "learning_rate": 8.343684136574398e-05, + "loss": 5.5351, + "step": 19800 + }, + { + "epoch": 0.027834908502047045, + "grad_norm": 1.364715337753296, + "learning_rate": 8.347899395812841e-05, + "loss": 5.6232, + "step": 19810 + }, + { + "epoch": 0.027848959440210624, + "grad_norm": 1.2975870370864868, + "learning_rate": 8.352114655051286e-05, + "loss": 5.5265, + "step": 19820 + }, + { + "epoch": 0.0278630103783742, + "grad_norm": 1.2619905471801758, + "learning_rate": 8.356329914289729e-05, + "loss": 5.5378, + "step": 19830 + }, + { + "epoch": 0.02787706131653778, + "grad_norm": 1.4168630838394165, + "learning_rate": 8.36054517352817e-05, + "loss": 5.5177, + "step": 19840 + }, + { + "epoch": 0.027891112254701356, + "grad_norm": 1.4585949182510376, + "learning_rate": 8.364760432766615e-05, + "loss": 5.4355, + "step": 19850 + }, + { + "epoch": 0.027905163192864933, + "grad_norm": 1.356927752494812, + "learning_rate": 8.368975692005058e-05, + "loss": 5.5808, + "step": 19860 + }, + { + "epoch": 0.027919214131028512, + "grad_norm": 1.495552659034729, + "learning_rate": 8.3731909512435e-05, + "loss": 5.5017, + "step": 19870 + }, + { + "epoch": 0.02793326506919209, + "grad_norm": 1.2983280420303345, + "learning_rate": 8.377406210481944e-05, + "loss": 5.4182, + "step": 19880 + }, + { + "epoch": 0.027947316007355665, + "grad_norm": 1.3719418048858643, + "learning_rate": 8.381621469720387e-05, + "loss": 5.5936, + "step": 19890 + }, + { + "epoch": 0.027961366945519244, + "grad_norm": 1.450944423675537, + "learning_rate": 8.385836728958829e-05, + "loss": 5.4513, + "step": 19900 + }, + { + "epoch": 0.02797541788368282, + "grad_norm": 1.314232587814331, + "learning_rate": 8.390051988197274e-05, + "loss": 5.4597, + "step": 19910 + }, + { + "epoch": 0.0279894688218464, + "grad_norm": 1.274675726890564, + "learning_rate": 8.394267247435717e-05, + "loss": 5.4734, + "step": 19920 + }, + { + "epoch": 0.028003519760009976, + "grad_norm": 1.3578556776046753, + "learning_rate": 8.398482506674159e-05, + "loss": 5.4294, + "step": 19930 + }, + { + "epoch": 0.028017570698173552, + "grad_norm": 1.3357514142990112, + "learning_rate": 8.402697765912603e-05, + "loss": 5.4595, + "step": 19940 + }, + { + "epoch": 0.028031621636337132, + "grad_norm": 1.3431594371795654, + "learning_rate": 8.406913025151046e-05, + "loss": 5.5151, + "step": 19950 + }, + { + "epoch": 0.028045672574500708, + "grad_norm": 1.5500048398971558, + "learning_rate": 8.41112828438949e-05, + "loss": 5.5227, + "step": 19960 + }, + { + "epoch": 0.028059723512664288, + "grad_norm": 1.3698660135269165, + "learning_rate": 8.415343543627932e-05, + "loss": 5.5025, + "step": 19970 + }, + { + "epoch": 0.028073774450827864, + "grad_norm": 1.332960844039917, + "learning_rate": 8.419558802866376e-05, + "loss": 5.5337, + "step": 19980 + }, + { + "epoch": 0.02808782538899144, + "grad_norm": 1.2651346921920776, + "learning_rate": 8.42377406210482e-05, + "loss": 5.5576, + "step": 19990 + }, + { + "epoch": 0.02810187632715502, + "grad_norm": 1.3758925199508667, + "learning_rate": 8.427989321343262e-05, + "loss": 5.3875, + "step": 20000 + }, + { + "epoch": 0.028115927265318596, + "grad_norm": 1.412123203277588, + "learning_rate": 8.432204580581705e-05, + "loss": 5.6327, + "step": 20010 + }, + { + "epoch": 0.028129978203482175, + "grad_norm": 1.3514657020568848, + "learning_rate": 8.43641983982015e-05, + "loss": 5.5142, + "step": 20020 + }, + { + "epoch": 0.02814402914164575, + "grad_norm": 1.4179363250732422, + "learning_rate": 8.440635099058591e-05, + "loss": 5.5414, + "step": 20030 + }, + { + "epoch": 0.028158080079809328, + "grad_norm": 1.3700520992279053, + "learning_rate": 8.444850358297034e-05, + "loss": 5.5191, + "step": 20040 + }, + { + "epoch": 0.028172131017972907, + "grad_norm": 1.3933957815170288, + "learning_rate": 8.449065617535479e-05, + "loss": 5.5648, + "step": 20050 + }, + { + "epoch": 0.028186181956136484, + "grad_norm": 1.2371602058410645, + "learning_rate": 8.45328087677392e-05, + "loss": 5.5392, + "step": 20060 + }, + { + "epoch": 0.02820023289430006, + "grad_norm": 1.3389537334442139, + "learning_rate": 8.457496136012364e-05, + "loss": 5.4112, + "step": 20070 + }, + { + "epoch": 0.02821428383246364, + "grad_norm": 1.3131827116012573, + "learning_rate": 8.461711395250808e-05, + "loss": 5.5364, + "step": 20080 + }, + { + "epoch": 0.028228334770627216, + "grad_norm": 1.3104963302612305, + "learning_rate": 8.46592665448925e-05, + "loss": 5.4475, + "step": 20090 + }, + { + "epoch": 0.028242385708790795, + "grad_norm": 1.3459300994873047, + "learning_rate": 8.470141913727693e-05, + "loss": 5.5386, + "step": 20100 + }, + { + "epoch": 0.02825643664695437, + "grad_norm": 1.3658233880996704, + "learning_rate": 8.474357172966137e-05, + "loss": 5.5473, + "step": 20110 + }, + { + "epoch": 0.028270487585117948, + "grad_norm": 1.3540921211242676, + "learning_rate": 8.47857243220458e-05, + "loss": 5.4997, + "step": 20120 + }, + { + "epoch": 0.028284538523281527, + "grad_norm": 1.2725752592086792, + "learning_rate": 8.482787691443022e-05, + "loss": 5.5258, + "step": 20130 + }, + { + "epoch": 0.028298589461445103, + "grad_norm": 1.3213703632354736, + "learning_rate": 8.487002950681467e-05, + "loss": 5.5548, + "step": 20140 + }, + { + "epoch": 0.028312640399608683, + "grad_norm": 1.304655909538269, + "learning_rate": 8.49121820991991e-05, + "loss": 5.648, + "step": 20150 + }, + { + "epoch": 0.02832669133777226, + "grad_norm": 1.2564914226531982, + "learning_rate": 8.495433469158352e-05, + "loss": 5.5097, + "step": 20160 + }, + { + "epoch": 0.028340742275935835, + "grad_norm": 1.2632144689559937, + "learning_rate": 8.499648728396796e-05, + "loss": 5.4913, + "step": 20170 + }, + { + "epoch": 0.028354793214099415, + "grad_norm": 1.2769023180007935, + "learning_rate": 8.503863987635239e-05, + "loss": 5.504, + "step": 20180 + }, + { + "epoch": 0.02836884415226299, + "grad_norm": 1.3045737743377686, + "learning_rate": 8.508079246873681e-05, + "loss": 5.3591, + "step": 20190 + }, + { + "epoch": 0.028382895090426567, + "grad_norm": 1.3489607572555542, + "learning_rate": 8.512294506112125e-05, + "loss": 5.4323, + "step": 20200 + }, + { + "epoch": 0.028396946028590147, + "grad_norm": 1.3132845163345337, + "learning_rate": 8.516509765350569e-05, + "loss": 5.5967, + "step": 20210 + }, + { + "epoch": 0.028410996966753723, + "grad_norm": 1.3848180770874023, + "learning_rate": 8.52072502458901e-05, + "loss": 5.5421, + "step": 20220 + }, + { + "epoch": 0.028425047904917303, + "grad_norm": 1.3170721530914307, + "learning_rate": 8.524940283827455e-05, + "loss": 5.5576, + "step": 20230 + }, + { + "epoch": 0.02843909884308088, + "grad_norm": 1.2676606178283691, + "learning_rate": 8.529155543065898e-05, + "loss": 5.5059, + "step": 20240 + }, + { + "epoch": 0.028453149781244455, + "grad_norm": 1.3508251905441284, + "learning_rate": 8.533370802304342e-05, + "loss": 5.3921, + "step": 20250 + }, + { + "epoch": 0.028467200719408035, + "grad_norm": 1.4981266260147095, + "learning_rate": 8.537586061542784e-05, + "loss": 5.4407, + "step": 20260 + }, + { + "epoch": 0.02848125165757161, + "grad_norm": 1.2797385454177856, + "learning_rate": 8.541801320781227e-05, + "loss": 5.4869, + "step": 20270 + }, + { + "epoch": 0.02849530259573519, + "grad_norm": 1.394260287284851, + "learning_rate": 8.546016580019672e-05, + "loss": 5.5474, + "step": 20280 + }, + { + "epoch": 0.028509353533898767, + "grad_norm": 1.4078445434570312, + "learning_rate": 8.550231839258114e-05, + "loss": 5.384, + "step": 20290 + }, + { + "epoch": 0.028523404472062343, + "grad_norm": 1.7833536863327026, + "learning_rate": 8.554447098496557e-05, + "loss": 5.4554, + "step": 20300 + }, + { + "epoch": 0.028537455410225922, + "grad_norm": 1.3692078590393066, + "learning_rate": 8.558662357735001e-05, + "loss": 5.4846, + "step": 20310 + }, + { + "epoch": 0.0285515063483895, + "grad_norm": 1.3532313108444214, + "learning_rate": 8.562877616973443e-05, + "loss": 5.3635, + "step": 20320 + }, + { + "epoch": 0.028565557286553078, + "grad_norm": 1.2888007164001465, + "learning_rate": 8.567092876211886e-05, + "loss": 5.5104, + "step": 20330 + }, + { + "epoch": 0.028579608224716654, + "grad_norm": 1.3509674072265625, + "learning_rate": 8.57130813545033e-05, + "loss": 5.5451, + "step": 20340 + }, + { + "epoch": 0.02859365916288023, + "grad_norm": 1.3917303085327148, + "learning_rate": 8.575523394688772e-05, + "loss": 5.4849, + "step": 20350 + }, + { + "epoch": 0.02860771010104381, + "grad_norm": 1.3092026710510254, + "learning_rate": 8.579738653927215e-05, + "loss": 5.3983, + "step": 20360 + }, + { + "epoch": 0.028621761039207386, + "grad_norm": 1.3765193223953247, + "learning_rate": 8.58395391316566e-05, + "loss": 5.5259, + "step": 20370 + }, + { + "epoch": 0.028635811977370963, + "grad_norm": 1.38758385181427, + "learning_rate": 8.588169172404102e-05, + "loss": 5.4927, + "step": 20380 + }, + { + "epoch": 0.028649862915534542, + "grad_norm": 1.3264268636703491, + "learning_rate": 8.592384431642545e-05, + "loss": 5.5557, + "step": 20390 + }, + { + "epoch": 0.02866391385369812, + "grad_norm": 1.3711224794387817, + "learning_rate": 8.596599690880989e-05, + "loss": 5.4997, + "step": 20400 + }, + { + "epoch": 0.028677964791861698, + "grad_norm": 1.430208683013916, + "learning_rate": 8.600814950119432e-05, + "loss": 5.3837, + "step": 20410 + }, + { + "epoch": 0.028692015730025274, + "grad_norm": 1.2916505336761475, + "learning_rate": 8.605030209357874e-05, + "loss": 5.5252, + "step": 20420 + }, + { + "epoch": 0.02870606666818885, + "grad_norm": 1.300502061843872, + "learning_rate": 8.609245468596319e-05, + "loss": 5.4593, + "step": 20430 + }, + { + "epoch": 0.02872011760635243, + "grad_norm": 1.2897007465362549, + "learning_rate": 8.613460727834762e-05, + "loss": 5.4996, + "step": 20440 + }, + { + "epoch": 0.028734168544516006, + "grad_norm": 1.3202191591262817, + "learning_rate": 8.617675987073203e-05, + "loss": 5.3855, + "step": 20450 + }, + { + "epoch": 0.028748219482679586, + "grad_norm": 1.3272267580032349, + "learning_rate": 8.621891246311648e-05, + "loss": 5.5036, + "step": 20460 + }, + { + "epoch": 0.028762270420843162, + "grad_norm": 1.308984398841858, + "learning_rate": 8.626106505550091e-05, + "loss": 5.5322, + "step": 20470 + }, + { + "epoch": 0.028776321359006738, + "grad_norm": 1.3932908773422241, + "learning_rate": 8.630321764788533e-05, + "loss": 5.524, + "step": 20480 + }, + { + "epoch": 0.028790372297170318, + "grad_norm": 1.3947856426239014, + "learning_rate": 8.634537024026977e-05, + "loss": 5.5781, + "step": 20490 + }, + { + "epoch": 0.028804423235333894, + "grad_norm": 1.2810063362121582, + "learning_rate": 8.63875228326542e-05, + "loss": 5.4543, + "step": 20500 + }, + { + "epoch": 0.02881847417349747, + "grad_norm": 1.3561229705810547, + "learning_rate": 8.642967542503862e-05, + "loss": 5.5105, + "step": 20510 + }, + { + "epoch": 0.02883252511166105, + "grad_norm": 1.3260747194290161, + "learning_rate": 8.647182801742307e-05, + "loss": 5.451, + "step": 20520 + }, + { + "epoch": 0.028846576049824626, + "grad_norm": 1.3854423761367798, + "learning_rate": 8.65139806098075e-05, + "loss": 5.4256, + "step": 20530 + }, + { + "epoch": 0.028860626987988205, + "grad_norm": 1.3165987730026245, + "learning_rate": 8.655613320219194e-05, + "loss": 5.3924, + "step": 20540 + }, + { + "epoch": 0.02887467792615178, + "grad_norm": 2.6763813495635986, + "learning_rate": 8.659828579457636e-05, + "loss": 5.524, + "step": 20550 + }, + { + "epoch": 0.028888728864315358, + "grad_norm": 1.459052324295044, + "learning_rate": 8.664043838696079e-05, + "loss": 5.4794, + "step": 20560 + }, + { + "epoch": 0.028902779802478937, + "grad_norm": 1.276800274848938, + "learning_rate": 8.668259097934523e-05, + "loss": 5.4635, + "step": 20570 + }, + { + "epoch": 0.028916830740642514, + "grad_norm": 1.3238308429718018, + "learning_rate": 8.672474357172965e-05, + "loss": 5.5032, + "step": 20580 + }, + { + "epoch": 0.028930881678806093, + "grad_norm": 1.308498740196228, + "learning_rate": 8.676689616411408e-05, + "loss": 5.5274, + "step": 20590 + }, + { + "epoch": 0.02894493261696967, + "grad_norm": 1.2564195394515991, + "learning_rate": 8.680904875649853e-05, + "loss": 5.5354, + "step": 20600 + }, + { + "epoch": 0.028958983555133246, + "grad_norm": 1.276263952255249, + "learning_rate": 8.685120134888295e-05, + "loss": 5.4732, + "step": 20610 + }, + { + "epoch": 0.028973034493296825, + "grad_norm": 1.257238745689392, + "learning_rate": 8.689335394126738e-05, + "loss": 5.5754, + "step": 20620 + }, + { + "epoch": 0.0289870854314604, + "grad_norm": 1.402646541595459, + "learning_rate": 8.693550653365182e-05, + "loss": 5.5832, + "step": 20630 + }, + { + "epoch": 0.02900113636962398, + "grad_norm": 1.2543669939041138, + "learning_rate": 8.697765912603624e-05, + "loss": 5.5227, + "step": 20640 + }, + { + "epoch": 0.029015187307787557, + "grad_norm": 1.3000344038009644, + "learning_rate": 8.701981171842067e-05, + "loss": 5.3653, + "step": 20650 + }, + { + "epoch": 0.029029238245951133, + "grad_norm": 1.3282537460327148, + "learning_rate": 8.706196431080512e-05, + "loss": 5.4161, + "step": 20660 + }, + { + "epoch": 0.029043289184114713, + "grad_norm": 1.3235125541687012, + "learning_rate": 8.710411690318953e-05, + "loss": 5.5692, + "step": 20670 + }, + { + "epoch": 0.02905734012227829, + "grad_norm": 1.3211613893508911, + "learning_rate": 8.714626949557396e-05, + "loss": 5.4011, + "step": 20680 + }, + { + "epoch": 0.029071391060441865, + "grad_norm": 1.3463915586471558, + "learning_rate": 8.718842208795841e-05, + "loss": 5.5313, + "step": 20690 + }, + { + "epoch": 0.029085441998605445, + "grad_norm": 1.304567813873291, + "learning_rate": 8.723057468034284e-05, + "loss": 5.3108, + "step": 20700 + }, + { + "epoch": 0.02909949293676902, + "grad_norm": 1.2378467321395874, + "learning_rate": 8.727272727272726e-05, + "loss": 5.5691, + "step": 20710 + }, + { + "epoch": 0.0291135438749326, + "grad_norm": 1.2816358804702759, + "learning_rate": 8.73148798651117e-05, + "loss": 5.4171, + "step": 20720 + }, + { + "epoch": 0.029127594813096177, + "grad_norm": 1.467879056930542, + "learning_rate": 8.735703245749613e-05, + "loss": 5.424, + "step": 20730 + }, + { + "epoch": 0.029141645751259753, + "grad_norm": 1.2690702676773071, + "learning_rate": 8.739918504988055e-05, + "loss": 5.5673, + "step": 20740 + }, + { + "epoch": 0.029155696689423333, + "grad_norm": 1.2710336446762085, + "learning_rate": 8.7441337642265e-05, + "loss": 5.47, + "step": 20750 + }, + { + "epoch": 0.02916974762758691, + "grad_norm": 1.2570821046829224, + "learning_rate": 8.748349023464943e-05, + "loss": 5.5539, + "step": 20760 + }, + { + "epoch": 0.02918379856575049, + "grad_norm": 1.3138800859451294, + "learning_rate": 8.752564282703384e-05, + "loss": 5.4017, + "step": 20770 + }, + { + "epoch": 0.029197849503914065, + "grad_norm": 1.3933358192443848, + "learning_rate": 8.756779541941829e-05, + "loss": 5.3666, + "step": 20780 + }, + { + "epoch": 0.02921190044207764, + "grad_norm": 1.499240517616272, + "learning_rate": 8.760994801180272e-05, + "loss": 5.4957, + "step": 20790 + }, + { + "epoch": 0.02922595138024122, + "grad_norm": 1.332919955253601, + "learning_rate": 8.765210060418714e-05, + "loss": 5.4886, + "step": 20800 + }, + { + "epoch": 0.029240002318404797, + "grad_norm": 1.2993764877319336, + "learning_rate": 8.769425319657158e-05, + "loss": 5.5195, + "step": 20810 + }, + { + "epoch": 0.029254053256568373, + "grad_norm": 1.3067516088485718, + "learning_rate": 8.773640578895601e-05, + "loss": 5.5169, + "step": 20820 + }, + { + "epoch": 0.029268104194731952, + "grad_norm": 1.28152596950531, + "learning_rate": 8.777855838134046e-05, + "loss": 5.4082, + "step": 20830 + }, + { + "epoch": 0.02928215513289553, + "grad_norm": 1.3164806365966797, + "learning_rate": 8.782071097372488e-05, + "loss": 5.5273, + "step": 20840 + }, + { + "epoch": 0.029296206071059108, + "grad_norm": 1.3631839752197266, + "learning_rate": 8.786286356610931e-05, + "loss": 5.4297, + "step": 20850 + }, + { + "epoch": 0.029310257009222684, + "grad_norm": 1.3792098760604858, + "learning_rate": 8.790501615849375e-05, + "loss": 5.4204, + "step": 20860 + }, + { + "epoch": 0.02932430794738626, + "grad_norm": 1.3812150955200195, + "learning_rate": 8.794716875087817e-05, + "loss": 5.3696, + "step": 20870 + }, + { + "epoch": 0.02933835888554984, + "grad_norm": 1.2828326225280762, + "learning_rate": 8.79893213432626e-05, + "loss": 5.5572, + "step": 20880 + }, + { + "epoch": 0.029352409823713416, + "grad_norm": 1.4241127967834473, + "learning_rate": 8.803147393564705e-05, + "loss": 5.3305, + "step": 20890 + }, + { + "epoch": 0.029366460761876996, + "grad_norm": 1.386412501335144, + "learning_rate": 8.807362652803146e-05, + "loss": 5.5092, + "step": 20900 + }, + { + "epoch": 0.029380511700040572, + "grad_norm": 1.2819172143936157, + "learning_rate": 8.81157791204159e-05, + "loss": 5.3611, + "step": 20910 + }, + { + "epoch": 0.02939456263820415, + "grad_norm": 1.305132269859314, + "learning_rate": 8.815793171280034e-05, + "loss": 5.5092, + "step": 20920 + }, + { + "epoch": 0.029408613576367728, + "grad_norm": 1.359618902206421, + "learning_rate": 8.820008430518476e-05, + "loss": 5.5222, + "step": 20930 + }, + { + "epoch": 0.029422664514531304, + "grad_norm": 1.3207499980926514, + "learning_rate": 8.824223689756919e-05, + "loss": 5.4073, + "step": 20940 + }, + { + "epoch": 0.029436715452694884, + "grad_norm": 1.4006059169769287, + "learning_rate": 8.828438948995363e-05, + "loss": 5.4006, + "step": 20950 + }, + { + "epoch": 0.02945076639085846, + "grad_norm": 1.3299939632415771, + "learning_rate": 8.832654208233806e-05, + "loss": 5.5144, + "step": 20960 + }, + { + "epoch": 0.029464817329022036, + "grad_norm": 1.2843600511550903, + "learning_rate": 8.836869467472248e-05, + "loss": 5.4639, + "step": 20970 + }, + { + "epoch": 0.029478868267185616, + "grad_norm": 1.2437927722930908, + "learning_rate": 8.841084726710693e-05, + "loss": 5.5152, + "step": 20980 + }, + { + "epoch": 0.029492919205349192, + "grad_norm": 1.3445243835449219, + "learning_rate": 8.845299985949136e-05, + "loss": 5.5151, + "step": 20990 + }, + { + "epoch": 0.029506970143512768, + "grad_norm": 1.3153207302093506, + "learning_rate": 8.849515245187578e-05, + "loss": 5.5303, + "step": 21000 + }, + { + "epoch": 0.029521021081676348, + "grad_norm": 1.3267221450805664, + "learning_rate": 8.853730504426022e-05, + "loss": 5.5593, + "step": 21010 + }, + { + "epoch": 0.029535072019839924, + "grad_norm": 1.3878754377365112, + "learning_rate": 8.857945763664465e-05, + "loss": 5.4331, + "step": 21020 + }, + { + "epoch": 0.029549122958003503, + "grad_norm": 1.3253564834594727, + "learning_rate": 8.862161022902907e-05, + "loss": 5.4197, + "step": 21030 + }, + { + "epoch": 0.02956317389616708, + "grad_norm": 1.2962226867675781, + "learning_rate": 8.866376282141351e-05, + "loss": 5.4463, + "step": 21040 + }, + { + "epoch": 0.029577224834330656, + "grad_norm": 1.2892351150512695, + "learning_rate": 8.870591541379794e-05, + "loss": 5.4884, + "step": 21050 + }, + { + "epoch": 0.029591275772494235, + "grad_norm": 1.435368537902832, + "learning_rate": 8.874806800618236e-05, + "loss": 5.5157, + "step": 21060 + }, + { + "epoch": 0.02960532671065781, + "grad_norm": 1.3492792844772339, + "learning_rate": 8.879022059856681e-05, + "loss": 5.498, + "step": 21070 + }, + { + "epoch": 0.02961937764882139, + "grad_norm": 1.3389153480529785, + "learning_rate": 8.883237319095124e-05, + "loss": 5.4208, + "step": 21080 + }, + { + "epoch": 0.029633428586984967, + "grad_norm": 1.3750202655792236, + "learning_rate": 8.887452578333566e-05, + "loss": 5.3824, + "step": 21090 + }, + { + "epoch": 0.029647479525148544, + "grad_norm": 1.3427931070327759, + "learning_rate": 8.89166783757201e-05, + "loss": 5.4627, + "step": 21100 + }, + { + "epoch": 0.029661530463312123, + "grad_norm": 1.3169935941696167, + "learning_rate": 8.895883096810453e-05, + "loss": 5.4553, + "step": 21110 + }, + { + "epoch": 0.0296755814014757, + "grad_norm": 1.3012065887451172, + "learning_rate": 8.900098356048898e-05, + "loss": 5.4464, + "step": 21120 + }, + { + "epoch": 0.029689632339639276, + "grad_norm": 1.3812952041625977, + "learning_rate": 8.90431361528734e-05, + "loss": 5.4815, + "step": 21130 + }, + { + "epoch": 0.029703683277802855, + "grad_norm": 1.336200475692749, + "learning_rate": 8.908528874525782e-05, + "loss": 5.3232, + "step": 21140 + }, + { + "epoch": 0.02971773421596643, + "grad_norm": 1.2766376733779907, + "learning_rate": 8.912744133764227e-05, + "loss": 5.4727, + "step": 21150 + }, + { + "epoch": 0.02973178515413001, + "grad_norm": 1.391106367111206, + "learning_rate": 8.916959393002669e-05, + "loss": 5.5038, + "step": 21160 + }, + { + "epoch": 0.029745836092293587, + "grad_norm": 1.3515926599502563, + "learning_rate": 8.921174652241112e-05, + "loss": 5.4136, + "step": 21170 + }, + { + "epoch": 0.029759887030457163, + "grad_norm": 1.3355499505996704, + "learning_rate": 8.925389911479556e-05, + "loss": 5.3543, + "step": 21180 + }, + { + "epoch": 0.029773937968620743, + "grad_norm": 1.2321969270706177, + "learning_rate": 8.929605170717998e-05, + "loss": 5.4438, + "step": 21190 + }, + { + "epoch": 0.02978798890678432, + "grad_norm": 1.378713607788086, + "learning_rate": 8.933820429956441e-05, + "loss": 5.3934, + "step": 21200 + }, + { + "epoch": 0.0298020398449479, + "grad_norm": 1.2393652200698853, + "learning_rate": 8.938035689194886e-05, + "loss": 5.5751, + "step": 21210 + }, + { + "epoch": 0.029816090783111475, + "grad_norm": 1.4586329460144043, + "learning_rate": 8.942250948433327e-05, + "loss": 5.2181, + "step": 21220 + }, + { + "epoch": 0.02983014172127505, + "grad_norm": 1.5103076696395874, + "learning_rate": 8.94646620767177e-05, + "loss": 5.4515, + "step": 21230 + }, + { + "epoch": 0.02984419265943863, + "grad_norm": 1.4935081005096436, + "learning_rate": 8.950681466910215e-05, + "loss": 5.4309, + "step": 21240 + }, + { + "epoch": 0.029858243597602207, + "grad_norm": 1.3122179508209229, + "learning_rate": 8.954896726148658e-05, + "loss": 5.3174, + "step": 21250 + }, + { + "epoch": 0.029872294535765787, + "grad_norm": 1.280525803565979, + "learning_rate": 8.9591119853871e-05, + "loss": 5.3517, + "step": 21260 + }, + { + "epoch": 0.029886345473929363, + "grad_norm": 1.3139269351959229, + "learning_rate": 8.963327244625544e-05, + "loss": 5.4784, + "step": 21270 + }, + { + "epoch": 0.02990039641209294, + "grad_norm": 1.2973021268844604, + "learning_rate": 8.967542503863987e-05, + "loss": 5.3769, + "step": 21280 + }, + { + "epoch": 0.02991444735025652, + "grad_norm": 1.4092621803283691, + "learning_rate": 8.971757763102429e-05, + "loss": 5.3972, + "step": 21290 + }, + { + "epoch": 0.029928498288420095, + "grad_norm": 1.364310383796692, + "learning_rate": 8.975973022340874e-05, + "loss": 5.4728, + "step": 21300 + }, + { + "epoch": 0.02994254922658367, + "grad_norm": 1.516514539718628, + "learning_rate": 8.980188281579317e-05, + "loss": 5.4473, + "step": 21310 + }, + { + "epoch": 0.02995660016474725, + "grad_norm": 1.413217306137085, + "learning_rate": 8.984403540817759e-05, + "loss": 5.5172, + "step": 21320 + }, + { + "epoch": 0.029970651102910827, + "grad_norm": 1.724693775177002, + "learning_rate": 8.988618800056203e-05, + "loss": 5.3085, + "step": 21330 + }, + { + "epoch": 0.029984702041074406, + "grad_norm": 1.2294498682022095, + "learning_rate": 8.992834059294646e-05, + "loss": 5.5309, + "step": 21340 + }, + { + "epoch": 0.029998752979237982, + "grad_norm": 1.2753736972808838, + "learning_rate": 8.997049318533088e-05, + "loss": 5.4041, + "step": 21350 + }, + { + "epoch": 0.03001280391740156, + "grad_norm": 1.3634488582611084, + "learning_rate": 9.001264577771532e-05, + "loss": 5.4316, + "step": 21360 + }, + { + "epoch": 0.030026854855565138, + "grad_norm": 1.3544750213623047, + "learning_rate": 9.005479837009976e-05, + "loss": 5.4193, + "step": 21370 + }, + { + "epoch": 0.030040905793728714, + "grad_norm": 1.3579610586166382, + "learning_rate": 9.009695096248417e-05, + "loss": 5.4289, + "step": 21380 + }, + { + "epoch": 0.030054956731892294, + "grad_norm": 1.3251279592514038, + "learning_rate": 9.013910355486862e-05, + "loss": 5.4037, + "step": 21390 + }, + { + "epoch": 0.03006900767005587, + "grad_norm": 1.4140822887420654, + "learning_rate": 9.018125614725305e-05, + "loss": 5.3694, + "step": 21400 + }, + { + "epoch": 0.030083058608219446, + "grad_norm": 1.3545302152633667, + "learning_rate": 9.02234087396375e-05, + "loss": 5.4191, + "step": 21410 + }, + { + "epoch": 0.030097109546383026, + "grad_norm": 1.3583157062530518, + "learning_rate": 9.026556133202191e-05, + "loss": 5.4124, + "step": 21420 + }, + { + "epoch": 0.030111160484546602, + "grad_norm": 1.3296844959259033, + "learning_rate": 9.030771392440634e-05, + "loss": 5.3846, + "step": 21430 + }, + { + "epoch": 0.030125211422710182, + "grad_norm": 1.287698745727539, + "learning_rate": 9.034986651679079e-05, + "loss": 5.3998, + "step": 21440 + }, + { + "epoch": 0.030139262360873758, + "grad_norm": 1.4231420755386353, + "learning_rate": 9.03920191091752e-05, + "loss": 5.2575, + "step": 21450 + }, + { + "epoch": 0.030153313299037334, + "grad_norm": 1.347367286682129, + "learning_rate": 9.043417170155964e-05, + "loss": 5.4645, + "step": 21460 + }, + { + "epoch": 0.030167364237200914, + "grad_norm": 1.2664035558700562, + "learning_rate": 9.047632429394408e-05, + "loss": 5.4775, + "step": 21470 + }, + { + "epoch": 0.03018141517536449, + "grad_norm": 1.3278405666351318, + "learning_rate": 9.05184768863285e-05, + "loss": 5.5146, + "step": 21480 + }, + { + "epoch": 0.030195466113528066, + "grad_norm": 1.3342235088348389, + "learning_rate": 9.056062947871293e-05, + "loss": 5.438, + "step": 21490 + }, + { + "epoch": 0.030209517051691646, + "grad_norm": 1.2799886465072632, + "learning_rate": 9.060278207109737e-05, + "loss": 5.4206, + "step": 21500 + }, + { + "epoch": 0.030223567989855222, + "grad_norm": 1.3722513914108276, + "learning_rate": 9.064493466348179e-05, + "loss": 5.2671, + "step": 21510 + }, + { + "epoch": 0.0302376189280188, + "grad_norm": 1.2341845035552979, + "learning_rate": 9.068708725586622e-05, + "loss": 5.4003, + "step": 21520 + }, + { + "epoch": 0.030251669866182378, + "grad_norm": 1.3256713151931763, + "learning_rate": 9.072923984825067e-05, + "loss": 5.4125, + "step": 21530 + }, + { + "epoch": 0.030265720804345954, + "grad_norm": 1.2737789154052734, + "learning_rate": 9.07713924406351e-05, + "loss": 5.493, + "step": 21540 + }, + { + "epoch": 0.030279771742509533, + "grad_norm": 1.3018547296524048, + "learning_rate": 9.081354503301952e-05, + "loss": 5.4518, + "step": 21550 + }, + { + "epoch": 0.03029382268067311, + "grad_norm": 1.2525168657302856, + "learning_rate": 9.085569762540396e-05, + "loss": 5.4166, + "step": 21560 + }, + { + "epoch": 0.03030787361883669, + "grad_norm": 1.5232125520706177, + "learning_rate": 9.089785021778839e-05, + "loss": 5.3974, + "step": 21570 + }, + { + "epoch": 0.030321924557000265, + "grad_norm": 1.332067608833313, + "learning_rate": 9.094000281017281e-05, + "loss": 5.2823, + "step": 21580 + }, + { + "epoch": 0.03033597549516384, + "grad_norm": 1.3199515342712402, + "learning_rate": 9.098215540255725e-05, + "loss": 5.5286, + "step": 21590 + }, + { + "epoch": 0.03035002643332742, + "grad_norm": 1.3310644626617432, + "learning_rate": 9.102430799494169e-05, + "loss": 5.3436, + "step": 21600 + }, + { + "epoch": 0.030364077371490997, + "grad_norm": 1.285497784614563, + "learning_rate": 9.10664605873261e-05, + "loss": 5.3442, + "step": 21610 + }, + { + "epoch": 0.030378128309654574, + "grad_norm": 1.5494871139526367, + "learning_rate": 9.110861317971055e-05, + "loss": 5.5381, + "step": 21620 + }, + { + "epoch": 0.030392179247818153, + "grad_norm": 1.3371659517288208, + "learning_rate": 9.114655051285653e-05, + "loss": 5.3604, + "step": 21630 + }, + { + "epoch": 0.03040623018598173, + "grad_norm": 1.322951078414917, + "learning_rate": 9.118870310524096e-05, + "loss": 5.4602, + "step": 21640 + }, + { + "epoch": 0.03042028112414531, + "grad_norm": 1.5651347637176514, + "learning_rate": 9.123085569762539e-05, + "loss": 5.4247, + "step": 21650 + }, + { + "epoch": 0.030434332062308885, + "grad_norm": 1.3033775091171265, + "learning_rate": 9.127300829000982e-05, + "loss": 5.4838, + "step": 21660 + }, + { + "epoch": 0.03044838300047246, + "grad_norm": 1.436387300491333, + "learning_rate": 9.131516088239427e-05, + "loss": 5.2139, + "step": 21670 + }, + { + "epoch": 0.03046243393863604, + "grad_norm": 1.3395622968673706, + "learning_rate": 9.135731347477869e-05, + "loss": 5.4355, + "step": 21680 + }, + { + "epoch": 0.030476484876799617, + "grad_norm": 1.457055926322937, + "learning_rate": 9.139946606716312e-05, + "loss": 5.3278, + "step": 21690 + }, + { + "epoch": 0.030490535814963197, + "grad_norm": 1.3071521520614624, + "learning_rate": 9.144161865954756e-05, + "loss": 5.2767, + "step": 21700 + }, + { + "epoch": 0.030504586753126773, + "grad_norm": 1.3810970783233643, + "learning_rate": 9.148377125193198e-05, + "loss": 5.4068, + "step": 21710 + }, + { + "epoch": 0.03051863769129035, + "grad_norm": 1.3929694890975952, + "learning_rate": 9.152592384431641e-05, + "loss": 5.4296, + "step": 21720 + }, + { + "epoch": 0.03053268862945393, + "grad_norm": 1.5036205053329468, + "learning_rate": 9.156807643670086e-05, + "loss": 5.4989, + "step": 21730 + }, + { + "epoch": 0.030546739567617505, + "grad_norm": 1.315721035003662, + "learning_rate": 9.161022902908527e-05, + "loss": 5.4409, + "step": 21740 + }, + { + "epoch": 0.030560790505781085, + "grad_norm": 1.323768138885498, + "learning_rate": 9.165238162146972e-05, + "loss": 5.4287, + "step": 21750 + }, + { + "epoch": 0.03057484144394466, + "grad_norm": 1.445924997329712, + "learning_rate": 9.169453421385415e-05, + "loss": 5.4339, + "step": 21760 + }, + { + "epoch": 0.030588892382108237, + "grad_norm": 1.287317156791687, + "learning_rate": 9.173668680623857e-05, + "loss": 5.3386, + "step": 21770 + }, + { + "epoch": 0.030602943320271817, + "grad_norm": 1.3343262672424316, + "learning_rate": 9.177883939862301e-05, + "loss": 5.4419, + "step": 21780 + }, + { + "epoch": 0.030616994258435393, + "grad_norm": 1.309372901916504, + "learning_rate": 9.182099199100744e-05, + "loss": 5.3939, + "step": 21790 + }, + { + "epoch": 0.03063104519659897, + "grad_norm": 1.3397578001022339, + "learning_rate": 9.186314458339186e-05, + "loss": 5.4396, + "step": 21800 + }, + { + "epoch": 0.03064509613476255, + "grad_norm": 1.3325626850128174, + "learning_rate": 9.19052971757763e-05, + "loss": 5.3904, + "step": 21810 + }, + { + "epoch": 0.030659147072926125, + "grad_norm": 1.2755528688430786, + "learning_rate": 9.194744976816074e-05, + "loss": 5.352, + "step": 21820 + }, + { + "epoch": 0.030673198011089704, + "grad_norm": 1.3710780143737793, + "learning_rate": 9.198960236054517e-05, + "loss": 5.3976, + "step": 21830 + }, + { + "epoch": 0.03068724894925328, + "grad_norm": 1.29194176197052, + "learning_rate": 9.20317549529296e-05, + "loss": 5.4736, + "step": 21840 + }, + { + "epoch": 0.030701299887416857, + "grad_norm": 1.2661114931106567, + "learning_rate": 9.207390754531403e-05, + "loss": 5.5116, + "step": 21850 + }, + { + "epoch": 0.030715350825580436, + "grad_norm": 1.337958574295044, + "learning_rate": 9.211606013769846e-05, + "loss": 5.4852, + "step": 21860 + }, + { + "epoch": 0.030729401763744012, + "grad_norm": 1.3310383558273315, + "learning_rate": 9.215821273008289e-05, + "loss": 5.2673, + "step": 21870 + }, + { + "epoch": 0.030743452701907592, + "grad_norm": 1.2530386447906494, + "learning_rate": 9.220036532246732e-05, + "loss": 5.4167, + "step": 21880 + }, + { + "epoch": 0.030757503640071168, + "grad_norm": 1.273057460784912, + "learning_rate": 9.224251791485175e-05, + "loss": 5.4284, + "step": 21890 + }, + { + "epoch": 0.030771554578234744, + "grad_norm": 1.264639973640442, + "learning_rate": 9.228467050723619e-05, + "loss": 5.3107, + "step": 21900 + }, + { + "epoch": 0.030785605516398324, + "grad_norm": 1.8401588201522827, + "learning_rate": 9.232682309962062e-05, + "loss": 5.3392, + "step": 21910 + }, + { + "epoch": 0.0307996564545619, + "grad_norm": 1.2661328315734863, + "learning_rate": 9.236897569200505e-05, + "loss": 5.4274, + "step": 21920 + }, + { + "epoch": 0.030813707392725476, + "grad_norm": 1.2905445098876953, + "learning_rate": 9.241112828438948e-05, + "loss": 5.4376, + "step": 21930 + }, + { + "epoch": 0.030827758330889056, + "grad_norm": 1.395332932472229, + "learning_rate": 9.245328087677391e-05, + "loss": 5.3368, + "step": 21940 + }, + { + "epoch": 0.030841809269052632, + "grad_norm": 1.3216753005981445, + "learning_rate": 9.249543346915834e-05, + "loss": 5.4493, + "step": 21950 + }, + { + "epoch": 0.030855860207216212, + "grad_norm": 1.528210997581482, + "learning_rate": 9.253758606154279e-05, + "loss": 5.3529, + "step": 21960 + }, + { + "epoch": 0.030869911145379788, + "grad_norm": 1.333594560623169, + "learning_rate": 9.25797386539272e-05, + "loss": 5.2689, + "step": 21970 + }, + { + "epoch": 0.030883962083543364, + "grad_norm": 1.2748163938522339, + "learning_rate": 9.262189124631164e-05, + "loss": 5.2231, + "step": 21980 + }, + { + "epoch": 0.030898013021706944, + "grad_norm": 1.4222743511199951, + "learning_rate": 9.266404383869608e-05, + "loss": 5.5653, + "step": 21990 + }, + { + "epoch": 0.03091206395987052, + "grad_norm": 1.2896723747253418, + "learning_rate": 9.27061964310805e-05, + "loss": 5.4378, + "step": 22000 + }, + { + "epoch": 0.0309261148980341, + "grad_norm": 1.3336145877838135, + "learning_rate": 9.274834902346493e-05, + "loss": 5.4317, + "step": 22010 + }, + { + "epoch": 0.030940165836197676, + "grad_norm": 1.4646224975585938, + "learning_rate": 9.279050161584937e-05, + "loss": 5.331, + "step": 22020 + }, + { + "epoch": 0.030954216774361252, + "grad_norm": 1.4956231117248535, + "learning_rate": 9.283265420823379e-05, + "loss": 5.4396, + "step": 22030 + }, + { + "epoch": 0.03096826771252483, + "grad_norm": 1.3492366075515747, + "learning_rate": 9.287480680061824e-05, + "loss": 5.3148, + "step": 22040 + }, + { + "epoch": 0.030982318650688408, + "grad_norm": 1.420286774635315, + "learning_rate": 9.291695939300267e-05, + "loss": 5.3644, + "step": 22050 + }, + { + "epoch": 0.030996369588851987, + "grad_norm": 1.3561815023422241, + "learning_rate": 9.295911198538708e-05, + "loss": 5.3762, + "step": 22060 + }, + { + "epoch": 0.031010420527015563, + "grad_norm": 1.2883050441741943, + "learning_rate": 9.300126457777153e-05, + "loss": 5.3012, + "step": 22070 + }, + { + "epoch": 0.03102447146517914, + "grad_norm": 1.2738019227981567, + "learning_rate": 9.304341717015596e-05, + "loss": 5.4794, + "step": 22080 + }, + { + "epoch": 0.03103852240334272, + "grad_norm": 1.4682445526123047, + "learning_rate": 9.308556976254039e-05, + "loss": 5.3829, + "step": 22090 + }, + { + "epoch": 0.031052573341506295, + "grad_norm": 1.2694358825683594, + "learning_rate": 9.312772235492482e-05, + "loss": 5.4007, + "step": 22100 + }, + { + "epoch": 0.03106662427966987, + "grad_norm": 1.3557790517807007, + "learning_rate": 9.316987494730925e-05, + "loss": 5.3658, + "step": 22110 + }, + { + "epoch": 0.03108067521783345, + "grad_norm": 1.38999342918396, + "learning_rate": 9.321202753969368e-05, + "loss": 5.3173, + "step": 22120 + }, + { + "epoch": 0.031094726155997027, + "grad_norm": 1.4154685735702515, + "learning_rate": 9.325418013207812e-05, + "loss": 5.296, + "step": 22130 + }, + { + "epoch": 0.031108777094160607, + "grad_norm": 1.3089263439178467, + "learning_rate": 9.329633272446255e-05, + "loss": 5.3943, + "step": 22140 + }, + { + "epoch": 0.031122828032324183, + "grad_norm": 1.258065104484558, + "learning_rate": 9.333848531684698e-05, + "loss": 5.4311, + "step": 22150 + }, + { + "epoch": 0.03113687897048776, + "grad_norm": 1.329082727432251, + "learning_rate": 9.338063790923141e-05, + "loss": 5.3696, + "step": 22160 + }, + { + "epoch": 0.03115092990865134, + "grad_norm": 1.3508650064468384, + "learning_rate": 9.342279050161584e-05, + "loss": 5.3758, + "step": 22170 + }, + { + "epoch": 0.031164980846814915, + "grad_norm": 1.3524917364120483, + "learning_rate": 9.346494309400027e-05, + "loss": 5.3318, + "step": 22180 + }, + { + "epoch": 0.031179031784978495, + "grad_norm": 1.2871861457824707, + "learning_rate": 9.35070956863847e-05, + "loss": 5.5478, + "step": 22190 + }, + { + "epoch": 0.03119308272314207, + "grad_norm": 1.3874597549438477, + "learning_rate": 9.354924827876913e-05, + "loss": 5.4872, + "step": 22200 + }, + { + "epoch": 0.031207133661305647, + "grad_norm": 1.3094698190689087, + "learning_rate": 9.359140087115357e-05, + "loss": 5.3514, + "step": 22210 + }, + { + "epoch": 0.031221184599469227, + "grad_norm": 1.498294711112976, + "learning_rate": 9.3633553463538e-05, + "loss": 5.2953, + "step": 22220 + }, + { + "epoch": 0.031235235537632803, + "grad_norm": 1.7673845291137695, + "learning_rate": 9.367570605592243e-05, + "loss": 5.3295, + "step": 22230 + }, + { + "epoch": 0.03124928647579638, + "grad_norm": 1.4196926355361938, + "learning_rate": 9.371785864830686e-05, + "loss": 5.3922, + "step": 22240 + }, + { + "epoch": 0.03126333741395996, + "grad_norm": 1.6076236963272095, + "learning_rate": 9.37600112406913e-05, + "loss": 5.425, + "step": 22250 + }, + { + "epoch": 0.031277388352123535, + "grad_norm": 1.2859355211257935, + "learning_rate": 9.380216383307572e-05, + "loss": 5.4363, + "step": 22260 + }, + { + "epoch": 0.03129143929028711, + "grad_norm": 1.3375812768936157, + "learning_rate": 9.384431642546015e-05, + "loss": 5.5041, + "step": 22270 + }, + { + "epoch": 0.031305490228450694, + "grad_norm": 1.2610328197479248, + "learning_rate": 9.38864690178446e-05, + "loss": 5.3793, + "step": 22280 + }, + { + "epoch": 0.03131954116661427, + "grad_norm": 1.3440093994140625, + "learning_rate": 9.392862161022901e-05, + "loss": 5.2878, + "step": 22290 + }, + { + "epoch": 0.031333592104777847, + "grad_norm": 1.3581554889678955, + "learning_rate": 9.397077420261345e-05, + "loss": 5.4093, + "step": 22300 + }, + { + "epoch": 0.03134764304294142, + "grad_norm": 1.3433340787887573, + "learning_rate": 9.401292679499789e-05, + "loss": 5.391, + "step": 22310 + }, + { + "epoch": 0.031361693981105, + "grad_norm": 1.3540128469467163, + "learning_rate": 9.405507938738231e-05, + "loss": 5.3798, + "step": 22320 + }, + { + "epoch": 0.031375744919268575, + "grad_norm": 1.2961390018463135, + "learning_rate": 9.409723197976674e-05, + "loss": 5.316, + "step": 22330 + }, + { + "epoch": 0.03138979585743216, + "grad_norm": 1.2970017194747925, + "learning_rate": 9.413938457215118e-05, + "loss": 5.3238, + "step": 22340 + }, + { + "epoch": 0.031403846795595734, + "grad_norm": 1.275976300239563, + "learning_rate": 9.41815371645356e-05, + "loss": 5.3903, + "step": 22350 + }, + { + "epoch": 0.03141789773375931, + "grad_norm": 1.329797625541687, + "learning_rate": 9.422368975692005e-05, + "loss": 5.3756, + "step": 22360 + }, + { + "epoch": 0.03143194867192289, + "grad_norm": 1.3486462831497192, + "learning_rate": 9.426584234930448e-05, + "loss": 5.4281, + "step": 22370 + }, + { + "epoch": 0.03144599961008646, + "grad_norm": 1.3495275974273682, + "learning_rate": 9.430799494168891e-05, + "loss": 5.3231, + "step": 22380 + }, + { + "epoch": 0.031460050548250046, + "grad_norm": 1.2206830978393555, + "learning_rate": 9.435014753407334e-05, + "loss": 5.3296, + "step": 22390 + }, + { + "epoch": 0.03147410148641362, + "grad_norm": 1.2643572092056274, + "learning_rate": 9.439230012645777e-05, + "loss": 5.3465, + "step": 22400 + }, + { + "epoch": 0.0314881524245772, + "grad_norm": 1.3389561176300049, + "learning_rate": 9.44344527188422e-05, + "loss": 5.3205, + "step": 22410 + }, + { + "epoch": 0.031502203362740774, + "grad_norm": 1.2590327262878418, + "learning_rate": 9.447660531122663e-05, + "loss": 5.3742, + "step": 22420 + }, + { + "epoch": 0.03151625430090435, + "grad_norm": 1.2709726095199585, + "learning_rate": 9.451875790361106e-05, + "loss": 5.4671, + "step": 22430 + }, + { + "epoch": 0.031530305239067934, + "grad_norm": 1.3207013607025146, + "learning_rate": 9.45609104959955e-05, + "loss": 5.2865, + "step": 22440 + }, + { + "epoch": 0.03154435617723151, + "grad_norm": 1.3502670526504517, + "learning_rate": 9.460306308837993e-05, + "loss": 5.2844, + "step": 22450 + }, + { + "epoch": 0.031558407115395086, + "grad_norm": 1.3033579587936401, + "learning_rate": 9.464521568076436e-05, + "loss": 5.5248, + "step": 22460 + }, + { + "epoch": 0.03157245805355866, + "grad_norm": 1.2336679697036743, + "learning_rate": 9.468736827314879e-05, + "loss": 5.4307, + "step": 22470 + }, + { + "epoch": 0.03158650899172224, + "grad_norm": 1.4275672435760498, + "learning_rate": 9.472952086553322e-05, + "loss": 5.4736, + "step": 22480 + }, + { + "epoch": 0.03160055992988582, + "grad_norm": 1.3759491443634033, + "learning_rate": 9.477167345791765e-05, + "loss": 5.38, + "step": 22490 + }, + { + "epoch": 0.0316146108680494, + "grad_norm": 1.3446354866027832, + "learning_rate": 9.481382605030208e-05, + "loss": 5.4072, + "step": 22500 + }, + { + "epoch": 0.031628661806212974, + "grad_norm": 1.3388227224349976, + "learning_rate": 9.485597864268651e-05, + "loss": 5.3462, + "step": 22510 + }, + { + "epoch": 0.03164271274437655, + "grad_norm": 1.3788104057312012, + "learning_rate": 9.489813123507095e-05, + "loss": 5.4326, + "step": 22520 + }, + { + "epoch": 0.031656763682540126, + "grad_norm": 1.424253225326538, + "learning_rate": 9.494028382745538e-05, + "loss": 5.4119, + "step": 22530 + }, + { + "epoch": 0.03167081462070371, + "grad_norm": 1.2999223470687866, + "learning_rate": 9.498243641983982e-05, + "loss": 5.3624, + "step": 22540 + }, + { + "epoch": 0.031684865558867285, + "grad_norm": 1.368601679801941, + "learning_rate": 9.502458901222424e-05, + "loss": 5.2332, + "step": 22550 + }, + { + "epoch": 0.03169891649703086, + "grad_norm": 1.4578156471252441, + "learning_rate": 9.506674160460867e-05, + "loss": 5.2938, + "step": 22560 + }, + { + "epoch": 0.03171296743519444, + "grad_norm": 1.274147391319275, + "learning_rate": 9.510889419699311e-05, + "loss": 5.3459, + "step": 22570 + }, + { + "epoch": 0.031727018373358014, + "grad_norm": 1.3094562292099, + "learning_rate": 9.515104678937753e-05, + "loss": 5.3795, + "step": 22580 + }, + { + "epoch": 0.0317410693115216, + "grad_norm": 1.2858974933624268, + "learning_rate": 9.519319938176196e-05, + "loss": 5.4598, + "step": 22590 + }, + { + "epoch": 0.03175512024968517, + "grad_norm": 1.267670750617981, + "learning_rate": 9.523535197414641e-05, + "loss": 5.3513, + "step": 22600 + }, + { + "epoch": 0.03176917118784875, + "grad_norm": 1.3018923997879028, + "learning_rate": 9.527750456653083e-05, + "loss": 5.3053, + "step": 22610 + }, + { + "epoch": 0.031783222126012325, + "grad_norm": 1.5751721858978271, + "learning_rate": 9.531965715891526e-05, + "loss": 5.5364, + "step": 22620 + }, + { + "epoch": 0.0317972730641759, + "grad_norm": 1.3210186958312988, + "learning_rate": 9.53618097512997e-05, + "loss": 5.3663, + "step": 22630 + }, + { + "epoch": 0.03181132400233948, + "grad_norm": 1.3259351253509521, + "learning_rate": 9.540396234368412e-05, + "loss": 5.4101, + "step": 22640 + }, + { + "epoch": 0.03182537494050306, + "grad_norm": 1.2493022680282593, + "learning_rate": 9.544611493606856e-05, + "loss": 5.409, + "step": 22650 + }, + { + "epoch": 0.03183942587866664, + "grad_norm": 1.3749110698699951, + "learning_rate": 9.5488267528453e-05, + "loss": 5.4479, + "step": 22660 + }, + { + "epoch": 0.03185347681683021, + "grad_norm": 1.2747150659561157, + "learning_rate": 9.553042012083743e-05, + "loss": 5.3817, + "step": 22670 + }, + { + "epoch": 0.03186752775499379, + "grad_norm": 1.2162046432495117, + "learning_rate": 9.557257271322186e-05, + "loss": 5.3368, + "step": 22680 + }, + { + "epoch": 0.031881578693157366, + "grad_norm": 1.301879644393921, + "learning_rate": 9.561472530560629e-05, + "loss": 5.4536, + "step": 22690 + }, + { + "epoch": 0.03189562963132095, + "grad_norm": 1.389109492301941, + "learning_rate": 9.565687789799072e-05, + "loss": 5.2986, + "step": 22700 + }, + { + "epoch": 0.031909680569484525, + "grad_norm": 1.2711687088012695, + "learning_rate": 9.569903049037515e-05, + "loss": 5.488, + "step": 22710 + }, + { + "epoch": 0.0319237315076481, + "grad_norm": 1.2965320348739624, + "learning_rate": 9.573696782352113e-05, + "loss": 5.2603, + "step": 22720 + }, + { + "epoch": 0.03193778244581168, + "grad_norm": 1.325437307357788, + "learning_rate": 9.577912041590558e-05, + "loss": 5.4919, + "step": 22730 + }, + { + "epoch": 0.03195183338397525, + "grad_norm": 1.2769930362701416, + "learning_rate": 9.582127300829e-05, + "loss": 5.329, + "step": 22740 + }, + { + "epoch": 0.031965884322138836, + "grad_norm": 1.3971630334854126, + "learning_rate": 9.586342560067443e-05, + "loss": 5.3242, + "step": 22750 + }, + { + "epoch": 0.03197993526030241, + "grad_norm": 1.2280455827713013, + "learning_rate": 9.590557819305887e-05, + "loss": 5.342, + "step": 22760 + }, + { + "epoch": 0.03199398619846599, + "grad_norm": 1.3401799201965332, + "learning_rate": 9.594773078544329e-05, + "loss": 5.4924, + "step": 22770 + }, + { + "epoch": 0.032008037136629565, + "grad_norm": 1.4017051458358765, + "learning_rate": 9.598988337782772e-05, + "loss": 5.277, + "step": 22780 + }, + { + "epoch": 0.03202208807479314, + "grad_norm": 1.3228023052215576, + "learning_rate": 9.603203597021217e-05, + "loss": 5.3887, + "step": 22790 + }, + { + "epoch": 0.032036139012956724, + "grad_norm": 1.2626895904541016, + "learning_rate": 9.60741885625966e-05, + "loss": 5.4167, + "step": 22800 + }, + { + "epoch": 0.0320501899511203, + "grad_norm": 1.3468395471572876, + "learning_rate": 9.611634115498101e-05, + "loss": 5.3809, + "step": 22810 + }, + { + "epoch": 0.032064240889283877, + "grad_norm": 1.2649286985397339, + "learning_rate": 9.615849374736546e-05, + "loss": 5.4244, + "step": 22820 + }, + { + "epoch": 0.03207829182744745, + "grad_norm": 1.2764794826507568, + "learning_rate": 9.620064633974989e-05, + "loss": 5.3179, + "step": 22830 + }, + { + "epoch": 0.03209234276561103, + "grad_norm": 1.2899025678634644, + "learning_rate": 9.624279893213431e-05, + "loss": 5.3826, + "step": 22840 + }, + { + "epoch": 0.03210639370377461, + "grad_norm": 1.3156379461288452, + "learning_rate": 9.628495152451875e-05, + "loss": 5.4528, + "step": 22850 + }, + { + "epoch": 0.03212044464193819, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.632710411690318e-05, + "loss": 5.3015, + "step": 22860 + }, + { + "epoch": 0.032134495580101764, + "grad_norm": 1.3608930110931396, + "learning_rate": 9.63692567092876e-05, + "loss": 5.3645, + "step": 22870 + }, + { + "epoch": 0.03214854651826534, + "grad_norm": 1.3903223276138306, + "learning_rate": 9.641140930167205e-05, + "loss": 5.2674, + "step": 22880 + }, + { + "epoch": 0.03216259745642892, + "grad_norm": 1.372069001197815, + "learning_rate": 9.645356189405648e-05, + "loss": 5.3272, + "step": 22890 + }, + { + "epoch": 0.0321766483945925, + "grad_norm": 1.373261570930481, + "learning_rate": 9.64957144864409e-05, + "loss": 5.2367, + "step": 22900 + }, + { + "epoch": 0.032190699332756076, + "grad_norm": 1.2778481245040894, + "learning_rate": 9.653786707882534e-05, + "loss": 5.4686, + "step": 22910 + }, + { + "epoch": 0.03220475027091965, + "grad_norm": 1.4180645942687988, + "learning_rate": 9.658001967120977e-05, + "loss": 5.3619, + "step": 22920 + }, + { + "epoch": 0.03221880120908323, + "grad_norm": 1.3312855958938599, + "learning_rate": 9.662217226359422e-05, + "loss": 5.2804, + "step": 22930 + }, + { + "epoch": 0.032232852147246804, + "grad_norm": 1.3726269006729126, + "learning_rate": 9.666432485597863e-05, + "loss": 5.428, + "step": 22940 + }, + { + "epoch": 0.03224690308541038, + "grad_norm": 1.2660539150238037, + "learning_rate": 9.670647744836306e-05, + "loss": 5.3517, + "step": 22950 + }, + { + "epoch": 0.032260954023573964, + "grad_norm": 1.3094338178634644, + "learning_rate": 9.674863004074751e-05, + "loss": 5.3259, + "step": 22960 + }, + { + "epoch": 0.03227500496173754, + "grad_norm": 1.348771572113037, + "learning_rate": 9.679078263313193e-05, + "loss": 5.3373, + "step": 22970 + }, + { + "epoch": 0.032289055899901116, + "grad_norm": 1.3427238464355469, + "learning_rate": 9.683293522551636e-05, + "loss": 5.2462, + "step": 22980 + }, + { + "epoch": 0.03230310683806469, + "grad_norm": 1.2989392280578613, + "learning_rate": 9.68750878179008e-05, + "loss": 5.4482, + "step": 22990 + }, + { + "epoch": 0.03231715777622827, + "grad_norm": 1.3329780101776123, + "learning_rate": 9.691724041028522e-05, + "loss": 5.2809, + "step": 23000 + }, + { + "epoch": 0.03233120871439185, + "grad_norm": 1.3921016454696655, + "learning_rate": 9.695939300266965e-05, + "loss": 5.5561, + "step": 23010 + }, + { + "epoch": 0.03234525965255543, + "grad_norm": 1.3452013731002808, + "learning_rate": 9.70015455950541e-05, + "loss": 5.3734, + "step": 23020 + }, + { + "epoch": 0.032359310590719004, + "grad_norm": 1.2437107563018799, + "learning_rate": 9.704369818743851e-05, + "loss": 5.4029, + "step": 23030 + }, + { + "epoch": 0.03237336152888258, + "grad_norm": 1.2906579971313477, + "learning_rate": 9.708585077982294e-05, + "loss": 5.1958, + "step": 23040 + }, + { + "epoch": 0.032387412467046156, + "grad_norm": 1.290738821029663, + "learning_rate": 9.712800337220739e-05, + "loss": 5.3823, + "step": 23050 + }, + { + "epoch": 0.03240146340520974, + "grad_norm": 1.3592157363891602, + "learning_rate": 9.71701559645918e-05, + "loss": 5.3865, + "step": 23060 + }, + { + "epoch": 0.032415514343373315, + "grad_norm": 1.4341562986373901, + "learning_rate": 9.721230855697624e-05, + "loss": 5.1396, + "step": 23070 + }, + { + "epoch": 0.03242956528153689, + "grad_norm": 1.7886507511138916, + "learning_rate": 9.725446114936068e-05, + "loss": 5.2892, + "step": 23080 + }, + { + "epoch": 0.03244361621970047, + "grad_norm": 1.2833921909332275, + "learning_rate": 9.729661374174511e-05, + "loss": 5.408, + "step": 23090 + }, + { + "epoch": 0.032457667157864044, + "grad_norm": 1.2953739166259766, + "learning_rate": 9.733876633412953e-05, + "loss": 5.2775, + "step": 23100 + }, + { + "epoch": 0.03247171809602763, + "grad_norm": 1.264936923980713, + "learning_rate": 9.738091892651398e-05, + "loss": 5.3672, + "step": 23110 + }, + { + "epoch": 0.0324857690341912, + "grad_norm": 1.324508786201477, + "learning_rate": 9.742307151889841e-05, + "loss": 5.2859, + "step": 23120 + }, + { + "epoch": 0.03249981997235478, + "grad_norm": 1.2716864347457886, + "learning_rate": 9.746522411128282e-05, + "loss": 5.2861, + "step": 23130 + }, + { + "epoch": 0.032513870910518355, + "grad_norm": 1.2742117643356323, + "learning_rate": 9.750737670366727e-05, + "loss": 5.3845, + "step": 23140 + }, + { + "epoch": 0.03252792184868193, + "grad_norm": 1.3381208181381226, + "learning_rate": 9.75495292960517e-05, + "loss": 5.3024, + "step": 23150 + }, + { + "epoch": 0.032541972786845515, + "grad_norm": 1.4787967205047607, + "learning_rate": 9.759168188843612e-05, + "loss": 5.3082, + "step": 23160 + }, + { + "epoch": 0.03255602372500909, + "grad_norm": 1.301733136177063, + "learning_rate": 9.763383448082056e-05, + "loss": 5.4468, + "step": 23170 + }, + { + "epoch": 0.03257007466317267, + "grad_norm": 1.271909236907959, + "learning_rate": 9.7675987073205e-05, + "loss": 5.2869, + "step": 23180 + }, + { + "epoch": 0.03258412560133624, + "grad_norm": 1.2851873636245728, + "learning_rate": 9.771813966558941e-05, + "loss": 5.2482, + "step": 23190 + }, + { + "epoch": 0.03259817653949982, + "grad_norm": 1.307614803314209, + "learning_rate": 9.776029225797386e-05, + "loss": 5.36, + "step": 23200 + }, + { + "epoch": 0.0326122274776634, + "grad_norm": 1.25319504737854, + "learning_rate": 9.780244485035829e-05, + "loss": 5.409, + "step": 23210 + }, + { + "epoch": 0.03262627841582698, + "grad_norm": 1.4005918502807617, + "learning_rate": 9.784459744274273e-05, + "loss": 5.3335, + "step": 23220 + }, + { + "epoch": 0.032640329353990555, + "grad_norm": 1.2450411319732666, + "learning_rate": 9.788675003512715e-05, + "loss": 5.3397, + "step": 23230 + }, + { + "epoch": 0.03265438029215413, + "grad_norm": 1.2542985677719116, + "learning_rate": 9.792890262751158e-05, + "loss": 5.4002, + "step": 23240 + }, + { + "epoch": 0.03266843123031771, + "grad_norm": 1.2760860919952393, + "learning_rate": 9.797105521989603e-05, + "loss": 5.4205, + "step": 23250 + }, + { + "epoch": 0.03268248216848129, + "grad_norm": 1.3349649906158447, + "learning_rate": 9.801320781228044e-05, + "loss": 5.3124, + "step": 23260 + }, + { + "epoch": 0.032696533106644866, + "grad_norm": 1.2851274013519287, + "learning_rate": 9.805536040466487e-05, + "loss": 5.2979, + "step": 23270 + }, + { + "epoch": 0.03271058404480844, + "grad_norm": 1.28581702709198, + "learning_rate": 9.809751299704932e-05, + "loss": 5.3434, + "step": 23280 + }, + { + "epoch": 0.03272463498297202, + "grad_norm": 1.2845999002456665, + "learning_rate": 9.813966558943374e-05, + "loss": 5.3271, + "step": 23290 + }, + { + "epoch": 0.032738685921135595, + "grad_norm": 1.4002037048339844, + "learning_rate": 9.818181818181817e-05, + "loss": 5.3637, + "step": 23300 + }, + { + "epoch": 0.03275273685929917, + "grad_norm": 1.2308608293533325, + "learning_rate": 9.822397077420261e-05, + "loss": 5.3833, + "step": 23310 + }, + { + "epoch": 0.032766787797462754, + "grad_norm": 1.3694802522659302, + "learning_rate": 9.826612336658703e-05, + "loss": 5.386, + "step": 23320 + }, + { + "epoch": 0.03278083873562633, + "grad_norm": 1.294423222541809, + "learning_rate": 9.830827595897146e-05, + "loss": 5.2552, + "step": 23330 + }, + { + "epoch": 0.032794889673789906, + "grad_norm": 1.3576372861862183, + "learning_rate": 9.83504285513559e-05, + "loss": 5.3575, + "step": 23340 + }, + { + "epoch": 0.03280894061195348, + "grad_norm": 1.276759386062622, + "learning_rate": 9.839258114374032e-05, + "loss": 5.4282, + "step": 23350 + }, + { + "epoch": 0.03282299155011706, + "grad_norm": 1.290276288986206, + "learning_rate": 9.843473373612476e-05, + "loss": 5.2632, + "step": 23360 + }, + { + "epoch": 0.03283704248828064, + "grad_norm": 1.3245643377304077, + "learning_rate": 9.84768863285092e-05, + "loss": 5.3792, + "step": 23370 + }, + { + "epoch": 0.03285109342644422, + "grad_norm": 1.3463149070739746, + "learning_rate": 9.851903892089363e-05, + "loss": 5.3489, + "step": 23380 + }, + { + "epoch": 0.032865144364607794, + "grad_norm": 1.3925325870513916, + "learning_rate": 9.856119151327805e-05, + "loss": 5.2327, + "step": 23390 + }, + { + "epoch": 0.03287919530277137, + "grad_norm": 1.3809847831726074, + "learning_rate": 9.86033441056625e-05, + "loss": 5.44, + "step": 23400 + }, + { + "epoch": 0.03289324624093495, + "grad_norm": 1.2953121662139893, + "learning_rate": 9.864549669804692e-05, + "loss": 5.2537, + "step": 23410 + }, + { + "epoch": 0.03290729717909853, + "grad_norm": 1.2769551277160645, + "learning_rate": 9.868764929043134e-05, + "loss": 5.4532, + "step": 23420 + }, + { + "epoch": 0.032921348117262106, + "grad_norm": 1.3292839527130127, + "learning_rate": 9.872980188281579e-05, + "loss": 5.3824, + "step": 23430 + }, + { + "epoch": 0.03293539905542568, + "grad_norm": 1.2691352367401123, + "learning_rate": 9.877195447520022e-05, + "loss": 5.3559, + "step": 23440 + }, + { + "epoch": 0.03294944999358926, + "grad_norm": 1.4160397052764893, + "learning_rate": 9.881410706758464e-05, + "loss": 5.2673, + "step": 23450 + }, + { + "epoch": 0.032963500931752834, + "grad_norm": 1.3387774229049683, + "learning_rate": 9.885625965996908e-05, + "loss": 5.3447, + "step": 23460 + }, + { + "epoch": 0.03297755186991642, + "grad_norm": 1.3649985790252686, + "learning_rate": 9.889841225235351e-05, + "loss": 5.3222, + "step": 23470 + }, + { + "epoch": 0.032991602808079994, + "grad_norm": 1.2664110660552979, + "learning_rate": 9.894056484473793e-05, + "loss": 5.34, + "step": 23480 + }, + { + "epoch": 0.03300565374624357, + "grad_norm": 1.3583790063858032, + "learning_rate": 9.898271743712237e-05, + "loss": 5.2413, + "step": 23490 + }, + { + "epoch": 0.033019704684407146, + "grad_norm": 1.3465863466262817, + "learning_rate": 9.90248700295068e-05, + "loss": 5.245, + "step": 23500 + }, + { + "epoch": 0.03303375562257072, + "grad_norm": 1.3353192806243896, + "learning_rate": 9.906702262189125e-05, + "loss": 5.3347, + "step": 23510 + }, + { + "epoch": 0.033047806560734305, + "grad_norm": 1.3138058185577393, + "learning_rate": 9.910917521427567e-05, + "loss": 5.2903, + "step": 23520 + }, + { + "epoch": 0.03306185749889788, + "grad_norm": 1.308861494064331, + "learning_rate": 9.91513278066601e-05, + "loss": 5.2704, + "step": 23530 + }, + { + "epoch": 0.03307590843706146, + "grad_norm": 1.2975902557373047, + "learning_rate": 9.919348039904454e-05, + "loss": 5.3236, + "step": 23540 + }, + { + "epoch": 0.033089959375225034, + "grad_norm": 1.2685211896896362, + "learning_rate": 9.923563299142896e-05, + "loss": 5.3083, + "step": 23550 + }, + { + "epoch": 0.03310401031338861, + "grad_norm": 1.3065454959869385, + "learning_rate": 9.927778558381339e-05, + "loss": 5.3769, + "step": 23560 + }, + { + "epoch": 0.03311806125155219, + "grad_norm": 1.3842145204544067, + "learning_rate": 9.931993817619784e-05, + "loss": 5.3922, + "step": 23570 + }, + { + "epoch": 0.03313211218971577, + "grad_norm": 1.272464632987976, + "learning_rate": 9.936209076858225e-05, + "loss": 5.2546, + "step": 23580 + }, + { + "epoch": 0.033146163127879345, + "grad_norm": 1.3734886646270752, + "learning_rate": 9.940424336096669e-05, + "loss": 5.4634, + "step": 23590 + }, + { + "epoch": 0.03316021406604292, + "grad_norm": 1.3234834671020508, + "learning_rate": 9.944639595335113e-05, + "loss": 5.2879, + "step": 23600 + }, + { + "epoch": 0.0331742650042065, + "grad_norm": 1.3114765882492065, + "learning_rate": 9.948854854573555e-05, + "loss": 5.2448, + "step": 23610 + }, + { + "epoch": 0.033188315942370074, + "grad_norm": 1.239916443824768, + "learning_rate": 9.953070113811998e-05, + "loss": 5.1157, + "step": 23620 + }, + { + "epoch": 0.03320236688053366, + "grad_norm": 1.3152765035629272, + "learning_rate": 9.957285373050442e-05, + "loss": 5.2116, + "step": 23630 + }, + { + "epoch": 0.03321641781869723, + "grad_norm": 1.3076961040496826, + "learning_rate": 9.961500632288884e-05, + "loss": 5.3021, + "step": 23640 + }, + { + "epoch": 0.03323046875686081, + "grad_norm": 1.3413221836090088, + "learning_rate": 9.965715891527327e-05, + "loss": 5.3721, + "step": 23650 + }, + { + "epoch": 0.033244519695024385, + "grad_norm": 1.2938029766082764, + "learning_rate": 9.969931150765772e-05, + "loss": 5.313, + "step": 23660 + }, + { + "epoch": 0.03325857063318796, + "grad_norm": 1.2781319618225098, + "learning_rate": 9.974146410004215e-05, + "loss": 5.2119, + "step": 23670 + }, + { + "epoch": 0.033272621571351545, + "grad_norm": 1.3234959840774536, + "learning_rate": 9.978361669242657e-05, + "loss": 5.4493, + "step": 23680 + }, + { + "epoch": 0.03328667250951512, + "grad_norm": 1.4533023834228516, + "learning_rate": 9.982576928481101e-05, + "loss": 5.317, + "step": 23690 + }, + { + "epoch": 0.0333007234476787, + "grad_norm": 1.276374101638794, + "learning_rate": 9.986792187719544e-05, + "loss": 5.3605, + "step": 23700 + }, + { + "epoch": 0.03331477438584227, + "grad_norm": 1.292145013809204, + "learning_rate": 9.991007446957986e-05, + "loss": 5.3252, + "step": 23710 + }, + { + "epoch": 0.03332882532400585, + "grad_norm": 1.2901628017425537, + "learning_rate": 9.99522270619643e-05, + "loss": 5.2686, + "step": 23720 + }, + { + "epoch": 0.03334287626216943, + "grad_norm": 1.3100521564483643, + "learning_rate": 9.999437965434874e-05, + "loss": 5.1247, + "step": 23730 + }, + { + "epoch": 0.03335692720033301, + "grad_norm": 1.344598650932312, + "learning_rate": 0.00010003653224673315, + "loss": 5.2823, + "step": 23740 + }, + { + "epoch": 0.033370978138496585, + "grad_norm": 1.4314061403274536, + "learning_rate": 0.0001000786848391176, + "loss": 5.2714, + "step": 23750 + }, + { + "epoch": 0.03338502907666016, + "grad_norm": 1.2748030424118042, + "learning_rate": 0.00010012083743150203, + "loss": 5.3582, + "step": 23760 + }, + { + "epoch": 0.03339908001482374, + "grad_norm": 1.3783010244369507, + "learning_rate": 0.00010016299002388645, + "loss": 5.1891, + "step": 23770 + }, + { + "epoch": 0.03341313095298732, + "grad_norm": 1.3372441530227661, + "learning_rate": 0.00010020514261627089, + "loss": 5.3044, + "step": 23780 + }, + { + "epoch": 0.033427181891150896, + "grad_norm": 1.2305197715759277, + "learning_rate": 0.00010024729520865532, + "loss": 5.3672, + "step": 23790 + }, + { + "epoch": 0.03344123282931447, + "grad_norm": 1.28226900100708, + "learning_rate": 0.00010028944780103977, + "loss": 5.3432, + "step": 23800 + }, + { + "epoch": 0.03345528376747805, + "grad_norm": 1.2784920930862427, + "learning_rate": 0.00010033160039342418, + "loss": 5.2558, + "step": 23810 + }, + { + "epoch": 0.033469334705641625, + "grad_norm": 1.298294186592102, + "learning_rate": 0.00010037375298580862, + "loss": 5.296, + "step": 23820 + }, + { + "epoch": 0.03348338564380521, + "grad_norm": 1.3087866306304932, + "learning_rate": 0.00010041590557819306, + "loss": 5.2793, + "step": 23830 + }, + { + "epoch": 0.033497436581968784, + "grad_norm": 1.3522417545318604, + "learning_rate": 0.00010045805817057748, + "loss": 5.369, + "step": 23840 + }, + { + "epoch": 0.03351148752013236, + "grad_norm": 1.2974127531051636, + "learning_rate": 0.00010050021076296191, + "loss": 5.3775, + "step": 23850 + }, + { + "epoch": 0.033525538458295936, + "grad_norm": 1.3162837028503418, + "learning_rate": 0.00010054236335534635, + "loss": 5.3664, + "step": 23860 + }, + { + "epoch": 0.03353958939645951, + "grad_norm": 1.2961353063583374, + "learning_rate": 0.00010058451594773077, + "loss": 5.3546, + "step": 23870 + }, + { + "epoch": 0.033553640334623096, + "grad_norm": 1.3011044263839722, + "learning_rate": 0.0001006266685401152, + "loss": 5.2702, + "step": 23880 + }, + { + "epoch": 0.03356769127278667, + "grad_norm": 1.2920984029769897, + "learning_rate": 0.00010066882113249965, + "loss": 5.2699, + "step": 23890 + }, + { + "epoch": 0.03358174221095025, + "grad_norm": 1.2577017545700073, + "learning_rate": 0.00010071097372488407, + "loss": 5.3344, + "step": 23900 + }, + { + "epoch": 0.033595793149113824, + "grad_norm": 2.0838184356689453, + "learning_rate": 0.0001007531263172685, + "loss": 5.1927, + "step": 23910 + }, + { + "epoch": 0.0336098440872774, + "grad_norm": 1.5343761444091797, + "learning_rate": 0.00010079527890965294, + "loss": 5.3123, + "step": 23920 + }, + { + "epoch": 0.03362389502544098, + "grad_norm": 1.3116449117660522, + "learning_rate": 0.00010083743150203737, + "loss": 5.3366, + "step": 23930 + }, + { + "epoch": 0.03363794596360456, + "grad_norm": 1.2814358472824097, + "learning_rate": 0.00010087958409442179, + "loss": 5.2453, + "step": 23940 + }, + { + "epoch": 0.033651996901768136, + "grad_norm": 1.2302325963974, + "learning_rate": 0.00010092173668680623, + "loss": 5.2506, + "step": 23950 + }, + { + "epoch": 0.03366604783993171, + "grad_norm": 1.3629188537597656, + "learning_rate": 0.00010096388927919067, + "loss": 5.3369, + "step": 23960 + }, + { + "epoch": 0.03368009877809529, + "grad_norm": 1.241894245147705, + "learning_rate": 0.00010100604187157508, + "loss": 5.3224, + "step": 23970 + }, + { + "epoch": 0.033694149716258864, + "grad_norm": 1.3450833559036255, + "learning_rate": 0.00010104819446395953, + "loss": 5.2227, + "step": 23980 + }, + { + "epoch": 0.03370820065442245, + "grad_norm": 1.3539717197418213, + "learning_rate": 0.00010109034705634396, + "loss": 5.1751, + "step": 23990 + }, + { + "epoch": 0.033722251592586024, + "grad_norm": 1.5781745910644531, + "learning_rate": 0.00010113249964872838, + "loss": 5.2189, + "step": 24000 + }, + { + "epoch": 0.0337363025307496, + "grad_norm": 1.2791818380355835, + "learning_rate": 0.00010117465224111282, + "loss": 5.3397, + "step": 24010 + }, + { + "epoch": 0.033750353468913176, + "grad_norm": 1.318710207939148, + "learning_rate": 0.00010121680483349725, + "loss": 5.2535, + "step": 24020 + }, + { + "epoch": 0.03376440440707675, + "grad_norm": 1.4116847515106201, + "learning_rate": 0.00010125895742588167, + "loss": 5.246, + "step": 24030 + }, + { + "epoch": 0.033778455345240335, + "grad_norm": 1.352486491203308, + "learning_rate": 0.00010130111001826612, + "loss": 5.3818, + "step": 24040 + }, + { + "epoch": 0.03379250628340391, + "grad_norm": 1.3490184545516968, + "learning_rate": 0.00010134326261065055, + "loss": 5.1459, + "step": 24050 + }, + { + "epoch": 0.03380655722156749, + "grad_norm": 1.25826895236969, + "learning_rate": 0.00010138541520303496, + "loss": 5.2441, + "step": 24060 + }, + { + "epoch": 0.033820608159731064, + "grad_norm": 1.2731109857559204, + "learning_rate": 0.00010142756779541941, + "loss": 5.382, + "step": 24070 + }, + { + "epoch": 0.03383465909789464, + "grad_norm": 1.2558667659759521, + "learning_rate": 0.00010146972038780384, + "loss": 5.2548, + "step": 24080 + }, + { + "epoch": 0.03384871003605822, + "grad_norm": 1.3731662034988403, + "learning_rate": 0.00010151187298018828, + "loss": 5.1889, + "step": 24090 + }, + { + "epoch": 0.0338627609742218, + "grad_norm": 1.2584823369979858, + "learning_rate": 0.0001015540255725727, + "loss": 5.2309, + "step": 24100 + }, + { + "epoch": 0.033876811912385375, + "grad_norm": 1.3157720565795898, + "learning_rate": 0.00010159617816495713, + "loss": 5.147, + "step": 24110 + }, + { + "epoch": 0.03389086285054895, + "grad_norm": 1.3942573070526123, + "learning_rate": 0.00010163833075734158, + "loss": 5.1946, + "step": 24120 + }, + { + "epoch": 0.03390491378871253, + "grad_norm": 1.218231201171875, + "learning_rate": 0.000101680483349726, + "loss": 5.3256, + "step": 24130 + }, + { + "epoch": 0.03391896472687611, + "grad_norm": 1.3514090776443481, + "learning_rate": 0.00010172263594211043, + "loss": 5.2472, + "step": 24140 + }, + { + "epoch": 0.03393301566503969, + "grad_norm": 1.4769346714019775, + "learning_rate": 0.00010176478853449487, + "loss": 5.3211, + "step": 24150 + }, + { + "epoch": 0.03394706660320326, + "grad_norm": 1.4301344156265259, + "learning_rate": 0.00010180694112687929, + "loss": 5.3223, + "step": 24160 + }, + { + "epoch": 0.03396111754136684, + "grad_norm": 1.3295481204986572, + "learning_rate": 0.00010184909371926372, + "loss": 5.2742, + "step": 24170 + }, + { + "epoch": 0.033975168479530415, + "grad_norm": 1.34248948097229, + "learning_rate": 0.00010189124631164817, + "loss": 5.3578, + "step": 24180 + }, + { + "epoch": 0.033989219417694, + "grad_norm": 1.3010623455047607, + "learning_rate": 0.00010193339890403258, + "loss": 5.3045, + "step": 24190 + }, + { + "epoch": 0.034003270355857575, + "grad_norm": 1.3236461877822876, + "learning_rate": 0.00010197555149641701, + "loss": 5.2843, + "step": 24200 + }, + { + "epoch": 0.03401732129402115, + "grad_norm": 1.3258452415466309, + "learning_rate": 0.00010201770408880146, + "loss": 5.1747, + "step": 24210 + }, + { + "epoch": 0.03403137223218473, + "grad_norm": 1.411746621131897, + "learning_rate": 0.00010205985668118589, + "loss": 5.2508, + "step": 24220 + }, + { + "epoch": 0.0340454231703483, + "grad_norm": 1.399804949760437, + "learning_rate": 0.00010210200927357031, + "loss": 5.3071, + "step": 24230 + }, + { + "epoch": 0.03405947410851188, + "grad_norm": 1.3025681972503662, + "learning_rate": 0.00010214416186595475, + "loss": 5.1783, + "step": 24240 + }, + { + "epoch": 0.03407352504667546, + "grad_norm": 1.245466709136963, + "learning_rate": 0.00010218631445833918, + "loss": 5.2707, + "step": 24250 + }, + { + "epoch": 0.03408757598483904, + "grad_norm": 1.307504415512085, + "learning_rate": 0.0001022284670507236, + "loss": 5.418, + "step": 24260 + }, + { + "epoch": 0.034101626923002615, + "grad_norm": 1.331075668334961, + "learning_rate": 0.00010227061964310805, + "loss": 5.3077, + "step": 24270 + }, + { + "epoch": 0.03411567786116619, + "grad_norm": 1.3218508958816528, + "learning_rate": 0.00010231277223549248, + "loss": 5.4505, + "step": 24280 + }, + { + "epoch": 0.03412972879932977, + "grad_norm": 1.4919476509094238, + "learning_rate": 0.0001023549248278769, + "loss": 5.1643, + "step": 24290 + }, + { + "epoch": 0.03414377973749335, + "grad_norm": 1.511077642440796, + "learning_rate": 0.00010239707742026134, + "loss": 5.3138, + "step": 24300 + }, + { + "epoch": 0.034157830675656926, + "grad_norm": 1.2608944177627563, + "learning_rate": 0.00010243923001264577, + "loss": 5.353, + "step": 24310 + }, + { + "epoch": 0.0341718816138205, + "grad_norm": 1.2811615467071533, + "learning_rate": 0.00010248138260503019, + "loss": 5.326, + "step": 24320 + }, + { + "epoch": 0.03418593255198408, + "grad_norm": 1.3658688068389893, + "learning_rate": 0.00010252353519741463, + "loss": 5.2803, + "step": 24330 + }, + { + "epoch": 0.034199983490147655, + "grad_norm": 1.2968345880508423, + "learning_rate": 0.00010256568778979906, + "loss": 5.2007, + "step": 24340 + }, + { + "epoch": 0.03421403442831124, + "grad_norm": 1.3284701108932495, + "learning_rate": 0.00010260784038218348, + "loss": 5.2224, + "step": 24350 + }, + { + "epoch": 0.034228085366474814, + "grad_norm": 1.2858362197875977, + "learning_rate": 0.00010264999297456793, + "loss": 5.3118, + "step": 24360 + }, + { + "epoch": 0.03424213630463839, + "grad_norm": 1.242591381072998, + "learning_rate": 0.00010269214556695236, + "loss": 5.2242, + "step": 24370 + }, + { + "epoch": 0.034256187242801966, + "grad_norm": 1.282493233680725, + "learning_rate": 0.0001027342981593368, + "loss": 5.4394, + "step": 24380 + }, + { + "epoch": 0.03427023818096554, + "grad_norm": 1.2927570343017578, + "learning_rate": 0.00010277645075172122, + "loss": 5.3704, + "step": 24390 + }, + { + "epoch": 0.034284289119129126, + "grad_norm": 1.2820286750793457, + "learning_rate": 0.00010281860334410565, + "loss": 5.2751, + "step": 24400 + }, + { + "epoch": 0.0342983400572927, + "grad_norm": 1.3351532220840454, + "learning_rate": 0.0001028607559364901, + "loss": 5.2028, + "step": 24410 + }, + { + "epoch": 0.03431239099545628, + "grad_norm": 1.3240007162094116, + "learning_rate": 0.00010290290852887451, + "loss": 5.1208, + "step": 24420 + }, + { + "epoch": 0.034326441933619854, + "grad_norm": 1.3099170923233032, + "learning_rate": 0.00010294506112125894, + "loss": 5.3686, + "step": 24430 + }, + { + "epoch": 0.03434049287178343, + "grad_norm": 1.3434085845947266, + "learning_rate": 0.00010298721371364339, + "loss": 5.2695, + "step": 24440 + }, + { + "epoch": 0.034354543809947014, + "grad_norm": 1.2506746053695679, + "learning_rate": 0.0001030293663060278, + "loss": 5.4538, + "step": 24450 + }, + { + "epoch": 0.03436859474811059, + "grad_norm": 1.2607885599136353, + "learning_rate": 0.00010307151889841224, + "loss": 5.2082, + "step": 24460 + }, + { + "epoch": 0.034382645686274166, + "grad_norm": 1.2790932655334473, + "learning_rate": 0.00010311367149079668, + "loss": 5.3349, + "step": 24470 + }, + { + "epoch": 0.03439669662443774, + "grad_norm": 1.4397903680801392, + "learning_rate": 0.0001031558240831811, + "loss": 5.2953, + "step": 24480 + }, + { + "epoch": 0.03441074756260132, + "grad_norm": 1.254449725151062, + "learning_rate": 0.00010319797667556553, + "loss": 5.3391, + "step": 24490 + }, + { + "epoch": 0.0344247985007649, + "grad_norm": 1.3138560056686401, + "learning_rate": 0.00010324012926794998, + "loss": 5.2934, + "step": 24500 + }, + { + "epoch": 0.03443884943892848, + "grad_norm": 1.2442389726638794, + "learning_rate": 0.00010328228186033441, + "loss": 5.1722, + "step": 24510 + }, + { + "epoch": 0.034452900377092054, + "grad_norm": 1.3116623163223267, + "learning_rate": 0.00010332443445271882, + "loss": 5.2642, + "step": 24520 + }, + { + "epoch": 0.03446695131525563, + "grad_norm": 1.3116538524627686, + "learning_rate": 0.00010336658704510327, + "loss": 5.3113, + "step": 24530 + }, + { + "epoch": 0.034481002253419206, + "grad_norm": 1.3560738563537598, + "learning_rate": 0.0001034087396374877, + "loss": 5.2302, + "step": 24540 + }, + { + "epoch": 0.03449505319158278, + "grad_norm": 1.3531079292297363, + "learning_rate": 0.00010345089222987212, + "loss": 5.2305, + "step": 24550 + }, + { + "epoch": 0.034509104129746365, + "grad_norm": 1.26160728931427, + "learning_rate": 0.00010349304482225656, + "loss": 5.2376, + "step": 24560 + }, + { + "epoch": 0.03452315506790994, + "grad_norm": 1.4624823331832886, + "learning_rate": 0.000103535197414641, + "loss": 5.1941, + "step": 24570 + }, + { + "epoch": 0.03453720600607352, + "grad_norm": 1.3396354913711548, + "learning_rate": 0.00010357735000702541, + "loss": 5.1731, + "step": 24580 + }, + { + "epoch": 0.034551256944237094, + "grad_norm": 1.3305305242538452, + "learning_rate": 0.00010361950259940986, + "loss": 5.3613, + "step": 24590 + }, + { + "epoch": 0.03456530788240067, + "grad_norm": 1.2732019424438477, + "learning_rate": 0.00010366165519179429, + "loss": 5.2624, + "step": 24600 + }, + { + "epoch": 0.03457935882056425, + "grad_norm": 1.428470253944397, + "learning_rate": 0.0001037038077841787, + "loss": 5.2881, + "step": 24610 + }, + { + "epoch": 0.03459340975872783, + "grad_norm": 1.3105640411376953, + "learning_rate": 0.00010374596037656315, + "loss": 5.3397, + "step": 24620 + }, + { + "epoch": 0.034607460696891405, + "grad_norm": 1.4078952074050903, + "learning_rate": 0.00010378811296894758, + "loss": 5.4407, + "step": 24630 + }, + { + "epoch": 0.03462151163505498, + "grad_norm": 1.3400802612304688, + "learning_rate": 0.00010383026556133203, + "loss": 5.3244, + "step": 24640 + }, + { + "epoch": 0.03463556257321856, + "grad_norm": 1.315213918685913, + "learning_rate": 0.00010387241815371644, + "loss": 5.3574, + "step": 24650 + }, + { + "epoch": 0.03464961351138214, + "grad_norm": 1.2948580980300903, + "learning_rate": 0.00010391457074610087, + "loss": 5.3071, + "step": 24660 + }, + { + "epoch": 0.03466366444954572, + "grad_norm": 1.3167226314544678, + "learning_rate": 0.00010395672333848532, + "loss": 5.2615, + "step": 24670 + }, + { + "epoch": 0.03467771538770929, + "grad_norm": 1.3595393896102905, + "learning_rate": 0.00010399887593086974, + "loss": 5.3395, + "step": 24680 + }, + { + "epoch": 0.03469176632587287, + "grad_norm": 1.3089308738708496, + "learning_rate": 0.00010404102852325417, + "loss": 5.1544, + "step": 24690 + }, + { + "epoch": 0.034705817264036445, + "grad_norm": 1.2565276622772217, + "learning_rate": 0.00010408318111563861, + "loss": 5.3002, + "step": 24700 + }, + { + "epoch": 0.03471986820220003, + "grad_norm": 1.256977915763855, + "learning_rate": 0.00010412533370802303, + "loss": 5.287, + "step": 24710 + }, + { + "epoch": 0.034733919140363605, + "grad_norm": 1.2311729192733765, + "learning_rate": 0.00010416748630040746, + "loss": 5.39, + "step": 24720 + }, + { + "epoch": 0.03474797007852718, + "grad_norm": 1.2389764785766602, + "learning_rate": 0.0001042096388927919, + "loss": 5.3162, + "step": 24730 + }, + { + "epoch": 0.03476202101669076, + "grad_norm": 1.2789169549942017, + "learning_rate": 0.00010425179148517632, + "loss": 5.3211, + "step": 24740 + }, + { + "epoch": 0.03477607195485433, + "grad_norm": 1.225974202156067, + "learning_rate": 0.00010429394407756076, + "loss": 5.2767, + "step": 24750 + }, + { + "epoch": 0.034790122893017916, + "grad_norm": 1.2819294929504395, + "learning_rate": 0.0001043360966699452, + "loss": 5.335, + "step": 24760 + }, + { + "epoch": 0.03480417383118149, + "grad_norm": 1.329951524734497, + "learning_rate": 0.00010437824926232962, + "loss": 5.1979, + "step": 24770 + }, + { + "epoch": 0.03481822476934507, + "grad_norm": 1.2084249258041382, + "learning_rate": 0.00010442040185471405, + "loss": 5.3172, + "step": 24780 + }, + { + "epoch": 0.034832275707508645, + "grad_norm": 1.2773807048797607, + "learning_rate": 0.0001044625544470985, + "loss": 5.2737, + "step": 24790 + }, + { + "epoch": 0.03484632664567222, + "grad_norm": 1.302215576171875, + "learning_rate": 0.00010450470703948292, + "loss": 5.2025, + "step": 24800 + }, + { + "epoch": 0.034860377583835804, + "grad_norm": 1.317597508430481, + "learning_rate": 0.00010454685963186734, + "loss": 5.3506, + "step": 24810 + }, + { + "epoch": 0.03487442852199938, + "grad_norm": 1.2877779006958008, + "learning_rate": 0.00010458901222425179, + "loss": 5.2092, + "step": 24820 + }, + { + "epoch": 0.034888479460162956, + "grad_norm": 1.297938585281372, + "learning_rate": 0.00010463116481663622, + "loss": 5.3321, + "step": 24830 + }, + { + "epoch": 0.03490253039832653, + "grad_norm": 1.4365018606185913, + "learning_rate": 0.00010467331740902064, + "loss": 5.1708, + "step": 24840 + }, + { + "epoch": 0.03491658133649011, + "grad_norm": 1.2824749946594238, + "learning_rate": 0.00010471547000140508, + "loss": 5.2512, + "step": 24850 + }, + { + "epoch": 0.034930632274653685, + "grad_norm": 1.3265628814697266, + "learning_rate": 0.00010475762259378951, + "loss": 5.1575, + "step": 24860 + }, + { + "epoch": 0.03494468321281727, + "grad_norm": 1.360796332359314, + "learning_rate": 0.00010479977518617393, + "loss": 5.2298, + "step": 24870 + }, + { + "epoch": 0.034958734150980844, + "grad_norm": 1.2341952323913574, + "learning_rate": 0.00010484192777855837, + "loss": 5.1945, + "step": 24880 + }, + { + "epoch": 0.03497278508914442, + "grad_norm": 1.2878468036651611, + "learning_rate": 0.0001048840803709428, + "loss": 5.2382, + "step": 24890 + }, + { + "epoch": 0.034986836027307996, + "grad_norm": 1.248975157737732, + "learning_rate": 0.00010492623296332722, + "loss": 5.4503, + "step": 24900 + }, + { + "epoch": 0.03500088696547157, + "grad_norm": 1.2369763851165771, + "learning_rate": 0.00010496838555571167, + "loss": 5.2011, + "step": 24910 + }, + { + "epoch": 0.035014937903635156, + "grad_norm": 1.4231282472610474, + "learning_rate": 0.0001050105381480961, + "loss": 5.3073, + "step": 24920 + }, + { + "epoch": 0.03502898884179873, + "grad_norm": 1.3121232986450195, + "learning_rate": 0.00010505269074048054, + "loss": 5.2135, + "step": 24930 + }, + { + "epoch": 0.03504303977996231, + "grad_norm": 1.2677346467971802, + "learning_rate": 0.00010509484333286496, + "loss": 5.2371, + "step": 24940 + }, + { + "epoch": 0.035057090718125884, + "grad_norm": 1.359431266784668, + "learning_rate": 0.00010513699592524939, + "loss": 5.2875, + "step": 24950 + }, + { + "epoch": 0.03507114165628946, + "grad_norm": 1.2087974548339844, + "learning_rate": 0.00010517914851763384, + "loss": 5.1022, + "step": 24960 + }, + { + "epoch": 0.035085192594453044, + "grad_norm": 1.297215461730957, + "learning_rate": 0.00010522130111001825, + "loss": 5.3486, + "step": 24970 + }, + { + "epoch": 0.03509924353261662, + "grad_norm": 1.2546544075012207, + "learning_rate": 0.00010526345370240269, + "loss": 5.2505, + "step": 24980 + }, + { + "epoch": 0.035113294470780196, + "grad_norm": 1.3065543174743652, + "learning_rate": 0.00010530560629478713, + "loss": 5.1717, + "step": 24990 + }, + { + "epoch": 0.03512734540894377, + "grad_norm": 1.2734280824661255, + "learning_rate": 0.00010534775888717155, + "loss": 5.1809, + "step": 25000 + }, + { + "epoch": 0.03514139634710735, + "grad_norm": 1.3350660800933838, + "learning_rate": 0.00010538991147955598, + "loss": 5.3357, + "step": 25010 + }, + { + "epoch": 0.03515544728527093, + "grad_norm": 1.3462845087051392, + "learning_rate": 0.00010543206407194042, + "loss": 5.266, + "step": 25020 + }, + { + "epoch": 0.03516949822343451, + "grad_norm": 1.4067381620407104, + "learning_rate": 0.00010547000140508639, + "loss": 5.2521, + "step": 25030 + }, + { + "epoch": 0.035183549161598084, + "grad_norm": 1.3267967700958252, + "learning_rate": 0.00010551215399747084, + "loss": 5.2519, + "step": 25040 + }, + { + "epoch": 0.03519760009976166, + "grad_norm": 1.2769830226898193, + "learning_rate": 0.00010555430658985527, + "loss": 5.3735, + "step": 25050 + }, + { + "epoch": 0.035211651037925236, + "grad_norm": 1.4258005619049072, + "learning_rate": 0.0001055964591822397, + "loss": 5.3553, + "step": 25060 + }, + { + "epoch": 0.03522570197608882, + "grad_norm": 1.370233416557312, + "learning_rate": 0.00010563861177462413, + "loss": 5.2365, + "step": 25070 + }, + { + "epoch": 0.035239752914252395, + "grad_norm": 1.2431062459945679, + "learning_rate": 0.00010568076436700856, + "loss": 5.244, + "step": 25080 + }, + { + "epoch": 0.03525380385241597, + "grad_norm": 1.4300603866577148, + "learning_rate": 0.00010572291695939301, + "loss": 5.1916, + "step": 25090 + }, + { + "epoch": 0.03526785479057955, + "grad_norm": 1.3223384618759155, + "learning_rate": 0.00010576506955177742, + "loss": 5.1435, + "step": 25100 + }, + { + "epoch": 0.035281905728743124, + "grad_norm": 1.2834317684173584, + "learning_rate": 0.00010580722214416186, + "loss": 5.3326, + "step": 25110 + }, + { + "epoch": 0.03529595666690671, + "grad_norm": 1.2990752458572388, + "learning_rate": 0.0001058493747365463, + "loss": 5.1775, + "step": 25120 + }, + { + "epoch": 0.03531000760507028, + "grad_norm": 1.2824280261993408, + "learning_rate": 0.00010589152732893072, + "loss": 5.2595, + "step": 25130 + }, + { + "epoch": 0.03532405854323386, + "grad_norm": 1.3103786706924438, + "learning_rate": 0.00010593367992131515, + "loss": 5.2454, + "step": 25140 + }, + { + "epoch": 0.035338109481397435, + "grad_norm": 1.2637652158737183, + "learning_rate": 0.0001059758325136996, + "loss": 5.1544, + "step": 25150 + }, + { + "epoch": 0.03535216041956101, + "grad_norm": 1.2903791666030884, + "learning_rate": 0.00010601798510608401, + "loss": 5.2983, + "step": 25160 + }, + { + "epoch": 0.03536621135772459, + "grad_norm": 1.268822193145752, + "learning_rate": 0.00010606013769846844, + "loss": 5.1807, + "step": 25170 + }, + { + "epoch": 0.03538026229588817, + "grad_norm": 1.3194992542266846, + "learning_rate": 0.00010610229029085289, + "loss": 5.1897, + "step": 25180 + }, + { + "epoch": 0.03539431323405175, + "grad_norm": 1.3022356033325195, + "learning_rate": 0.0001061444428832373, + "loss": 5.1931, + "step": 25190 + }, + { + "epoch": 0.03540836417221532, + "grad_norm": 1.4322096109390259, + "learning_rate": 0.00010618659547562174, + "loss": 5.1337, + "step": 25200 + }, + { + "epoch": 0.0354224151103789, + "grad_norm": 1.285513162612915, + "learning_rate": 0.00010622874806800618, + "loss": 5.3514, + "step": 25210 + }, + { + "epoch": 0.035436466048542475, + "grad_norm": 1.3835152387619019, + "learning_rate": 0.00010627090066039061, + "loss": 5.2987, + "step": 25220 + }, + { + "epoch": 0.03545051698670606, + "grad_norm": 1.3201239109039307, + "learning_rate": 0.00010631305325277503, + "loss": 5.3481, + "step": 25230 + }, + { + "epoch": 0.035464567924869635, + "grad_norm": 1.3172633647918701, + "learning_rate": 0.00010635520584515947, + "loss": 5.2151, + "step": 25240 + }, + { + "epoch": 0.03547861886303321, + "grad_norm": 1.3208476305007935, + "learning_rate": 0.0001063973584375439, + "loss": 5.2665, + "step": 25250 + }, + { + "epoch": 0.03549266980119679, + "grad_norm": 1.2850348949432373, + "learning_rate": 0.00010643951102992832, + "loss": 5.2579, + "step": 25260 + }, + { + "epoch": 0.03550672073936036, + "grad_norm": 1.2918301820755005, + "learning_rate": 0.00010648166362231277, + "loss": 5.3202, + "step": 25270 + }, + { + "epoch": 0.035520771677523946, + "grad_norm": 1.3454853296279907, + "learning_rate": 0.0001065238162146972, + "loss": 5.302, + "step": 25280 + }, + { + "epoch": 0.03553482261568752, + "grad_norm": 1.23288893699646, + "learning_rate": 0.00010656596880708162, + "loss": 5.2014, + "step": 25290 + }, + { + "epoch": 0.0355488735538511, + "grad_norm": 1.2703696489334106, + "learning_rate": 0.00010660812139946606, + "loss": 5.2891, + "step": 25300 + }, + { + "epoch": 0.035562924492014675, + "grad_norm": 1.3757070302963257, + "learning_rate": 0.00010665027399185049, + "loss": 5.2369, + "step": 25310 + }, + { + "epoch": 0.03557697543017825, + "grad_norm": 1.3169059753417969, + "learning_rate": 0.00010669242658423491, + "loss": 5.2194, + "step": 25320 + }, + { + "epoch": 0.035591026368341834, + "grad_norm": 1.3210643529891968, + "learning_rate": 0.00010673457917661935, + "loss": 5.2344, + "step": 25330 + }, + { + "epoch": 0.03560507730650541, + "grad_norm": 1.2993870973587036, + "learning_rate": 0.00010677673176900379, + "loss": 5.2018, + "step": 25340 + }, + { + "epoch": 0.035619128244668986, + "grad_norm": 1.299553632736206, + "learning_rate": 0.00010681888436138822, + "loss": 5.2323, + "step": 25350 + }, + { + "epoch": 0.03563317918283256, + "grad_norm": 1.4303969144821167, + "learning_rate": 0.00010686103695377265, + "loss": 5.1968, + "step": 25360 + }, + { + "epoch": 0.03564723012099614, + "grad_norm": 1.2443745136260986, + "learning_rate": 0.00010690318954615708, + "loss": 5.3775, + "step": 25370 + }, + { + "epoch": 0.03566128105915972, + "grad_norm": 1.5378773212432861, + "learning_rate": 0.00010694534213854152, + "loss": 5.1952, + "step": 25380 + }, + { + "epoch": 0.0356753319973233, + "grad_norm": 1.2778575420379639, + "learning_rate": 0.00010698749473092594, + "loss": 5.1902, + "step": 25390 + }, + { + "epoch": 0.035689382935486874, + "grad_norm": 1.241885781288147, + "learning_rate": 0.00010702964732331037, + "loss": 5.3114, + "step": 25400 + }, + { + "epoch": 0.03570343387365045, + "grad_norm": 1.250803828239441, + "learning_rate": 0.00010707179991569482, + "loss": 5.2635, + "step": 25410 + }, + { + "epoch": 0.035717484811814026, + "grad_norm": 1.1884135007858276, + "learning_rate": 0.00010711395250807924, + "loss": 5.2907, + "step": 25420 + }, + { + "epoch": 0.03573153574997761, + "grad_norm": 1.169700026512146, + "learning_rate": 0.00010715610510046367, + "loss": 5.2049, + "step": 25430 + }, + { + "epoch": 0.035745586688141186, + "grad_norm": 1.2907392978668213, + "learning_rate": 0.00010719825769284811, + "loss": 5.1974, + "step": 25440 + }, + { + "epoch": 0.03575963762630476, + "grad_norm": 1.3100101947784424, + "learning_rate": 0.00010724041028523253, + "loss": 5.1541, + "step": 25450 + }, + { + "epoch": 0.03577368856446834, + "grad_norm": 1.3046437501907349, + "learning_rate": 0.00010728256287761696, + "loss": 5.2203, + "step": 25460 + }, + { + "epoch": 0.035787739502631914, + "grad_norm": 1.2555994987487793, + "learning_rate": 0.0001073247154700014, + "loss": 5.2891, + "step": 25470 + }, + { + "epoch": 0.0358017904407955, + "grad_norm": 1.315011978149414, + "learning_rate": 0.00010736686806238582, + "loss": 5.3418, + "step": 25480 + }, + { + "epoch": 0.035815841378959073, + "grad_norm": 1.3488926887512207, + "learning_rate": 0.00010740902065477025, + "loss": 5.082, + "step": 25490 + }, + { + "epoch": 0.03582989231712265, + "grad_norm": 1.2231671810150146, + "learning_rate": 0.0001074511732471547, + "loss": 5.2338, + "step": 25500 + }, + { + "epoch": 0.035843943255286226, + "grad_norm": 1.2394180297851562, + "learning_rate": 0.00010749332583953913, + "loss": 5.2079, + "step": 25510 + }, + { + "epoch": 0.0358579941934498, + "grad_norm": 1.2652291059494019, + "learning_rate": 0.00010753547843192355, + "loss": 5.09, + "step": 25520 + }, + { + "epoch": 0.03587204513161338, + "grad_norm": 1.3209153413772583, + "learning_rate": 0.00010757763102430799, + "loss": 5.1912, + "step": 25530 + }, + { + "epoch": 0.03588609606977696, + "grad_norm": 1.2716671228408813, + "learning_rate": 0.00010761978361669242, + "loss": 5.1782, + "step": 25540 + }, + { + "epoch": 0.03590014700794054, + "grad_norm": 1.2218382358551025, + "learning_rate": 0.00010766193620907684, + "loss": 5.3313, + "step": 25550 + }, + { + "epoch": 0.035914197946104114, + "grad_norm": 1.2553699016571045, + "learning_rate": 0.00010770408880146129, + "loss": 5.2144, + "step": 25560 + }, + { + "epoch": 0.03592824888426769, + "grad_norm": 1.3492262363433838, + "learning_rate": 0.00010774624139384572, + "loss": 5.2676, + "step": 25570 + }, + { + "epoch": 0.035942299822431266, + "grad_norm": 1.2544468641281128, + "learning_rate": 0.00010778839398623013, + "loss": 5.2419, + "step": 25580 + }, + { + "epoch": 0.03595635076059485, + "grad_norm": 1.228851079940796, + "learning_rate": 0.00010783054657861458, + "loss": 5.2684, + "step": 25590 + }, + { + "epoch": 0.035970401698758425, + "grad_norm": 1.2383124828338623, + "learning_rate": 0.00010787269917099901, + "loss": 5.2532, + "step": 25600 + }, + { + "epoch": 0.035984452636922, + "grad_norm": 1.3050917387008667, + "learning_rate": 0.00010791485176338343, + "loss": 5.2751, + "step": 25610 + }, + { + "epoch": 0.03599850357508558, + "grad_norm": 1.26146399974823, + "learning_rate": 0.00010795700435576787, + "loss": 5.1677, + "step": 25620 + }, + { + "epoch": 0.036012554513249154, + "grad_norm": 1.2697525024414062, + "learning_rate": 0.0001079991569481523, + "loss": 5.3691, + "step": 25630 + }, + { + "epoch": 0.03602660545141274, + "grad_norm": 1.299674153327942, + "learning_rate": 0.00010804130954053673, + "loss": 5.198, + "step": 25640 + }, + { + "epoch": 0.03604065638957631, + "grad_norm": 1.3521478176116943, + "learning_rate": 0.00010808346213292117, + "loss": 5.0404, + "step": 25650 + }, + { + "epoch": 0.03605470732773989, + "grad_norm": 1.270524501800537, + "learning_rate": 0.0001081256147253056, + "loss": 5.1391, + "step": 25660 + }, + { + "epoch": 0.036068758265903465, + "grad_norm": 1.2725433111190796, + "learning_rate": 0.00010816776731769004, + "loss": 5.3264, + "step": 25670 + }, + { + "epoch": 0.03608280920406704, + "grad_norm": 1.283799171447754, + "learning_rate": 0.00010820991991007446, + "loss": 5.2389, + "step": 25680 + }, + { + "epoch": 0.036096860142230625, + "grad_norm": 1.2880244255065918, + "learning_rate": 0.00010825207250245889, + "loss": 5.2285, + "step": 25690 + }, + { + "epoch": 0.0361109110803942, + "grad_norm": 1.3934433460235596, + "learning_rate": 0.00010829422509484334, + "loss": 5.2087, + "step": 25700 + }, + { + "epoch": 0.03612496201855778, + "grad_norm": 1.244972825050354, + "learning_rate": 0.00010833637768722775, + "loss": 5.3292, + "step": 25710 + }, + { + "epoch": 0.03613901295672135, + "grad_norm": 1.2489376068115234, + "learning_rate": 0.00010837853027961218, + "loss": 5.4028, + "step": 25720 + }, + { + "epoch": 0.03615306389488493, + "grad_norm": 1.2495671510696411, + "learning_rate": 0.00010842068287199663, + "loss": 5.1091, + "step": 25730 + }, + { + "epoch": 0.03616711483304851, + "grad_norm": 1.2367503643035889, + "learning_rate": 0.00010846283546438105, + "loss": 5.3419, + "step": 25740 + }, + { + "epoch": 0.03618116577121209, + "grad_norm": 1.247799038887024, + "learning_rate": 0.00010850498805676548, + "loss": 5.2247, + "step": 25750 + }, + { + "epoch": 0.036195216709375665, + "grad_norm": 1.3112616539001465, + "learning_rate": 0.00010854714064914992, + "loss": 5.3467, + "step": 25760 + }, + { + "epoch": 0.03620926764753924, + "grad_norm": 1.2683191299438477, + "learning_rate": 0.00010858929324153435, + "loss": 5.2628, + "step": 25770 + }, + { + "epoch": 0.03622331858570282, + "grad_norm": 1.295248031616211, + "learning_rate": 0.00010863144583391877, + "loss": 5.2255, + "step": 25780 + }, + { + "epoch": 0.0362373695238664, + "grad_norm": 1.3652722835540771, + "learning_rate": 0.00010867359842630322, + "loss": 5.1288, + "step": 25790 + }, + { + "epoch": 0.036251420462029976, + "grad_norm": 1.3258819580078125, + "learning_rate": 0.00010871575101868765, + "loss": 5.2706, + "step": 25800 + }, + { + "epoch": 0.03626547140019355, + "grad_norm": 1.213344931602478, + "learning_rate": 0.00010875790361107206, + "loss": 5.2729, + "step": 25810 + }, + { + "epoch": 0.03627952233835713, + "grad_norm": 1.3529800176620483, + "learning_rate": 0.00010880005620345651, + "loss": 5.1662, + "step": 25820 + }, + { + "epoch": 0.036293573276520705, + "grad_norm": 1.239091157913208, + "learning_rate": 0.00010884220879584094, + "loss": 5.1821, + "step": 25830 + }, + { + "epoch": 0.03630762421468428, + "grad_norm": 1.2374707460403442, + "learning_rate": 0.00010888436138822536, + "loss": 5.326, + "step": 25840 + }, + { + "epoch": 0.036321675152847864, + "grad_norm": 1.2616337537765503, + "learning_rate": 0.0001089265139806098, + "loss": 5.2543, + "step": 25850 + }, + { + "epoch": 0.03633572609101144, + "grad_norm": 1.256049633026123, + "learning_rate": 0.00010896866657299423, + "loss": 5.1544, + "step": 25860 + }, + { + "epoch": 0.036349777029175016, + "grad_norm": 1.4155590534210205, + "learning_rate": 0.00010901081916537865, + "loss": 5.3104, + "step": 25870 + }, + { + "epoch": 0.03636382796733859, + "grad_norm": 1.2644518613815308, + "learning_rate": 0.0001090529717577631, + "loss": 5.198, + "step": 25880 + }, + { + "epoch": 0.03637787890550217, + "grad_norm": 1.264570951461792, + "learning_rate": 0.00010909512435014753, + "loss": 5.0051, + "step": 25890 + }, + { + "epoch": 0.03639192984366575, + "grad_norm": 1.208848237991333, + "learning_rate": 0.00010913727694253195, + "loss": 5.2239, + "step": 25900 + }, + { + "epoch": 0.03640598078182933, + "grad_norm": 1.2104336023330688, + "learning_rate": 0.00010917942953491639, + "loss": 5.2732, + "step": 25910 + }, + { + "epoch": 0.036420031719992904, + "grad_norm": 1.298794150352478, + "learning_rate": 0.00010922158212730082, + "loss": 5.1825, + "step": 25920 + }, + { + "epoch": 0.03643408265815648, + "grad_norm": 1.3972251415252686, + "learning_rate": 0.00010926373471968525, + "loss": 5.4068, + "step": 25930 + }, + { + "epoch": 0.036448133596320056, + "grad_norm": 1.3141639232635498, + "learning_rate": 0.00010930588731206968, + "loss": 5.3439, + "step": 25940 + }, + { + "epoch": 0.03646218453448364, + "grad_norm": 1.4204273223876953, + "learning_rate": 0.00010934803990445411, + "loss": 5.1509, + "step": 25950 + }, + { + "epoch": 0.036476235472647216, + "grad_norm": 1.2282731533050537, + "learning_rate": 0.00010939019249683855, + "loss": 5.1782, + "step": 25960 + }, + { + "epoch": 0.03649028641081079, + "grad_norm": 1.2781927585601807, + "learning_rate": 0.00010943234508922298, + "loss": 5.1435, + "step": 25970 + }, + { + "epoch": 0.03650433734897437, + "grad_norm": 1.2722803354263306, + "learning_rate": 0.00010947449768160741, + "loss": 5.1179, + "step": 25980 + }, + { + "epoch": 0.036518388287137944, + "grad_norm": 1.3081459999084473, + "learning_rate": 0.00010951665027399185, + "loss": 5.0402, + "step": 25990 + }, + { + "epoch": 0.03653243922530153, + "grad_norm": 1.235568881034851, + "learning_rate": 0.00010955880286637627, + "loss": 5.22, + "step": 26000 + }, + { + "epoch": 0.036546490163465103, + "grad_norm": 1.3073358535766602, + "learning_rate": 0.0001096009554587607, + "loss": 5.2479, + "step": 26010 + }, + { + "epoch": 0.03656054110162868, + "grad_norm": 1.6806720495224, + "learning_rate": 0.00010964310805114515, + "loss": 5.1738, + "step": 26020 + }, + { + "epoch": 0.036574592039792256, + "grad_norm": 1.2512214183807373, + "learning_rate": 0.00010968526064352956, + "loss": 5.2327, + "step": 26030 + }, + { + "epoch": 0.03658864297795583, + "grad_norm": 1.2663812637329102, + "learning_rate": 0.000109727413235914, + "loss": 5.1307, + "step": 26040 + }, + { + "epoch": 0.036602693916119415, + "grad_norm": 1.2181764841079712, + "learning_rate": 0.00010976956582829844, + "loss": 5.2383, + "step": 26050 + }, + { + "epoch": 0.03661674485428299, + "grad_norm": 1.291024088859558, + "learning_rate": 0.00010981171842068287, + "loss": 5.1706, + "step": 26060 + }, + { + "epoch": 0.03663079579244657, + "grad_norm": 1.2037990093231201, + "learning_rate": 0.00010985387101306729, + "loss": 5.1271, + "step": 26070 + }, + { + "epoch": 0.036644846730610144, + "grad_norm": 1.1984660625457764, + "learning_rate": 0.00010989602360545173, + "loss": 5.291, + "step": 26080 + }, + { + "epoch": 0.03665889766877372, + "grad_norm": 1.301593542098999, + "learning_rate": 0.00010993817619783616, + "loss": 5.1974, + "step": 26090 + }, + { + "epoch": 0.0366729486069373, + "grad_norm": 1.249263882637024, + "learning_rate": 0.00010998032879022058, + "loss": 5.1413, + "step": 26100 + }, + { + "epoch": 0.03668699954510088, + "grad_norm": 1.3108259439468384, + "learning_rate": 0.00011002248138260503, + "loss": 5.3095, + "step": 26110 + }, + { + "epoch": 0.036701050483264455, + "grad_norm": 1.3736425638198853, + "learning_rate": 0.00011006463397498946, + "loss": 5.2484, + "step": 26120 + }, + { + "epoch": 0.03671510142142803, + "grad_norm": 1.2452633380889893, + "learning_rate": 0.00011010678656737388, + "loss": 5.2032, + "step": 26130 + }, + { + "epoch": 0.03672915235959161, + "grad_norm": 1.235262393951416, + "learning_rate": 0.00011014893915975832, + "loss": 5.2425, + "step": 26140 + }, + { + "epoch": 0.036743203297755184, + "grad_norm": 1.2423274517059326, + "learning_rate": 0.00011019109175214275, + "loss": 5.1752, + "step": 26150 + }, + { + "epoch": 0.03675725423591877, + "grad_norm": 1.3279187679290771, + "learning_rate": 0.00011023324434452717, + "loss": 5.232, + "step": 26160 + }, + { + "epoch": 0.03677130517408234, + "grad_norm": 1.2280527353286743, + "learning_rate": 0.00011027539693691161, + "loss": 5.3038, + "step": 26170 + }, + { + "epoch": 0.03678535611224592, + "grad_norm": 1.324863314628601, + "learning_rate": 0.00011031754952929604, + "loss": 5.131, + "step": 26180 + }, + { + "epoch": 0.036799407050409495, + "grad_norm": 1.3295210599899292, + "learning_rate": 0.00011035970212168046, + "loss": 5.2175, + "step": 26190 + }, + { + "epoch": 0.03681345798857307, + "grad_norm": 1.2481762170791626, + "learning_rate": 0.00011040185471406491, + "loss": 5.1572, + "step": 26200 + }, + { + "epoch": 0.036827508926736655, + "grad_norm": 1.2397472858428955, + "learning_rate": 0.00011044400730644934, + "loss": 5.1869, + "step": 26210 + }, + { + "epoch": 0.03684155986490023, + "grad_norm": 1.243333339691162, + "learning_rate": 0.00011048615989883377, + "loss": 5.1236, + "step": 26220 + }, + { + "epoch": 0.03685561080306381, + "grad_norm": 1.2801663875579834, + "learning_rate": 0.0001105283124912182, + "loss": 5.2453, + "step": 26230 + }, + { + "epoch": 0.03686966174122738, + "grad_norm": 1.3085575103759766, + "learning_rate": 0.00011057046508360263, + "loss": 5.131, + "step": 26240 + }, + { + "epoch": 0.03688371267939096, + "grad_norm": 1.2628742456436157, + "learning_rate": 0.00011061261767598706, + "loss": 5.2099, + "step": 26250 + }, + { + "epoch": 0.03689776361755454, + "grad_norm": 1.223900556564331, + "learning_rate": 0.0001106547702683715, + "loss": 5.234, + "step": 26260 + }, + { + "epoch": 0.03691181455571812, + "grad_norm": 1.3309193849563599, + "learning_rate": 0.00011069692286075593, + "loss": 5.3202, + "step": 26270 + }, + { + "epoch": 0.036925865493881695, + "grad_norm": 1.4030629396438599, + "learning_rate": 0.00011073907545314037, + "loss": 5.3066, + "step": 26280 + }, + { + "epoch": 0.03693991643204527, + "grad_norm": 1.2538514137268066, + "learning_rate": 0.00011078122804552479, + "loss": 5.157, + "step": 26290 + }, + { + "epoch": 0.03695396737020885, + "grad_norm": 1.26717209815979, + "learning_rate": 0.00011082338063790922, + "loss": 5.1852, + "step": 26300 + }, + { + "epoch": 0.03696801830837243, + "grad_norm": 1.2575275897979736, + "learning_rate": 0.00011086553323029366, + "loss": 5.3232, + "step": 26310 + }, + { + "epoch": 0.036982069246536006, + "grad_norm": 1.3039658069610596, + "learning_rate": 0.00011090768582267808, + "loss": 5.1781, + "step": 26320 + }, + { + "epoch": 0.03699612018469958, + "grad_norm": 1.3649189472198486, + "learning_rate": 0.00011094983841506251, + "loss": 5.2759, + "step": 26330 + }, + { + "epoch": 0.03701017112286316, + "grad_norm": 1.210462212562561, + "learning_rate": 0.00011099199100744696, + "loss": 5.0839, + "step": 26340 + }, + { + "epoch": 0.037024222061026735, + "grad_norm": 1.2349313497543335, + "learning_rate": 0.00011103414359983139, + "loss": 5.1527, + "step": 26350 + }, + { + "epoch": 0.03703827299919032, + "grad_norm": 1.326310634613037, + "learning_rate": 0.0001110762961922158, + "loss": 5.1428, + "step": 26360 + }, + { + "epoch": 0.037052323937353894, + "grad_norm": 1.2547760009765625, + "learning_rate": 0.00011111844878460025, + "loss": 5.1292, + "step": 26370 + }, + { + "epoch": 0.03706637487551747, + "grad_norm": 1.3274054527282715, + "learning_rate": 0.00011116060137698468, + "loss": 5.262, + "step": 26380 + }, + { + "epoch": 0.037080425813681046, + "grad_norm": 1.247420072555542, + "learning_rate": 0.0001112027539693691, + "loss": 5.3095, + "step": 26390 + }, + { + "epoch": 0.03709447675184462, + "grad_norm": 1.4132479429244995, + "learning_rate": 0.00011124490656175354, + "loss": 5.2025, + "step": 26400 + }, + { + "epoch": 0.037108527690008206, + "grad_norm": 1.262511968612671, + "learning_rate": 0.00011128705915413798, + "loss": 5.1409, + "step": 26410 + }, + { + "epoch": 0.03712257862817178, + "grad_norm": 1.2406529188156128, + "learning_rate": 0.00011132921174652239, + "loss": 5.1738, + "step": 26420 + }, + { + "epoch": 0.03713662956633536, + "grad_norm": 1.169571042060852, + "learning_rate": 0.00011137136433890684, + "loss": 5.2443, + "step": 26430 + }, + { + "epoch": 0.037150680504498934, + "grad_norm": 1.268477201461792, + "learning_rate": 0.00011141351693129127, + "loss": 5.1545, + "step": 26440 + }, + { + "epoch": 0.03716473144266251, + "grad_norm": 1.2400575876235962, + "learning_rate": 0.00011145566952367569, + "loss": 5.2186, + "step": 26450 + }, + { + "epoch": 0.037178782380826086, + "grad_norm": 1.244472622871399, + "learning_rate": 0.00011149782211606013, + "loss": 5.1656, + "step": 26460 + }, + { + "epoch": 0.03719283331898967, + "grad_norm": 1.3057937622070312, + "learning_rate": 0.00011153997470844456, + "loss": 5.0956, + "step": 26470 + }, + { + "epoch": 0.037206884257153246, + "grad_norm": 1.222322702407837, + "learning_rate": 0.00011158212730082898, + "loss": 5.2298, + "step": 26480 + }, + { + "epoch": 0.03722093519531682, + "grad_norm": 1.2240188121795654, + "learning_rate": 0.00011162427989321342, + "loss": 5.1473, + "step": 26490 + }, + { + "epoch": 0.0372349861334804, + "grad_norm": 1.2890682220458984, + "learning_rate": 0.00011166643248559786, + "loss": 5.0835, + "step": 26500 + }, + { + "epoch": 0.037249037071643974, + "grad_norm": 1.3182138204574585, + "learning_rate": 0.00011170858507798229, + "loss": 5.3088, + "step": 26510 + }, + { + "epoch": 0.03726308800980756, + "grad_norm": 1.2409218549728394, + "learning_rate": 0.00011175073767036672, + "loss": 5.2822, + "step": 26520 + }, + { + "epoch": 0.037277138947971133, + "grad_norm": 1.2367993593215942, + "learning_rate": 0.00011179289026275115, + "loss": 5.1691, + "step": 26530 + }, + { + "epoch": 0.03729118988613471, + "grad_norm": 1.2489789724349976, + "learning_rate": 0.00011183504285513558, + "loss": 5.2188, + "step": 26540 + }, + { + "epoch": 0.037305240824298286, + "grad_norm": 1.2142417430877686, + "learning_rate": 0.00011187719544752001, + "loss": 5.2871, + "step": 26550 + }, + { + "epoch": 0.03731929176246186, + "grad_norm": 1.2378629446029663, + "learning_rate": 0.00011191934803990444, + "loss": 5.0973, + "step": 26560 + }, + { + "epoch": 0.037333342700625445, + "grad_norm": 1.2806910276412964, + "learning_rate": 0.00011196150063228889, + "loss": 5.3434, + "step": 26570 + }, + { + "epoch": 0.03734739363878902, + "grad_norm": 1.3529186248779297, + "learning_rate": 0.0001120036532246733, + "loss": 5.1493, + "step": 26580 + }, + { + "epoch": 0.0373614445769526, + "grad_norm": 1.2549669742584229, + "learning_rate": 0.00011204580581705774, + "loss": 5.0873, + "step": 26590 + }, + { + "epoch": 0.037375495515116174, + "grad_norm": 1.2252702713012695, + "learning_rate": 0.00011208795840944218, + "loss": 5.1133, + "step": 26600 + }, + { + "epoch": 0.03738954645327975, + "grad_norm": 1.2540626525878906, + "learning_rate": 0.0001121301110018266, + "loss": 5.2449, + "step": 26610 + }, + { + "epoch": 0.03740359739144333, + "grad_norm": 1.178832769393921, + "learning_rate": 0.00011217226359421103, + "loss": 5.1955, + "step": 26620 + }, + { + "epoch": 0.03741764832960691, + "grad_norm": 1.2755051851272583, + "learning_rate": 0.00011221441618659547, + "loss": 5.0336, + "step": 26630 + }, + { + "epoch": 0.037431699267770485, + "grad_norm": 1.2864094972610474, + "learning_rate": 0.0001122565687789799, + "loss": 5.2242, + "step": 26640 + }, + { + "epoch": 0.03744575020593406, + "grad_norm": 1.2537206411361694, + "learning_rate": 0.00011229872137136432, + "loss": 5.3039, + "step": 26650 + }, + { + "epoch": 0.03745980114409764, + "grad_norm": 1.2298046350479126, + "learning_rate": 0.00011234087396374877, + "loss": 5.1417, + "step": 26660 + }, + { + "epoch": 0.03747385208226122, + "grad_norm": 1.2369047403335571, + "learning_rate": 0.0001123830265561332, + "loss": 5.2243, + "step": 26670 + }, + { + "epoch": 0.0374879030204248, + "grad_norm": 1.2635163068771362, + "learning_rate": 0.00011242517914851762, + "loss": 5.2349, + "step": 26680 + }, + { + "epoch": 0.03750195395858837, + "grad_norm": 1.3052091598510742, + "learning_rate": 0.00011246733174090206, + "loss": 5.1844, + "step": 26690 + }, + { + "epoch": 0.03751600489675195, + "grad_norm": 1.2509127855300903, + "learning_rate": 0.00011250948433328649, + "loss": 5.1712, + "step": 26700 + }, + { + "epoch": 0.037530055834915525, + "grad_norm": 1.2266826629638672, + "learning_rate": 0.00011255163692567091, + "loss": 5.1993, + "step": 26710 + }, + { + "epoch": 0.03754410677307911, + "grad_norm": 1.3366031646728516, + "learning_rate": 0.00011259378951805535, + "loss": 5.178, + "step": 26720 + }, + { + "epoch": 0.037558157711242685, + "grad_norm": 1.2389544248580933, + "learning_rate": 0.00011263594211043979, + "loss": 5.282, + "step": 26730 + }, + { + "epoch": 0.03757220864940626, + "grad_norm": 1.2713171243667603, + "learning_rate": 0.0001126780947028242, + "loss": 5.1055, + "step": 26740 + }, + { + "epoch": 0.03758625958756984, + "grad_norm": 1.268943190574646, + "learning_rate": 0.00011272024729520865, + "loss": 5.2751, + "step": 26750 + }, + { + "epoch": 0.03760031052573341, + "grad_norm": 1.302060842514038, + "learning_rate": 0.00011276239988759308, + "loss": 5.1239, + "step": 26760 + }, + { + "epoch": 0.03761436146389699, + "grad_norm": 1.2425678968429565, + "learning_rate": 0.00011280455247997751, + "loss": 5.2777, + "step": 26770 + }, + { + "epoch": 0.03762841240206057, + "grad_norm": 1.297904133796692, + "learning_rate": 0.00011284670507236194, + "loss": 5.3101, + "step": 26780 + }, + { + "epoch": 0.03764246334022415, + "grad_norm": 1.3182086944580078, + "learning_rate": 0.00011288885766474637, + "loss": 5.1886, + "step": 26790 + }, + { + "epoch": 0.037656514278387725, + "grad_norm": 1.1934064626693726, + "learning_rate": 0.0001129310102571308, + "loss": 5.1946, + "step": 26800 + }, + { + "epoch": 0.0376705652165513, + "grad_norm": 1.2613691091537476, + "learning_rate": 0.00011297316284951524, + "loss": 5.2152, + "step": 26810 + }, + { + "epoch": 0.03768461615471488, + "grad_norm": 1.2664833068847656, + "learning_rate": 0.00011301531544189967, + "loss": 5.3019, + "step": 26820 + }, + { + "epoch": 0.03769866709287846, + "grad_norm": 1.687343955039978, + "learning_rate": 0.0001130574680342841, + "loss": 5.081, + "step": 26830 + }, + { + "epoch": 0.037712718031042036, + "grad_norm": 1.2772016525268555, + "learning_rate": 0.00011309962062666853, + "loss": 5.1301, + "step": 26840 + }, + { + "epoch": 0.03772676896920561, + "grad_norm": 1.2881845235824585, + "learning_rate": 0.00011314177321905296, + "loss": 5.2073, + "step": 26850 + }, + { + "epoch": 0.03774081990736919, + "grad_norm": 1.4994240999221802, + "learning_rate": 0.0001131839258114374, + "loss": 5.2521, + "step": 26860 + }, + { + "epoch": 0.037754870845532765, + "grad_norm": 1.3068636655807495, + "learning_rate": 0.00011322607840382182, + "loss": 5.1688, + "step": 26870 + }, + { + "epoch": 0.03776892178369635, + "grad_norm": 1.2948259115219116, + "learning_rate": 0.00011326823099620625, + "loss": 5.1923, + "step": 26880 + }, + { + "epoch": 0.037782972721859924, + "grad_norm": 1.2383815050125122, + "learning_rate": 0.0001133103835885907, + "loss": 5.205, + "step": 26890 + }, + { + "epoch": 0.0377970236600235, + "grad_norm": 1.258082628250122, + "learning_rate": 0.00011335253618097512, + "loss": 5.1863, + "step": 26900 + }, + { + "epoch": 0.037811074598187076, + "grad_norm": 1.2574599981307983, + "learning_rate": 0.00011339468877335955, + "loss": 5.1459, + "step": 26910 + }, + { + "epoch": 0.03782512553635065, + "grad_norm": 1.3260610103607178, + "learning_rate": 0.00011343684136574399, + "loss": 5.2764, + "step": 26920 + }, + { + "epoch": 0.037839176474514236, + "grad_norm": 1.1936684846878052, + "learning_rate": 0.00011347899395812842, + "loss": 5.2587, + "step": 26930 + }, + { + "epoch": 0.03785322741267781, + "grad_norm": 1.3640984296798706, + "learning_rate": 0.00011352114655051284, + "loss": 5.0741, + "step": 26940 + }, + { + "epoch": 0.03786727835084139, + "grad_norm": 1.3140736818313599, + "learning_rate": 0.00011356329914289729, + "loss": 5.1392, + "step": 26950 + }, + { + "epoch": 0.037881329289004964, + "grad_norm": 1.268675446510315, + "learning_rate": 0.00011360545173528172, + "loss": 5.1514, + "step": 26960 + }, + { + "epoch": 0.03789538022716854, + "grad_norm": 1.4075084924697876, + "learning_rate": 0.00011364760432766613, + "loss": 5.1024, + "step": 26970 + }, + { + "epoch": 0.03790943116533212, + "grad_norm": 1.2317532300949097, + "learning_rate": 0.00011368975692005058, + "loss": 5.2554, + "step": 26980 + }, + { + "epoch": 0.0379234821034957, + "grad_norm": 1.3724669218063354, + "learning_rate": 0.00011373190951243501, + "loss": 5.1995, + "step": 26990 + }, + { + "epoch": 0.037937533041659276, + "grad_norm": 1.2524470090866089, + "learning_rate": 0.00011377406210481943, + "loss": 5.2892, + "step": 27000 + }, + { + "epoch": 0.03795158397982285, + "grad_norm": 1.231868863105774, + "learning_rate": 0.00011381621469720387, + "loss": 5.2432, + "step": 27010 + }, + { + "epoch": 0.03796563491798643, + "grad_norm": 1.224562406539917, + "learning_rate": 0.0001138583672895883, + "loss": 5.3197, + "step": 27020 + }, + { + "epoch": 0.03797968585615001, + "grad_norm": 1.390925407409668, + "learning_rate": 0.00011390051988197272, + "loss": 5.2181, + "step": 27030 + }, + { + "epoch": 0.03799373679431359, + "grad_norm": 1.3121964931488037, + "learning_rate": 0.00011394267247435717, + "loss": 5.1817, + "step": 27040 + }, + { + "epoch": 0.03800778773247716, + "grad_norm": 1.2776087522506714, + "learning_rate": 0.0001139848250667416, + "loss": 5.2207, + "step": 27050 + }, + { + "epoch": 0.03802183867064074, + "grad_norm": 1.2448629140853882, + "learning_rate": 0.00011402697765912603, + "loss": 5.1486, + "step": 27060 + }, + { + "epoch": 0.038035889608804316, + "grad_norm": 1.22585129737854, + "learning_rate": 0.00011406913025151046, + "loss": 5.1954, + "step": 27070 + }, + { + "epoch": 0.03804994054696789, + "grad_norm": 1.289219856262207, + "learning_rate": 0.00011411128284389489, + "loss": 5.1862, + "step": 27080 + }, + { + "epoch": 0.038063991485131475, + "grad_norm": 1.2818207740783691, + "learning_rate": 0.00011415343543627932, + "loss": 5.2999, + "step": 27090 + }, + { + "epoch": 0.03807804242329505, + "grad_norm": 1.237623691558838, + "learning_rate": 0.00011419558802866375, + "loss": 5.0843, + "step": 27100 + }, + { + "epoch": 0.03809209336145863, + "grad_norm": 1.233920931816101, + "learning_rate": 0.00011423774062104818, + "loss": 5.1438, + "step": 27110 + }, + { + "epoch": 0.038106144299622204, + "grad_norm": 1.782360315322876, + "learning_rate": 0.00011427989321343262, + "loss": 5.1636, + "step": 27120 + }, + { + "epoch": 0.03812019523778578, + "grad_norm": 1.2615609169006348, + "learning_rate": 0.00011432204580581705, + "loss": 5.1292, + "step": 27130 + }, + { + "epoch": 0.03813424617594936, + "grad_norm": 1.2505155801773071, + "learning_rate": 0.00011436419839820148, + "loss": 5.1816, + "step": 27140 + }, + { + "epoch": 0.03814829711411294, + "grad_norm": 1.231854796409607, + "learning_rate": 0.00011440635099058592, + "loss": 5.1576, + "step": 27150 + }, + { + "epoch": 0.038162348052276515, + "grad_norm": 1.2065684795379639, + "learning_rate": 0.00011444850358297034, + "loss": 5.1434, + "step": 27160 + }, + { + "epoch": 0.03817639899044009, + "grad_norm": 1.2832350730895996, + "learning_rate": 0.00011449065617535477, + "loss": 5.1549, + "step": 27170 + }, + { + "epoch": 0.03819044992860367, + "grad_norm": 1.2158983945846558, + "learning_rate": 0.00011453280876773922, + "loss": 5.1136, + "step": 27180 + }, + { + "epoch": 0.03820450086676725, + "grad_norm": 1.2421070337295532, + "learning_rate": 0.00011457496136012363, + "loss": 5.1758, + "step": 27190 + }, + { + "epoch": 0.03821855180493083, + "grad_norm": 1.2263178825378418, + "learning_rate": 0.00011461711395250806, + "loss": 5.1627, + "step": 27200 + }, + { + "epoch": 0.0382326027430944, + "grad_norm": 1.2281807661056519, + "learning_rate": 0.00011465926654489251, + "loss": 5.1769, + "step": 27210 + }, + { + "epoch": 0.03824665368125798, + "grad_norm": 1.2824562788009644, + "learning_rate": 0.00011470141913727694, + "loss": 5.0437, + "step": 27220 + }, + { + "epoch": 0.038260704619421555, + "grad_norm": 1.2487444877624512, + "learning_rate": 0.00011474357172966136, + "loss": 5.1845, + "step": 27230 + }, + { + "epoch": 0.03827475555758514, + "grad_norm": 1.223210096359253, + "learning_rate": 0.0001147857243220458, + "loss": 5.1532, + "step": 27240 + }, + { + "epoch": 0.038288806495748715, + "grad_norm": 1.2843011617660522, + "learning_rate": 0.00011482787691443023, + "loss": 5.2197, + "step": 27250 + }, + { + "epoch": 0.03830285743391229, + "grad_norm": 1.2639187574386597, + "learning_rate": 0.00011487002950681465, + "loss": 5.1132, + "step": 27260 + }, + { + "epoch": 0.03831690837207587, + "grad_norm": 1.2122303247451782, + "learning_rate": 0.0001149121820991991, + "loss": 5.255, + "step": 27270 + }, + { + "epoch": 0.03833095931023944, + "grad_norm": 1.2370988130569458, + "learning_rate": 0.00011495433469158353, + "loss": 5.1655, + "step": 27280 + }, + { + "epoch": 0.038345010248403026, + "grad_norm": 1.2647359371185303, + "learning_rate": 0.00011499648728396794, + "loss": 5.2058, + "step": 27290 + }, + { + "epoch": 0.0383590611865666, + "grad_norm": 1.2288408279418945, + "learning_rate": 0.00011503863987635239, + "loss": 5.2224, + "step": 27300 + }, + { + "epoch": 0.03837311212473018, + "grad_norm": 1.3430143594741821, + "learning_rate": 0.00011508079246873682, + "loss": 5.1902, + "step": 27310 + }, + { + "epoch": 0.038387163062893755, + "grad_norm": 1.2298165559768677, + "learning_rate": 0.00011512294506112124, + "loss": 5.2141, + "step": 27320 + }, + { + "epoch": 0.03840121400105733, + "grad_norm": 1.2337199449539185, + "learning_rate": 0.00011516509765350568, + "loss": 5.1845, + "step": 27330 + }, + { + "epoch": 0.038415264939220914, + "grad_norm": 1.1939727067947388, + "learning_rate": 0.00011520725024589011, + "loss": 5.2187, + "step": 27340 + }, + { + "epoch": 0.03842931587738449, + "grad_norm": 1.3627872467041016, + "learning_rate": 0.00011524940283827455, + "loss": 5.0741, + "step": 27350 + }, + { + "epoch": 0.038443366815548066, + "grad_norm": 1.3677321672439575, + "learning_rate": 0.00011529155543065898, + "loss": 5.1668, + "step": 27360 + }, + { + "epoch": 0.03845741775371164, + "grad_norm": 1.2773394584655762, + "learning_rate": 0.00011533370802304341, + "loss": 5.1311, + "step": 27370 + }, + { + "epoch": 0.03847146869187522, + "grad_norm": 1.2625548839569092, + "learning_rate": 0.00011537586061542784, + "loss": 5.2052, + "step": 27380 + }, + { + "epoch": 0.038485519630038795, + "grad_norm": 1.227333426475525, + "learning_rate": 0.00011541801320781227, + "loss": 5.3206, + "step": 27390 + }, + { + "epoch": 0.03849957056820238, + "grad_norm": 1.214938759803772, + "learning_rate": 0.0001154601658001967, + "loss": 5.0788, + "step": 27400 + }, + { + "epoch": 0.038513621506365954, + "grad_norm": 1.2600798606872559, + "learning_rate": 0.00011550231839258113, + "loss": 5.2539, + "step": 27410 + }, + { + "epoch": 0.03852767244452953, + "grad_norm": 1.2314510345458984, + "learning_rate": 0.00011554447098496556, + "loss": 5.2616, + "step": 27420 + }, + { + "epoch": 0.038541723382693106, + "grad_norm": 1.2492339611053467, + "learning_rate": 0.00011558662357735, + "loss": 5.1835, + "step": 27430 + }, + { + "epoch": 0.03855577432085668, + "grad_norm": 1.3038430213928223, + "learning_rate": 0.00011562877616973444, + "loss": 5.1959, + "step": 27440 + }, + { + "epoch": 0.038569825259020266, + "grad_norm": 1.2819589376449585, + "learning_rate": 0.00011567092876211886, + "loss": 5.2702, + "step": 27450 + }, + { + "epoch": 0.03858387619718384, + "grad_norm": 1.2621688842773438, + "learning_rate": 0.00011571308135450329, + "loss": 5.2331, + "step": 27460 + }, + { + "epoch": 0.03859792713534742, + "grad_norm": 1.2393717765808105, + "learning_rate": 0.00011575523394688773, + "loss": 5.2205, + "step": 27470 + }, + { + "epoch": 0.038611978073510994, + "grad_norm": 1.407251238822937, + "learning_rate": 0.00011579738653927216, + "loss": 5.0831, + "step": 27480 + }, + { + "epoch": 0.03862602901167457, + "grad_norm": 1.196433663368225, + "learning_rate": 0.00011583953913165658, + "loss": 5.2502, + "step": 27490 + }, + { + "epoch": 0.03864007994983815, + "grad_norm": 1.2640856504440308, + "learning_rate": 0.00011588169172404103, + "loss": 5.1203, + "step": 27500 + }, + { + "epoch": 0.03865413088800173, + "grad_norm": 1.2602075338363647, + "learning_rate": 0.00011592384431642546, + "loss": 5.1076, + "step": 27510 + }, + { + "epoch": 0.038668181826165306, + "grad_norm": 1.226286768913269, + "learning_rate": 0.00011596599690880988, + "loss": 5.1437, + "step": 27520 + }, + { + "epoch": 0.03868223276432888, + "grad_norm": 1.2031055688858032, + "learning_rate": 0.00011600814950119432, + "loss": 5.1661, + "step": 27530 + }, + { + "epoch": 0.03869628370249246, + "grad_norm": 1.2152131795883179, + "learning_rate": 0.00011605030209357875, + "loss": 5.143, + "step": 27540 + }, + { + "epoch": 0.03871033464065604, + "grad_norm": 1.2977949380874634, + "learning_rate": 0.00011609245468596317, + "loss": 5.2466, + "step": 27550 + }, + { + "epoch": 0.03872438557881962, + "grad_norm": 1.2086836099624634, + "learning_rate": 0.00011613460727834761, + "loss": 5.2369, + "step": 27560 + }, + { + "epoch": 0.03873843651698319, + "grad_norm": 1.1972341537475586, + "learning_rate": 0.00011617675987073204, + "loss": 5.2176, + "step": 27570 + }, + { + "epoch": 0.03875248745514677, + "grad_norm": 1.3312931060791016, + "learning_rate": 0.00011621891246311646, + "loss": 4.9767, + "step": 27580 + }, + { + "epoch": 0.038766538393310346, + "grad_norm": 1.2363582849502563, + "learning_rate": 0.00011626106505550091, + "loss": 5.1467, + "step": 27590 + }, + { + "epoch": 0.03878058933147393, + "grad_norm": 1.2445141077041626, + "learning_rate": 0.00011630321764788534, + "loss": 4.9751, + "step": 27600 + }, + { + "epoch": 0.038794640269637505, + "grad_norm": 1.3403300046920776, + "learning_rate": 0.00011634537024026976, + "loss": 5.1634, + "step": 27610 + }, + { + "epoch": 0.03880869120780108, + "grad_norm": 1.2853689193725586, + "learning_rate": 0.0001163875228326542, + "loss": 5.2343, + "step": 27620 + }, + { + "epoch": 0.03882274214596466, + "grad_norm": 1.2790160179138184, + "learning_rate": 0.00011642967542503863, + "loss": 5.3136, + "step": 27630 + }, + { + "epoch": 0.038836793084128234, + "grad_norm": 1.3508522510528564, + "learning_rate": 0.00011647182801742306, + "loss": 5.0613, + "step": 27640 + }, + { + "epoch": 0.03885084402229182, + "grad_norm": 1.274295687675476, + "learning_rate": 0.0001165139806098075, + "loss": 5.1176, + "step": 27650 + }, + { + "epoch": 0.03886489496045539, + "grad_norm": 1.3735030889511108, + "learning_rate": 0.00011655613320219193, + "loss": 5.2095, + "step": 27660 + }, + { + "epoch": 0.03887894589861897, + "grad_norm": 1.2918239831924438, + "learning_rate": 0.00011659828579457636, + "loss": 5.2064, + "step": 27670 + }, + { + "epoch": 0.038892996836782545, + "grad_norm": 1.2781789302825928, + "learning_rate": 0.00011664043838696079, + "loss": 5.0818, + "step": 27680 + }, + { + "epoch": 0.03890704777494612, + "grad_norm": 1.2447351217269897, + "learning_rate": 0.00011668259097934522, + "loss": 5.085, + "step": 27690 + }, + { + "epoch": 0.0389210987131097, + "grad_norm": 1.2342867851257324, + "learning_rate": 0.00011672474357172965, + "loss": 5.1657, + "step": 27700 + }, + { + "epoch": 0.03893514965127328, + "grad_norm": 1.2419430017471313, + "learning_rate": 0.00011676689616411408, + "loss": 5.1151, + "step": 27710 + }, + { + "epoch": 0.03894920058943686, + "grad_norm": 1.261952519416809, + "learning_rate": 0.00011680904875649851, + "loss": 4.978, + "step": 27720 + }, + { + "epoch": 0.03896325152760043, + "grad_norm": 1.2440228462219238, + "learning_rate": 0.00011685120134888294, + "loss": 5.079, + "step": 27730 + }, + { + "epoch": 0.03897730246576401, + "grad_norm": 1.2461313009262085, + "learning_rate": 0.00011689335394126737, + "loss": 5.074, + "step": 27740 + }, + { + "epoch": 0.038991353403927585, + "grad_norm": 1.3326444625854492, + "learning_rate": 0.0001169355065336518, + "loss": 5.1748, + "step": 27750 + }, + { + "epoch": 0.03900540434209117, + "grad_norm": 1.1846364736557007, + "learning_rate": 0.00011697765912603625, + "loss": 5.1285, + "step": 27760 + }, + { + "epoch": 0.039019455280254745, + "grad_norm": 1.3407005071640015, + "learning_rate": 0.00011701981171842068, + "loss": 5.1625, + "step": 27770 + }, + { + "epoch": 0.03903350621841832, + "grad_norm": 1.1774260997772217, + "learning_rate": 0.0001170619643108051, + "loss": 5.1255, + "step": 27780 + }, + { + "epoch": 0.0390475571565819, + "grad_norm": 1.2277191877365112, + "learning_rate": 0.00011710411690318954, + "loss": 5.0722, + "step": 27790 + }, + { + "epoch": 0.03906160809474547, + "grad_norm": 1.2346806526184082, + "learning_rate": 0.00011714626949557398, + "loss": 5.3759, + "step": 27800 + }, + { + "epoch": 0.039075659032909056, + "grad_norm": 1.2467949390411377, + "learning_rate": 0.00011718842208795839, + "loss": 5.0743, + "step": 27810 + }, + { + "epoch": 0.03908970997107263, + "grad_norm": 1.2584148645401, + "learning_rate": 0.00011723057468034284, + "loss": 5.2156, + "step": 27820 + }, + { + "epoch": 0.03910376090923621, + "grad_norm": 1.2830735445022583, + "learning_rate": 0.00011727272727272727, + "loss": 5.1202, + "step": 27830 + }, + { + "epoch": 0.039117811847399785, + "grad_norm": 1.2006511688232422, + "learning_rate": 0.00011731487986511169, + "loss": 5.1867, + "step": 27840 + }, + { + "epoch": 0.03913186278556336, + "grad_norm": 1.2319871187210083, + "learning_rate": 0.00011735703245749613, + "loss": 5.1403, + "step": 27850 + }, + { + "epoch": 0.039145913723726944, + "grad_norm": 1.2313419580459595, + "learning_rate": 0.00011739918504988056, + "loss": 5.1664, + "step": 27860 + }, + { + "epoch": 0.03915996466189052, + "grad_norm": 1.2943781614303589, + "learning_rate": 0.00011744133764226498, + "loss": 5.0664, + "step": 27870 + }, + { + "epoch": 0.039174015600054096, + "grad_norm": 1.2293378114700317, + "learning_rate": 0.00011748349023464942, + "loss": 5.0583, + "step": 27880 + }, + { + "epoch": 0.03918806653821767, + "grad_norm": 1.1826175451278687, + "learning_rate": 0.00011752564282703386, + "loss": 5.1246, + "step": 27890 + }, + { + "epoch": 0.03920211747638125, + "grad_norm": 1.1859021186828613, + "learning_rate": 0.00011756779541941827, + "loss": 5.166, + "step": 27900 + }, + { + "epoch": 0.03921616841454483, + "grad_norm": 1.2054551839828491, + "learning_rate": 0.00011760994801180272, + "loss": 5.0331, + "step": 27910 + }, + { + "epoch": 0.03923021935270841, + "grad_norm": 1.2477535009384155, + "learning_rate": 0.0001176478853449487, + "loss": 5.1473, + "step": 27920 + }, + { + "epoch": 0.039244270290871984, + "grad_norm": 1.1738812923431396, + "learning_rate": 0.00011769003793733315, + "loss": 5.2345, + "step": 27930 + }, + { + "epoch": 0.03925832122903556, + "grad_norm": 1.2477853298187256, + "learning_rate": 0.00011773219052971756, + "loss": 5.1245, + "step": 27940 + }, + { + "epoch": 0.039272372167199136, + "grad_norm": 1.1837849617004395, + "learning_rate": 0.000117774343122102, + "loss": 5.2802, + "step": 27950 + }, + { + "epoch": 0.03928642310536272, + "grad_norm": 1.2485941648483276, + "learning_rate": 0.00011781649571448644, + "loss": 5.1342, + "step": 27960 + }, + { + "epoch": 0.039300474043526296, + "grad_norm": 1.2612384557724, + "learning_rate": 0.00011785864830687086, + "loss": 5.2221, + "step": 27970 + }, + { + "epoch": 0.03931452498168987, + "grad_norm": 1.2098948955535889, + "learning_rate": 0.00011790080089925529, + "loss": 5.0834, + "step": 27980 + }, + { + "epoch": 0.03932857591985345, + "grad_norm": 1.2792184352874756, + "learning_rate": 0.00011794295349163973, + "loss": 5.1232, + "step": 27990 + }, + { + "epoch": 0.039342626858017024, + "grad_norm": 1.1822669506072998, + "learning_rate": 0.00011798510608402415, + "loss": 5.1502, + "step": 28000 + }, + { + "epoch": 0.03935667779618061, + "grad_norm": 1.237909197807312, + "learning_rate": 0.00011802725867640858, + "loss": 5.0886, + "step": 28010 + }, + { + "epoch": 0.03937072873434418, + "grad_norm": 1.4007855653762817, + "learning_rate": 0.00011806941126879303, + "loss": 5.1382, + "step": 28020 + }, + { + "epoch": 0.03938477967250776, + "grad_norm": 1.2554928064346313, + "learning_rate": 0.00011811156386117744, + "loss": 5.1784, + "step": 28030 + }, + { + "epoch": 0.039398830610671336, + "grad_norm": 1.2145435810089111, + "learning_rate": 0.00011815371645356187, + "loss": 5.2392, + "step": 28040 + }, + { + "epoch": 0.03941288154883491, + "grad_norm": 1.194926381111145, + "learning_rate": 0.00011819586904594632, + "loss": 5.1088, + "step": 28050 + }, + { + "epoch": 0.03942693248699849, + "grad_norm": 1.2417168617248535, + "learning_rate": 0.00011823802163833075, + "loss": 5.2147, + "step": 28060 + }, + { + "epoch": 0.03944098342516207, + "grad_norm": 1.171698808670044, + "learning_rate": 0.00011828017423071517, + "loss": 5.1953, + "step": 28070 + }, + { + "epoch": 0.03945503436332565, + "grad_norm": 1.2243210077285767, + "learning_rate": 0.00011832232682309961, + "loss": 5.243, + "step": 28080 + }, + { + "epoch": 0.03946908530148922, + "grad_norm": 1.2159587144851685, + "learning_rate": 0.00011836447941548404, + "loss": 5.0568, + "step": 28090 + }, + { + "epoch": 0.0394831362396528, + "grad_norm": 1.2428697347640991, + "learning_rate": 0.00011840663200786846, + "loss": 5.0406, + "step": 28100 + }, + { + "epoch": 0.039497187177816376, + "grad_norm": 1.2648569345474243, + "learning_rate": 0.0001184487846002529, + "loss": 5.222, + "step": 28110 + }, + { + "epoch": 0.03951123811597996, + "grad_norm": 1.2005157470703125, + "learning_rate": 0.00011849093719263734, + "loss": 5.1258, + "step": 28120 + }, + { + "epoch": 0.039525289054143535, + "grad_norm": 1.2685343027114868, + "learning_rate": 0.00011853308978502177, + "loss": 5.064, + "step": 28130 + }, + { + "epoch": 0.03953933999230711, + "grad_norm": 1.2965656518936157, + "learning_rate": 0.0001185752423774062, + "loss": 5.0134, + "step": 28140 + }, + { + "epoch": 0.03955339093047069, + "grad_norm": 1.264532208442688, + "learning_rate": 0.00011861739496979063, + "loss": 5.2289, + "step": 28150 + }, + { + "epoch": 0.039567441868634264, + "grad_norm": 1.3054733276367188, + "learning_rate": 0.00011865954756217506, + "loss": 5.1761, + "step": 28160 + }, + { + "epoch": 0.03958149280679785, + "grad_norm": 1.1878447532653809, + "learning_rate": 0.0001187017001545595, + "loss": 5.1396, + "step": 28170 + }, + { + "epoch": 0.03959554374496142, + "grad_norm": 1.3018479347229004, + "learning_rate": 0.00011874385274694392, + "loss": 5.1258, + "step": 28180 + }, + { + "epoch": 0.039609594683125, + "grad_norm": 1.2443947792053223, + "learning_rate": 0.00011878600533932837, + "loss": 5.1624, + "step": 28190 + }, + { + "epoch": 0.039623645621288575, + "grad_norm": 1.3728779554367065, + "learning_rate": 0.00011882815793171279, + "loss": 5.2995, + "step": 28200 + }, + { + "epoch": 0.03963769655945215, + "grad_norm": 1.1758084297180176, + "learning_rate": 0.00011887031052409722, + "loss": 5.1263, + "step": 28210 + }, + { + "epoch": 0.039651747497615734, + "grad_norm": 1.2095502614974976, + "learning_rate": 0.00011891246311648166, + "loss": 5.0737, + "step": 28220 + }, + { + "epoch": 0.03966579843577931, + "grad_norm": 1.2369705438613892, + "learning_rate": 0.00011895461570886608, + "loss": 5.2049, + "step": 28230 + }, + { + "epoch": 0.03967984937394289, + "grad_norm": 1.2536842823028564, + "learning_rate": 0.00011899676830125051, + "loss": 5.1596, + "step": 28240 + }, + { + "epoch": 0.03969390031210646, + "grad_norm": 1.2963687181472778, + "learning_rate": 0.00011903892089363496, + "loss": 5.0851, + "step": 28250 + }, + { + "epoch": 0.03970795125027004, + "grad_norm": 1.3036259412765503, + "learning_rate": 0.00011908107348601937, + "loss": 5.2885, + "step": 28260 + }, + { + "epoch": 0.03972200218843362, + "grad_norm": 1.2262283563613892, + "learning_rate": 0.0001191232260784038, + "loss": 5.2086, + "step": 28270 + }, + { + "epoch": 0.0397360531265972, + "grad_norm": 1.234108805656433, + "learning_rate": 0.00011916537867078825, + "loss": 5.185, + "step": 28280 + }, + { + "epoch": 0.039750104064760775, + "grad_norm": 1.2334399223327637, + "learning_rate": 0.00011920753126317267, + "loss": 5.1078, + "step": 28290 + }, + { + "epoch": 0.03976415500292435, + "grad_norm": 1.2662261724472046, + "learning_rate": 0.0001192496838555571, + "loss": 5.1132, + "step": 28300 + }, + { + "epoch": 0.03977820594108793, + "grad_norm": 1.2096099853515625, + "learning_rate": 0.00011929183644794154, + "loss": 5.2598, + "step": 28310 + }, + { + "epoch": 0.03979225687925151, + "grad_norm": 1.2106359004974365, + "learning_rate": 0.00011933398904032596, + "loss": 4.9739, + "step": 28320 + }, + { + "epoch": 0.039806307817415086, + "grad_norm": 1.268173336982727, + "learning_rate": 0.00011937614163271039, + "loss": 5.3448, + "step": 28330 + }, + { + "epoch": 0.03982035875557866, + "grad_norm": 1.2181192636489868, + "learning_rate": 0.00011941829422509484, + "loss": 5.0814, + "step": 28340 + }, + { + "epoch": 0.03983440969374224, + "grad_norm": 1.2973514795303345, + "learning_rate": 0.00011946044681747927, + "loss": 4.9829, + "step": 28350 + }, + { + "epoch": 0.039848460631905815, + "grad_norm": 1.1853080987930298, + "learning_rate": 0.00011950259940986369, + "loss": 5.1086, + "step": 28360 + }, + { + "epoch": 0.03986251157006939, + "grad_norm": 1.2887145280838013, + "learning_rate": 0.00011954475200224813, + "loss": 5.2579, + "step": 28370 + }, + { + "epoch": 0.039876562508232974, + "grad_norm": 1.3244291543960571, + "learning_rate": 0.00011958690459463256, + "loss": 5.0825, + "step": 28380 + }, + { + "epoch": 0.03989061344639655, + "grad_norm": 1.2618260383605957, + "learning_rate": 0.00011962905718701698, + "loss": 5.0181, + "step": 28390 + }, + { + "epoch": 0.039904664384560126, + "grad_norm": 1.179956078529358, + "learning_rate": 0.00011967120977940142, + "loss": 5.2237, + "step": 28400 + }, + { + "epoch": 0.0399187153227237, + "grad_norm": 1.2036820650100708, + "learning_rate": 0.00011971336237178585, + "loss": 5.0647, + "step": 28410 + }, + { + "epoch": 0.03993276626088728, + "grad_norm": 1.2143844366073608, + "learning_rate": 0.00011975551496417029, + "loss": 5.0621, + "step": 28420 + }, + { + "epoch": 0.03994681719905086, + "grad_norm": 1.1590876579284668, + "learning_rate": 0.00011979766755655472, + "loss": 5.0342, + "step": 28430 + }, + { + "epoch": 0.03996086813721444, + "grad_norm": 1.2688127756118774, + "learning_rate": 0.00011983982014893915, + "loss": 5.2687, + "step": 28440 + }, + { + "epoch": 0.039974919075378014, + "grad_norm": 1.236639142036438, + "learning_rate": 0.00011988197274132358, + "loss": 5.0454, + "step": 28450 + }, + { + "epoch": 0.03998897001354159, + "grad_norm": 1.2120410203933716, + "learning_rate": 0.00011992412533370801, + "loss": 5.1101, + "step": 28460 + }, + { + "epoch": 0.040003020951705166, + "grad_norm": 1.2371752262115479, + "learning_rate": 0.00011996627792609244, + "loss": 5.1434, + "step": 28470 + }, + { + "epoch": 0.04001707188986875, + "grad_norm": 1.2628980875015259, + "learning_rate": 0.00012000843051847689, + "loss": 5.058, + "step": 28480 + }, + { + "epoch": 0.040031122828032326, + "grad_norm": 1.2172374725341797, + "learning_rate": 0.0001200505831108613, + "loss": 5.1972, + "step": 28490 + }, + { + "epoch": 0.0400451737661959, + "grad_norm": 1.1844308376312256, + "learning_rate": 0.00012009273570324574, + "loss": 5.1624, + "step": 28500 + }, + { + "epoch": 0.04005922470435948, + "grad_norm": 1.2529436349868774, + "learning_rate": 0.00012013488829563018, + "loss": 5.0565, + "step": 28510 + }, + { + "epoch": 0.040073275642523054, + "grad_norm": 1.2237712144851685, + "learning_rate": 0.0001201770408880146, + "loss": 5.1909, + "step": 28520 + }, + { + "epoch": 0.04008732658068664, + "grad_norm": 1.2147319316864014, + "learning_rate": 0.00012021919348039903, + "loss": 5.0974, + "step": 28530 + }, + { + "epoch": 0.04010137751885021, + "grad_norm": 1.194334626197815, + "learning_rate": 0.00012026134607278347, + "loss": 5.2433, + "step": 28540 + }, + { + "epoch": 0.04011542845701379, + "grad_norm": 1.2386977672576904, + "learning_rate": 0.00012030349866516789, + "loss": 5.2399, + "step": 28550 + }, + { + "epoch": 0.040129479395177366, + "grad_norm": 1.2008346319198608, + "learning_rate": 0.00012034565125755232, + "loss": 5.0889, + "step": 28560 + }, + { + "epoch": 0.04014353033334094, + "grad_norm": 1.1994179487228394, + "learning_rate": 0.00012038780384993677, + "loss": 5.1955, + "step": 28570 + }, + { + "epoch": 0.040157581271504525, + "grad_norm": 1.251625895500183, + "learning_rate": 0.00012042995644232118, + "loss": 5.1563, + "step": 28580 + }, + { + "epoch": 0.0401716322096681, + "grad_norm": 1.3374724388122559, + "learning_rate": 0.00012047210903470562, + "loss": 5.2259, + "step": 28590 + }, + { + "epoch": 0.04018568314783168, + "grad_norm": 1.184990644454956, + "learning_rate": 0.00012051426162709006, + "loss": 5.1774, + "step": 28600 + }, + { + "epoch": 0.04019973408599525, + "grad_norm": 1.2657346725463867, + "learning_rate": 0.00012055641421947449, + "loss": 5.0466, + "step": 28610 + }, + { + "epoch": 0.04021378502415883, + "grad_norm": 1.301020860671997, + "learning_rate": 0.00012059435155262047, + "loss": 5.1848, + "step": 28620 + }, + { + "epoch": 0.04022783596232241, + "grad_norm": 1.2240586280822754, + "learning_rate": 0.0001206365041450049, + "loss": 5.1032, + "step": 28630 + }, + { + "epoch": 0.04024188690048599, + "grad_norm": 1.2186791896820068, + "learning_rate": 0.00012067865673738935, + "loss": 5.1814, + "step": 28640 + }, + { + "epoch": 0.040255937838649565, + "grad_norm": 1.2839056253433228, + "learning_rate": 0.00012072080932977377, + "loss": 5.2019, + "step": 28650 + }, + { + "epoch": 0.04026998877681314, + "grad_norm": 1.2120087146759033, + "learning_rate": 0.0001207629619221582, + "loss": 5.1894, + "step": 28660 + }, + { + "epoch": 0.04028403971497672, + "grad_norm": 1.3091609477996826, + "learning_rate": 0.00012080511451454264, + "loss": 5.2507, + "step": 28670 + }, + { + "epoch": 0.040298090653140294, + "grad_norm": 1.285378098487854, + "learning_rate": 0.00012084726710692706, + "loss": 5.1372, + "step": 28680 + }, + { + "epoch": 0.04031214159130388, + "grad_norm": 1.1991231441497803, + "learning_rate": 0.00012088941969931149, + "loss": 5.1167, + "step": 28690 + }, + { + "epoch": 0.04032619252946745, + "grad_norm": 1.2777354717254639, + "learning_rate": 0.00012093157229169594, + "loss": 5.0535, + "step": 28700 + }, + { + "epoch": 0.04034024346763103, + "grad_norm": 1.2229773998260498, + "learning_rate": 0.00012097372488408035, + "loss": 5.1478, + "step": 28710 + }, + { + "epoch": 0.040354294405794605, + "grad_norm": 1.2146847248077393, + "learning_rate": 0.00012101587747646479, + "loss": 5.1944, + "step": 28720 + }, + { + "epoch": 0.04036834534395818, + "grad_norm": 1.1953580379486084, + "learning_rate": 0.00012105803006884923, + "loss": 5.1611, + "step": 28730 + }, + { + "epoch": 0.040382396282121764, + "grad_norm": 1.2256001234054565, + "learning_rate": 0.00012110018266123366, + "loss": 5.316, + "step": 28740 + }, + { + "epoch": 0.04039644722028534, + "grad_norm": 1.2031221389770508, + "learning_rate": 0.00012114233525361808, + "loss": 5.0734, + "step": 28750 + }, + { + "epoch": 0.04041049815844892, + "grad_norm": 1.2694804668426514, + "learning_rate": 0.00012118448784600252, + "loss": 5.1379, + "step": 28760 + }, + { + "epoch": 0.04042454909661249, + "grad_norm": 1.2258470058441162, + "learning_rate": 0.00012122664043838696, + "loss": 5.0536, + "step": 28770 + }, + { + "epoch": 0.04043860003477607, + "grad_norm": 1.2044484615325928, + "learning_rate": 0.00012126879303077137, + "loss": 5.2435, + "step": 28780 + }, + { + "epoch": 0.04045265097293965, + "grad_norm": 1.27933931350708, + "learning_rate": 0.00012131094562315582, + "loss": 5.1742, + "step": 28790 + }, + { + "epoch": 0.04046670191110323, + "grad_norm": 1.2397048473358154, + "learning_rate": 0.00012135309821554025, + "loss": 5.1537, + "step": 28800 + }, + { + "epoch": 0.040480752849266805, + "grad_norm": 1.3826310634613037, + "learning_rate": 0.00012139525080792467, + "loss": 5.1767, + "step": 28810 + }, + { + "epoch": 0.04049480378743038, + "grad_norm": 1.2300126552581787, + "learning_rate": 0.00012143740340030911, + "loss": 5.0833, + "step": 28820 + }, + { + "epoch": 0.04050885472559396, + "grad_norm": 1.2584890127182007, + "learning_rate": 0.00012147955599269354, + "loss": 5.1247, + "step": 28830 + }, + { + "epoch": 0.04052290566375754, + "grad_norm": 1.433069109916687, + "learning_rate": 0.00012152170858507796, + "loss": 5.133, + "step": 28840 + }, + { + "epoch": 0.040536956601921116, + "grad_norm": 1.2440624237060547, + "learning_rate": 0.0001215638611774624, + "loss": 5.0757, + "step": 28850 + }, + { + "epoch": 0.04055100754008469, + "grad_norm": 1.216599702835083, + "learning_rate": 0.00012160601376984684, + "loss": 4.9827, + "step": 28860 + }, + { + "epoch": 0.04056505847824827, + "grad_norm": 1.3541535139083862, + "learning_rate": 0.00012164816636223125, + "loss": 5.112, + "step": 28870 + }, + { + "epoch": 0.040579109416411845, + "grad_norm": 1.1753549575805664, + "learning_rate": 0.0001216903189546157, + "loss": 5.1629, + "step": 28880 + }, + { + "epoch": 0.04059316035457543, + "grad_norm": 1.2143678665161133, + "learning_rate": 0.00012173247154700013, + "loss": 5.1621, + "step": 28890 + }, + { + "epoch": 0.040607211292739004, + "grad_norm": 1.2164275646209717, + "learning_rate": 0.00012177462413938457, + "loss": 5.1983, + "step": 28900 + }, + { + "epoch": 0.04062126223090258, + "grad_norm": 1.676349401473999, + "learning_rate": 0.00012181677673176899, + "loss": 5.1482, + "step": 28910 + }, + { + "epoch": 0.040635313169066156, + "grad_norm": 1.188805341720581, + "learning_rate": 0.00012185892932415342, + "loss": 5.1028, + "step": 28920 + }, + { + "epoch": 0.04064936410722973, + "grad_norm": 1.304349660873413, + "learning_rate": 0.00012190108191653787, + "loss": 5.1201, + "step": 28930 + }, + { + "epoch": 0.040663415045393315, + "grad_norm": 1.256514549255371, + "learning_rate": 0.00012194323450892229, + "loss": 5.1092, + "step": 28940 + }, + { + "epoch": 0.04067746598355689, + "grad_norm": 1.1946409940719604, + "learning_rate": 0.00012198538710130672, + "loss": 5.2572, + "step": 28950 + }, + { + "epoch": 0.04069151692172047, + "grad_norm": 1.2821025848388672, + "learning_rate": 0.00012202753969369116, + "loss": 5.16, + "step": 28960 + }, + { + "epoch": 0.040705567859884044, + "grad_norm": 1.2164089679718018, + "learning_rate": 0.00012206969228607558, + "loss": 5.1757, + "step": 28970 + }, + { + "epoch": 0.04071961879804762, + "grad_norm": 1.3499404191970825, + "learning_rate": 0.00012211184487846002, + "loss": 5.1214, + "step": 28980 + }, + { + "epoch": 0.040733669736211196, + "grad_norm": 1.1832070350646973, + "learning_rate": 0.00012215399747084445, + "loss": 5.1601, + "step": 28990 + }, + { + "epoch": 0.04074772067437478, + "grad_norm": 1.2199169397354126, + "learning_rate": 0.00012219615006322886, + "loss": 5.046, + "step": 29000 + }, + { + "epoch": 0.040761771612538356, + "grad_norm": 1.2331231832504272, + "learning_rate": 0.00012223830265561332, + "loss": 5.0809, + "step": 29010 + }, + { + "epoch": 0.04077582255070193, + "grad_norm": 1.2529569864273071, + "learning_rate": 0.00012228045524799775, + "loss": 5.0944, + "step": 29020 + }, + { + "epoch": 0.04078987348886551, + "grad_norm": 1.2748099565505981, + "learning_rate": 0.00012232260784038218, + "loss": 4.9984, + "step": 29030 + }, + { + "epoch": 0.040803924427029084, + "grad_norm": 1.151564121246338, + "learning_rate": 0.0001223647604327666, + "loss": 5.0823, + "step": 29040 + }, + { + "epoch": 0.04081797536519267, + "grad_norm": 1.1809443235397339, + "learning_rate": 0.00012240691302515104, + "loss": 5.1298, + "step": 29050 + }, + { + "epoch": 0.04083202630335624, + "grad_norm": 1.2853679656982422, + "learning_rate": 0.00012244906561753547, + "loss": 5.1489, + "step": 29060 + }, + { + "epoch": 0.04084607724151982, + "grad_norm": 1.2451461553573608, + "learning_rate": 0.0001224912182099199, + "loss": 5.2294, + "step": 29070 + }, + { + "epoch": 0.040860128179683396, + "grad_norm": 1.1953366994857788, + "learning_rate": 0.00012253337080230434, + "loss": 5.1022, + "step": 29080 + }, + { + "epoch": 0.04087417911784697, + "grad_norm": 1.1801906824111938, + "learning_rate": 0.00012257552339468877, + "loss": 5.0872, + "step": 29090 + }, + { + "epoch": 0.040888230056010555, + "grad_norm": 1.2454745769500732, + "learning_rate": 0.0001226176759870732, + "loss": 5.1487, + "step": 29100 + }, + { + "epoch": 0.04090228099417413, + "grad_norm": 1.5468188524246216, + "learning_rate": 0.00012265982857945763, + "loss": 5.094, + "step": 29110 + }, + { + "epoch": 0.04091633193233771, + "grad_norm": 1.2361538410186768, + "learning_rate": 0.00012270198117184206, + "loss": 5.1832, + "step": 29120 + }, + { + "epoch": 0.04093038287050128, + "grad_norm": 1.2000207901000977, + "learning_rate": 0.0001227441337642265, + "loss": 5.1678, + "step": 29130 + }, + { + "epoch": 0.04094443380866486, + "grad_norm": 1.1986608505249023, + "learning_rate": 0.00012278628635661092, + "loss": 5.1264, + "step": 29140 + }, + { + "epoch": 0.04095848474682844, + "grad_norm": 1.2702739238739014, + "learning_rate": 0.00012282843894899535, + "loss": 5.1283, + "step": 29150 + }, + { + "epoch": 0.04097253568499202, + "grad_norm": 1.256484031677246, + "learning_rate": 0.00012287059154137978, + "loss": 5.1021, + "step": 29160 + }, + { + "epoch": 0.040986586623155595, + "grad_norm": 1.2504194974899292, + "learning_rate": 0.00012291274413376422, + "loss": 5.1583, + "step": 29170 + }, + { + "epoch": 0.04100063756131917, + "grad_norm": 1.193787693977356, + "learning_rate": 0.00012295489672614865, + "loss": 5.1885, + "step": 29180 + }, + { + "epoch": 0.04101468849948275, + "grad_norm": 1.2166396379470825, + "learning_rate": 0.00012299704931853308, + "loss": 4.9795, + "step": 29190 + }, + { + "epoch": 0.04102873943764633, + "grad_norm": 1.3542882204055786, + "learning_rate": 0.0001230392019109175, + "loss": 5.0044, + "step": 29200 + }, + { + "epoch": 0.04104279037580991, + "grad_norm": 1.3186988830566406, + "learning_rate": 0.00012308135450330194, + "loss": 5.0079, + "step": 29210 + }, + { + "epoch": 0.04105684131397348, + "grad_norm": 1.174622893333435, + "learning_rate": 0.00012312350709568637, + "loss": 5.1067, + "step": 29220 + }, + { + "epoch": 0.04107089225213706, + "grad_norm": 1.1636348962783813, + "learning_rate": 0.0001231656596880708, + "loss": 5.1078, + "step": 29230 + }, + { + "epoch": 0.041084943190300635, + "grad_norm": 1.1850835084915161, + "learning_rate": 0.00012320781228045523, + "loss": 5.1331, + "step": 29240 + }, + { + "epoch": 0.04109899412846422, + "grad_norm": 1.2667229175567627, + "learning_rate": 0.00012324996487283966, + "loss": 5.0366, + "step": 29250 + }, + { + "epoch": 0.041113045066627794, + "grad_norm": 1.217487096786499, + "learning_rate": 0.0001232921174652241, + "loss": 5.1203, + "step": 29260 + }, + { + "epoch": 0.04112709600479137, + "grad_norm": 1.2223021984100342, + "learning_rate": 0.00012333427005760853, + "loss": 5.0865, + "step": 29270 + }, + { + "epoch": 0.04114114694295495, + "grad_norm": 1.1677926778793335, + "learning_rate": 0.00012337642264999296, + "loss": 5.1227, + "step": 29280 + }, + { + "epoch": 0.04115519788111852, + "grad_norm": 1.197870135307312, + "learning_rate": 0.0001234185752423774, + "loss": 5.1118, + "step": 29290 + }, + { + "epoch": 0.0411692488192821, + "grad_norm": 1.2114304304122925, + "learning_rate": 0.00012346072783476182, + "loss": 5.1161, + "step": 29300 + }, + { + "epoch": 0.04118329975744568, + "grad_norm": 1.266849160194397, + "learning_rate": 0.00012350288042714625, + "loss": 5.0399, + "step": 29310 + }, + { + "epoch": 0.04119735069560926, + "grad_norm": 1.3081083297729492, + "learning_rate": 0.0001235450330195307, + "loss": 5.1199, + "step": 29320 + }, + { + "epoch": 0.041211401633772834, + "grad_norm": 1.2739002704620361, + "learning_rate": 0.00012358718561191511, + "loss": 5.0112, + "step": 29330 + }, + { + "epoch": 0.04122545257193641, + "grad_norm": 1.150550127029419, + "learning_rate": 0.00012362933820429955, + "loss": 5.0035, + "step": 29340 + }, + { + "epoch": 0.04123950351009999, + "grad_norm": 1.1802303791046143, + "learning_rate": 0.000123671490796684, + "loss": 5.0593, + "step": 29350 + }, + { + "epoch": 0.04125355444826357, + "grad_norm": 1.209800124168396, + "learning_rate": 0.0001237136433890684, + "loss": 5.0386, + "step": 29360 + }, + { + "epoch": 0.041267605386427146, + "grad_norm": 1.1998001337051392, + "learning_rate": 0.00012375579598145284, + "loss": 5.1109, + "step": 29370 + }, + { + "epoch": 0.04128165632459072, + "grad_norm": 1.1864341497421265, + "learning_rate": 0.0001237979485738373, + "loss": 5.1374, + "step": 29380 + }, + { + "epoch": 0.0412957072627543, + "grad_norm": 1.1645110845565796, + "learning_rate": 0.0001238401011662217, + "loss": 5.0924, + "step": 29390 + }, + { + "epoch": 0.041309758200917875, + "grad_norm": 1.193508505821228, + "learning_rate": 0.00012388225375860613, + "loss": 5.1911, + "step": 29400 + }, + { + "epoch": 0.04132380913908146, + "grad_norm": 1.2677333354949951, + "learning_rate": 0.0001239244063509906, + "loss": 5.1231, + "step": 29410 + }, + { + "epoch": 0.041337860077245034, + "grad_norm": 1.1700749397277832, + "learning_rate": 0.000123966558943375, + "loss": 5.1528, + "step": 29420 + }, + { + "epoch": 0.04135191101540861, + "grad_norm": 1.1878379583358765, + "learning_rate": 0.00012400871153575943, + "loss": 5.2438, + "step": 29430 + }, + { + "epoch": 0.041365961953572186, + "grad_norm": 1.2088021039962769, + "learning_rate": 0.00012405086412814388, + "loss": 5.0927, + "step": 29440 + }, + { + "epoch": 0.04138001289173576, + "grad_norm": 1.279140830039978, + "learning_rate": 0.00012409301672052832, + "loss": 5.0007, + "step": 29450 + }, + { + "epoch": 0.041394063829899345, + "grad_norm": 1.22353196144104, + "learning_rate": 0.00012413516931291272, + "loss": 5.1005, + "step": 29460 + }, + { + "epoch": 0.04140811476806292, + "grad_norm": 1.2469463348388672, + "learning_rate": 0.00012417732190529718, + "loss": 5.0887, + "step": 29470 + }, + { + "epoch": 0.0414221657062265, + "grad_norm": 1.2041430473327637, + "learning_rate": 0.0001242194744976816, + "loss": 5.1795, + "step": 29480 + }, + { + "epoch": 0.041436216644390074, + "grad_norm": 1.2046658992767334, + "learning_rate": 0.000124261627090066, + "loss": 5.1795, + "step": 29490 + }, + { + "epoch": 0.04145026758255365, + "grad_norm": 1.1940624713897705, + "learning_rate": 0.00012430377968245047, + "loss": 5.1144, + "step": 29500 + }, + { + "epoch": 0.04146431852071723, + "grad_norm": 1.2514973878860474, + "learning_rate": 0.0001243459322748349, + "loss": 5.0636, + "step": 29510 + }, + { + "epoch": 0.04147836945888081, + "grad_norm": 1.166250228881836, + "learning_rate": 0.0001243880848672193, + "loss": 5.1468, + "step": 29520 + }, + { + "epoch": 0.041492420397044386, + "grad_norm": 1.201289176940918, + "learning_rate": 0.00012443023745960376, + "loss": 5.0477, + "step": 29530 + }, + { + "epoch": 0.04150647133520796, + "grad_norm": 1.2102603912353516, + "learning_rate": 0.0001244723900519882, + "loss": 4.9634, + "step": 29540 + }, + { + "epoch": 0.04152052227337154, + "grad_norm": 1.303805947303772, + "learning_rate": 0.0001245145426443726, + "loss": 5.0351, + "step": 29550 + }, + { + "epoch": 0.04153457321153512, + "grad_norm": 1.160770058631897, + "learning_rate": 0.00012455669523675706, + "loss": 5.224, + "step": 29560 + }, + { + "epoch": 0.0415486241496987, + "grad_norm": 1.1994746923446655, + "learning_rate": 0.0001245988478291415, + "loss": 5.1209, + "step": 29570 + }, + { + "epoch": 0.04156267508786227, + "grad_norm": 1.2357219457626343, + "learning_rate": 0.0001246410004215259, + "loss": 5.0569, + "step": 29580 + }, + { + "epoch": 0.04157672602602585, + "grad_norm": 1.1528397798538208, + "learning_rate": 0.00012468315301391035, + "loss": 5.1259, + "step": 29590 + }, + { + "epoch": 0.041590776964189426, + "grad_norm": 1.1492503881454468, + "learning_rate": 0.00012472530560629478, + "loss": 5.2507, + "step": 29600 + }, + { + "epoch": 0.041604827902353, + "grad_norm": 1.1577057838439941, + "learning_rate": 0.00012476745819867921, + "loss": 5.2534, + "step": 29610 + }, + { + "epoch": 0.041618878840516585, + "grad_norm": 1.2352550029754639, + "learning_rate": 0.00012480961079106365, + "loss": 5.0825, + "step": 29620 + }, + { + "epoch": 0.04163292977868016, + "grad_norm": 1.359626054763794, + "learning_rate": 0.00012485176338344808, + "loss": 5.0889, + "step": 29630 + }, + { + "epoch": 0.04164698071684374, + "grad_norm": 1.2673550844192505, + "learning_rate": 0.0001248939159758325, + "loss": 5.0325, + "step": 29640 + }, + { + "epoch": 0.04166103165500731, + "grad_norm": 1.1786859035491943, + "learning_rate": 0.00012493606856821694, + "loss": 5.1071, + "step": 29650 + }, + { + "epoch": 0.04167508259317089, + "grad_norm": 1.209380030632019, + "learning_rate": 0.00012497822116060137, + "loss": 5.172, + "step": 29660 + }, + { + "epoch": 0.04168913353133447, + "grad_norm": 1.3903775215148926, + "learning_rate": 0.0001250203737529858, + "loss": 5.128, + "step": 29670 + }, + { + "epoch": 0.04170318446949805, + "grad_norm": 1.2083300352096558, + "learning_rate": 0.00012506252634537023, + "loss": 5.2088, + "step": 29680 + }, + { + "epoch": 0.041717235407661625, + "grad_norm": 1.3227828741073608, + "learning_rate": 0.00012510467893775466, + "loss": 5.0573, + "step": 29690 + }, + { + "epoch": 0.0417312863458252, + "grad_norm": 1.19940185546875, + "learning_rate": 0.0001251468315301391, + "loss": 5.0194, + "step": 29700 + }, + { + "epoch": 0.04174533728398878, + "grad_norm": 1.232577919960022, + "learning_rate": 0.00012518898412252353, + "loss": 5.1452, + "step": 29710 + }, + { + "epoch": 0.04175938822215236, + "grad_norm": 1.2753794193267822, + "learning_rate": 0.00012523113671490796, + "loss": 4.9559, + "step": 29720 + }, + { + "epoch": 0.04177343916031594, + "grad_norm": 1.3034454584121704, + "learning_rate": 0.0001252732893072924, + "loss": 5.1681, + "step": 29730 + }, + { + "epoch": 0.04178749009847951, + "grad_norm": 1.1941345930099487, + "learning_rate": 0.00012531544189967682, + "loss": 5.0454, + "step": 29740 + }, + { + "epoch": 0.04180154103664309, + "grad_norm": 1.2708008289337158, + "learning_rate": 0.00012535759449206125, + "loss": 5.0656, + "step": 29750 + }, + { + "epoch": 0.041815591974806665, + "grad_norm": 1.1832579374313354, + "learning_rate": 0.00012539974708444568, + "loss": 5.0031, + "step": 29760 + }, + { + "epoch": 0.04182964291297025, + "grad_norm": 1.1988276243209839, + "learning_rate": 0.0001254418996768301, + "loss": 5.0703, + "step": 29770 + }, + { + "epoch": 0.041843693851133824, + "grad_norm": 1.2514029741287231, + "learning_rate": 0.00012548405226921454, + "loss": 5.1405, + "step": 29780 + }, + { + "epoch": 0.0418577447892974, + "grad_norm": 1.1804388761520386, + "learning_rate": 0.00012552620486159898, + "loss": 5.0957, + "step": 29790 + }, + { + "epoch": 0.04187179572746098, + "grad_norm": 1.2735486030578613, + "learning_rate": 0.0001255683574539834, + "loss": 5.0302, + "step": 29800 + }, + { + "epoch": 0.04188584666562455, + "grad_norm": 1.2295114994049072, + "learning_rate": 0.00012561051004636784, + "loss": 5.2056, + "step": 29810 + }, + { + "epoch": 0.041899897603788136, + "grad_norm": 1.1553417444229126, + "learning_rate": 0.00012565266263875227, + "loss": 5.0882, + "step": 29820 + }, + { + "epoch": 0.04191394854195171, + "grad_norm": 1.304266333580017, + "learning_rate": 0.0001256948152311367, + "loss": 5.2603, + "step": 29830 + }, + { + "epoch": 0.04192799948011529, + "grad_norm": 1.1928671598434448, + "learning_rate": 0.00012573696782352113, + "loss": 5.1606, + "step": 29840 + }, + { + "epoch": 0.041942050418278864, + "grad_norm": 1.2966556549072266, + "learning_rate": 0.00012577912041590556, + "loss": 5.1442, + "step": 29850 + }, + { + "epoch": 0.04195610135644244, + "grad_norm": 1.3299918174743652, + "learning_rate": 0.00012582127300829, + "loss": 5.0088, + "step": 29860 + }, + { + "epoch": 0.041970152294606024, + "grad_norm": 1.2035354375839233, + "learning_rate": 0.00012586342560067442, + "loss": 5.0605, + "step": 29870 + }, + { + "epoch": 0.0419842032327696, + "grad_norm": 1.1587860584259033, + "learning_rate": 0.00012590557819305886, + "loss": 5.262, + "step": 29880 + }, + { + "epoch": 0.041998254170933176, + "grad_norm": 1.212730884552002, + "learning_rate": 0.0001259477307854433, + "loss": 5.1807, + "step": 29890 + }, + { + "epoch": 0.04201230510909675, + "grad_norm": 1.2755563259124756, + "learning_rate": 0.00012598988337782774, + "loss": 5.1815, + "step": 29900 + }, + { + "epoch": 0.04202635604726033, + "grad_norm": 1.2221256494522095, + "learning_rate": 0.00012603203597021215, + "loss": 5.1726, + "step": 29910 + }, + { + "epoch": 0.042040406985423905, + "grad_norm": 1.246488332748413, + "learning_rate": 0.00012607418856259658, + "loss": 5.1286, + "step": 29920 + }, + { + "epoch": 0.04205445792358749, + "grad_norm": 1.2140997648239136, + "learning_rate": 0.00012611634115498104, + "loss": 5.1014, + "step": 29930 + }, + { + "epoch": 0.042068508861751064, + "grad_norm": 1.2002910375595093, + "learning_rate": 0.00012615849374736544, + "loss": 5.1418, + "step": 29940 + }, + { + "epoch": 0.04208255979991464, + "grad_norm": 1.1797535419464111, + "learning_rate": 0.00012620064633974987, + "loss": 5.2805, + "step": 29950 + }, + { + "epoch": 0.042096610738078216, + "grad_norm": 1.2059688568115234, + "learning_rate": 0.00012624279893213433, + "loss": 5.1886, + "step": 29960 + }, + { + "epoch": 0.04211066167624179, + "grad_norm": 1.22100031375885, + "learning_rate": 0.00012628495152451874, + "loss": 5.1398, + "step": 29970 + }, + { + "epoch": 0.042124712614405375, + "grad_norm": 1.2340439558029175, + "learning_rate": 0.00012632710411690317, + "loss": 5.1082, + "step": 29980 + }, + { + "epoch": 0.04213876355256895, + "grad_norm": 1.218373417854309, + "learning_rate": 0.00012636925670928763, + "loss": 5.1635, + "step": 29990 + }, + { + "epoch": 0.04215281449073253, + "grad_norm": 1.2156459093093872, + "learning_rate": 0.00012641140930167203, + "loss": 5.1422, + "step": 30000 + }, + { + "epoch": 0.042166865428896104, + "grad_norm": 1.1739648580551147, + "learning_rate": 0.00012645356189405646, + "loss": 5.2125, + "step": 30010 + }, + { + "epoch": 0.04218091636705968, + "grad_norm": 1.2138736248016357, + "learning_rate": 0.00012649571448644092, + "loss": 5.1365, + "step": 30020 + }, + { + "epoch": 0.04219496730522326, + "grad_norm": 1.1886487007141113, + "learning_rate": 0.00012653786707882535, + "loss": 5.0892, + "step": 30030 + }, + { + "epoch": 0.04220901824338684, + "grad_norm": 1.1931877136230469, + "learning_rate": 0.00012658001967120975, + "loss": 5.091, + "step": 30040 + }, + { + "epoch": 0.042223069181550416, + "grad_norm": 1.1976298093795776, + "learning_rate": 0.0001266221722635942, + "loss": 5.243, + "step": 30050 + }, + { + "epoch": 0.04223712011971399, + "grad_norm": 1.1765670776367188, + "learning_rate": 0.00012666432485597864, + "loss": 5.1197, + "step": 30060 + }, + { + "epoch": 0.04225117105787757, + "grad_norm": 1.182282567024231, + "learning_rate": 0.00012670647744836305, + "loss": 5.0915, + "step": 30070 + }, + { + "epoch": 0.04226522199604115, + "grad_norm": 1.2112866640090942, + "learning_rate": 0.0001267486300407475, + "loss": 5.1765, + "step": 30080 + }, + { + "epoch": 0.04227927293420473, + "grad_norm": 1.2055011987686157, + "learning_rate": 0.00012679078263313194, + "loss": 5.0592, + "step": 30090 + }, + { + "epoch": 0.0422933238723683, + "grad_norm": 1.2951544523239136, + "learning_rate": 0.00012683293522551634, + "loss": 5.0923, + "step": 30100 + }, + { + "epoch": 0.04230737481053188, + "grad_norm": 1.1470328569412231, + "learning_rate": 0.0001268750878179008, + "loss": 5.103, + "step": 30110 + }, + { + "epoch": 0.042321425748695456, + "grad_norm": 1.1909065246582031, + "learning_rate": 0.00012691724041028523, + "loss": 5.2148, + "step": 30120 + }, + { + "epoch": 0.04233547668685904, + "grad_norm": 1.2222651243209839, + "learning_rate": 0.00012695939300266963, + "loss": 5.07, + "step": 30130 + }, + { + "epoch": 0.042349527625022615, + "grad_norm": 1.1813335418701172, + "learning_rate": 0.0001270015455950541, + "loss": 5.0987, + "step": 30140 + }, + { + "epoch": 0.04236357856318619, + "grad_norm": 1.1861828565597534, + "learning_rate": 0.00012704369818743852, + "loss": 5.1389, + "step": 30150 + }, + { + "epoch": 0.04237762950134977, + "grad_norm": 1.2178916931152344, + "learning_rate": 0.00012708585077982293, + "loss": 5.1295, + "step": 30160 + }, + { + "epoch": 0.04239168043951334, + "grad_norm": 1.1351721286773682, + "learning_rate": 0.0001271280033722074, + "loss": 5.069, + "step": 30170 + }, + { + "epoch": 0.042405731377676927, + "grad_norm": 1.2155929803848267, + "learning_rate": 0.00012717015596459182, + "loss": 5.1474, + "step": 30180 + }, + { + "epoch": 0.0424197823158405, + "grad_norm": 1.1644866466522217, + "learning_rate": 0.00012721230855697625, + "loss": 5.1563, + "step": 30190 + }, + { + "epoch": 0.04243383325400408, + "grad_norm": 1.1552120447158813, + "learning_rate": 0.00012725446114936068, + "loss": 5.2732, + "step": 30200 + }, + { + "epoch": 0.042447884192167655, + "grad_norm": 1.1935938596725464, + "learning_rate": 0.0001272966137417451, + "loss": 5.0557, + "step": 30210 + }, + { + "epoch": 0.04246193513033123, + "grad_norm": 1.207800030708313, + "learning_rate": 0.00012733876633412954, + "loss": 5.0201, + "step": 30220 + }, + { + "epoch": 0.04247598606849481, + "grad_norm": 1.2109053134918213, + "learning_rate": 0.00012738091892651397, + "loss": 5.0742, + "step": 30230 + }, + { + "epoch": 0.04249003700665839, + "grad_norm": 1.20108962059021, + "learning_rate": 0.0001274230715188984, + "loss": 5.1348, + "step": 30240 + }, + { + "epoch": 0.04250408794482197, + "grad_norm": 1.1453440189361572, + "learning_rate": 0.00012746522411128284, + "loss": 5.1966, + "step": 30250 + }, + { + "epoch": 0.04251813888298554, + "grad_norm": 1.1746257543563843, + "learning_rate": 0.00012750737670366727, + "loss": 5.0678, + "step": 30260 + }, + { + "epoch": 0.04253218982114912, + "grad_norm": 1.5763996839523315, + "learning_rate": 0.0001275495292960517, + "loss": 5.0624, + "step": 30270 + }, + { + "epoch": 0.042546240759312695, + "grad_norm": 1.2596639394760132, + "learning_rate": 0.00012759168188843613, + "loss": 5.0564, + "step": 30280 + }, + { + "epoch": 0.04256029169747628, + "grad_norm": 1.1996608972549438, + "learning_rate": 0.00012763383448082056, + "loss": 4.9513, + "step": 30290 + }, + { + "epoch": 0.042574342635639854, + "grad_norm": 1.2479219436645508, + "learning_rate": 0.000127675987073205, + "loss": 5.0576, + "step": 30300 + }, + { + "epoch": 0.04258839357380343, + "grad_norm": 1.2073161602020264, + "learning_rate": 0.00012771813966558942, + "loss": 5.1667, + "step": 30310 + }, + { + "epoch": 0.04260244451196701, + "grad_norm": 1.2177608013153076, + "learning_rate": 0.00012776029225797385, + "loss": 4.9945, + "step": 30320 + }, + { + "epoch": 0.04261649545013058, + "grad_norm": 1.213683843612671, + "learning_rate": 0.00012780244485035829, + "loss": 5.1458, + "step": 30330 + }, + { + "epoch": 0.042630546388294166, + "grad_norm": 1.2280396223068237, + "learning_rate": 0.00012784459744274272, + "loss": 5.0936, + "step": 30340 + }, + { + "epoch": 0.04264459732645774, + "grad_norm": 1.2426551580429077, + "learning_rate": 0.00012788675003512715, + "loss": 5.0878, + "step": 30350 + }, + { + "epoch": 0.04265864826462132, + "grad_norm": 1.2090293169021606, + "learning_rate": 0.00012792890262751158, + "loss": 5.1355, + "step": 30360 + }, + { + "epoch": 0.042672699202784894, + "grad_norm": 1.1868354082107544, + "learning_rate": 0.000127971055219896, + "loss": 5.0465, + "step": 30370 + }, + { + "epoch": 0.04268675014094847, + "grad_norm": 1.1995588541030884, + "learning_rate": 0.00012801320781228044, + "loss": 5.1283, + "step": 30380 + }, + { + "epoch": 0.042700801079112054, + "grad_norm": 1.3185629844665527, + "learning_rate": 0.00012805536040466487, + "loss": 5.1045, + "step": 30390 + }, + { + "epoch": 0.04271485201727563, + "grad_norm": 1.1966207027435303, + "learning_rate": 0.0001280975129970493, + "loss": 4.9284, + "step": 30400 + }, + { + "epoch": 0.042728902955439206, + "grad_norm": 1.1545592546463013, + "learning_rate": 0.00012813966558943373, + "loss": 5.1675, + "step": 30410 + }, + { + "epoch": 0.04274295389360278, + "grad_norm": 1.1954106092453003, + "learning_rate": 0.00012818181818181817, + "loss": 5.1079, + "step": 30420 + }, + { + "epoch": 0.04275700483176636, + "grad_norm": 1.1693055629730225, + "learning_rate": 0.0001282239707742026, + "loss": 5.2517, + "step": 30430 + }, + { + "epoch": 0.04277105576992994, + "grad_norm": 1.2295222282409668, + "learning_rate": 0.00012826612336658703, + "loss": 5.117, + "step": 30440 + }, + { + "epoch": 0.04278510670809352, + "grad_norm": 1.2061803340911865, + "learning_rate": 0.00012830827595897149, + "loss": 5.0885, + "step": 30450 + }, + { + "epoch": 0.042799157646257094, + "grad_norm": 1.1537740230560303, + "learning_rate": 0.0001283504285513559, + "loss": 5.1159, + "step": 30460 + }, + { + "epoch": 0.04281320858442067, + "grad_norm": 1.150728464126587, + "learning_rate": 0.00012839258114374032, + "loss": 5.2855, + "step": 30470 + }, + { + "epoch": 0.042827259522584246, + "grad_norm": 1.178875207901001, + "learning_rate": 0.00012843473373612478, + "loss": 5.2431, + "step": 30480 + }, + { + "epoch": 0.04284131046074783, + "grad_norm": 1.1979405879974365, + "learning_rate": 0.00012847688632850918, + "loss": 5.0651, + "step": 30490 + }, + { + "epoch": 0.042855361398911405, + "grad_norm": 1.142595648765564, + "learning_rate": 0.00012851903892089361, + "loss": 5.0605, + "step": 30500 + }, + { + "epoch": 0.04286941233707498, + "grad_norm": 1.142235279083252, + "learning_rate": 0.00012856119151327807, + "loss": 5.1623, + "step": 30510 + }, + { + "epoch": 0.04288346327523856, + "grad_norm": 1.2095253467559814, + "learning_rate": 0.00012860334410566248, + "loss": 5.0609, + "step": 30520 + }, + { + "epoch": 0.042897514213402134, + "grad_norm": 1.1336437463760376, + "learning_rate": 0.0001286454966980469, + "loss": 5.0318, + "step": 30530 + }, + { + "epoch": 0.04291156515156572, + "grad_norm": 1.2415790557861328, + "learning_rate": 0.00012868764929043137, + "loss": 5.2027, + "step": 30540 + }, + { + "epoch": 0.04292561608972929, + "grad_norm": 1.2634472846984863, + "learning_rate": 0.00012872980188281577, + "loss": 5.1282, + "step": 30550 + }, + { + "epoch": 0.04293966702789287, + "grad_norm": 1.198366403579712, + "learning_rate": 0.0001287719544752002, + "loss": 5.1052, + "step": 30560 + }, + { + "epoch": 0.042953717966056446, + "grad_norm": 1.2169760465621948, + "learning_rate": 0.00012881410706758466, + "loss": 5.0278, + "step": 30570 + }, + { + "epoch": 0.04296776890422002, + "grad_norm": 1.2113124132156372, + "learning_rate": 0.00012885625965996906, + "loss": 5.0952, + "step": 30580 + }, + { + "epoch": 0.0429818198423836, + "grad_norm": 1.247658371925354, + "learning_rate": 0.0001288984122523535, + "loss": 5.1624, + "step": 30590 + }, + { + "epoch": 0.04299587078054718, + "grad_norm": 1.3335022926330566, + "learning_rate": 0.00012894056484473795, + "loss": 5.016, + "step": 30600 + }, + { + "epoch": 0.04300992171871076, + "grad_norm": 1.1578710079193115, + "learning_rate": 0.00012898271743712238, + "loss": 5.2555, + "step": 30610 + }, + { + "epoch": 0.04302397265687433, + "grad_norm": 1.1831419467926025, + "learning_rate": 0.0001290248700295068, + "loss": 5.2272, + "step": 30620 + }, + { + "epoch": 0.04303802359503791, + "grad_norm": 1.3762239217758179, + "learning_rate": 0.00012906702262189125, + "loss": 5.1915, + "step": 30630 + }, + { + "epoch": 0.043052074533201486, + "grad_norm": 1.204743504524231, + "learning_rate": 0.00012910917521427568, + "loss": 5.1658, + "step": 30640 + }, + { + "epoch": 0.04306612547136507, + "grad_norm": 1.2079994678497314, + "learning_rate": 0.00012915132780666008, + "loss": 5.0166, + "step": 30650 + }, + { + "epoch": 0.043080176409528645, + "grad_norm": 1.2461872100830078, + "learning_rate": 0.00012919348039904454, + "loss": 5.0831, + "step": 30660 + }, + { + "epoch": 0.04309422734769222, + "grad_norm": 1.1971070766448975, + "learning_rate": 0.00012923563299142897, + "loss": 5.1336, + "step": 30670 + }, + { + "epoch": 0.0431082782858558, + "grad_norm": 1.1303532123565674, + "learning_rate": 0.00012927778558381338, + "loss": 5.0732, + "step": 30680 + }, + { + "epoch": 0.04312232922401937, + "grad_norm": 1.1667498350143433, + "learning_rate": 0.00012931993817619783, + "loss": 5.2159, + "step": 30690 + }, + { + "epoch": 0.043136380162182957, + "grad_norm": 1.159986972808838, + "learning_rate": 0.00012936209076858227, + "loss": 5.1623, + "step": 30700 + }, + { + "epoch": 0.04315043110034653, + "grad_norm": 1.1590168476104736, + "learning_rate": 0.00012940424336096667, + "loss": 5.1168, + "step": 30710 + }, + { + "epoch": 0.04316448203851011, + "grad_norm": 1.156524658203125, + "learning_rate": 0.00012944639595335113, + "loss": 5.0177, + "step": 30720 + }, + { + "epoch": 0.043178532976673685, + "grad_norm": 1.2327642440795898, + "learning_rate": 0.00012948854854573556, + "loss": 5.1663, + "step": 30730 + }, + { + "epoch": 0.04319258391483726, + "grad_norm": 1.215132236480713, + "learning_rate": 0.00012953070113812, + "loss": 5.1787, + "step": 30740 + }, + { + "epoch": 0.043206634853000844, + "grad_norm": 1.198944091796875, + "learning_rate": 0.00012957285373050442, + "loss": 5.1268, + "step": 30750 + }, + { + "epoch": 0.04322068579116442, + "grad_norm": 1.1945573091506958, + "learning_rate": 0.00012961500632288885, + "loss": 5.1569, + "step": 30760 + }, + { + "epoch": 0.043234736729328, + "grad_norm": 1.1851938962936401, + "learning_rate": 0.00012965715891527328, + "loss": 5.0766, + "step": 30770 + }, + { + "epoch": 0.04324878766749157, + "grad_norm": 1.2323837280273438, + "learning_rate": 0.00012969931150765771, + "loss": 5.0651, + "step": 30780 + }, + { + "epoch": 0.04326283860565515, + "grad_norm": 1.1993993520736694, + "learning_rate": 0.00012974146410004215, + "loss": 4.9883, + "step": 30790 + }, + { + "epoch": 0.04327688954381873, + "grad_norm": 1.2613015174865723, + "learning_rate": 0.00012978361669242658, + "loss": 5.0884, + "step": 30800 + }, + { + "epoch": 0.04329094048198231, + "grad_norm": 1.2269909381866455, + "learning_rate": 0.000129825769284811, + "loss": 5.1133, + "step": 30810 + }, + { + "epoch": 0.043304991420145884, + "grad_norm": 1.2280735969543457, + "learning_rate": 0.00012986792187719544, + "loss": 4.8907, + "step": 30820 + }, + { + "epoch": 0.04331904235830946, + "grad_norm": 1.1935653686523438, + "learning_rate": 0.00012991007446957987, + "loss": 5.042, + "step": 30830 + }, + { + "epoch": 0.04333309329647304, + "grad_norm": 1.1728026866912842, + "learning_rate": 0.0001299522270619643, + "loss": 5.0976, + "step": 30840 + }, + { + "epoch": 0.04334714423463662, + "grad_norm": 1.1702702045440674, + "learning_rate": 0.00012999437965434873, + "loss": 5.0627, + "step": 30850 + }, + { + "epoch": 0.043361195172800196, + "grad_norm": 1.144583821296692, + "learning_rate": 0.00013003653224673316, + "loss": 4.9926, + "step": 30860 + }, + { + "epoch": 0.04337524611096377, + "grad_norm": 1.1992604732513428, + "learning_rate": 0.0001300786848391176, + "loss": 5.1402, + "step": 30870 + }, + { + "epoch": 0.04338929704912735, + "grad_norm": 1.2514524459838867, + "learning_rate": 0.00013012083743150203, + "loss": 5.2172, + "step": 30880 + }, + { + "epoch": 0.043403347987290924, + "grad_norm": 1.1470156908035278, + "learning_rate": 0.00013016299002388646, + "loss": 5.0893, + "step": 30890 + }, + { + "epoch": 0.0434173989254545, + "grad_norm": 1.1299411058425903, + "learning_rate": 0.0001302051426162709, + "loss": 5.1019, + "step": 30900 + }, + { + "epoch": 0.043431449863618084, + "grad_norm": 1.2057139873504639, + "learning_rate": 0.00013024729520865532, + "loss": 5.0939, + "step": 30910 + }, + { + "epoch": 0.04344550080178166, + "grad_norm": 1.108620285987854, + "learning_rate": 0.00013028944780103975, + "loss": 5.1344, + "step": 30920 + }, + { + "epoch": 0.043459551739945236, + "grad_norm": 1.22635817527771, + "learning_rate": 0.00013033160039342418, + "loss": 5.0597, + "step": 30930 + }, + { + "epoch": 0.04347360267810881, + "grad_norm": 1.2512584924697876, + "learning_rate": 0.0001303737529858086, + "loss": 4.9751, + "step": 30940 + }, + { + "epoch": 0.04348765361627239, + "grad_norm": 1.198045253753662, + "learning_rate": 0.00013041590557819304, + "loss": 5.0859, + "step": 30950 + }, + { + "epoch": 0.04350170455443597, + "grad_norm": 1.21504545211792, + "learning_rate": 0.00013045805817057748, + "loss": 5.1187, + "step": 30960 + }, + { + "epoch": 0.04351575549259955, + "grad_norm": 1.1842291355133057, + "learning_rate": 0.0001305002107629619, + "loss": 5.0685, + "step": 30970 + }, + { + "epoch": 0.043529806430763124, + "grad_norm": 1.1496257781982422, + "learning_rate": 0.00013054236335534634, + "loss": 5.1684, + "step": 30980 + }, + { + "epoch": 0.0435438573689267, + "grad_norm": 1.2007911205291748, + "learning_rate": 0.00013058451594773077, + "loss": 5.1206, + "step": 30990 + }, + { + "epoch": 0.043557908307090276, + "grad_norm": 1.1559113264083862, + "learning_rate": 0.0001306266685401152, + "loss": 5.1135, + "step": 31000 + }, + { + "epoch": 0.04357195924525386, + "grad_norm": 1.2255867719650269, + "learning_rate": 0.00013066882113249963, + "loss": 5.0647, + "step": 31010 + }, + { + "epoch": 0.043586010183417435, + "grad_norm": 1.1707844734191895, + "learning_rate": 0.00013071097372488406, + "loss": 5.0317, + "step": 31020 + }, + { + "epoch": 0.04360006112158101, + "grad_norm": 1.1722240447998047, + "learning_rate": 0.00013075312631726852, + "loss": 4.9861, + "step": 31030 + }, + { + "epoch": 0.04361411205974459, + "grad_norm": 1.164713740348816, + "learning_rate": 0.00013079527890965293, + "loss": 5.2333, + "step": 31040 + }, + { + "epoch": 0.043628162997908164, + "grad_norm": 1.2219550609588623, + "learning_rate": 0.00013083743150203736, + "loss": 5.118, + "step": 31050 + }, + { + "epoch": 0.04364221393607175, + "grad_norm": 1.173423171043396, + "learning_rate": 0.00013087958409442181, + "loss": 5.1398, + "step": 31060 + }, + { + "epoch": 0.04365626487423532, + "grad_norm": 1.227458119392395, + "learning_rate": 0.00013092173668680622, + "loss": 5.0945, + "step": 31070 + }, + { + "epoch": 0.0436703158123989, + "grad_norm": 1.2771947383880615, + "learning_rate": 0.00013096388927919065, + "loss": 5.0949, + "step": 31080 + }, + { + "epoch": 0.043684366750562476, + "grad_norm": 1.218270182609558, + "learning_rate": 0.0001310060418715751, + "loss": 5.1639, + "step": 31090 + }, + { + "epoch": 0.04369841768872605, + "grad_norm": 1.195109248161316, + "learning_rate": 0.0001310481944639595, + "loss": 5.1404, + "step": 31100 + }, + { + "epoch": 0.043712468626889635, + "grad_norm": 1.2469673156738281, + "learning_rate": 0.00013109034705634394, + "loss": 5.086, + "step": 31110 + }, + { + "epoch": 0.04372651956505321, + "grad_norm": 1.1276779174804688, + "learning_rate": 0.0001311324996487284, + "loss": 5.0138, + "step": 31120 + }, + { + "epoch": 0.04374057050321679, + "grad_norm": 1.1672612428665161, + "learning_rate": 0.0001311746522411128, + "loss": 5.0668, + "step": 31130 + }, + { + "epoch": 0.04375462144138036, + "grad_norm": 1.2316880226135254, + "learning_rate": 0.00013121680483349724, + "loss": 5.0723, + "step": 31140 + }, + { + "epoch": 0.04376867237954394, + "grad_norm": 1.3245317935943604, + "learning_rate": 0.0001312589574258817, + "loss": 5.0082, + "step": 31150 + }, + { + "epoch": 0.04378272331770752, + "grad_norm": 1.2602311372756958, + "learning_rate": 0.0001313011100182661, + "loss": 4.9338, + "step": 31160 + }, + { + "epoch": 0.0437967742558711, + "grad_norm": 1.1890308856964111, + "learning_rate": 0.00013134326261065053, + "loss": 5.1929, + "step": 31170 + }, + { + "epoch": 0.043810825194034675, + "grad_norm": 1.2277629375457764, + "learning_rate": 0.000131385415203035, + "loss": 4.9834, + "step": 31180 + }, + { + "epoch": 0.04382487613219825, + "grad_norm": 1.142812728881836, + "learning_rate": 0.00013142756779541942, + "loss": 5.1015, + "step": 31190 + }, + { + "epoch": 0.04383892707036183, + "grad_norm": 1.1541565656661987, + "learning_rate": 0.00013146972038780382, + "loss": 5.0397, + "step": 31200 + }, + { + "epoch": 0.0438529780085254, + "grad_norm": 1.1937357187271118, + "learning_rate": 0.00013151187298018828, + "loss": 5.0137, + "step": 31210 + }, + { + "epoch": 0.043867028946688987, + "grad_norm": 1.1668943166732788, + "learning_rate": 0.0001315540255725727, + "loss": 5.1796, + "step": 31220 + }, + { + "epoch": 0.04388107988485256, + "grad_norm": 1.1891443729400635, + "learning_rate": 0.00013159617816495712, + "loss": 5.059, + "step": 31230 + }, + { + "epoch": 0.04389513082301614, + "grad_norm": 1.1822489500045776, + "learning_rate": 0.00013163833075734158, + "loss": 5.1285, + "step": 31240 + }, + { + "epoch": 0.043909181761179715, + "grad_norm": 1.1516724824905396, + "learning_rate": 0.000131680483349726, + "loss": 5.1245, + "step": 31250 + }, + { + "epoch": 0.04392323269934329, + "grad_norm": 1.2096267938613892, + "learning_rate": 0.0001317226359421104, + "loss": 5.1178, + "step": 31260 + }, + { + "epoch": 0.043937283637506874, + "grad_norm": 1.2081648111343384, + "learning_rate": 0.00013176478853449487, + "loss": 5.1561, + "step": 31270 + }, + { + "epoch": 0.04395133457567045, + "grad_norm": 1.1660428047180176, + "learning_rate": 0.0001318069411268793, + "loss": 5.2086, + "step": 31280 + }, + { + "epoch": 0.04396538551383403, + "grad_norm": 1.153578519821167, + "learning_rate": 0.0001318490937192637, + "loss": 5.156, + "step": 31290 + }, + { + "epoch": 0.0439794364519976, + "grad_norm": 1.2928626537322998, + "learning_rate": 0.00013189124631164816, + "loss": 5.0846, + "step": 31300 + }, + { + "epoch": 0.04399348739016118, + "grad_norm": 1.1599982976913452, + "learning_rate": 0.0001319333989040326, + "loss": 5.1444, + "step": 31310 + }, + { + "epoch": 0.04400753832832476, + "grad_norm": 1.1558305025100708, + "learning_rate": 0.00013197555149641702, + "loss": 5.0282, + "step": 31320 + }, + { + "epoch": 0.04402158926648834, + "grad_norm": 1.168467402458191, + "learning_rate": 0.00013201770408880146, + "loss": 4.9011, + "step": 31330 + }, + { + "epoch": 0.044035640204651914, + "grad_norm": 1.1910271644592285, + "learning_rate": 0.0001320598566811859, + "loss": 5.0972, + "step": 31340 + }, + { + "epoch": 0.04404969114281549, + "grad_norm": 1.1661889553070068, + "learning_rate": 0.00013210200927357032, + "loss": 5.0018, + "step": 31350 + }, + { + "epoch": 0.04406374208097907, + "grad_norm": 1.1902189254760742, + "learning_rate": 0.00013214416186595475, + "loss": 5.1285, + "step": 31360 + }, + { + "epoch": 0.04407779301914265, + "grad_norm": 1.191560983657837, + "learning_rate": 0.00013218631445833918, + "loss": 5.0772, + "step": 31370 + }, + { + "epoch": 0.044091843957306226, + "grad_norm": 1.108267903327942, + "learning_rate": 0.0001322284670507236, + "loss": 5.1525, + "step": 31380 + }, + { + "epoch": 0.0441058948954698, + "grad_norm": 1.2221838235855103, + "learning_rate": 0.00013227061964310804, + "loss": 5.2337, + "step": 31390 + }, + { + "epoch": 0.04411994583363338, + "grad_norm": 1.19728684425354, + "learning_rate": 0.00013231277223549247, + "loss": 5.1011, + "step": 31400 + }, + { + "epoch": 0.044133996771796954, + "grad_norm": 1.1746065616607666, + "learning_rate": 0.0001323549248278769, + "loss": 5.1121, + "step": 31410 + }, + { + "epoch": 0.04414804770996054, + "grad_norm": 1.2161784172058105, + "learning_rate": 0.00013239707742026134, + "loss": 5.1285, + "step": 31420 + }, + { + "epoch": 0.044162098648124114, + "grad_norm": 1.2407193183898926, + "learning_rate": 0.00013243923001264577, + "loss": 5.0633, + "step": 31430 + }, + { + "epoch": 0.04417614958628769, + "grad_norm": 1.1342822313308716, + "learning_rate": 0.0001324813826050302, + "loss": 5.0568, + "step": 31440 + }, + { + "epoch": 0.044190200524451266, + "grad_norm": 1.2115027904510498, + "learning_rate": 0.00013252353519741463, + "loss": 4.9452, + "step": 31450 + }, + { + "epoch": 0.04420425146261484, + "grad_norm": 1.20106840133667, + "learning_rate": 0.00013256568778979906, + "loss": 5.0924, + "step": 31460 + }, + { + "epoch": 0.044218302400778425, + "grad_norm": 1.1099613904953003, + "learning_rate": 0.0001326078403821835, + "loss": 5.07, + "step": 31470 + }, + { + "epoch": 0.044232353338942, + "grad_norm": 1.1473101377487183, + "learning_rate": 0.00013264999297456792, + "loss": 5.069, + "step": 31480 + }, + { + "epoch": 0.04424640427710558, + "grad_norm": 1.1742898225784302, + "learning_rate": 0.00013269214556695235, + "loss": 5.1199, + "step": 31490 + }, + { + "epoch": 0.044260455215269154, + "grad_norm": 1.1407841444015503, + "learning_rate": 0.00013273429815933679, + "loss": 5.0929, + "step": 31500 + }, + { + "epoch": 0.04427450615343273, + "grad_norm": 1.17631196975708, + "learning_rate": 0.00013277645075172122, + "loss": 4.9963, + "step": 31510 + }, + { + "epoch": 0.044288557091596306, + "grad_norm": 1.1480594873428345, + "learning_rate": 0.00013281860334410565, + "loss": 5.11, + "step": 31520 + }, + { + "epoch": 0.04430260802975989, + "grad_norm": 1.2020182609558105, + "learning_rate": 0.00013286075593649008, + "loss": 5.0679, + "step": 31530 + }, + { + "epoch": 0.044316658967923465, + "grad_norm": 1.230470061302185, + "learning_rate": 0.0001329029085288745, + "loss": 4.9922, + "step": 31540 + }, + { + "epoch": 0.04433070990608704, + "grad_norm": 1.2092961072921753, + "learning_rate": 0.00013294506112125894, + "loss": 5.0695, + "step": 31550 + }, + { + "epoch": 0.04434476084425062, + "grad_norm": 1.2055702209472656, + "learning_rate": 0.00013298721371364337, + "loss": 5.1677, + "step": 31560 + }, + { + "epoch": 0.044358811782414194, + "grad_norm": 1.1955968141555786, + "learning_rate": 0.0001330293663060278, + "loss": 5.1062, + "step": 31570 + }, + { + "epoch": 0.04437286272057778, + "grad_norm": 1.1392838954925537, + "learning_rate": 0.00013307151889841224, + "loss": 5.0741, + "step": 31580 + }, + { + "epoch": 0.04438691365874135, + "grad_norm": 1.1732559204101562, + "learning_rate": 0.00013311367149079667, + "loss": 5.0425, + "step": 31590 + }, + { + "epoch": 0.04440096459690493, + "grad_norm": 1.1739134788513184, + "learning_rate": 0.0001331558240831811, + "loss": 5.0015, + "step": 31600 + }, + { + "epoch": 0.044415015535068506, + "grad_norm": 1.175411581993103, + "learning_rate": 0.00013319797667556556, + "loss": 5.0818, + "step": 31610 + }, + { + "epoch": 0.04442906647323208, + "grad_norm": 1.2318931818008423, + "learning_rate": 0.00013324012926794996, + "loss": 4.9621, + "step": 31620 + }, + { + "epoch": 0.044443117411395665, + "grad_norm": 1.146024227142334, + "learning_rate": 0.0001332822818603344, + "loss": 5.061, + "step": 31630 + }, + { + "epoch": 0.04445716834955924, + "grad_norm": 1.1564781665802002, + "learning_rate": 0.00013332443445271885, + "loss": 5.2572, + "step": 31640 + }, + { + "epoch": 0.04447121928772282, + "grad_norm": 1.1591886281967163, + "learning_rate": 0.00013336658704510325, + "loss": 5.1789, + "step": 31650 + }, + { + "epoch": 0.04448527022588639, + "grad_norm": 1.172905445098877, + "learning_rate": 0.00013340873963748768, + "loss": 5.0811, + "step": 31660 + }, + { + "epoch": 0.04449932116404997, + "grad_norm": 1.1963396072387695, + "learning_rate": 0.00013345089222987214, + "loss": 5.15, + "step": 31670 + }, + { + "epoch": 0.04451337210221355, + "grad_norm": 1.234837532043457, + "learning_rate": 0.00013349304482225655, + "loss": 5.1728, + "step": 31680 + }, + { + "epoch": 0.04452742304037713, + "grad_norm": 1.179028868675232, + "learning_rate": 0.00013353519741464098, + "loss": 5.0332, + "step": 31690 + }, + { + "epoch": 0.044541473978540705, + "grad_norm": 1.1915132999420166, + "learning_rate": 0.00013357735000702544, + "loss": 5.1575, + "step": 31700 + }, + { + "epoch": 0.04455552491670428, + "grad_norm": 1.248386263847351, + "learning_rate": 0.00013361950259940984, + "loss": 5.1046, + "step": 31710 + }, + { + "epoch": 0.04456957585486786, + "grad_norm": 1.134567141532898, + "learning_rate": 0.00013366165519179427, + "loss": 5.0822, + "step": 31720 + }, + { + "epoch": 0.04458362679303144, + "grad_norm": 1.2228316068649292, + "learning_rate": 0.00013370380778417873, + "loss": 4.9389, + "step": 31730 + }, + { + "epoch": 0.044597677731195016, + "grad_norm": 1.143936276435852, + "learning_rate": 0.00013374596037656316, + "loss": 5.0541, + "step": 31740 + }, + { + "epoch": 0.04461172866935859, + "grad_norm": 1.1601719856262207, + "learning_rate": 0.00013378811296894757, + "loss": 5.1395, + "step": 31750 + }, + { + "epoch": 0.04462577960752217, + "grad_norm": 1.1841803789138794, + "learning_rate": 0.00013383026556133202, + "loss": 5.0936, + "step": 31760 + }, + { + "epoch": 0.044639830545685745, + "grad_norm": 1.1612204313278198, + "learning_rate": 0.00013387241815371645, + "loss": 5.0203, + "step": 31770 + }, + { + "epoch": 0.04465388148384933, + "grad_norm": 1.2054232358932495, + "learning_rate": 0.00013391457074610086, + "loss": 5.0786, + "step": 31780 + }, + { + "epoch": 0.044667932422012904, + "grad_norm": 1.155108094215393, + "learning_rate": 0.00013395672333848532, + "loss": 4.9753, + "step": 31790 + }, + { + "epoch": 0.04468198336017648, + "grad_norm": 1.1976162195205688, + "learning_rate": 0.00013399887593086975, + "loss": 5.0861, + "step": 31800 + }, + { + "epoch": 0.04469603429834006, + "grad_norm": 1.157468557357788, + "learning_rate": 0.00013404102852325415, + "loss": 5.1192, + "step": 31810 + }, + { + "epoch": 0.04471008523650363, + "grad_norm": 1.1830439567565918, + "learning_rate": 0.0001340831811156386, + "loss": 5.1491, + "step": 31820 + }, + { + "epoch": 0.04472413617466721, + "grad_norm": 1.2510123252868652, + "learning_rate": 0.00013412533370802304, + "loss": 5.0752, + "step": 31830 + }, + { + "epoch": 0.04473818711283079, + "grad_norm": 1.151624083518982, + "learning_rate": 0.00013416748630040745, + "loss": 5.0527, + "step": 31840 + }, + { + "epoch": 0.04475223805099437, + "grad_norm": 1.1923450231552124, + "learning_rate": 0.0001342096388927919, + "loss": 5.0977, + "step": 31850 + }, + { + "epoch": 0.044766288989157944, + "grad_norm": 1.1659468412399292, + "learning_rate": 0.00013425179148517633, + "loss": 4.9261, + "step": 31860 + }, + { + "epoch": 0.04478033992732152, + "grad_norm": 1.139060378074646, + "learning_rate": 0.00013429394407756074, + "loss": 5.0291, + "step": 31870 + }, + { + "epoch": 0.0447943908654851, + "grad_norm": 1.134356141090393, + "learning_rate": 0.0001343360966699452, + "loss": 5.1025, + "step": 31880 + }, + { + "epoch": 0.04480844180364868, + "grad_norm": 1.1306984424591064, + "learning_rate": 0.00013437824926232963, + "loss": 5.0889, + "step": 31890 + }, + { + "epoch": 0.044822492741812256, + "grad_norm": 1.1675775051116943, + "learning_rate": 0.00013442040185471406, + "loss": 4.949, + "step": 31900 + }, + { + "epoch": 0.04483654367997583, + "grad_norm": 1.2445342540740967, + "learning_rate": 0.0001344625544470985, + "loss": 4.9771, + "step": 31910 + }, + { + "epoch": 0.04485059461813941, + "grad_norm": 1.1660617589950562, + "learning_rate": 0.00013450470703948292, + "loss": 5.0768, + "step": 31920 + }, + { + "epoch": 0.044864645556302984, + "grad_norm": 1.2834054231643677, + "learning_rate": 0.00013454685963186735, + "loss": 4.9632, + "step": 31930 + }, + { + "epoch": 0.04487869649446657, + "grad_norm": 1.2516728639602661, + "learning_rate": 0.00013458901222425178, + "loss": 5.091, + "step": 31940 + }, + { + "epoch": 0.044892747432630144, + "grad_norm": 1.1859095096588135, + "learning_rate": 0.00013463116481663622, + "loss": 4.9886, + "step": 31950 + }, + { + "epoch": 0.04490679837079372, + "grad_norm": 1.140558123588562, + "learning_rate": 0.00013467331740902065, + "loss": 5.1474, + "step": 31960 + }, + { + "epoch": 0.044920849308957296, + "grad_norm": 1.1825441122055054, + "learning_rate": 0.00013471547000140508, + "loss": 5.0819, + "step": 31970 + }, + { + "epoch": 0.04493490024712087, + "grad_norm": 1.1421507596969604, + "learning_rate": 0.0001347576225937895, + "loss": 5.0934, + "step": 31980 + }, + { + "epoch": 0.044948951185284455, + "grad_norm": 1.1730067729949951, + "learning_rate": 0.00013479977518617394, + "loss": 4.9571, + "step": 31990 + }, + { + "epoch": 0.04496300212344803, + "grad_norm": 1.1735901832580566, + "learning_rate": 0.00013484192777855837, + "loss": 5.0852, + "step": 32000 + }, + { + "epoch": 0.04497705306161161, + "grad_norm": 1.2650270462036133, + "learning_rate": 0.0001348840803709428, + "loss": 4.9872, + "step": 32010 + }, + { + "epoch": 0.044991103999775184, + "grad_norm": 1.2169119119644165, + "learning_rate": 0.00013492623296332723, + "loss": 5.0531, + "step": 32020 + }, + { + "epoch": 0.04500515493793876, + "grad_norm": 1.1884304285049438, + "learning_rate": 0.00013496838555571166, + "loss": 4.9522, + "step": 32030 + }, + { + "epoch": 0.04501920587610234, + "grad_norm": 1.131784439086914, + "learning_rate": 0.0001350105381480961, + "loss": 5.0842, + "step": 32040 + }, + { + "epoch": 0.04503325681426592, + "grad_norm": 1.1771255731582642, + "learning_rate": 0.00013505269074048053, + "loss": 5.1013, + "step": 32050 + }, + { + "epoch": 0.045047307752429495, + "grad_norm": 1.181799054145813, + "learning_rate": 0.00013509484333286496, + "loss": 5.0919, + "step": 32060 + }, + { + "epoch": 0.04506135869059307, + "grad_norm": 1.173443078994751, + "learning_rate": 0.0001351369959252494, + "loss": 4.9914, + "step": 32070 + }, + { + "epoch": 0.04507540962875665, + "grad_norm": 1.1408971548080444, + "learning_rate": 0.00013517914851763382, + "loss": 5.0476, + "step": 32080 + }, + { + "epoch": 0.04508946056692023, + "grad_norm": 1.1605677604675293, + "learning_rate": 0.00013522130111001825, + "loss": 5.1048, + "step": 32090 + }, + { + "epoch": 0.04510351150508381, + "grad_norm": 1.1556565761566162, + "learning_rate": 0.00013526345370240268, + "loss": 5.0675, + "step": 32100 + }, + { + "epoch": 0.04511756244324738, + "grad_norm": 1.1450446844100952, + "learning_rate": 0.00013530560629478711, + "loss": 5.0533, + "step": 32110 + }, + { + "epoch": 0.04513161338141096, + "grad_norm": 1.163759708404541, + "learning_rate": 0.00013534775888717155, + "loss": 5.1082, + "step": 32120 + }, + { + "epoch": 0.045145664319574536, + "grad_norm": 1.2103548049926758, + "learning_rate": 0.00013538991147955598, + "loss": 5.0448, + "step": 32130 + }, + { + "epoch": 0.04515971525773811, + "grad_norm": 1.3155592679977417, + "learning_rate": 0.0001354320640719404, + "loss": 5.0333, + "step": 32140 + }, + { + "epoch": 0.045173766195901695, + "grad_norm": 1.169601321220398, + "learning_rate": 0.00013547421666432484, + "loss": 5.0035, + "step": 32150 + }, + { + "epoch": 0.04518781713406527, + "grad_norm": 1.1616172790527344, + "learning_rate": 0.0001355163692567093, + "loss": 5.0827, + "step": 32160 + }, + { + "epoch": 0.04520186807222885, + "grad_norm": 1.1275705099105835, + "learning_rate": 0.0001355585218490937, + "loss": 5.0115, + "step": 32170 + }, + { + "epoch": 0.04521591901039242, + "grad_norm": 1.1657685041427612, + "learning_rate": 0.00013560067444147813, + "loss": 5.0359, + "step": 32180 + }, + { + "epoch": 0.045229969948556, + "grad_norm": 1.1922658681869507, + "learning_rate": 0.0001356428270338626, + "loss": 5.0735, + "step": 32190 + }, + { + "epoch": 0.04524402088671958, + "grad_norm": 1.262093186378479, + "learning_rate": 0.000135684979626247, + "loss": 5.0809, + "step": 32200 + }, + { + "epoch": 0.04525807182488316, + "grad_norm": 1.1995415687561035, + "learning_rate": 0.00013572713221863143, + "loss": 5.0154, + "step": 32210 + }, + { + "epoch": 0.045272122763046735, + "grad_norm": 1.1763862371444702, + "learning_rate": 0.00013576928481101588, + "loss": 5.0166, + "step": 32220 + }, + { + "epoch": 0.04528617370121031, + "grad_norm": 1.1900880336761475, + "learning_rate": 0.0001358114374034003, + "loss": 5.0693, + "step": 32230 + }, + { + "epoch": 0.04530022463937389, + "grad_norm": 1.1440578699111938, + "learning_rate": 0.00013585358999578472, + "loss": 5.0902, + "step": 32240 + }, + { + "epoch": 0.04531427557753747, + "grad_norm": 1.1583417654037476, + "learning_rate": 0.00013589574258816918, + "loss": 4.9809, + "step": 32250 + }, + { + "epoch": 0.045328326515701046, + "grad_norm": 1.2270220518112183, + "learning_rate": 0.00013593789518055358, + "loss": 5.099, + "step": 32260 + }, + { + "epoch": 0.04534237745386462, + "grad_norm": 1.1560698747634888, + "learning_rate": 0.000135980047772938, + "loss": 5.1224, + "step": 32270 + }, + { + "epoch": 0.0453564283920282, + "grad_norm": 1.1910443305969238, + "learning_rate": 0.00013602220036532247, + "loss": 5.0196, + "step": 32280 + }, + { + "epoch": 0.045370479330191775, + "grad_norm": 1.204832673072815, + "learning_rate": 0.00013606435295770688, + "loss": 5.0512, + "step": 32290 + }, + { + "epoch": 0.04538453026835536, + "grad_norm": 1.3012568950653076, + "learning_rate": 0.0001361065055500913, + "loss": 5.0248, + "step": 32300 + }, + { + "epoch": 0.045398581206518934, + "grad_norm": 1.1138745546340942, + "learning_rate": 0.00013614865814247576, + "loss": 5.1426, + "step": 32310 + }, + { + "epoch": 0.04541263214468251, + "grad_norm": 1.3530911207199097, + "learning_rate": 0.0001361908107348602, + "loss": 4.9985, + "step": 32320 + }, + { + "epoch": 0.04542668308284609, + "grad_norm": 1.1052865982055664, + "learning_rate": 0.0001362329633272446, + "loss": 5.0387, + "step": 32330 + }, + { + "epoch": 0.04544073402100966, + "grad_norm": 1.1460787057876587, + "learning_rate": 0.00013627511591962906, + "loss": 4.9708, + "step": 32340 + }, + { + "epoch": 0.045454784959173246, + "grad_norm": 1.1664546728134155, + "learning_rate": 0.0001363172685120135, + "loss": 4.9381, + "step": 32350 + }, + { + "epoch": 0.04546883589733682, + "grad_norm": 1.1577123403549194, + "learning_rate": 0.0001363594211043979, + "loss": 4.9815, + "step": 32360 + }, + { + "epoch": 0.0454828868355004, + "grad_norm": 1.1973512172698975, + "learning_rate": 0.00013640157369678235, + "loss": 5.1239, + "step": 32370 + }, + { + "epoch": 0.045496937773663974, + "grad_norm": 1.1406227350234985, + "learning_rate": 0.00013644372628916678, + "loss": 5.0569, + "step": 32380 + }, + { + "epoch": 0.04551098871182755, + "grad_norm": 1.1365989446640015, + "learning_rate": 0.0001364858788815512, + "loss": 4.9423, + "step": 32390 + }, + { + "epoch": 0.045525039649991134, + "grad_norm": 1.1738041639328003, + "learning_rate": 0.00013652803147393565, + "loss": 5.1111, + "step": 32400 + }, + { + "epoch": 0.04553909058815471, + "grad_norm": 1.2123348712921143, + "learning_rate": 0.00013657018406632008, + "loss": 5.1021, + "step": 32410 + }, + { + "epoch": 0.045553141526318286, + "grad_norm": 1.1308833360671997, + "learning_rate": 0.00013661233665870448, + "loss": 5.0533, + "step": 32420 + }, + { + "epoch": 0.04556719246448186, + "grad_norm": 1.2196406126022339, + "learning_rate": 0.00013665448925108894, + "loss": 4.9591, + "step": 32430 + }, + { + "epoch": 0.04558124340264544, + "grad_norm": 1.1016089916229248, + "learning_rate": 0.00013669664184347337, + "loss": 5.0497, + "step": 32440 + }, + { + "epoch": 0.045595294340809014, + "grad_norm": 1.248906135559082, + "learning_rate": 0.0001367387944358578, + "loss": 5.0935, + "step": 32450 + }, + { + "epoch": 0.0456093452789726, + "grad_norm": 1.1859537363052368, + "learning_rate": 0.00013678094702824223, + "loss": 5.1309, + "step": 32460 + }, + { + "epoch": 0.045623396217136174, + "grad_norm": 1.2062934637069702, + "learning_rate": 0.00013682309962062666, + "loss": 5.0923, + "step": 32470 + }, + { + "epoch": 0.04563744715529975, + "grad_norm": 1.4972939491271973, + "learning_rate": 0.0001368652522130111, + "loss": 5.009, + "step": 32480 + }, + { + "epoch": 0.045651498093463326, + "grad_norm": 1.1737431287765503, + "learning_rate": 0.00013690740480539553, + "loss": 5.1537, + "step": 32490 + }, + { + "epoch": 0.0456655490316269, + "grad_norm": 1.1139081716537476, + "learning_rate": 0.00013694955739777996, + "loss": 5.1087, + "step": 32500 + }, + { + "epoch": 0.045679599969790485, + "grad_norm": 1.2105811834335327, + "learning_rate": 0.0001369917099901644, + "loss": 5.0994, + "step": 32510 + }, + { + "epoch": 0.04569365090795406, + "grad_norm": 1.2313215732574463, + "learning_rate": 0.00013703386258254882, + "loss": 5.0346, + "step": 32520 + }, + { + "epoch": 0.04570770184611764, + "grad_norm": 1.1877297163009644, + "learning_rate": 0.00013707601517493325, + "loss": 5.132, + "step": 32530 + }, + { + "epoch": 0.045721752784281214, + "grad_norm": 1.1226810216903687, + "learning_rate": 0.00013711816776731768, + "loss": 5.0537, + "step": 32540 + }, + { + "epoch": 0.04573580372244479, + "grad_norm": 1.2374706268310547, + "learning_rate": 0.0001371603203597021, + "loss": 5.0868, + "step": 32550 + }, + { + "epoch": 0.04574985466060837, + "grad_norm": 1.1252933740615845, + "learning_rate": 0.00013720247295208654, + "loss": 5.1987, + "step": 32560 + }, + { + "epoch": 0.04576390559877195, + "grad_norm": 1.1599668264389038, + "learning_rate": 0.00013724462554447097, + "loss": 5.0369, + "step": 32570 + }, + { + "epoch": 0.045777956536935525, + "grad_norm": 1.2881426811218262, + "learning_rate": 0.0001372867781368554, + "loss": 5.0175, + "step": 32580 + }, + { + "epoch": 0.0457920074750991, + "grad_norm": 1.1907261610031128, + "learning_rate": 0.00013732893072923984, + "loss": 5.0749, + "step": 32590 + }, + { + "epoch": 0.04580605841326268, + "grad_norm": 1.1891196966171265, + "learning_rate": 0.00013737108332162427, + "loss": 5.1202, + "step": 32600 + }, + { + "epoch": 0.04582010935142626, + "grad_norm": 1.1137062311172485, + "learning_rate": 0.0001374132359140087, + "loss": 4.9954, + "step": 32610 + }, + { + "epoch": 0.04583416028958984, + "grad_norm": 1.1719475984573364, + "learning_rate": 0.00013745538850639313, + "loss": 4.9482, + "step": 32620 + }, + { + "epoch": 0.04584821122775341, + "grad_norm": 1.1451879739761353, + "learning_rate": 0.00013749754109877756, + "loss": 4.9943, + "step": 32630 + }, + { + "epoch": 0.04586226216591699, + "grad_norm": 1.144889235496521, + "learning_rate": 0.000137539693691162, + "loss": 5.0704, + "step": 32640 + }, + { + "epoch": 0.045876313104080566, + "grad_norm": 1.1692876815795898, + "learning_rate": 0.00013758184628354642, + "loss": 4.9091, + "step": 32650 + }, + { + "epoch": 0.04589036404224415, + "grad_norm": 1.1342777013778687, + "learning_rate": 0.00013762399887593086, + "loss": 5.0762, + "step": 32660 + }, + { + "epoch": 0.045904414980407725, + "grad_norm": 1.1293963193893433, + "learning_rate": 0.0001376661514683153, + "loss": 5.1474, + "step": 32670 + }, + { + "epoch": 0.0459184659185713, + "grad_norm": 1.1727898120880127, + "learning_rate": 0.00013770830406069972, + "loss": 5.0395, + "step": 32680 + }, + { + "epoch": 0.04593251685673488, + "grad_norm": 1.1893597841262817, + "learning_rate": 0.00013775045665308415, + "loss": 5.0505, + "step": 32690 + }, + { + "epoch": 0.04594656779489845, + "grad_norm": 1.1680330038070679, + "learning_rate": 0.00013779260924546858, + "loss": 5.0726, + "step": 32700 + }, + { + "epoch": 0.045960618733062036, + "grad_norm": 1.1110119819641113, + "learning_rate": 0.000137834761837853, + "loss": 4.9678, + "step": 32710 + }, + { + "epoch": 0.04597466967122561, + "grad_norm": 1.1456838846206665, + "learning_rate": 0.00013787691443023744, + "loss": 5.0045, + "step": 32720 + }, + { + "epoch": 0.04598872060938919, + "grad_norm": 1.1601930856704712, + "learning_rate": 0.00013791906702262187, + "loss": 5.044, + "step": 32730 + }, + { + "epoch": 0.046002771547552765, + "grad_norm": 1.1178258657455444, + "learning_rate": 0.00013796121961500633, + "loss": 4.9794, + "step": 32740 + }, + { + "epoch": 0.04601682248571634, + "grad_norm": 1.1255162954330444, + "learning_rate": 0.00013800337220739074, + "loss": 5.0032, + "step": 32750 + }, + { + "epoch": 0.046030873423879924, + "grad_norm": 1.1232777833938599, + "learning_rate": 0.00013804552479977517, + "loss": 5.0948, + "step": 32760 + }, + { + "epoch": 0.0460449243620435, + "grad_norm": 1.158825159072876, + "learning_rate": 0.00013808767739215963, + "loss": 5.0024, + "step": 32770 + }, + { + "epoch": 0.046058975300207076, + "grad_norm": 1.132293462753296, + "learning_rate": 0.00013812982998454403, + "loss": 5.0553, + "step": 32780 + }, + { + "epoch": 0.04607302623837065, + "grad_norm": 1.1477060317993164, + "learning_rate": 0.00013817198257692846, + "loss": 5.016, + "step": 32790 + }, + { + "epoch": 0.04608707717653423, + "grad_norm": 1.1272825002670288, + "learning_rate": 0.00013821413516931292, + "loss": 5.0562, + "step": 32800 + }, + { + "epoch": 0.046101128114697805, + "grad_norm": 1.1596038341522217, + "learning_rate": 0.00013825628776169732, + "loss": 5.0503, + "step": 32810 + }, + { + "epoch": 0.04611517905286139, + "grad_norm": 1.131719708442688, + "learning_rate": 0.00013829844035408175, + "loss": 4.9989, + "step": 32820 + }, + { + "epoch": 0.046129229991024964, + "grad_norm": 1.1962887048721313, + "learning_rate": 0.0001383405929464662, + "loss": 5.0298, + "step": 32830 + }, + { + "epoch": 0.04614328092918854, + "grad_norm": 1.133915901184082, + "learning_rate": 0.00013838274553885062, + "loss": 4.9067, + "step": 32840 + }, + { + "epoch": 0.04615733186735212, + "grad_norm": 1.2771695852279663, + "learning_rate": 0.00013842489813123505, + "loss": 5.0036, + "step": 32850 + }, + { + "epoch": 0.04617138280551569, + "grad_norm": 1.1963088512420654, + "learning_rate": 0.0001384670507236195, + "loss": 5.0803, + "step": 32860 + }, + { + "epoch": 0.046185433743679276, + "grad_norm": 1.0991750955581665, + "learning_rate": 0.0001385092033160039, + "loss": 5.0325, + "step": 32870 + }, + { + "epoch": 0.04619948468184285, + "grad_norm": 1.179234504699707, + "learning_rate": 0.00013855135590838834, + "loss": 5.0296, + "step": 32880 + }, + { + "epoch": 0.04621353562000643, + "grad_norm": 1.1082992553710938, + "learning_rate": 0.0001385935085007728, + "loss": 5.1175, + "step": 32890 + }, + { + "epoch": 0.046227586558170004, + "grad_norm": 1.2209382057189941, + "learning_rate": 0.00013863566109315723, + "loss": 5.0361, + "step": 32900 + }, + { + "epoch": 0.04624163749633358, + "grad_norm": 1.1588517427444458, + "learning_rate": 0.00013867781368554163, + "loss": 5.1443, + "step": 32910 + }, + { + "epoch": 0.046255688434497164, + "grad_norm": 1.0982693433761597, + "learning_rate": 0.0001387199662779261, + "loss": 5.0724, + "step": 32920 + }, + { + "epoch": 0.04626973937266074, + "grad_norm": 1.1611078977584839, + "learning_rate": 0.00013876211887031052, + "loss": 5.127, + "step": 32930 + }, + { + "epoch": 0.046283790310824316, + "grad_norm": 1.1593987941741943, + "learning_rate": 0.00013880427146269493, + "loss": 5.1295, + "step": 32940 + }, + { + "epoch": 0.04629784124898789, + "grad_norm": 1.1332095861434937, + "learning_rate": 0.00013884642405507939, + "loss": 5.0213, + "step": 32950 + }, + { + "epoch": 0.04631189218715147, + "grad_norm": 1.1835927963256836, + "learning_rate": 0.00013888857664746382, + "loss": 5.0062, + "step": 32960 + }, + { + "epoch": 0.04632594312531505, + "grad_norm": 1.1010524034500122, + "learning_rate": 0.00013893072923984822, + "loss": 5.1005, + "step": 32970 + }, + { + "epoch": 0.04633999406347863, + "grad_norm": 1.1981157064437866, + "learning_rate": 0.00013897288183223268, + "loss": 4.8705, + "step": 32980 + }, + { + "epoch": 0.046354045001642204, + "grad_norm": 1.2006856203079224, + "learning_rate": 0.0001390150344246171, + "loss": 5.0685, + "step": 32990 + }, + { + "epoch": 0.04636809593980578, + "grad_norm": 1.1585785150527954, + "learning_rate": 0.00013905718701700152, + "loss": 5.0302, + "step": 33000 + }, + { + "epoch": 0.046382146877969356, + "grad_norm": 1.1677348613739014, + "learning_rate": 0.00013909933960938597, + "loss": 4.9233, + "step": 33010 + }, + { + "epoch": 0.04639619781613294, + "grad_norm": 1.1076332330703735, + "learning_rate": 0.0001391414922017704, + "loss": 5.0191, + "step": 33020 + }, + { + "epoch": 0.046410248754296515, + "grad_norm": 1.0935347080230713, + "learning_rate": 0.00013918364479415484, + "loss": 5.1139, + "step": 33030 + }, + { + "epoch": 0.04642429969246009, + "grad_norm": 1.1202341318130493, + "learning_rate": 0.00013922579738653927, + "loss": 5.0648, + "step": 33040 + }, + { + "epoch": 0.04643835063062367, + "grad_norm": 1.1524226665496826, + "learning_rate": 0.0001392679499789237, + "loss": 4.8851, + "step": 33050 + }, + { + "epoch": 0.046452401568787244, + "grad_norm": 1.19994056224823, + "learning_rate": 0.00013931010257130813, + "loss": 5.0289, + "step": 33060 + }, + { + "epoch": 0.04646645250695083, + "grad_norm": 1.1779851913452148, + "learning_rate": 0.00013935225516369256, + "loss": 5.0094, + "step": 33070 + }, + { + "epoch": 0.0464805034451144, + "grad_norm": 1.186610460281372, + "learning_rate": 0.000139394407756077, + "loss": 5.0195, + "step": 33080 + }, + { + "epoch": 0.04649455438327798, + "grad_norm": 1.1642428636550903, + "learning_rate": 0.00013943656034846142, + "loss": 5.0686, + "step": 33090 + }, + { + "epoch": 0.046508605321441555, + "grad_norm": 1.1461211442947388, + "learning_rate": 0.00013947871294084585, + "loss": 5.1077, + "step": 33100 + }, + { + "epoch": 0.04652265625960513, + "grad_norm": 1.1319652795791626, + "learning_rate": 0.00013952086553323029, + "loss": 5.0228, + "step": 33110 + }, + { + "epoch": 0.04653670719776871, + "grad_norm": 1.2022271156311035, + "learning_rate": 0.00013956301812561472, + "loss": 5.0792, + "step": 33120 + }, + { + "epoch": 0.04655075813593229, + "grad_norm": 1.1919790506362915, + "learning_rate": 0.00013960517071799915, + "loss": 5.0433, + "step": 33130 + }, + { + "epoch": 0.04656480907409587, + "grad_norm": 1.1596547365188599, + "learning_rate": 0.00013964732331038358, + "loss": 5.0325, + "step": 33140 + }, + { + "epoch": 0.04657886001225944, + "grad_norm": 1.16820228099823, + "learning_rate": 0.000139689475902768, + "loss": 5.0048, + "step": 33150 + }, + { + "epoch": 0.04659291095042302, + "grad_norm": 1.422676682472229, + "learning_rate": 0.00013973162849515244, + "loss": 4.9945, + "step": 33160 + }, + { + "epoch": 0.046606961888586595, + "grad_norm": 1.1482213735580444, + "learning_rate": 0.00013977378108753687, + "loss": 4.9878, + "step": 33170 + }, + { + "epoch": 0.04662101282675018, + "grad_norm": 1.2206376791000366, + "learning_rate": 0.0001398159336799213, + "loss": 5.1302, + "step": 33180 + }, + { + "epoch": 0.046635063764913755, + "grad_norm": 1.1734592914581299, + "learning_rate": 0.00013985808627230573, + "loss": 4.9858, + "step": 33190 + }, + { + "epoch": 0.04664911470307733, + "grad_norm": 1.1573774814605713, + "learning_rate": 0.00013990023886469017, + "loss": 5.0833, + "step": 33200 + }, + { + "epoch": 0.04666316564124091, + "grad_norm": 1.1239194869995117, + "learning_rate": 0.0001399423914570746, + "loss": 4.9798, + "step": 33210 + }, + { + "epoch": 0.04667721657940448, + "grad_norm": 1.2762165069580078, + "learning_rate": 0.00013998454404945903, + "loss": 4.9813, + "step": 33220 + }, + { + "epoch": 0.046691267517568066, + "grad_norm": 1.1590991020202637, + "learning_rate": 0.00014002669664184346, + "loss": 5.0678, + "step": 33230 + }, + { + "epoch": 0.04670531845573164, + "grad_norm": 1.2096633911132812, + "learning_rate": 0.0001400688492342279, + "loss": 5.2786, + "step": 33240 + }, + { + "epoch": 0.04671936939389522, + "grad_norm": 1.1752020120620728, + "learning_rate": 0.00014011100182661232, + "loss": 5.097, + "step": 33250 + }, + { + "epoch": 0.046733420332058795, + "grad_norm": 1.1489932537078857, + "learning_rate": 0.00014015315441899675, + "loss": 5.1115, + "step": 33260 + }, + { + "epoch": 0.04674747127022237, + "grad_norm": 1.2402178049087524, + "learning_rate": 0.00014019530701138118, + "loss": 5.0294, + "step": 33270 + }, + { + "epoch": 0.046761522208385954, + "grad_norm": 1.109150767326355, + "learning_rate": 0.00014023745960376561, + "loss": 5.1305, + "step": 33280 + }, + { + "epoch": 0.04677557314654953, + "grad_norm": 1.1309486627578735, + "learning_rate": 0.00014027961219615005, + "loss": 4.9889, + "step": 33290 + }, + { + "epoch": 0.046789624084713106, + "grad_norm": 1.1985018253326416, + "learning_rate": 0.00014032176478853448, + "loss": 5.0472, + "step": 33300 + }, + { + "epoch": 0.04680367502287668, + "grad_norm": 1.231369972229004, + "learning_rate": 0.0001403639173809189, + "loss": 5.0311, + "step": 33310 + }, + { + "epoch": 0.04681772596104026, + "grad_norm": 1.1748765707015991, + "learning_rate": 0.00014040606997330337, + "loss": 4.9819, + "step": 33320 + }, + { + "epoch": 0.04683177689920384, + "grad_norm": 1.2126888036727905, + "learning_rate": 0.00014044822256568777, + "loss": 4.9947, + "step": 33330 + }, + { + "epoch": 0.04684582783736742, + "grad_norm": 1.2520742416381836, + "learning_rate": 0.0001404903751580722, + "loss": 4.9942, + "step": 33340 + }, + { + "epoch": 0.046859878775530994, + "grad_norm": 1.1721147298812866, + "learning_rate": 0.00014053252775045666, + "loss": 5.0119, + "step": 33350 + }, + { + "epoch": 0.04687392971369457, + "grad_norm": 1.1375186443328857, + "learning_rate": 0.00014057468034284106, + "loss": 5.1042, + "step": 33360 + }, + { + "epoch": 0.04688798065185815, + "grad_norm": 1.1971328258514404, + "learning_rate": 0.0001406168329352255, + "loss": 5.0726, + "step": 33370 + }, + { + "epoch": 0.04690203159002173, + "grad_norm": 1.4078949689865112, + "learning_rate": 0.00014065898552760995, + "loss": 5.0296, + "step": 33380 + }, + { + "epoch": 0.046916082528185306, + "grad_norm": 1.2646558284759521, + "learning_rate": 0.00014070113811999436, + "loss": 5.0794, + "step": 33390 + }, + { + "epoch": 0.04693013346634888, + "grad_norm": 1.107875943183899, + "learning_rate": 0.0001407432907123788, + "loss": 5.0813, + "step": 33400 + }, + { + "epoch": 0.04694418440451246, + "grad_norm": 1.1319985389709473, + "learning_rate": 0.00014078544330476325, + "loss": 5.1663, + "step": 33410 + }, + { + "epoch": 0.046958235342676034, + "grad_norm": 1.0848567485809326, + "learning_rate": 0.00014082759589714765, + "loss": 5.1029, + "step": 33420 + }, + { + "epoch": 0.04697228628083961, + "grad_norm": 1.1160449981689453, + "learning_rate": 0.00014086974848953208, + "loss": 4.9525, + "step": 33430 + }, + { + "epoch": 0.046986337219003194, + "grad_norm": 1.1369086503982544, + "learning_rate": 0.00014091190108191654, + "loss": 5.0147, + "step": 33440 + }, + { + "epoch": 0.04700038815716677, + "grad_norm": 1.1825052499771118, + "learning_rate": 0.00014095405367430097, + "loss": 5.0531, + "step": 33450 + }, + { + "epoch": 0.047014439095330346, + "grad_norm": 1.1444425582885742, + "learning_rate": 0.00014099620626668538, + "loss": 5.0372, + "step": 33460 + }, + { + "epoch": 0.04702849003349392, + "grad_norm": 1.1329076290130615, + "learning_rate": 0.00014103835885906983, + "loss": 5.189, + "step": 33470 + }, + { + "epoch": 0.0470425409716575, + "grad_norm": 1.1967828273773193, + "learning_rate": 0.00014108051145145427, + "loss": 5.0807, + "step": 33480 + }, + { + "epoch": 0.04705659190982108, + "grad_norm": 1.1986347436904907, + "learning_rate": 0.00014112266404383867, + "loss": 5.0648, + "step": 33490 + }, + { + "epoch": 0.04707064284798466, + "grad_norm": 1.131181001663208, + "learning_rate": 0.00014116481663622313, + "loss": 5.0826, + "step": 33500 + }, + { + "epoch": 0.047084693786148234, + "grad_norm": 1.1698004007339478, + "learning_rate": 0.00014120696922860756, + "loss": 5.0187, + "step": 33510 + }, + { + "epoch": 0.04709874472431181, + "grad_norm": 1.186985731124878, + "learning_rate": 0.00014124912182099196, + "loss": 5.071, + "step": 33520 + }, + { + "epoch": 0.047112795662475386, + "grad_norm": 1.1888569593429565, + "learning_rate": 0.00014129127441337642, + "loss": 4.9117, + "step": 33530 + }, + { + "epoch": 0.04712684660063897, + "grad_norm": 1.1342166662216187, + "learning_rate": 0.00014133342700576085, + "loss": 5.0269, + "step": 33540 + }, + { + "epoch": 0.047140897538802545, + "grad_norm": 1.2070249319076538, + "learning_rate": 0.00014137557959814526, + "loss": 4.9468, + "step": 33550 + }, + { + "epoch": 0.04715494847696612, + "grad_norm": 1.1142017841339111, + "learning_rate": 0.00014141773219052971, + "loss": 5.2021, + "step": 33560 + }, + { + "epoch": 0.0471689994151297, + "grad_norm": 1.092371940612793, + "learning_rate": 0.00014145988478291415, + "loss": 5.0756, + "step": 33570 + }, + { + "epoch": 0.047183050353293274, + "grad_norm": 1.1797840595245361, + "learning_rate": 0.00014150203737529855, + "loss": 5.0859, + "step": 33580 + }, + { + "epoch": 0.04719710129145686, + "grad_norm": 1.177193522453308, + "learning_rate": 0.000141544189967683, + "loss": 4.9599, + "step": 33590 + }, + { + "epoch": 0.04721115222962043, + "grad_norm": 1.1384317874908447, + "learning_rate": 0.00014158634256006744, + "loss": 5.0421, + "step": 33600 + }, + { + "epoch": 0.04722520316778401, + "grad_norm": 1.1723968982696533, + "learning_rate": 0.00014162849515245187, + "loss": 5.0571, + "step": 33610 + }, + { + "epoch": 0.047239254105947585, + "grad_norm": 1.1619266271591187, + "learning_rate": 0.0001416706477448363, + "loss": 5.114, + "step": 33620 + }, + { + "epoch": 0.04725330504411116, + "grad_norm": 1.1768392324447632, + "learning_rate": 0.00014171280033722073, + "loss": 4.9927, + "step": 33630 + }, + { + "epoch": 0.047267355982274745, + "grad_norm": 1.1438103914260864, + "learning_rate": 0.00014175495292960516, + "loss": 5.0783, + "step": 33640 + }, + { + "epoch": 0.04728140692043832, + "grad_norm": 1.165272831916809, + "learning_rate": 0.0001417971055219896, + "loss": 4.9889, + "step": 33650 + }, + { + "epoch": 0.0472954578586019, + "grad_norm": 1.2394903898239136, + "learning_rate": 0.00014183925811437403, + "loss": 5.0751, + "step": 33660 + }, + { + "epoch": 0.04730950879676547, + "grad_norm": 1.2130417823791504, + "learning_rate": 0.00014188141070675846, + "loss": 5.1452, + "step": 33670 + }, + { + "epoch": 0.04732355973492905, + "grad_norm": 1.100905418395996, + "learning_rate": 0.0001419235632991429, + "loss": 5.1668, + "step": 33680 + }, + { + "epoch": 0.04733761067309263, + "grad_norm": 1.1132055521011353, + "learning_rate": 0.00014196571589152732, + "loss": 4.9762, + "step": 33690 + }, + { + "epoch": 0.04735166161125621, + "grad_norm": 1.1489577293395996, + "learning_rate": 0.00014200786848391175, + "loss": 5.0032, + "step": 33700 + }, + { + "epoch": 0.047365712549419785, + "grad_norm": 1.1104612350463867, + "learning_rate": 0.00014205002107629618, + "loss": 5.0617, + "step": 33710 + }, + { + "epoch": 0.04737976348758336, + "grad_norm": 1.148146629333496, + "learning_rate": 0.0001420921736686806, + "loss": 5.0499, + "step": 33720 + }, + { + "epoch": 0.04739381442574694, + "grad_norm": 1.1576919555664062, + "learning_rate": 0.00014213432626106504, + "loss": 5.0133, + "step": 33730 + }, + { + "epoch": 0.04740786536391051, + "grad_norm": 1.1263247728347778, + "learning_rate": 0.00014217647885344948, + "loss": 4.8634, + "step": 33740 + }, + { + "epoch": 0.047421916302074096, + "grad_norm": 1.168540358543396, + "learning_rate": 0.0001422186314458339, + "loss": 4.9549, + "step": 33750 + }, + { + "epoch": 0.04743596724023767, + "grad_norm": 1.1476012468338013, + "learning_rate": 0.00014226078403821834, + "loss": 5.0192, + "step": 33760 + }, + { + "epoch": 0.04745001817840125, + "grad_norm": 1.1322742700576782, + "learning_rate": 0.00014230293663060277, + "loss": 5.1356, + "step": 33770 + }, + { + "epoch": 0.047464069116564825, + "grad_norm": 1.147298812866211, + "learning_rate": 0.0001423450892229872, + "loss": 5.0278, + "step": 33780 + }, + { + "epoch": 0.0474781200547284, + "grad_norm": 1.1303203105926514, + "learning_rate": 0.00014238724181537163, + "loss": 5.0996, + "step": 33790 + }, + { + "epoch": 0.047492170992891984, + "grad_norm": 1.0890021324157715, + "learning_rate": 0.00014242939440775606, + "loss": 5.0753, + "step": 33800 + }, + { + "epoch": 0.04750622193105556, + "grad_norm": 1.163889765739441, + "learning_rate": 0.0001424715470001405, + "loss": 5.0723, + "step": 33810 + }, + { + "epoch": 0.047520272869219136, + "grad_norm": 1.1471232175827026, + "learning_rate": 0.00014251369959252492, + "loss": 5.0793, + "step": 33820 + }, + { + "epoch": 0.04753432380738271, + "grad_norm": 1.135847568511963, + "learning_rate": 0.00014255585218490936, + "loss": 5.1346, + "step": 33830 + }, + { + "epoch": 0.04754837474554629, + "grad_norm": 1.1431607007980347, + "learning_rate": 0.0001425980047772938, + "loss": 5.1053, + "step": 33840 + }, + { + "epoch": 0.04756242568370987, + "grad_norm": 1.1063638925552368, + "learning_rate": 0.00014264015736967822, + "loss": 4.9736, + "step": 33850 + }, + { + "epoch": 0.04757647662187345, + "grad_norm": 1.1206468343734741, + "learning_rate": 0.00014268230996206265, + "loss": 4.9508, + "step": 33860 + }, + { + "epoch": 0.047590527560037024, + "grad_norm": 1.234711766242981, + "learning_rate": 0.00014272446255444708, + "loss": 4.9879, + "step": 33870 + }, + { + "epoch": 0.0476045784982006, + "grad_norm": 1.2123228311538696, + "learning_rate": 0.0001427666151468315, + "loss": 4.8591, + "step": 33880 + }, + { + "epoch": 0.04761862943636418, + "grad_norm": 1.330238938331604, + "learning_rate": 0.00014280876773921594, + "loss": 4.9129, + "step": 33890 + }, + { + "epoch": 0.04763268037452776, + "grad_norm": 1.1890876293182373, + "learning_rate": 0.0001428509203316004, + "loss": 5.018, + "step": 33900 + }, + { + "epoch": 0.047646731312691336, + "grad_norm": 1.2122526168823242, + "learning_rate": 0.0001428930729239848, + "loss": 5.1118, + "step": 33910 + }, + { + "epoch": 0.04766078225085491, + "grad_norm": 1.1024436950683594, + "learning_rate": 0.00014293522551636924, + "loss": 4.8631, + "step": 33920 + }, + { + "epoch": 0.04767483318901849, + "grad_norm": 1.12148118019104, + "learning_rate": 0.0001429773781087537, + "loss": 5.0444, + "step": 33930 + }, + { + "epoch": 0.047688884127182064, + "grad_norm": 1.1458978652954102, + "learning_rate": 0.0001430195307011381, + "loss": 5.1447, + "step": 33940 + }, + { + "epoch": 0.04770293506534565, + "grad_norm": 1.1465718746185303, + "learning_rate": 0.00014306168329352253, + "loss": 4.968, + "step": 33950 + }, + { + "epoch": 0.047716986003509224, + "grad_norm": 1.162516713142395, + "learning_rate": 0.000143103835885907, + "loss": 5.0728, + "step": 33960 + }, + { + "epoch": 0.0477310369416728, + "grad_norm": 1.1164641380310059, + "learning_rate": 0.0001431459884782914, + "loss": 4.9858, + "step": 33970 + }, + { + "epoch": 0.047745087879836376, + "grad_norm": 1.1258015632629395, + "learning_rate": 0.00014318814107067582, + "loss": 5.0594, + "step": 33980 + }, + { + "epoch": 0.04775913881799995, + "grad_norm": 1.1459895372390747, + "learning_rate": 0.00014323029366306028, + "loss": 4.987, + "step": 33990 + }, + { + "epoch": 0.047773189756163535, + "grad_norm": 1.16275155544281, + "learning_rate": 0.00014327244625544469, + "loss": 5.0785, + "step": 34000 + }, + { + "epoch": 0.04778724069432711, + "grad_norm": 1.1919983625411987, + "learning_rate": 0.00014331459884782912, + "loss": 5.0735, + "step": 34010 + }, + { + "epoch": 0.04780129163249069, + "grad_norm": 1.0925780534744263, + "learning_rate": 0.00014335675144021358, + "loss": 5.1679, + "step": 34020 + }, + { + "epoch": 0.047815342570654264, + "grad_norm": 1.1478568315505981, + "learning_rate": 0.000143398904032598, + "loss": 5.1252, + "step": 34030 + }, + { + "epoch": 0.04782939350881784, + "grad_norm": 1.1364628076553345, + "learning_rate": 0.0001434410566249824, + "loss": 4.9149, + "step": 34040 + }, + { + "epoch": 0.047843444446981416, + "grad_norm": 1.130101203918457, + "learning_rate": 0.00014348320921736687, + "loss": 4.909, + "step": 34050 + }, + { + "epoch": 0.047857495385145, + "grad_norm": 1.151053547859192, + "learning_rate": 0.0001435253618097513, + "loss": 5.0317, + "step": 34060 + }, + { + "epoch": 0.047871546323308575, + "grad_norm": 1.158276081085205, + "learning_rate": 0.0001435675144021357, + "loss": 5.0723, + "step": 34070 + }, + { + "epoch": 0.04788559726147215, + "grad_norm": 1.2227519750595093, + "learning_rate": 0.00014360966699452016, + "loss": 5.0276, + "step": 34080 + }, + { + "epoch": 0.04789964819963573, + "grad_norm": 1.1406713724136353, + "learning_rate": 0.0001436518195869046, + "loss": 5.0192, + "step": 34090 + }, + { + "epoch": 0.047913699137799304, + "grad_norm": 1.1668028831481934, + "learning_rate": 0.000143693972179289, + "loss": 4.9819, + "step": 34100 + }, + { + "epoch": 0.04792775007596289, + "grad_norm": 1.1254831552505493, + "learning_rate": 0.00014373612477167346, + "loss": 5.0085, + "step": 34110 + }, + { + "epoch": 0.04794180101412646, + "grad_norm": 1.1782492399215698, + "learning_rate": 0.0001437782773640579, + "loss": 5.087, + "step": 34120 + }, + { + "epoch": 0.04795585195229004, + "grad_norm": 1.125497817993164, + "learning_rate": 0.0001438204299564423, + "loss": 5.0525, + "step": 34130 + }, + { + "epoch": 0.047969902890453615, + "grad_norm": 1.1250324249267578, + "learning_rate": 0.00014386258254882675, + "loss": 5.0964, + "step": 34140 + }, + { + "epoch": 0.04798395382861719, + "grad_norm": 1.1316850185394287, + "learning_rate": 0.00014390473514121118, + "loss": 5.0596, + "step": 34150 + }, + { + "epoch": 0.047998004766780775, + "grad_norm": 1.158501386642456, + "learning_rate": 0.0001439468877335956, + "loss": 5.0526, + "step": 34160 + }, + { + "epoch": 0.04801205570494435, + "grad_norm": 1.1655758619308472, + "learning_rate": 0.00014398904032598004, + "loss": 5.0539, + "step": 34170 + }, + { + "epoch": 0.04802610664310793, + "grad_norm": 1.1360056400299072, + "learning_rate": 0.00014403119291836447, + "loss": 5.0661, + "step": 34180 + }, + { + "epoch": 0.0480401575812715, + "grad_norm": 1.1133544445037842, + "learning_rate": 0.0001440733455107489, + "loss": 5.0339, + "step": 34190 + }, + { + "epoch": 0.04805420851943508, + "grad_norm": 1.0935070514678955, + "learning_rate": 0.00014411549810313334, + "loss": 4.9833, + "step": 34200 + }, + { + "epoch": 0.04806825945759866, + "grad_norm": 1.0671685934066772, + "learning_rate": 0.00014415765069551777, + "loss": 4.9763, + "step": 34210 + }, + { + "epoch": 0.04808231039576224, + "grad_norm": 1.1430456638336182, + "learning_rate": 0.0001441998032879022, + "loss": 4.9482, + "step": 34220 + }, + { + "epoch": 0.048096361333925815, + "grad_norm": 1.1322462558746338, + "learning_rate": 0.00014424195588028663, + "loss": 4.9501, + "step": 34230 + }, + { + "epoch": 0.04811041227208939, + "grad_norm": 1.121475100517273, + "learning_rate": 0.00014428410847267106, + "loss": 5.0683, + "step": 34240 + }, + { + "epoch": 0.04812446321025297, + "grad_norm": 1.1033834218978882, + "learning_rate": 0.0001443262610650555, + "loss": 4.9089, + "step": 34250 + }, + { + "epoch": 0.04813851414841655, + "grad_norm": 1.1717206239700317, + "learning_rate": 0.00014436841365743992, + "loss": 4.9873, + "step": 34260 + }, + { + "epoch": 0.048152565086580126, + "grad_norm": 1.1643035411834717, + "learning_rate": 0.00014441056624982435, + "loss": 5.0396, + "step": 34270 + }, + { + "epoch": 0.0481666160247437, + "grad_norm": 1.109316110610962, + "learning_rate": 0.00014445271884220879, + "loss": 5.0065, + "step": 34280 + }, + { + "epoch": 0.04818066696290728, + "grad_norm": 1.0868771076202393, + "learning_rate": 0.00014449487143459322, + "loss": 5.0783, + "step": 34290 + }, + { + "epoch": 0.048194717901070855, + "grad_norm": 1.1115925312042236, + "learning_rate": 0.00014453702402697765, + "loss": 4.9573, + "step": 34300 + }, + { + "epoch": 0.04820876883923444, + "grad_norm": 1.14295494556427, + "learning_rate": 0.00014457496136012363, + "loss": 5.016, + "step": 34310 + }, + { + "epoch": 0.048222819777398014, + "grad_norm": 1.148398518562317, + "learning_rate": 0.00014461711395250806, + "loss": 5.147, + "step": 34320 + }, + { + "epoch": 0.04823687071556159, + "grad_norm": 1.1277004480361938, + "learning_rate": 0.0001446592665448925, + "loss": 5.1473, + "step": 34330 + }, + { + "epoch": 0.048250921653725166, + "grad_norm": 1.186437964439392, + "learning_rate": 0.00014470141913727692, + "loss": 4.9353, + "step": 34340 + }, + { + "epoch": 0.04826497259188874, + "grad_norm": 1.131174921989441, + "learning_rate": 0.00014474357172966136, + "loss": 5.0711, + "step": 34350 + }, + { + "epoch": 0.04827902353005232, + "grad_norm": 1.1172866821289062, + "learning_rate": 0.0001447857243220458, + "loss": 5.1223, + "step": 34360 + }, + { + "epoch": 0.0482930744682159, + "grad_norm": 1.179067850112915, + "learning_rate": 0.00014482787691443022, + "loss": 4.9628, + "step": 34370 + }, + { + "epoch": 0.04830712540637948, + "grad_norm": 1.1836308240890503, + "learning_rate": 0.00014487002950681465, + "loss": 5.0934, + "step": 34380 + }, + { + "epoch": 0.048321176344543054, + "grad_norm": 1.1305451393127441, + "learning_rate": 0.00014491218209919908, + "loss": 5.0188, + "step": 34390 + }, + { + "epoch": 0.04833522728270663, + "grad_norm": 1.113901138305664, + "learning_rate": 0.0001449543346915835, + "loss": 5.0124, + "step": 34400 + }, + { + "epoch": 0.04834927822087021, + "grad_norm": 1.1200542449951172, + "learning_rate": 0.00014499648728396794, + "loss": 5.1159, + "step": 34410 + }, + { + "epoch": 0.04836332915903379, + "grad_norm": 1.1390150785446167, + "learning_rate": 0.00014503863987635237, + "loss": 5.0467, + "step": 34420 + }, + { + "epoch": 0.048377380097197366, + "grad_norm": 1.1669032573699951, + "learning_rate": 0.0001450807924687368, + "loss": 5.0509, + "step": 34430 + }, + { + "epoch": 0.04839143103536094, + "grad_norm": 1.1723250150680542, + "learning_rate": 0.00014512294506112124, + "loss": 4.8566, + "step": 34440 + }, + { + "epoch": 0.04840548197352452, + "grad_norm": 1.2041908502578735, + "learning_rate": 0.0001451650976535057, + "loss": 5.0341, + "step": 34450 + }, + { + "epoch": 0.048419532911688094, + "grad_norm": 1.1292054653167725, + "learning_rate": 0.0001452072502458901, + "loss": 5.0321, + "step": 34460 + }, + { + "epoch": 0.04843358384985168, + "grad_norm": 1.12458074092865, + "learning_rate": 0.00014524940283827453, + "loss": 5.0766, + "step": 34470 + }, + { + "epoch": 0.048447634788015254, + "grad_norm": 1.1062582731246948, + "learning_rate": 0.000145291555430659, + "loss": 5.0649, + "step": 34480 + }, + { + "epoch": 0.04846168572617883, + "grad_norm": 1.124700665473938, + "learning_rate": 0.0001453337080230434, + "loss": 4.9559, + "step": 34490 + }, + { + "epoch": 0.048475736664342406, + "grad_norm": 1.1494004726409912, + "learning_rate": 0.00014537586061542782, + "loss": 5.0064, + "step": 34500 + }, + { + "epoch": 0.04848978760250598, + "grad_norm": 1.1343144178390503, + "learning_rate": 0.00014541801320781228, + "loss": 5.0244, + "step": 34510 + }, + { + "epoch": 0.048503838540669565, + "grad_norm": 1.0855003595352173, + "learning_rate": 0.00014546016580019669, + "loss": 5.0812, + "step": 34520 + }, + { + "epoch": 0.04851788947883314, + "grad_norm": 1.1069141626358032, + "learning_rate": 0.00014550231839258112, + "loss": 5.0764, + "step": 34530 + }, + { + "epoch": 0.04853194041699672, + "grad_norm": 1.470924735069275, + "learning_rate": 0.00014554447098496557, + "loss": 5.0076, + "step": 34540 + }, + { + "epoch": 0.048545991355160294, + "grad_norm": 1.1423304080963135, + "learning_rate": 0.00014558662357734998, + "loss": 5.008, + "step": 34550 + }, + { + "epoch": 0.04856004229332387, + "grad_norm": 1.164551854133606, + "learning_rate": 0.00014562877616973444, + "loss": 4.9289, + "step": 34560 + }, + { + "epoch": 0.04857409323148745, + "grad_norm": 1.0787816047668457, + "learning_rate": 0.00014567092876211887, + "loss": 5.0759, + "step": 34570 + }, + { + "epoch": 0.04858814416965103, + "grad_norm": 1.1306999921798706, + "learning_rate": 0.0001457130813545033, + "loss": 5.0986, + "step": 34580 + }, + { + "epoch": 0.048602195107814605, + "grad_norm": 1.1416428089141846, + "learning_rate": 0.00014575523394688773, + "loss": 5.0618, + "step": 34590 + }, + { + "epoch": 0.04861624604597818, + "grad_norm": 1.1224125623703003, + "learning_rate": 0.00014579738653927216, + "loss": 5.0685, + "step": 34600 + }, + { + "epoch": 0.04863029698414176, + "grad_norm": 1.1280869245529175, + "learning_rate": 0.0001458395391316566, + "loss": 4.9973, + "step": 34610 + }, + { + "epoch": 0.04864434792230534, + "grad_norm": 1.1328657865524292, + "learning_rate": 0.00014588169172404102, + "loss": 5.0089, + "step": 34620 + }, + { + "epoch": 0.04865839886046892, + "grad_norm": 1.183826208114624, + "learning_rate": 0.00014592384431642546, + "loss": 4.9232, + "step": 34630 + }, + { + "epoch": 0.04867244979863249, + "grad_norm": 1.1926147937774658, + "learning_rate": 0.00014596599690880989, + "loss": 4.9648, + "step": 34640 + }, + { + "epoch": 0.04868650073679607, + "grad_norm": 1.1898232698440552, + "learning_rate": 0.00014600814950119432, + "loss": 4.9844, + "step": 34650 + }, + { + "epoch": 0.048700551674959645, + "grad_norm": 1.1884058713912964, + "learning_rate": 0.00014605030209357875, + "loss": 5.0502, + "step": 34660 + }, + { + "epoch": 0.04871460261312322, + "grad_norm": 1.1177476644515991, + "learning_rate": 0.00014609245468596318, + "loss": 5.0569, + "step": 34670 + }, + { + "epoch": 0.048728653551286805, + "grad_norm": 1.3436133861541748, + "learning_rate": 0.0001461346072783476, + "loss": 5.0094, + "step": 34680 + }, + { + "epoch": 0.04874270448945038, + "grad_norm": 1.1321064233779907, + "learning_rate": 0.00014617675987073204, + "loss": 5.0142, + "step": 34690 + }, + { + "epoch": 0.04875675542761396, + "grad_norm": 1.104989767074585, + "learning_rate": 0.00014621891246311647, + "loss": 5.0279, + "step": 34700 + }, + { + "epoch": 0.04877080636577753, + "grad_norm": 1.1625908613204956, + "learning_rate": 0.0001462610650555009, + "loss": 4.9406, + "step": 34710 + }, + { + "epoch": 0.04878485730394111, + "grad_norm": 1.1442335844039917, + "learning_rate": 0.00014630321764788534, + "loss": 4.9705, + "step": 34720 + }, + { + "epoch": 0.04879890824210469, + "grad_norm": 1.1111855506896973, + "learning_rate": 0.00014634537024026977, + "loss": 5.003, + "step": 34730 + }, + { + "epoch": 0.04881295918026827, + "grad_norm": 1.1129965782165527, + "learning_rate": 0.0001463875228326542, + "loss": 4.8758, + "step": 34740 + }, + { + "epoch": 0.048827010118431845, + "grad_norm": 1.1186314821243286, + "learning_rate": 0.00014642967542503863, + "loss": 5.0139, + "step": 34750 + }, + { + "epoch": 0.04884106105659542, + "grad_norm": 1.150681972503662, + "learning_rate": 0.00014647182801742306, + "loss": 4.9994, + "step": 34760 + }, + { + "epoch": 0.048855111994759, + "grad_norm": 1.1509026288986206, + "learning_rate": 0.0001465139806098075, + "loss": 5.0786, + "step": 34770 + }, + { + "epoch": 0.04886916293292258, + "grad_norm": 1.1005713939666748, + "learning_rate": 0.00014655613320219192, + "loss": 4.9926, + "step": 34780 + }, + { + "epoch": 0.048883213871086156, + "grad_norm": 1.176209807395935, + "learning_rate": 0.00014659828579457635, + "loss": 5.0281, + "step": 34790 + }, + { + "epoch": 0.04889726480924973, + "grad_norm": 1.1285464763641357, + "learning_rate": 0.00014664043838696078, + "loss": 4.8861, + "step": 34800 + }, + { + "epoch": 0.04891131574741331, + "grad_norm": 1.1435015201568604, + "learning_rate": 0.00014668259097934522, + "loss": 4.9688, + "step": 34810 + }, + { + "epoch": 0.048925366685576885, + "grad_norm": 1.1347907781600952, + "learning_rate": 0.00014672474357172965, + "loss": 5.0083, + "step": 34820 + }, + { + "epoch": 0.04893941762374047, + "grad_norm": 1.1737812757492065, + "learning_rate": 0.00014676689616411408, + "loss": 5.0881, + "step": 34830 + }, + { + "epoch": 0.048953468561904044, + "grad_norm": 1.0986801385879517, + "learning_rate": 0.0001468090487564985, + "loss": 5.0823, + "step": 34840 + }, + { + "epoch": 0.04896751950006762, + "grad_norm": 1.218393325805664, + "learning_rate": 0.00014685120134888294, + "loss": 5.0719, + "step": 34850 + }, + { + "epoch": 0.048981570438231196, + "grad_norm": 1.0812214612960815, + "learning_rate": 0.00014689335394126737, + "loss": 5.0184, + "step": 34860 + }, + { + "epoch": 0.04899562137639477, + "grad_norm": 1.2031632661819458, + "learning_rate": 0.0001469355065336518, + "loss": 5.0449, + "step": 34870 + }, + { + "epoch": 0.049009672314558356, + "grad_norm": 1.109456181526184, + "learning_rate": 0.00014697765912603623, + "loss": 5.1284, + "step": 34880 + }, + { + "epoch": 0.04902372325272193, + "grad_norm": 1.1223009824752808, + "learning_rate": 0.00014701981171842067, + "loss": 4.9705, + "step": 34890 + }, + { + "epoch": 0.04903777419088551, + "grad_norm": 1.125307559967041, + "learning_rate": 0.0001470619643108051, + "loss": 4.9975, + "step": 34900 + }, + { + "epoch": 0.049051825129049084, + "grad_norm": 1.1051353216171265, + "learning_rate": 0.00014710411690318953, + "loss": 4.9682, + "step": 34910 + }, + { + "epoch": 0.04906587606721266, + "grad_norm": 1.0811420679092407, + "learning_rate": 0.00014714626949557396, + "loss": 5.0424, + "step": 34920 + }, + { + "epoch": 0.049079927005376243, + "grad_norm": 1.1198779344558716, + "learning_rate": 0.0001471884220879584, + "loss": 5.0246, + "step": 34930 + }, + { + "epoch": 0.04909397794353982, + "grad_norm": 1.1411025524139404, + "learning_rate": 0.00014723057468034282, + "loss": 5.0529, + "step": 34940 + }, + { + "epoch": 0.049108028881703396, + "grad_norm": 1.1323539018630981, + "learning_rate": 0.00014727272727272725, + "loss": 4.9568, + "step": 34950 + }, + { + "epoch": 0.04912207981986697, + "grad_norm": 1.1151742935180664, + "learning_rate": 0.00014731487986511168, + "loss": 4.9563, + "step": 34960 + }, + { + "epoch": 0.04913613075803055, + "grad_norm": 1.1359392404556274, + "learning_rate": 0.00014735703245749611, + "loss": 5.112, + "step": 34970 + }, + { + "epoch": 0.049150181696194124, + "grad_norm": 1.0861750841140747, + "learning_rate": 0.00014739918504988055, + "loss": 4.9891, + "step": 34980 + }, + { + "epoch": 0.04916423263435771, + "grad_norm": 1.2048242092132568, + "learning_rate": 0.00014744133764226498, + "loss": 5.0255, + "step": 34990 + }, + { + "epoch": 0.049178283572521284, + "grad_norm": 1.1281739473342896, + "learning_rate": 0.00014748349023464944, + "loss": 5.1152, + "step": 35000 + }, + { + "epoch": 0.04919233451068486, + "grad_norm": 1.1170549392700195, + "learning_rate": 0.00014752564282703384, + "loss": 4.8994, + "step": 35010 + }, + { + "epoch": 0.049206385448848436, + "grad_norm": 1.1060420274734497, + "learning_rate": 0.00014756779541941827, + "loss": 5.1206, + "step": 35020 + }, + { + "epoch": 0.04922043638701201, + "grad_norm": 1.105675458908081, + "learning_rate": 0.00014760994801180273, + "loss": 4.9421, + "step": 35030 + }, + { + "epoch": 0.049234487325175595, + "grad_norm": 1.1171681880950928, + "learning_rate": 0.00014765210060418713, + "loss": 4.9546, + "step": 35040 + }, + { + "epoch": 0.04924853826333917, + "grad_norm": 1.1521064043045044, + "learning_rate": 0.00014769425319657156, + "loss": 4.8988, + "step": 35050 + }, + { + "epoch": 0.04926258920150275, + "grad_norm": 1.110176920890808, + "learning_rate": 0.00014773640578895602, + "loss": 4.9492, + "step": 35060 + }, + { + "epoch": 0.049276640139666324, + "grad_norm": 1.0732078552246094, + "learning_rate": 0.00014777855838134043, + "loss": 5.0328, + "step": 35070 + }, + { + "epoch": 0.0492906910778299, + "grad_norm": 1.1150749921798706, + "learning_rate": 0.00014782071097372486, + "loss": 5.1412, + "step": 35080 + }, + { + "epoch": 0.04930474201599348, + "grad_norm": 1.1571887731552124, + "learning_rate": 0.00014786286356610932, + "loss": 4.9802, + "step": 35090 + }, + { + "epoch": 0.04931879295415706, + "grad_norm": 1.0949782133102417, + "learning_rate": 0.00014790501615849372, + "loss": 4.9798, + "step": 35100 + }, + { + "epoch": 0.049332843892320635, + "grad_norm": 1.1114603281021118, + "learning_rate": 0.00014794716875087815, + "loss": 4.9468, + "step": 35110 + }, + { + "epoch": 0.04934689483048421, + "grad_norm": 1.0757004022598267, + "learning_rate": 0.0001479893213432626, + "loss": 4.8796, + "step": 35120 + }, + { + "epoch": 0.04936094576864779, + "grad_norm": 1.1924103498458862, + "learning_rate": 0.000148031473935647, + "loss": 4.932, + "step": 35130 + }, + { + "epoch": 0.04937499670681137, + "grad_norm": 1.1277077198028564, + "learning_rate": 0.00014807362652803147, + "loss": 5.0101, + "step": 35140 + }, + { + "epoch": 0.04938904764497495, + "grad_norm": 1.1275465488433838, + "learning_rate": 0.0001481157791204159, + "loss": 4.9974, + "step": 35150 + }, + { + "epoch": 0.04940309858313852, + "grad_norm": 1.1476144790649414, + "learning_rate": 0.00014815793171280033, + "loss": 5.0018, + "step": 35160 + }, + { + "epoch": 0.0494171495213021, + "grad_norm": 1.1097756624221802, + "learning_rate": 0.00014820008430518477, + "loss": 5.0057, + "step": 35170 + }, + { + "epoch": 0.049431200459465675, + "grad_norm": 1.1443592309951782, + "learning_rate": 0.0001482422368975692, + "loss": 4.9678, + "step": 35180 + }, + { + "epoch": 0.04944525139762926, + "grad_norm": 1.1339356899261475, + "learning_rate": 0.00014828438948995363, + "loss": 4.9923, + "step": 35190 + }, + { + "epoch": 0.049459302335792835, + "grad_norm": 1.0916308164596558, + "learning_rate": 0.00014832654208233806, + "loss": 5.0241, + "step": 35200 + }, + { + "epoch": 0.04947335327395641, + "grad_norm": 1.1227270364761353, + "learning_rate": 0.0001483686946747225, + "loss": 4.9788, + "step": 35210 + }, + { + "epoch": 0.04948740421211999, + "grad_norm": 1.126926064491272, + "learning_rate": 0.00014841084726710692, + "loss": 4.8996, + "step": 35220 + }, + { + "epoch": 0.04950145515028356, + "grad_norm": 1.1398160457611084, + "learning_rate": 0.00014845299985949135, + "loss": 4.9572, + "step": 35230 + }, + { + "epoch": 0.049515506088447146, + "grad_norm": 1.1179039478302002, + "learning_rate": 0.00014849515245187578, + "loss": 4.9517, + "step": 35240 + }, + { + "epoch": 0.04952955702661072, + "grad_norm": 1.1064491271972656, + "learning_rate": 0.00014853730504426021, + "loss": 4.8927, + "step": 35250 + }, + { + "epoch": 0.0495436079647743, + "grad_norm": 1.128804087638855, + "learning_rate": 0.00014857945763664465, + "loss": 5.0498, + "step": 35260 + }, + { + "epoch": 0.049557658902937875, + "grad_norm": 1.1051520109176636, + "learning_rate": 0.00014862161022902908, + "loss": 5.0604, + "step": 35270 + }, + { + "epoch": 0.04957170984110145, + "grad_norm": 1.1551424264907837, + "learning_rate": 0.0001486637628214135, + "loss": 4.917, + "step": 35280 + }, + { + "epoch": 0.049585760779265034, + "grad_norm": 1.081084132194519, + "learning_rate": 0.00014870591541379794, + "loss": 5.2189, + "step": 35290 + }, + { + "epoch": 0.04959981171742861, + "grad_norm": 1.1549731492996216, + "learning_rate": 0.00014874806800618237, + "loss": 4.9709, + "step": 35300 + }, + { + "epoch": 0.049613862655592186, + "grad_norm": 1.1668425798416138, + "learning_rate": 0.0001487902205985668, + "loss": 5.0498, + "step": 35310 + }, + { + "epoch": 0.04962791359375576, + "grad_norm": 1.143610954284668, + "learning_rate": 0.00014883237319095123, + "loss": 5.0319, + "step": 35320 + }, + { + "epoch": 0.04964196453191934, + "grad_norm": 1.1000237464904785, + "learning_rate": 0.00014887452578333566, + "loss": 5.0029, + "step": 35330 + }, + { + "epoch": 0.049656015470082915, + "grad_norm": 1.1435863971710205, + "learning_rate": 0.0001489166783757201, + "loss": 4.8895, + "step": 35340 + }, + { + "epoch": 0.0496700664082465, + "grad_norm": 1.1478296518325806, + "learning_rate": 0.00014895883096810453, + "loss": 5.0575, + "step": 35350 + }, + { + "epoch": 0.049684117346410074, + "grad_norm": 1.0952467918395996, + "learning_rate": 0.00014900098356048896, + "loss": 5.0144, + "step": 35360 + }, + { + "epoch": 0.04969816828457365, + "grad_norm": 1.0802350044250488, + "learning_rate": 0.0001490431361528734, + "loss": 5.0557, + "step": 35370 + }, + { + "epoch": 0.049712219222737226, + "grad_norm": 1.1121940612792969, + "learning_rate": 0.00014908528874525782, + "loss": 5.0688, + "step": 35380 + }, + { + "epoch": 0.0497262701609008, + "grad_norm": 1.1207494735717773, + "learning_rate": 0.00014912744133764225, + "loss": 4.9295, + "step": 35390 + }, + { + "epoch": 0.049740321099064386, + "grad_norm": 1.1117680072784424, + "learning_rate": 0.00014916959393002668, + "loss": 5.0406, + "step": 35400 + }, + { + "epoch": 0.04975437203722796, + "grad_norm": 1.2448337078094482, + "learning_rate": 0.0001492117465224111, + "loss": 4.9559, + "step": 35410 + }, + { + "epoch": 0.04976842297539154, + "grad_norm": 1.0946520566940308, + "learning_rate": 0.00014925389911479554, + "loss": 4.9064, + "step": 35420 + }, + { + "epoch": 0.049782473913555114, + "grad_norm": 1.0836005210876465, + "learning_rate": 0.00014929605170717998, + "loss": 5.0171, + "step": 35430 + }, + { + "epoch": 0.04979652485171869, + "grad_norm": 1.1312141418457031, + "learning_rate": 0.0001493382042995644, + "loss": 5.103, + "step": 35440 + }, + { + "epoch": 0.04981057578988227, + "grad_norm": 1.2918745279312134, + "learning_rate": 0.00014938035689194884, + "loss": 5.056, + "step": 35450 + }, + { + "epoch": 0.04982462672804585, + "grad_norm": 1.1127320528030396, + "learning_rate": 0.00014942250948433327, + "loss": 4.988, + "step": 35460 + }, + { + "epoch": 0.049838677666209426, + "grad_norm": 1.1448490619659424, + "learning_rate": 0.0001494646620767177, + "loss": 5.141, + "step": 35470 + }, + { + "epoch": 0.049852728604373, + "grad_norm": 1.1456868648529053, + "learning_rate": 0.00014950681466910213, + "loss": 5.0021, + "step": 35480 + }, + { + "epoch": 0.04986677954253658, + "grad_norm": 1.1046836376190186, + "learning_rate": 0.00014954896726148656, + "loss": 5.0716, + "step": 35490 + }, + { + "epoch": 0.04988083048070016, + "grad_norm": 1.1139931678771973, + "learning_rate": 0.000149591119853871, + "loss": 5.1356, + "step": 35500 + }, + { + "epoch": 0.04989488141886374, + "grad_norm": 1.103615164756775, + "learning_rate": 0.00014963327244625542, + "loss": 4.9488, + "step": 35510 + }, + { + "epoch": 0.049908932357027314, + "grad_norm": 1.1086006164550781, + "learning_rate": 0.00014967542503863986, + "loss": 5.1068, + "step": 35520 + }, + { + "epoch": 0.04992298329519089, + "grad_norm": 1.2253687381744385, + "learning_rate": 0.0001497175776310243, + "loss": 4.9127, + "step": 35530 + }, + { + "epoch": 0.049937034233354466, + "grad_norm": 1.0812503099441528, + "learning_rate": 0.00014975973022340872, + "loss": 5.0393, + "step": 35540 + }, + { + "epoch": 0.04995108517151805, + "grad_norm": 1.108386516571045, + "learning_rate": 0.00014980188281579315, + "loss": 4.9698, + "step": 35550 + }, + { + "epoch": 0.049965136109681625, + "grad_norm": 1.108074426651001, + "learning_rate": 0.00014984403540817758, + "loss": 5.0325, + "step": 35560 + }, + { + "epoch": 0.0499791870478452, + "grad_norm": 1.1028242111206055, + "learning_rate": 0.000149886188000562, + "loss": 5.0655, + "step": 35570 + }, + { + "epoch": 0.04999323798600878, + "grad_norm": 1.1405538320541382, + "learning_rate": 0.00014992834059294647, + "loss": 5.0859, + "step": 35580 + }, + { + "epoch": 0.050007288924172354, + "grad_norm": 1.1215912103652954, + "learning_rate": 0.00014997049318533087, + "loss": 4.9616, + "step": 35590 + }, + { + "epoch": 0.05002133986233594, + "grad_norm": 1.2131357192993164, + "learning_rate": 0.0001500126457777153, + "loss": 4.9392, + "step": 35600 + }, + { + "epoch": 0.05003539080049951, + "grad_norm": 1.1143559217453003, + "learning_rate": 0.00015005479837009974, + "loss": 4.9206, + "step": 35610 + }, + { + "epoch": 0.05004944173866309, + "grad_norm": 1.0911166667938232, + "learning_rate": 0.00015009695096248417, + "loss": 5.0549, + "step": 35620 + }, + { + "epoch": 0.050063492676826665, + "grad_norm": 1.0939656496047974, + "learning_rate": 0.0001501391035548686, + "loss": 5.0054, + "step": 35630 + }, + { + "epoch": 0.05007754361499024, + "grad_norm": 1.123145341873169, + "learning_rate": 0.00015018125614725306, + "loss": 4.9684, + "step": 35640 + }, + { + "epoch": 0.05009159455315382, + "grad_norm": 1.072049856185913, + "learning_rate": 0.0001502234087396375, + "loss": 4.9857, + "step": 35650 + }, + { + "epoch": 0.0501056454913174, + "grad_norm": 1.1123992204666138, + "learning_rate": 0.00015026556133202192, + "loss": 5.0464, + "step": 35660 + }, + { + "epoch": 0.05011969642948098, + "grad_norm": 1.2838733196258545, + "learning_rate": 0.00015030771392440632, + "loss": 5.0244, + "step": 35670 + }, + { + "epoch": 0.05013374736764455, + "grad_norm": 1.0853863954544067, + "learning_rate": 0.00015034986651679075, + "loss": 5.0946, + "step": 35680 + }, + { + "epoch": 0.05014779830580813, + "grad_norm": 1.0802594423294067, + "learning_rate": 0.00015039201910917519, + "loss": 4.9547, + "step": 35690 + }, + { + "epoch": 0.050161849243971705, + "grad_norm": 1.1661789417266846, + "learning_rate": 0.00015043417170155964, + "loss": 5.0453, + "step": 35700 + }, + { + "epoch": 0.05017590018213529, + "grad_norm": 1.0816981792449951, + "learning_rate": 0.00015047632429394408, + "loss": 5.1652, + "step": 35710 + }, + { + "epoch": 0.050189951120298865, + "grad_norm": 1.1324831247329712, + "learning_rate": 0.0001505184768863285, + "loss": 4.9944, + "step": 35720 + }, + { + "epoch": 0.05020400205846244, + "grad_norm": 1.1340142488479614, + "learning_rate": 0.0001505606294787129, + "loss": 5.0676, + "step": 35730 + }, + { + "epoch": 0.05021805299662602, + "grad_norm": 1.0971640348434448, + "learning_rate": 0.00015060278207109734, + "loss": 4.9213, + "step": 35740 + }, + { + "epoch": 0.05023210393478959, + "grad_norm": 1.1202173233032227, + "learning_rate": 0.0001506449346634818, + "loss": 5.0313, + "step": 35750 + }, + { + "epoch": 0.050246154872953176, + "grad_norm": 1.1177239418029785, + "learning_rate": 0.00015068708725586623, + "loss": 5.0798, + "step": 35760 + }, + { + "epoch": 0.05026020581111675, + "grad_norm": 1.1042288541793823, + "learning_rate": 0.00015072923984825066, + "loss": 5.1434, + "step": 35770 + }, + { + "epoch": 0.05027425674928033, + "grad_norm": 1.0804433822631836, + "learning_rate": 0.0001507713924406351, + "loss": 5.0876, + "step": 35780 + }, + { + "epoch": 0.050288307687443905, + "grad_norm": 1.0773561000823975, + "learning_rate": 0.00015081354503301952, + "loss": 5.0045, + "step": 35790 + }, + { + "epoch": 0.05030235862560748, + "grad_norm": 1.1056697368621826, + "learning_rate": 0.00015085569762540393, + "loss": 5.1274, + "step": 35800 + }, + { + "epoch": 0.050316409563771064, + "grad_norm": 1.1184196472167969, + "learning_rate": 0.0001508978502177884, + "loss": 4.973, + "step": 35810 + }, + { + "epoch": 0.05033046050193464, + "grad_norm": 1.1502302885055542, + "learning_rate": 0.00015094000281017282, + "loss": 4.8589, + "step": 35820 + }, + { + "epoch": 0.050344511440098216, + "grad_norm": 1.110656499862671, + "learning_rate": 0.00015098215540255725, + "loss": 5.1146, + "step": 35830 + }, + { + "epoch": 0.05035856237826179, + "grad_norm": 1.0898936986923218, + "learning_rate": 0.00015102430799494168, + "loss": 5.0243, + "step": 35840 + }, + { + "epoch": 0.05037261331642537, + "grad_norm": 1.1786738634109497, + "learning_rate": 0.0001510664605873261, + "loss": 5.0675, + "step": 35850 + }, + { + "epoch": 0.05038666425458895, + "grad_norm": 1.1352401971817017, + "learning_rate": 0.00015110861317971052, + "loss": 5.0869, + "step": 35860 + }, + { + "epoch": 0.05040071519275253, + "grad_norm": 1.2441682815551758, + "learning_rate": 0.00015115076577209497, + "loss": 4.9753, + "step": 35870 + }, + { + "epoch": 0.050414766130916104, + "grad_norm": 1.062424898147583, + "learning_rate": 0.0001511929183644794, + "loss": 5.1051, + "step": 35880 + }, + { + "epoch": 0.05042881706907968, + "grad_norm": 1.0863795280456543, + "learning_rate": 0.00015123507095686384, + "loss": 5.0136, + "step": 35890 + }, + { + "epoch": 0.050442868007243256, + "grad_norm": 1.1157557964324951, + "learning_rate": 0.00015127722354924827, + "loss": 5.0345, + "step": 35900 + }, + { + "epoch": 0.05045691894540684, + "grad_norm": 1.0928047895431519, + "learning_rate": 0.0001513193761416327, + "loss": 5.0341, + "step": 35910 + }, + { + "epoch": 0.050470969883570416, + "grad_norm": 1.103150486946106, + "learning_rate": 0.0001513615287340171, + "loss": 5.0651, + "step": 35920 + }, + { + "epoch": 0.05048502082173399, + "grad_norm": 1.1346688270568848, + "learning_rate": 0.00015140368132640156, + "loss": 4.9009, + "step": 35930 + }, + { + "epoch": 0.05049907175989757, + "grad_norm": 1.1874549388885498, + "learning_rate": 0.000151445833918786, + "loss": 4.9587, + "step": 35940 + }, + { + "epoch": 0.050513122698061144, + "grad_norm": 1.10201895236969, + "learning_rate": 0.00015148798651117042, + "loss": 5.0994, + "step": 35950 + }, + { + "epoch": 0.05052717363622472, + "grad_norm": 1.1142152547836304, + "learning_rate": 0.00015153013910355485, + "loss": 4.9086, + "step": 35960 + }, + { + "epoch": 0.0505412245743883, + "grad_norm": 1.1207566261291504, + "learning_rate": 0.00015157229169593929, + "loss": 4.9371, + "step": 35970 + }, + { + "epoch": 0.05055527551255188, + "grad_norm": 1.146672248840332, + "learning_rate": 0.00015161444428832374, + "loss": 4.9947, + "step": 35980 + }, + { + "epoch": 0.050569326450715456, + "grad_norm": 1.1421736478805542, + "learning_rate": 0.00015165659688070815, + "loss": 5.1101, + "step": 35990 + }, + { + "epoch": 0.05058337738887903, + "grad_norm": 1.0799651145935059, + "learning_rate": 0.00015169874947309258, + "loss": 4.856, + "step": 36000 + }, + { + "epoch": 0.05059742832704261, + "grad_norm": 1.1083508729934692, + "learning_rate": 0.000151740902065477, + "loss": 4.9631, + "step": 36010 + }, + { + "epoch": 0.05061147926520619, + "grad_norm": 1.1054953336715698, + "learning_rate": 0.00015178305465786144, + "loss": 5.0091, + "step": 36020 + }, + { + "epoch": 0.05062553020336977, + "grad_norm": 1.219901204109192, + "learning_rate": 0.00015182520725024587, + "loss": 4.9669, + "step": 36030 + }, + { + "epoch": 0.050639581141533344, + "grad_norm": 1.1822065114974976, + "learning_rate": 0.00015186735984263033, + "loss": 4.9951, + "step": 36040 + }, + { + "epoch": 0.05065363207969692, + "grad_norm": 1.0649558305740356, + "learning_rate": 0.00015190951243501473, + "loss": 4.9975, + "step": 36050 + }, + { + "epoch": 0.050667683017860496, + "grad_norm": 1.1625783443450928, + "learning_rate": 0.00015195166502739917, + "loss": 4.9153, + "step": 36060 + }, + { + "epoch": 0.05068173395602408, + "grad_norm": 1.0902538299560547, + "learning_rate": 0.0001519938176197836, + "loss": 4.9411, + "step": 36070 + }, + { + "epoch": 0.050695784894187655, + "grad_norm": 1.12613046169281, + "learning_rate": 0.00015203597021216803, + "loss": 4.9862, + "step": 36080 + }, + { + "epoch": 0.05070983583235123, + "grad_norm": 1.1342322826385498, + "learning_rate": 0.00015207812280455246, + "loss": 5.0452, + "step": 36090 + }, + { + "epoch": 0.05072388677051481, + "grad_norm": 1.0751118659973145, + "learning_rate": 0.00015212027539693692, + "loss": 5.0084, + "step": 36100 + }, + { + "epoch": 0.050737937708678384, + "grad_norm": 1.1124134063720703, + "learning_rate": 0.00015216242798932135, + "loss": 4.9795, + "step": 36110 + }, + { + "epoch": 0.05075198864684197, + "grad_norm": 1.1308832168579102, + "learning_rate": 0.00015220458058170575, + "loss": 5.0751, + "step": 36120 + }, + { + "epoch": 0.05076603958500554, + "grad_norm": 1.1083581447601318, + "learning_rate": 0.00015224673317409018, + "loss": 5.0698, + "step": 36130 + }, + { + "epoch": 0.05078009052316912, + "grad_norm": 1.140774130821228, + "learning_rate": 0.00015228888576647462, + "loss": 5.041, + "step": 36140 + }, + { + "epoch": 0.050794141461332695, + "grad_norm": 1.0934237241744995, + "learning_rate": 0.00015233103835885905, + "loss": 5.0086, + "step": 36150 + }, + { + "epoch": 0.05080819239949627, + "grad_norm": 1.0771994590759277, + "learning_rate": 0.0001523731909512435, + "loss": 4.9932, + "step": 36160 + }, + { + "epoch": 0.050822243337659855, + "grad_norm": 1.076748251914978, + "learning_rate": 0.00015241534354362794, + "loss": 4.8132, + "step": 36170 + }, + { + "epoch": 0.05083629427582343, + "grad_norm": 1.0804756879806519, + "learning_rate": 0.00015245749613601234, + "loss": 4.9832, + "step": 36180 + }, + { + "epoch": 0.05085034521398701, + "grad_norm": 1.0959327220916748, + "learning_rate": 0.00015249964872839677, + "loss": 4.9817, + "step": 36190 + }, + { + "epoch": 0.05086439615215058, + "grad_norm": 1.185053825378418, + "learning_rate": 0.0001525418013207812, + "loss": 5.0076, + "step": 36200 + }, + { + "epoch": 0.05087844709031416, + "grad_norm": 1.1760470867156982, + "learning_rate": 0.00015258395391316563, + "loss": 4.9851, + "step": 36210 + }, + { + "epoch": 0.05089249802847774, + "grad_norm": 1.1177805662155151, + "learning_rate": 0.0001526261065055501, + "loss": 5.0122, + "step": 36220 + }, + { + "epoch": 0.05090654896664132, + "grad_norm": 1.0969743728637695, + "learning_rate": 0.00015266825909793452, + "loss": 4.8199, + "step": 36230 + }, + { + "epoch": 0.050920599904804895, + "grad_norm": 1.171985387802124, + "learning_rate": 0.00015271041169031895, + "loss": 4.9671, + "step": 36240 + }, + { + "epoch": 0.05093465084296847, + "grad_norm": 1.0644655227661133, + "learning_rate": 0.00015275256428270336, + "loss": 5.0531, + "step": 36250 + }, + { + "epoch": 0.05094870178113205, + "grad_norm": 1.0977424383163452, + "learning_rate": 0.0001527947168750878, + "loss": 4.9316, + "step": 36260 + }, + { + "epoch": 0.05096275271929562, + "grad_norm": 1.0816428661346436, + "learning_rate": 0.00015283686946747222, + "loss": 5.0497, + "step": 36270 + }, + { + "epoch": 0.050976803657459206, + "grad_norm": 1.1141808032989502, + "learning_rate": 0.00015287902205985668, + "loss": 4.8977, + "step": 36280 + }, + { + "epoch": 0.05099085459562278, + "grad_norm": 1.135312557220459, + "learning_rate": 0.0001529211746522411, + "loss": 5.0077, + "step": 36290 + }, + { + "epoch": 0.05100490553378636, + "grad_norm": 1.0765386819839478, + "learning_rate": 0.00015296332724462554, + "loss": 4.9414, + "step": 36300 + }, + { + "epoch": 0.051018956471949935, + "grad_norm": 1.0505393743515015, + "learning_rate": 0.00015300547983700995, + "loss": 4.8978, + "step": 36310 + }, + { + "epoch": 0.05103300741011351, + "grad_norm": 1.127390742301941, + "learning_rate": 0.00015304763242939438, + "loss": 5.0173, + "step": 36320 + }, + { + "epoch": 0.051047058348277094, + "grad_norm": 1.1132893562316895, + "learning_rate": 0.00015308978502177883, + "loss": 5.0465, + "step": 36330 + }, + { + "epoch": 0.05106110928644067, + "grad_norm": 1.0964865684509277, + "learning_rate": 0.00015313193761416327, + "loss": 5.009, + "step": 36340 + }, + { + "epoch": 0.051075160224604246, + "grad_norm": 1.0559228658676147, + "learning_rate": 0.0001531740902065477, + "loss": 5.136, + "step": 36350 + }, + { + "epoch": 0.05108921116276782, + "grad_norm": 1.0860519409179688, + "learning_rate": 0.00015321624279893213, + "loss": 5.0737, + "step": 36360 + }, + { + "epoch": 0.0511032621009314, + "grad_norm": 1.1100194454193115, + "learning_rate": 0.00015325839539131656, + "loss": 4.9814, + "step": 36370 + }, + { + "epoch": 0.05111731303909498, + "grad_norm": 1.0740619897842407, + "learning_rate": 0.00015330054798370096, + "loss": 4.9415, + "step": 36380 + }, + { + "epoch": 0.05113136397725856, + "grad_norm": 1.1333972215652466, + "learning_rate": 0.00015334270057608542, + "loss": 5.037, + "step": 36390 + }, + { + "epoch": 0.051145414915422134, + "grad_norm": 1.1923198699951172, + "learning_rate": 0.00015338485316846985, + "loss": 4.9674, + "step": 36400 + }, + { + "epoch": 0.05115946585358571, + "grad_norm": 1.0857388973236084, + "learning_rate": 0.00015342700576085428, + "loss": 5.0191, + "step": 36410 + }, + { + "epoch": 0.051173516791749286, + "grad_norm": 1.1336112022399902, + "learning_rate": 0.00015346494309400027, + "loss": 4.9396, + "step": 36420 + }, + { + "epoch": 0.05118756772991287, + "grad_norm": 1.1554714441299438, + "learning_rate": 0.0001535070956863847, + "loss": 4.9489, + "step": 36430 + }, + { + "epoch": 0.051201618668076446, + "grad_norm": 1.1246522665023804, + "learning_rate": 0.00015354924827876913, + "loss": 5.0433, + "step": 36440 + }, + { + "epoch": 0.05121566960624002, + "grad_norm": 1.1257320642471313, + "learning_rate": 0.00015359140087115356, + "loss": 4.9265, + "step": 36450 + }, + { + "epoch": 0.0512297205444036, + "grad_norm": 1.1546956300735474, + "learning_rate": 0.00015363355346353802, + "loss": 4.9957, + "step": 36460 + }, + { + "epoch": 0.051243771482567174, + "grad_norm": 1.1014739274978638, + "learning_rate": 0.00015367570605592242, + "loss": 4.9966, + "step": 36470 + }, + { + "epoch": 0.05125782242073076, + "grad_norm": 1.1065007448196411, + "learning_rate": 0.00015371785864830685, + "loss": 4.9691, + "step": 36480 + }, + { + "epoch": 0.05127187335889433, + "grad_norm": 1.0437864065170288, + "learning_rate": 0.00015376001124069128, + "loss": 5.0153, + "step": 36490 + }, + { + "epoch": 0.05128592429705791, + "grad_norm": 1.0865705013275146, + "learning_rate": 0.00015380216383307572, + "loss": 5.0422, + "step": 36500 + }, + { + "epoch": 0.051299975235221486, + "grad_norm": 1.0838688611984253, + "learning_rate": 0.00015384431642546015, + "loss": 4.9375, + "step": 36510 + }, + { + "epoch": 0.05131402617338506, + "grad_norm": 1.13599693775177, + "learning_rate": 0.0001538864690178446, + "loss": 4.9728, + "step": 36520 + }, + { + "epoch": 0.051328077111548645, + "grad_norm": 1.183531403541565, + "learning_rate": 0.00015392862161022904, + "loss": 5.066, + "step": 36530 + }, + { + "epoch": 0.05134212804971222, + "grad_norm": 1.1362754106521606, + "learning_rate": 0.00015397077420261344, + "loss": 5.0401, + "step": 36540 + }, + { + "epoch": 0.0513561789878758, + "grad_norm": 1.1068400144577026, + "learning_rate": 0.00015401292679499787, + "loss": 4.9703, + "step": 36550 + }, + { + "epoch": 0.051370229926039374, + "grad_norm": 1.1438322067260742, + "learning_rate": 0.0001540550793873823, + "loss": 5.0719, + "step": 36560 + }, + { + "epoch": 0.05138428086420295, + "grad_norm": 1.0228193998336792, + "learning_rate": 0.00015409723197976673, + "loss": 4.9806, + "step": 36570 + }, + { + "epoch": 0.051398331802366526, + "grad_norm": 1.0660607814788818, + "learning_rate": 0.0001541393845721512, + "loss": 5.1565, + "step": 36580 + }, + { + "epoch": 0.05141238274053011, + "grad_norm": 1.1452786922454834, + "learning_rate": 0.00015418153716453562, + "loss": 5.0448, + "step": 36590 + }, + { + "epoch": 0.051426433678693685, + "grad_norm": 1.1097620725631714, + "learning_rate": 0.00015422368975692003, + "loss": 5.046, + "step": 36600 + }, + { + "epoch": 0.05144048461685726, + "grad_norm": 1.2317959070205688, + "learning_rate": 0.00015426584234930446, + "loss": 4.9258, + "step": 36610 + }, + { + "epoch": 0.05145453555502084, + "grad_norm": 1.0700747966766357, + "learning_rate": 0.0001543079949416889, + "loss": 5.0544, + "step": 36620 + }, + { + "epoch": 0.051468586493184414, + "grad_norm": 1.0716843605041504, + "learning_rate": 0.00015435014753407332, + "loss": 5.0264, + "step": 36630 + }, + { + "epoch": 0.051482637431348, + "grad_norm": 1.0859941244125366, + "learning_rate": 0.00015439230012645778, + "loss": 4.989, + "step": 36640 + }, + { + "epoch": 0.05149668836951157, + "grad_norm": 1.0916000604629517, + "learning_rate": 0.0001544344527188422, + "loss": 4.971, + "step": 36650 + }, + { + "epoch": 0.05151073930767515, + "grad_norm": 1.139979362487793, + "learning_rate": 0.00015447660531122664, + "loss": 4.9515, + "step": 36660 + }, + { + "epoch": 0.051524790245838725, + "grad_norm": 1.1469497680664062, + "learning_rate": 0.00015451875790361105, + "loss": 4.9998, + "step": 36670 + }, + { + "epoch": 0.0515388411840023, + "grad_norm": 1.1082267761230469, + "learning_rate": 0.00015456091049599548, + "loss": 4.9369, + "step": 36680 + }, + { + "epoch": 0.051552892122165885, + "grad_norm": 1.1943933963775635, + "learning_rate": 0.0001546030630883799, + "loss": 4.9594, + "step": 36690 + }, + { + "epoch": 0.05156694306032946, + "grad_norm": 1.0887293815612793, + "learning_rate": 0.00015464521568076437, + "loss": 4.9823, + "step": 36700 + }, + { + "epoch": 0.05158099399849304, + "grad_norm": 1.1254092454910278, + "learning_rate": 0.0001546873682731488, + "loss": 4.9227, + "step": 36710 + }, + { + "epoch": 0.05159504493665661, + "grad_norm": 1.1450250148773193, + "learning_rate": 0.00015472952086553323, + "loss": 4.9345, + "step": 36720 + }, + { + "epoch": 0.05160909587482019, + "grad_norm": 1.0992549657821655, + "learning_rate": 0.00015477167345791763, + "loss": 4.9191, + "step": 36730 + }, + { + "epoch": 0.05162314681298377, + "grad_norm": 1.0670653581619263, + "learning_rate": 0.00015481382605030206, + "loss": 4.9883, + "step": 36740 + }, + { + "epoch": 0.05163719775114735, + "grad_norm": 1.074080467224121, + "learning_rate": 0.0001548559786426865, + "loss": 4.9899, + "step": 36750 + }, + { + "epoch": 0.051651248689310925, + "grad_norm": 1.1951225996017456, + "learning_rate": 0.00015489813123507095, + "loss": 5.037, + "step": 36760 + }, + { + "epoch": 0.0516652996274745, + "grad_norm": 1.0775877237319946, + "learning_rate": 0.00015494028382745538, + "loss": 5.0046, + "step": 36770 + }, + { + "epoch": 0.05167935056563808, + "grad_norm": 1.087863802909851, + "learning_rate": 0.00015498243641983982, + "loss": 4.9907, + "step": 36780 + }, + { + "epoch": 0.05169340150380166, + "grad_norm": 1.0674740076065063, + "learning_rate": 0.00015502458901222425, + "loss": 5.0147, + "step": 36790 + }, + { + "epoch": 0.051707452441965236, + "grad_norm": 1.069732904434204, + "learning_rate": 0.00015506674160460865, + "loss": 4.8633, + "step": 36800 + }, + { + "epoch": 0.05172150338012881, + "grad_norm": 1.0820239782333374, + "learning_rate": 0.00015510889419699308, + "loss": 5.0354, + "step": 36810 + }, + { + "epoch": 0.05173555431829239, + "grad_norm": 1.1347521543502808, + "learning_rate": 0.00015515104678937754, + "loss": 4.8265, + "step": 36820 + }, + { + "epoch": 0.051749605256455965, + "grad_norm": 1.1913303136825562, + "learning_rate": 0.00015519319938176197, + "loss": 4.9479, + "step": 36830 + }, + { + "epoch": 0.05176365619461955, + "grad_norm": 1.1147087812423706, + "learning_rate": 0.0001552353519741464, + "loss": 4.9907, + "step": 36840 + }, + { + "epoch": 0.051777707132783124, + "grad_norm": 1.076640009880066, + "learning_rate": 0.00015527750456653083, + "loss": 4.9654, + "step": 36850 + }, + { + "epoch": 0.0517917580709467, + "grad_norm": 1.1436219215393066, + "learning_rate": 0.00015531965715891524, + "loss": 4.9585, + "step": 36860 + }, + { + "epoch": 0.051805809009110276, + "grad_norm": 1.0725477933883667, + "learning_rate": 0.00015536180975129967, + "loss": 5.0113, + "step": 36870 + }, + { + "epoch": 0.05181985994727385, + "grad_norm": 1.087472677230835, + "learning_rate": 0.00015540396234368413, + "loss": 4.8823, + "step": 36880 + }, + { + "epoch": 0.05183391088543743, + "grad_norm": 1.0537265539169312, + "learning_rate": 0.00015544611493606856, + "loss": 4.8053, + "step": 36890 + }, + { + "epoch": 0.05184796182360101, + "grad_norm": 1.0963505506515503, + "learning_rate": 0.000155488267528453, + "loss": 5.0603, + "step": 36900 + }, + { + "epoch": 0.05186201276176459, + "grad_norm": 1.1273267269134521, + "learning_rate": 0.00015553042012083742, + "loss": 5.0182, + "step": 36910 + }, + { + "epoch": 0.051876063699928164, + "grad_norm": 1.0548876523971558, + "learning_rate": 0.00015557257271322188, + "loss": 5.1116, + "step": 36920 + }, + { + "epoch": 0.05189011463809174, + "grad_norm": 1.1563364267349243, + "learning_rate": 0.00015561472530560626, + "loss": 4.9696, + "step": 36930 + }, + { + "epoch": 0.051904165576255316, + "grad_norm": 1.0634068250656128, + "learning_rate": 0.00015565687789799071, + "loss": 4.9628, + "step": 36940 + }, + { + "epoch": 0.0519182165144189, + "grad_norm": 1.182931900024414, + "learning_rate": 0.00015569903049037515, + "loss": 5.021, + "step": 36950 + }, + { + "epoch": 0.051932267452582476, + "grad_norm": 1.0970754623413086, + "learning_rate": 0.00015574118308275958, + "loss": 5.0209, + "step": 36960 + }, + { + "epoch": 0.05194631839074605, + "grad_norm": 1.10125732421875, + "learning_rate": 0.000155783335675144, + "loss": 4.963, + "step": 36970 + }, + { + "epoch": 0.05196036932890963, + "grad_norm": 1.1242049932479858, + "learning_rate": 0.00015582548826752847, + "loss": 5.0308, + "step": 36980 + }, + { + "epoch": 0.051974420267073204, + "grad_norm": 1.114256739616394, + "learning_rate": 0.00015586764085991287, + "loss": 5.0974, + "step": 36990 + }, + { + "epoch": 0.05198847120523679, + "grad_norm": 1.080618143081665, + "learning_rate": 0.0001559097934522973, + "loss": 4.9911, + "step": 37000 + }, + { + "epoch": 0.05200252214340036, + "grad_norm": 1.097937822341919, + "learning_rate": 0.00015595194604468173, + "loss": 4.9756, + "step": 37010 + }, + { + "epoch": 0.05201657308156394, + "grad_norm": 1.2937126159667969, + "learning_rate": 0.00015599409863706616, + "loss": 5.0298, + "step": 37020 + }, + { + "epoch": 0.052030624019727516, + "grad_norm": 1.1030606031417847, + "learning_rate": 0.0001560362512294506, + "loss": 4.9976, + "step": 37030 + }, + { + "epoch": 0.05204467495789109, + "grad_norm": 1.1693426370620728, + "learning_rate": 0.00015607840382183505, + "loss": 4.9024, + "step": 37040 + }, + { + "epoch": 0.052058725896054675, + "grad_norm": 1.1187068223953247, + "learning_rate": 0.00015612055641421946, + "loss": 5.0483, + "step": 37050 + }, + { + "epoch": 0.05207277683421825, + "grad_norm": 1.0532891750335693, + "learning_rate": 0.0001561627090066039, + "loss": 4.9193, + "step": 37060 + }, + { + "epoch": 0.05208682777238183, + "grad_norm": 1.1207010746002197, + "learning_rate": 0.00015620486159898832, + "loss": 4.9848, + "step": 37070 + }, + { + "epoch": 0.052100878710545404, + "grad_norm": 1.0693873167037964, + "learning_rate": 0.00015624701419137275, + "loss": 4.9215, + "step": 37080 + }, + { + "epoch": 0.05211492964870898, + "grad_norm": 1.0835233926773071, + "learning_rate": 0.00015628916678375718, + "loss": 4.7552, + "step": 37090 + }, + { + "epoch": 0.05212898058687256, + "grad_norm": 1.124945878982544, + "learning_rate": 0.00015633131937614164, + "loss": 5.0005, + "step": 37100 + }, + { + "epoch": 0.05214303152503614, + "grad_norm": 1.0725970268249512, + "learning_rate": 0.00015637347196852607, + "loss": 4.9894, + "step": 37110 + }, + { + "epoch": 0.052157082463199715, + "grad_norm": 1.1235997676849365, + "learning_rate": 0.00015641562456091048, + "loss": 5.0398, + "step": 37120 + }, + { + "epoch": 0.05217113340136329, + "grad_norm": 1.1638944149017334, + "learning_rate": 0.0001564577771532949, + "loss": 4.8523, + "step": 37130 + }, + { + "epoch": 0.05218518433952687, + "grad_norm": 1.1262266635894775, + "learning_rate": 0.00015649992974567934, + "loss": 4.8287, + "step": 37140 + }, + { + "epoch": 0.05219923527769045, + "grad_norm": 1.1217573881149292, + "learning_rate": 0.00015654208233806377, + "loss": 5.0582, + "step": 37150 + }, + { + "epoch": 0.05221328621585403, + "grad_norm": 1.1533286571502686, + "learning_rate": 0.00015658423493044823, + "loss": 4.939, + "step": 37160 + }, + { + "epoch": 0.0522273371540176, + "grad_norm": 1.063332438468933, + "learning_rate": 0.00015662638752283266, + "loss": 5.0078, + "step": 37170 + }, + { + "epoch": 0.05224138809218118, + "grad_norm": 1.0666674375534058, + "learning_rate": 0.00015666854011521706, + "loss": 4.8933, + "step": 37180 + }, + { + "epoch": 0.052255439030344755, + "grad_norm": 1.1299502849578857, + "learning_rate": 0.0001567106927076015, + "loss": 4.9698, + "step": 37190 + }, + { + "epoch": 0.05226948996850833, + "grad_norm": 1.0899701118469238, + "learning_rate": 0.00015675284529998592, + "loss": 5.1382, + "step": 37200 + }, + { + "epoch": 0.052283540906671915, + "grad_norm": 1.0890862941741943, + "learning_rate": 0.00015679499789237036, + "loss": 4.9112, + "step": 37210 + }, + { + "epoch": 0.05229759184483549, + "grad_norm": 1.1074151992797852, + "learning_rate": 0.00015683715048475481, + "loss": 4.9958, + "step": 37220 + }, + { + "epoch": 0.05231164278299907, + "grad_norm": 1.047777771949768, + "learning_rate": 0.00015687930307713925, + "loss": 4.8429, + "step": 37230 + }, + { + "epoch": 0.05232569372116264, + "grad_norm": 1.0324103832244873, + "learning_rate": 0.00015692145566952368, + "loss": 4.9171, + "step": 37240 + }, + { + "epoch": 0.05233974465932622, + "grad_norm": 1.043428897857666, + "learning_rate": 0.00015696360826190808, + "loss": 5.0088, + "step": 37250 + }, + { + "epoch": 0.0523537955974898, + "grad_norm": 1.107131004333496, + "learning_rate": 0.0001570057608542925, + "loss": 5.0575, + "step": 37260 + }, + { + "epoch": 0.05236784653565338, + "grad_norm": 1.1451491117477417, + "learning_rate": 0.00015704791344667694, + "loss": 4.9536, + "step": 37270 + }, + { + "epoch": 0.052381897473816955, + "grad_norm": 1.1125915050506592, + "learning_rate": 0.0001570900660390614, + "loss": 4.987, + "step": 37280 + }, + { + "epoch": 0.05239594841198053, + "grad_norm": 1.1330432891845703, + "learning_rate": 0.00015713221863144583, + "loss": 4.9512, + "step": 37290 + }, + { + "epoch": 0.05240999935014411, + "grad_norm": 1.0728533267974854, + "learning_rate": 0.00015717437122383026, + "loss": 5.0056, + "step": 37300 + }, + { + "epoch": 0.05242405028830769, + "grad_norm": 1.1220287084579468, + "learning_rate": 0.00015721652381621467, + "loss": 4.8007, + "step": 37310 + }, + { + "epoch": 0.052438101226471266, + "grad_norm": 1.086573839187622, + "learning_rate": 0.0001572586764085991, + "loss": 4.8958, + "step": 37320 + }, + { + "epoch": 0.05245215216463484, + "grad_norm": 1.034942865371704, + "learning_rate": 0.00015730082900098353, + "loss": 4.9862, + "step": 37330 + }, + { + "epoch": 0.05246620310279842, + "grad_norm": 1.2076352834701538, + "learning_rate": 0.000157342981593368, + "loss": 5.0167, + "step": 37340 + }, + { + "epoch": 0.052480254040961995, + "grad_norm": 1.0755456686019897, + "learning_rate": 0.00015738513418575242, + "loss": 4.9192, + "step": 37350 + }, + { + "epoch": 0.05249430497912558, + "grad_norm": 1.0850402116775513, + "learning_rate": 0.00015742728677813685, + "loss": 5.0214, + "step": 37360 + }, + { + "epoch": 0.052508355917289154, + "grad_norm": 1.1084506511688232, + "learning_rate": 0.00015746943937052128, + "loss": 5.02, + "step": 37370 + }, + { + "epoch": 0.05252240685545273, + "grad_norm": 1.1227412223815918, + "learning_rate": 0.00015751159196290569, + "loss": 5.046, + "step": 37380 + }, + { + "epoch": 0.052536457793616306, + "grad_norm": 1.1785093545913696, + "learning_rate": 0.00015755374455529012, + "loss": 4.9965, + "step": 37390 + }, + { + "epoch": 0.05255050873177988, + "grad_norm": 1.1120465993881226, + "learning_rate": 0.00015759589714767458, + "loss": 4.9608, + "step": 37400 + }, + { + "epoch": 0.052564559669943466, + "grad_norm": 1.1275166273117065, + "learning_rate": 0.000157638049740059, + "loss": 4.9256, + "step": 37410 + }, + { + "epoch": 0.05257861060810704, + "grad_norm": 1.0952259302139282, + "learning_rate": 0.00015768020233244344, + "loss": 4.9027, + "step": 37420 + }, + { + "epoch": 0.05259266154627062, + "grad_norm": 1.084206223487854, + "learning_rate": 0.00015772235492482787, + "loss": 4.9846, + "step": 37430 + }, + { + "epoch": 0.052606712484434194, + "grad_norm": 1.1174198389053345, + "learning_rate": 0.00015776450751721227, + "loss": 4.8721, + "step": 37440 + }, + { + "epoch": 0.05262076342259777, + "grad_norm": 1.077741265296936, + "learning_rate": 0.0001578066601095967, + "loss": 4.9004, + "step": 37450 + }, + { + "epoch": 0.05263481436076135, + "grad_norm": 1.1274837255477905, + "learning_rate": 0.00015784881270198116, + "loss": 4.9527, + "step": 37460 + }, + { + "epoch": 0.05264886529892493, + "grad_norm": 1.098956823348999, + "learning_rate": 0.0001578909652943656, + "loss": 4.9924, + "step": 37470 + }, + { + "epoch": 0.052662916237088506, + "grad_norm": 1.2196323871612549, + "learning_rate": 0.00015793311788675002, + "loss": 4.9223, + "step": 37480 + }, + { + "epoch": 0.05267696717525208, + "grad_norm": 1.065298318862915, + "learning_rate": 0.00015797527047913446, + "loss": 4.9821, + "step": 37490 + }, + { + "epoch": 0.05269101811341566, + "grad_norm": 1.0735142230987549, + "learning_rate": 0.00015801742307151891, + "loss": 5.0825, + "step": 37500 + }, + { + "epoch": 0.052705069051579234, + "grad_norm": 1.075454831123352, + "learning_rate": 0.0001580595756639033, + "loss": 4.9954, + "step": 37510 + }, + { + "epoch": 0.05271911998974282, + "grad_norm": 1.0807503461837769, + "learning_rate": 0.00015810172825628775, + "loss": 4.9428, + "step": 37520 + }, + { + "epoch": 0.05273317092790639, + "grad_norm": 1.1390740871429443, + "learning_rate": 0.00015814388084867218, + "loss": 4.9437, + "step": 37530 + }, + { + "epoch": 0.05274722186606997, + "grad_norm": 1.0902714729309082, + "learning_rate": 0.0001581860334410566, + "loss": 5.0031, + "step": 37540 + }, + { + "epoch": 0.052761272804233546, + "grad_norm": 1.0692888498306274, + "learning_rate": 0.00015822818603344104, + "loss": 4.8607, + "step": 37550 + }, + { + "epoch": 0.05277532374239712, + "grad_norm": 1.0641309022903442, + "learning_rate": 0.0001582703386258255, + "loss": 5.0244, + "step": 37560 + }, + { + "epoch": 0.052789374680560705, + "grad_norm": 1.0614022016525269, + "learning_rate": 0.00015831249121820988, + "loss": 4.937, + "step": 37570 + }, + { + "epoch": 0.05280342561872428, + "grad_norm": 1.0764083862304688, + "learning_rate": 0.00015835464381059434, + "loss": 4.8395, + "step": 37580 + }, + { + "epoch": 0.05281747655688786, + "grad_norm": 1.091915488243103, + "learning_rate": 0.00015839679640297877, + "loss": 5.0053, + "step": 37590 + }, + { + "epoch": 0.052831527495051434, + "grad_norm": 1.3076285123825073, + "learning_rate": 0.0001584389489953632, + "loss": 4.8942, + "step": 37600 + }, + { + "epoch": 0.05284557843321501, + "grad_norm": 1.0910054445266724, + "learning_rate": 0.00015848110158774763, + "loss": 4.9173, + "step": 37610 + }, + { + "epoch": 0.05285962937137859, + "grad_norm": 1.0928317308425903, + "learning_rate": 0.0001585232541801321, + "loss": 5.0224, + "step": 37620 + }, + { + "epoch": 0.05287368030954217, + "grad_norm": 1.0733320713043213, + "learning_rate": 0.00015856540677251652, + "loss": 4.9854, + "step": 37630 + }, + { + "epoch": 0.052887731247705745, + "grad_norm": 1.0689090490341187, + "learning_rate": 0.00015860755936490092, + "loss": 4.9577, + "step": 37640 + }, + { + "epoch": 0.05290178218586932, + "grad_norm": 1.1283597946166992, + "learning_rate": 0.00015864971195728535, + "loss": 5.0415, + "step": 37650 + }, + { + "epoch": 0.0529158331240329, + "grad_norm": 1.1071979999542236, + "learning_rate": 0.00015869186454966979, + "loss": 5.0478, + "step": 37660 + }, + { + "epoch": 0.05292988406219648, + "grad_norm": 1.0887236595153809, + "learning_rate": 0.00015873401714205422, + "loss": 5.0003, + "step": 37670 + }, + { + "epoch": 0.05294393500036006, + "grad_norm": 1.1010711193084717, + "learning_rate": 0.00015877616973443868, + "loss": 4.8824, + "step": 37680 + }, + { + "epoch": 0.05295798593852363, + "grad_norm": 1.0766741037368774, + "learning_rate": 0.0001588183223268231, + "loss": 5.1363, + "step": 37690 + }, + { + "epoch": 0.05297203687668721, + "grad_norm": 1.0775905847549438, + "learning_rate": 0.0001588604749192075, + "loss": 4.9825, + "step": 37700 + }, + { + "epoch": 0.052986087814850785, + "grad_norm": 1.0823009014129639, + "learning_rate": 0.00015890262751159194, + "loss": 5.2041, + "step": 37710 + }, + { + "epoch": 0.05300013875301437, + "grad_norm": 1.0547107458114624, + "learning_rate": 0.00015894478010397637, + "loss": 5.036, + "step": 37720 + }, + { + "epoch": 0.053014189691177944, + "grad_norm": 1.0810598134994507, + "learning_rate": 0.0001589869326963608, + "loss": 4.7697, + "step": 37730 + }, + { + "epoch": 0.05302824062934152, + "grad_norm": 1.0776020288467407, + "learning_rate": 0.00015902908528874526, + "loss": 4.95, + "step": 37740 + }, + { + "epoch": 0.0530422915675051, + "grad_norm": 1.068782925605774, + "learning_rate": 0.0001590712378811297, + "loss": 4.9632, + "step": 37750 + }, + { + "epoch": 0.05305634250566867, + "grad_norm": 1.0810242891311646, + "learning_rate": 0.0001591133904735141, + "loss": 5.0325, + "step": 37760 + }, + { + "epoch": 0.053070393443832256, + "grad_norm": 1.0585830211639404, + "learning_rate": 0.00015915554306589853, + "loss": 4.9651, + "step": 37770 + }, + { + "epoch": 0.05308444438199583, + "grad_norm": 1.0817153453826904, + "learning_rate": 0.00015919769565828296, + "loss": 5.022, + "step": 37780 + }, + { + "epoch": 0.05309849532015941, + "grad_norm": 1.1465531587600708, + "learning_rate": 0.0001592398482506674, + "loss": 4.9242, + "step": 37790 + }, + { + "epoch": 0.053112546258322985, + "grad_norm": 1.1970518827438354, + "learning_rate": 0.00015928200084305185, + "loss": 5.0169, + "step": 37800 + }, + { + "epoch": 0.05312659719648656, + "grad_norm": 1.0447371006011963, + "learning_rate": 0.00015932415343543628, + "loss": 5.0174, + "step": 37810 + }, + { + "epoch": 0.053140648134650144, + "grad_norm": 1.117707371711731, + "learning_rate": 0.0001593663060278207, + "loss": 4.9709, + "step": 37820 + }, + { + "epoch": 0.05315469907281372, + "grad_norm": 1.101830005645752, + "learning_rate": 0.00015940845862020512, + "loss": 4.9622, + "step": 37830 + }, + { + "epoch": 0.053168750010977296, + "grad_norm": 1.0951014757156372, + "learning_rate": 0.00015945061121258955, + "loss": 5.005, + "step": 37840 + }, + { + "epoch": 0.05318280094914087, + "grad_norm": 1.0685999393463135, + "learning_rate": 0.00015949276380497398, + "loss": 4.9262, + "step": 37850 + }, + { + "epoch": 0.05319685188730445, + "grad_norm": 1.1599674224853516, + "learning_rate": 0.00015953491639735844, + "loss": 4.9453, + "step": 37860 + }, + { + "epoch": 0.053210902825468025, + "grad_norm": 1.1290907859802246, + "learning_rate": 0.00015957706898974287, + "loss": 4.8283, + "step": 37870 + }, + { + "epoch": 0.05322495376363161, + "grad_norm": 1.1260758638381958, + "learning_rate": 0.0001596192215821273, + "loss": 4.8132, + "step": 37880 + }, + { + "epoch": 0.053239004701795184, + "grad_norm": 1.4716039896011353, + "learning_rate": 0.0001596613741745117, + "loss": 5.0606, + "step": 37890 + }, + { + "epoch": 0.05325305563995876, + "grad_norm": 1.08821439743042, + "learning_rate": 0.00015970352676689613, + "loss": 5.085, + "step": 37900 + }, + { + "epoch": 0.053267106578122336, + "grad_norm": 1.0185142755508423, + "learning_rate": 0.00015974567935928056, + "loss": 4.9683, + "step": 37910 + }, + { + "epoch": 0.05328115751628591, + "grad_norm": 1.0454940795898438, + "learning_rate": 0.00015978783195166502, + "loss": 5.0988, + "step": 37920 + }, + { + "epoch": 0.053295208454449496, + "grad_norm": 1.085282325744629, + "learning_rate": 0.00015982998454404945, + "loss": 4.873, + "step": 37930 + }, + { + "epoch": 0.05330925939261307, + "grad_norm": 1.1121746301651, + "learning_rate": 0.00015987213713643389, + "loss": 4.97, + "step": 37940 + }, + { + "epoch": 0.05332331033077665, + "grad_norm": 1.072363257408142, + "learning_rate": 0.00015991428972881832, + "loss": 4.9087, + "step": 37950 + }, + { + "epoch": 0.053337361268940224, + "grad_norm": 1.0807424783706665, + "learning_rate": 0.00015995644232120272, + "loss": 4.9707, + "step": 37960 + }, + { + "epoch": 0.0533514122071038, + "grad_norm": 1.1155600547790527, + "learning_rate": 0.00015999859491358715, + "loss": 4.9794, + "step": 37970 + }, + { + "epoch": 0.05336546314526738, + "grad_norm": 1.1688952445983887, + "learning_rate": 0.0001600407475059716, + "loss": 4.8757, + "step": 37980 + }, + { + "epoch": 0.05337951408343096, + "grad_norm": 1.0695239305496216, + "learning_rate": 0.00016008290009835604, + "loss": 4.9425, + "step": 37990 + }, + { + "epoch": 0.053393565021594536, + "grad_norm": 1.1101523637771606, + "learning_rate": 0.00016012505269074047, + "loss": 4.9533, + "step": 38000 + }, + { + "epoch": 0.05340761595975811, + "grad_norm": 1.0731979608535767, + "learning_rate": 0.0001601672052831249, + "loss": 4.9356, + "step": 38010 + }, + { + "epoch": 0.05342166689792169, + "grad_norm": 1.0717893838882446, + "learning_rate": 0.0001602093578755093, + "loss": 4.9014, + "step": 38020 + }, + { + "epoch": 0.05343571783608527, + "grad_norm": 1.1218794584274292, + "learning_rate": 0.00016025151046789374, + "loss": 5.0567, + "step": 38030 + }, + { + "epoch": 0.05344976877424885, + "grad_norm": 1.0864797830581665, + "learning_rate": 0.0001602936630602782, + "loss": 4.964, + "step": 38040 + }, + { + "epoch": 0.05346381971241242, + "grad_norm": 1.1077324151992798, + "learning_rate": 0.00016033581565266263, + "loss": 5.0164, + "step": 38050 + }, + { + "epoch": 0.053477870650576, + "grad_norm": 1.0500108003616333, + "learning_rate": 0.00016037796824504706, + "loss": 4.9392, + "step": 38060 + }, + { + "epoch": 0.053491921588739576, + "grad_norm": 1.052202820777893, + "learning_rate": 0.0001604201208374315, + "loss": 5.1295, + "step": 38070 + }, + { + "epoch": 0.05350597252690316, + "grad_norm": 1.0734517574310303, + "learning_rate": 0.00016046227342981592, + "loss": 4.9639, + "step": 38080 + }, + { + "epoch": 0.053520023465066735, + "grad_norm": 1.0562039613723755, + "learning_rate": 0.00016050442602220033, + "loss": 5.0704, + "step": 38090 + }, + { + "epoch": 0.05353407440323031, + "grad_norm": 1.0819730758666992, + "learning_rate": 0.00016054657861458478, + "loss": 4.9824, + "step": 38100 + }, + { + "epoch": 0.05354812534139389, + "grad_norm": 1.0542058944702148, + "learning_rate": 0.00016058873120696922, + "loss": 5.0282, + "step": 38110 + }, + { + "epoch": 0.053562176279557464, + "grad_norm": 1.0968252420425415, + "learning_rate": 0.00016063088379935365, + "loss": 4.9974, + "step": 38120 + }, + { + "epoch": 0.05357622721772105, + "grad_norm": 1.1440109014511108, + "learning_rate": 0.00016067303639173808, + "loss": 5.0201, + "step": 38130 + }, + { + "epoch": 0.05359027815588462, + "grad_norm": 1.1223822832107544, + "learning_rate": 0.00016071518898412254, + "loss": 4.885, + "step": 38140 + }, + { + "epoch": 0.0536043290940482, + "grad_norm": 1.0383020639419556, + "learning_rate": 0.0001607573415765069, + "loss": 5.0449, + "step": 38150 + }, + { + "epoch": 0.053618380032211775, + "grad_norm": 1.1103941202163696, + "learning_rate": 0.00016079949416889137, + "loss": 5.1299, + "step": 38160 + }, + { + "epoch": 0.05363243097037535, + "grad_norm": 1.1310505867004395, + "learning_rate": 0.0001608416467612758, + "loss": 4.8367, + "step": 38170 + }, + { + "epoch": 0.05364648190853893, + "grad_norm": 1.1387687921524048, + "learning_rate": 0.00016088379935366023, + "loss": 4.9226, + "step": 38180 + }, + { + "epoch": 0.05366053284670251, + "grad_norm": 1.082069993019104, + "learning_rate": 0.00016092595194604466, + "loss": 5.0986, + "step": 38190 + }, + { + "epoch": 0.05367458378486609, + "grad_norm": 1.1030524969100952, + "learning_rate": 0.00016096810453842912, + "loss": 5.0223, + "step": 38200 + }, + { + "epoch": 0.05368863472302966, + "grad_norm": 1.0807095766067505, + "learning_rate": 0.00016101025713081355, + "loss": 4.998, + "step": 38210 + }, + { + "epoch": 0.05370268566119324, + "grad_norm": 1.0664376020431519, + "learning_rate": 0.00016105240972319796, + "loss": 5.0521, + "step": 38220 + }, + { + "epoch": 0.053716736599356815, + "grad_norm": 1.1031697988510132, + "learning_rate": 0.0001610945623155824, + "loss": 4.9556, + "step": 38230 + }, + { + "epoch": 0.0537307875375204, + "grad_norm": 1.0721416473388672, + "learning_rate": 0.00016113671490796682, + "loss": 5.0203, + "step": 38240 + }, + { + "epoch": 0.053744838475683974, + "grad_norm": 1.0892760753631592, + "learning_rate": 0.00016117886750035125, + "loss": 4.9218, + "step": 38250 + }, + { + "epoch": 0.05375888941384755, + "grad_norm": 1.0493428707122803, + "learning_rate": 0.0001612210200927357, + "loss": 5.0262, + "step": 38260 + }, + { + "epoch": 0.05377294035201113, + "grad_norm": 1.0725332498550415, + "learning_rate": 0.00016126317268512014, + "loss": 5.0425, + "step": 38270 + }, + { + "epoch": 0.0537869912901747, + "grad_norm": 1.0133819580078125, + "learning_rate": 0.00016130532527750455, + "loss": 5.1757, + "step": 38280 + }, + { + "epoch": 0.053801042228338286, + "grad_norm": 1.0614526271820068, + "learning_rate": 0.00016134747786988898, + "loss": 4.8729, + "step": 38290 + }, + { + "epoch": 0.05381509316650186, + "grad_norm": 1.0696353912353516, + "learning_rate": 0.0001613896304622734, + "loss": 4.9481, + "step": 38300 + }, + { + "epoch": 0.05382914410466544, + "grad_norm": 1.1202436685562134, + "learning_rate": 0.00016143178305465784, + "loss": 4.9642, + "step": 38310 + }, + { + "epoch": 0.053843195042829015, + "grad_norm": 1.1082916259765625, + "learning_rate": 0.0001614739356470423, + "loss": 4.9846, + "step": 38320 + }, + { + "epoch": 0.05385724598099259, + "grad_norm": 1.0403755903244019, + "learning_rate": 0.00016151608823942673, + "loss": 5.0774, + "step": 38330 + }, + { + "epoch": 0.053871296919156174, + "grad_norm": 1.1000362634658813, + "learning_rate": 0.00016155824083181116, + "loss": 4.9838, + "step": 38340 + }, + { + "epoch": 0.05388534785731975, + "grad_norm": 1.0491472482681274, + "learning_rate": 0.00016160039342419556, + "loss": 4.971, + "step": 38350 + }, + { + "epoch": 0.053899398795483326, + "grad_norm": 1.093478798866272, + "learning_rate": 0.00016164254601658, + "loss": 4.9692, + "step": 38360 + }, + { + "epoch": 0.0539134497336469, + "grad_norm": 1.0910667181015015, + "learning_rate": 0.00016168469860896443, + "loss": 4.9924, + "step": 38370 + }, + { + "epoch": 0.05392750067181048, + "grad_norm": 1.0395450592041016, + "learning_rate": 0.00016172685120134888, + "loss": 5.0483, + "step": 38380 + }, + { + "epoch": 0.05394155160997406, + "grad_norm": 1.1088807582855225, + "learning_rate": 0.00016176900379373331, + "loss": 5.0126, + "step": 38390 + }, + { + "epoch": 0.05395560254813764, + "grad_norm": 1.096949815750122, + "learning_rate": 0.00016181115638611775, + "loss": 4.8924, + "step": 38400 + }, + { + "epoch": 0.053969653486301214, + "grad_norm": 1.0980435609817505, + "learning_rate": 0.00016185330897850215, + "loss": 4.9031, + "step": 38410 + }, + { + "epoch": 0.05398370442446479, + "grad_norm": 1.1065305471420288, + "learning_rate": 0.00016189546157088658, + "loss": 4.9263, + "step": 38420 + }, + { + "epoch": 0.053997755362628366, + "grad_norm": 1.0549814701080322, + "learning_rate": 0.000161937614163271, + "loss": 4.9421, + "step": 38430 + }, + { + "epoch": 0.05401180630079195, + "grad_norm": 1.0844722986221313, + "learning_rate": 0.00016197976675565547, + "loss": 5.0158, + "step": 38440 + }, + { + "epoch": 0.054025857238955526, + "grad_norm": 1.0721558332443237, + "learning_rate": 0.0001620219193480399, + "loss": 4.9792, + "step": 38450 + }, + { + "epoch": 0.0540399081771191, + "grad_norm": 1.0752381086349487, + "learning_rate": 0.00016206407194042433, + "loss": 4.834, + "step": 38460 + }, + { + "epoch": 0.05405395911528268, + "grad_norm": 1.0965216159820557, + "learning_rate": 0.00016210622453280874, + "loss": 4.9676, + "step": 38470 + }, + { + "epoch": 0.054068010053446254, + "grad_norm": 1.094175934791565, + "learning_rate": 0.00016214837712519317, + "loss": 4.9671, + "step": 38480 + }, + { + "epoch": 0.05408206099160983, + "grad_norm": 1.0462054014205933, + "learning_rate": 0.0001621905297175776, + "loss": 5.075, + "step": 38490 + }, + { + "epoch": 0.05409611192977341, + "grad_norm": 1.0655064582824707, + "learning_rate": 0.00016223268230996206, + "loss": 4.9576, + "step": 38500 + }, + { + "epoch": 0.05411016286793699, + "grad_norm": 1.075216293334961, + "learning_rate": 0.0001622748349023465, + "loss": 5.0349, + "step": 38510 + }, + { + "epoch": 0.054124213806100566, + "grad_norm": 1.0411499738693237, + "learning_rate": 0.00016231698749473092, + "loss": 5.0538, + "step": 38520 + }, + { + "epoch": 0.05413826474426414, + "grad_norm": 1.0397356748580933, + "learning_rate": 0.00016235914008711535, + "loss": 5.0188, + "step": 38530 + }, + { + "epoch": 0.05415231568242772, + "grad_norm": 1.0994830131530762, + "learning_rate": 0.00016240129267949976, + "loss": 4.9885, + "step": 38540 + }, + { + "epoch": 0.0541663666205913, + "grad_norm": 1.059415578842163, + "learning_rate": 0.0001624434452718842, + "loss": 4.868, + "step": 38550 + }, + { + "epoch": 0.05418041755875488, + "grad_norm": 1.1089651584625244, + "learning_rate": 0.00016248559786426864, + "loss": 4.8518, + "step": 38560 + }, + { + "epoch": 0.05419446849691845, + "grad_norm": 1.064829707145691, + "learning_rate": 0.00016252775045665308, + "loss": 4.9925, + "step": 38570 + }, + { + "epoch": 0.05420851943508203, + "grad_norm": 1.0721412897109985, + "learning_rate": 0.0001625699030490375, + "loss": 4.9596, + "step": 38580 + }, + { + "epoch": 0.054222570373245606, + "grad_norm": 1.1417285203933716, + "learning_rate": 0.00016261205564142194, + "loss": 4.8477, + "step": 38590 + }, + { + "epoch": 0.05423662131140919, + "grad_norm": 1.2103323936462402, + "learning_rate": 0.00016265420823380634, + "loss": 5.13, + "step": 38600 + }, + { + "epoch": 0.054250672249572765, + "grad_norm": 1.1253100633621216, + "learning_rate": 0.00016269636082619077, + "loss": 4.9663, + "step": 38610 + }, + { + "epoch": 0.05426472318773634, + "grad_norm": 1.0597470998764038, + "learning_rate": 0.00016273851341857523, + "loss": 4.9221, + "step": 38620 + }, + { + "epoch": 0.05427877412589992, + "grad_norm": 1.0937083959579468, + "learning_rate": 0.00016278066601095966, + "loss": 4.8933, + "step": 38630 + }, + { + "epoch": 0.054292825064063494, + "grad_norm": 1.0493961572647095, + "learning_rate": 0.0001628228186033441, + "loss": 5.0548, + "step": 38640 + }, + { + "epoch": 0.05430687600222708, + "grad_norm": 1.0712189674377441, + "learning_rate": 0.00016286497119572853, + "loss": 4.8846, + "step": 38650 + }, + { + "epoch": 0.05432092694039065, + "grad_norm": 1.1186167001724243, + "learning_rate": 0.00016290712378811296, + "loss": 5.0001, + "step": 38660 + }, + { + "epoch": 0.05433497787855423, + "grad_norm": 1.1650162935256958, + "learning_rate": 0.00016294927638049736, + "loss": 4.8981, + "step": 38670 + }, + { + "epoch": 0.054349028816717805, + "grad_norm": 1.0327341556549072, + "learning_rate": 0.00016299142897288182, + "loss": 5.0457, + "step": 38680 + }, + { + "epoch": 0.05436307975488138, + "grad_norm": 1.051758050918579, + "learning_rate": 0.00016303358156526625, + "loss": 4.9659, + "step": 38690 + }, + { + "epoch": 0.054377130693044964, + "grad_norm": 1.1347402334213257, + "learning_rate": 0.00016307573415765068, + "loss": 4.9157, + "step": 38700 + }, + { + "epoch": 0.05439118163120854, + "grad_norm": 1.0630327463150024, + "learning_rate": 0.0001631178867500351, + "loss": 5.0936, + "step": 38710 + }, + { + "epoch": 0.05440523256937212, + "grad_norm": 1.104245662689209, + "learning_rate": 0.00016316003934241957, + "loss": 5.0357, + "step": 38720 + }, + { + "epoch": 0.05441928350753569, + "grad_norm": 1.0906322002410889, + "learning_rate": 0.00016320219193480395, + "loss": 4.9834, + "step": 38730 + }, + { + "epoch": 0.05443333444569927, + "grad_norm": 1.0565413236618042, + "learning_rate": 0.0001632443445271884, + "loss": 4.9378, + "step": 38740 + }, + { + "epoch": 0.05444738538386285, + "grad_norm": 1.0998698472976685, + "learning_rate": 0.00016328649711957284, + "loss": 5.0077, + "step": 38750 + }, + { + "epoch": 0.05446143632202643, + "grad_norm": 1.1021455526351929, + "learning_rate": 0.00016332864971195727, + "loss": 4.9545, + "step": 38760 + }, + { + "epoch": 0.054475487260190004, + "grad_norm": 1.0735493898391724, + "learning_rate": 0.0001633708023043417, + "loss": 4.9852, + "step": 38770 + }, + { + "epoch": 0.05448953819835358, + "grad_norm": 1.0596339702606201, + "learning_rate": 0.00016341295489672616, + "loss": 4.919, + "step": 38780 + }, + { + "epoch": 0.05450358913651716, + "grad_norm": 1.1908012628555298, + "learning_rate": 0.0001634551074891106, + "loss": 4.8531, + "step": 38790 + }, + { + "epoch": 0.05451764007468073, + "grad_norm": 1.1102972030639648, + "learning_rate": 0.000163497260081495, + "loss": 4.8526, + "step": 38800 + }, + { + "epoch": 0.054531691012844316, + "grad_norm": 1.1045284271240234, + "learning_rate": 0.00016353941267387942, + "loss": 4.9537, + "step": 38810 + }, + { + "epoch": 0.05454574195100789, + "grad_norm": 1.0729451179504395, + "learning_rate": 0.00016358156526626386, + "loss": 4.9667, + "step": 38820 + }, + { + "epoch": 0.05455979288917147, + "grad_norm": 1.0459548234939575, + "learning_rate": 0.00016362371785864829, + "loss": 4.913, + "step": 38830 + }, + { + "epoch": 0.054573843827335045, + "grad_norm": 1.0615431070327759, + "learning_rate": 0.00016366587045103274, + "loss": 4.9522, + "step": 38840 + }, + { + "epoch": 0.05458789476549862, + "grad_norm": 1.1227502822875977, + "learning_rate": 0.00016370802304341718, + "loss": 4.9627, + "step": 38850 + }, + { + "epoch": 0.054601945703662204, + "grad_norm": 1.0452730655670166, + "learning_rate": 0.00016375017563580158, + "loss": 5.0616, + "step": 38860 + }, + { + "epoch": 0.05461599664182578, + "grad_norm": 0.998740017414093, + "learning_rate": 0.000163792328228186, + "loss": 4.9609, + "step": 38870 + }, + { + "epoch": 0.054630047579989356, + "grad_norm": 1.1312004327774048, + "learning_rate": 0.00016383448082057044, + "loss": 4.9495, + "step": 38880 + }, + { + "epoch": 0.05464409851815293, + "grad_norm": 1.0797289609909058, + "learning_rate": 0.00016387663341295487, + "loss": 4.946, + "step": 38890 + }, + { + "epoch": 0.05465814945631651, + "grad_norm": 1.0702334642410278, + "learning_rate": 0.00016391878600533933, + "loss": 5.0044, + "step": 38900 + }, + { + "epoch": 0.05467220039448009, + "grad_norm": 1.0228101015090942, + "learning_rate": 0.00016396093859772376, + "loss": 4.8987, + "step": 38910 + }, + { + "epoch": 0.05468625133264367, + "grad_norm": 1.0430359840393066, + "learning_rate": 0.0001640030911901082, + "loss": 5.0504, + "step": 38920 + }, + { + "epoch": 0.054700302270807244, + "grad_norm": 1.065110206604004, + "learning_rate": 0.0001640452437824926, + "loss": 4.9139, + "step": 38930 + }, + { + "epoch": 0.05471435320897082, + "grad_norm": 1.1111505031585693, + "learning_rate": 0.00016408739637487703, + "loss": 4.9611, + "step": 38940 + }, + { + "epoch": 0.054728404147134396, + "grad_norm": 1.0788532495498657, + "learning_rate": 0.00016412954896726146, + "loss": 4.8959, + "step": 38950 + }, + { + "epoch": 0.05474245508529798, + "grad_norm": 1.0466620922088623, + "learning_rate": 0.00016417170155964592, + "loss": 5.1461, + "step": 38960 + }, + { + "epoch": 0.054756506023461556, + "grad_norm": 1.115582823753357, + "learning_rate": 0.00016421385415203035, + "loss": 4.9678, + "step": 38970 + }, + { + "epoch": 0.05477055696162513, + "grad_norm": 1.0731269121170044, + "learning_rate": 0.00016425600674441478, + "loss": 4.9063, + "step": 38980 + }, + { + "epoch": 0.05478460789978871, + "grad_norm": 1.081316351890564, + "learning_rate": 0.00016429815933679918, + "loss": 4.9726, + "step": 38990 + }, + { + "epoch": 0.054798658837952284, + "grad_norm": 1.0765340328216553, + "learning_rate": 0.00016434031192918362, + "loss": 4.8929, + "step": 39000 + }, + { + "epoch": 0.05481270977611587, + "grad_norm": 1.0560728311538696, + "learning_rate": 0.00016438246452156805, + "loss": 4.9395, + "step": 39010 + }, + { + "epoch": 0.05482676071427944, + "grad_norm": 1.069467306137085, + "learning_rate": 0.0001644246171139525, + "loss": 5.0583, + "step": 39020 + }, + { + "epoch": 0.05484081165244302, + "grad_norm": 1.0423463582992554, + "learning_rate": 0.00016446676970633694, + "loss": 4.993, + "step": 39030 + }, + { + "epoch": 0.054854862590606596, + "grad_norm": 1.064470648765564, + "learning_rate": 0.00016450892229872137, + "loss": 5.0129, + "step": 39040 + }, + { + "epoch": 0.05486891352877017, + "grad_norm": 1.0736603736877441, + "learning_rate": 0.0001645510748911058, + "loss": 5.0073, + "step": 39050 + }, + { + "epoch": 0.054882964466933755, + "grad_norm": 1.045296549797058, + "learning_rate": 0.0001645932274834902, + "loss": 4.9443, + "step": 39060 + }, + { + "epoch": 0.05489701540509733, + "grad_norm": 1.1380881071090698, + "learning_rate": 0.00016463538007587463, + "loss": 4.9244, + "step": 39070 + }, + { + "epoch": 0.05491106634326091, + "grad_norm": 1.0342426300048828, + "learning_rate": 0.0001646775326682591, + "loss": 4.9206, + "step": 39080 + }, + { + "epoch": 0.05492511728142448, + "grad_norm": 1.0200796127319336, + "learning_rate": 0.00016471968526064352, + "loss": 5.0105, + "step": 39090 + }, + { + "epoch": 0.05493916821958806, + "grad_norm": 1.0826342105865479, + "learning_rate": 0.00016476183785302795, + "loss": 5.0702, + "step": 39100 + }, + { + "epoch": 0.054953219157751636, + "grad_norm": 1.0333572626113892, + "learning_rate": 0.00016480399044541239, + "loss": 4.8559, + "step": 39110 + }, + { + "epoch": 0.05496727009591522, + "grad_norm": 1.1156673431396484, + "learning_rate": 0.0001648461430377968, + "loss": 5.0019, + "step": 39120 + }, + { + "epoch": 0.054981321034078795, + "grad_norm": 1.07420814037323, + "learning_rate": 0.00016488829563018122, + "loss": 5.0687, + "step": 39130 + }, + { + "epoch": 0.05499537197224237, + "grad_norm": 1.0014334917068481, + "learning_rate": 0.00016493044822256568, + "loss": 5.0476, + "step": 39140 + }, + { + "epoch": 0.05500942291040595, + "grad_norm": 1.0669128894805908, + "learning_rate": 0.0001649726008149501, + "loss": 4.8751, + "step": 39150 + }, + { + "epoch": 0.055023473848569524, + "grad_norm": 1.0790259838104248, + "learning_rate": 0.00016501475340733454, + "loss": 5.0306, + "step": 39160 + }, + { + "epoch": 0.05503752478673311, + "grad_norm": 1.0873439311981201, + "learning_rate": 0.00016505690599971897, + "loss": 4.8936, + "step": 39170 + }, + { + "epoch": 0.05505157572489668, + "grad_norm": 1.0448416471481323, + "learning_rate": 0.00016509905859210338, + "loss": 4.8749, + "step": 39180 + }, + { + "epoch": 0.05506562666306026, + "grad_norm": 1.0575209856033325, + "learning_rate": 0.0001651412111844878, + "loss": 4.9797, + "step": 39190 + }, + { + "epoch": 0.055079677601223835, + "grad_norm": 1.1213315725326538, + "learning_rate": 0.00016518336377687227, + "loss": 4.9521, + "step": 39200 + }, + { + "epoch": 0.05509372853938741, + "grad_norm": 1.0260167121887207, + "learning_rate": 0.0001652255163692567, + "loss": 5.0328, + "step": 39210 + }, + { + "epoch": 0.055107779477550994, + "grad_norm": 1.0591065883636475, + "learning_rate": 0.00016526766896164113, + "loss": 4.9218, + "step": 39220 + }, + { + "epoch": 0.05512183041571457, + "grad_norm": 1.0388381481170654, + "learning_rate": 0.00016530982155402556, + "loss": 4.9386, + "step": 39230 + }, + { + "epoch": 0.05513588135387815, + "grad_norm": 1.0415207147598267, + "learning_rate": 0.00016535197414641, + "loss": 4.8885, + "step": 39240 + }, + { + "epoch": 0.05514993229204172, + "grad_norm": 1.0508288145065308, + "learning_rate": 0.0001653941267387944, + "loss": 5.0216, + "step": 39250 + }, + { + "epoch": 0.0551639832302053, + "grad_norm": 1.0683437585830688, + "learning_rate": 0.00016543627933117885, + "loss": 4.9671, + "step": 39260 + }, + { + "epoch": 0.05517803416836888, + "grad_norm": 1.0968106985092163, + "learning_rate": 0.00016547843192356328, + "loss": 4.9947, + "step": 39270 + }, + { + "epoch": 0.05519208510653246, + "grad_norm": 1.0676803588867188, + "learning_rate": 0.00016552058451594772, + "loss": 4.973, + "step": 39280 + }, + { + "epoch": 0.055206136044696034, + "grad_norm": 1.1953974962234497, + "learning_rate": 0.00016556273710833215, + "loss": 5.0037, + "step": 39290 + }, + { + "epoch": 0.05522018698285961, + "grad_norm": 1.0898711681365967, + "learning_rate": 0.0001656048897007166, + "loss": 4.8778, + "step": 39300 + }, + { + "epoch": 0.05523423792102319, + "grad_norm": 1.0796436071395874, + "learning_rate": 0.00016564282703386256, + "loss": 5.0486, + "step": 39310 + }, + { + "epoch": 0.05524828885918677, + "grad_norm": 1.0733755826950073, + "learning_rate": 0.000165684979626247, + "loss": 4.9972, + "step": 39320 + }, + { + "epoch": 0.055262339797350346, + "grad_norm": 1.0305414199829102, + "learning_rate": 0.00016572713221863145, + "loss": 5.0652, + "step": 39330 + }, + { + "epoch": 0.05527639073551392, + "grad_norm": 1.0890687704086304, + "learning_rate": 0.00016576928481101588, + "loss": 5.0277, + "step": 39340 + }, + { + "epoch": 0.0552904416736775, + "grad_norm": 1.1217213869094849, + "learning_rate": 0.00016581143740340029, + "loss": 4.8927, + "step": 39350 + }, + { + "epoch": 0.055304492611841075, + "grad_norm": 1.1358542442321777, + "learning_rate": 0.00016585358999578472, + "loss": 4.9728, + "step": 39360 + }, + { + "epoch": 0.05531854355000466, + "grad_norm": 1.036201000213623, + "learning_rate": 0.00016589574258816915, + "loss": 4.9836, + "step": 39370 + }, + { + "epoch": 0.055332594488168234, + "grad_norm": 1.0400617122650146, + "learning_rate": 0.0001659378951805536, + "loss": 4.9628, + "step": 39380 + }, + { + "epoch": 0.05534664542633181, + "grad_norm": 1.030198097229004, + "learning_rate": 0.00016598004777293804, + "loss": 4.8906, + "step": 39390 + }, + { + "epoch": 0.055360696364495386, + "grad_norm": 1.0506401062011719, + "learning_rate": 0.00016602220036532247, + "loss": 4.8948, + "step": 39400 + }, + { + "epoch": 0.05537474730265896, + "grad_norm": 1.0474390983581543, + "learning_rate": 0.00016606435295770687, + "loss": 4.9137, + "step": 39410 + }, + { + "epoch": 0.05538879824082254, + "grad_norm": 1.0166150331497192, + "learning_rate": 0.0001661065055500913, + "loss": 5.0177, + "step": 39420 + }, + { + "epoch": 0.05540284917898612, + "grad_norm": 1.0387684106826782, + "learning_rate": 0.00016614865814247573, + "loss": 4.9143, + "step": 39430 + }, + { + "epoch": 0.0554169001171497, + "grad_norm": 1.0397957563400269, + "learning_rate": 0.0001661908107348602, + "loss": 4.9583, + "step": 39440 + }, + { + "epoch": 0.055430951055313274, + "grad_norm": 1.0370898246765137, + "learning_rate": 0.00016623296332724462, + "loss": 4.9651, + "step": 39450 + }, + { + "epoch": 0.05544500199347685, + "grad_norm": 1.0566965341567993, + "learning_rate": 0.00016627511591962906, + "loss": 5.0027, + "step": 39460 + }, + { + "epoch": 0.055459052931640426, + "grad_norm": 1.054910659790039, + "learning_rate": 0.0001663172685120135, + "loss": 5.0484, + "step": 39470 + }, + { + "epoch": 0.05547310386980401, + "grad_norm": 1.0622105598449707, + "learning_rate": 0.0001663594211043979, + "loss": 4.9111, + "step": 39480 + }, + { + "epoch": 0.055487154807967586, + "grad_norm": 1.0327736139297485, + "learning_rate": 0.00016640157369678232, + "loss": 5.0337, + "step": 39490 + }, + { + "epoch": 0.05550120574613116, + "grad_norm": 1.0651620626449585, + "learning_rate": 0.00016644372628916678, + "loss": 4.9398, + "step": 39500 + }, + { + "epoch": 0.05551525668429474, + "grad_norm": 1.0605753660202026, + "learning_rate": 0.0001664858788815512, + "loss": 5.0069, + "step": 39510 + }, + { + "epoch": 0.055529307622458314, + "grad_norm": 1.0352438688278198, + "learning_rate": 0.00016652803147393564, + "loss": 4.9148, + "step": 39520 + }, + { + "epoch": 0.0555433585606219, + "grad_norm": 1.1176104545593262, + "learning_rate": 0.00016657018406632007, + "loss": 4.9403, + "step": 39530 + }, + { + "epoch": 0.05555740949878547, + "grad_norm": 1.0405285358428955, + "learning_rate": 0.00016661233665870448, + "loss": 4.8954, + "step": 39540 + }, + { + "epoch": 0.05557146043694905, + "grad_norm": 1.0079467296600342, + "learning_rate": 0.0001666544892510889, + "loss": 5.0099, + "step": 39550 + }, + { + "epoch": 0.055585511375112626, + "grad_norm": 1.0810550451278687, + "learning_rate": 0.00016669664184347337, + "loss": 4.9254, + "step": 39560 + }, + { + "epoch": 0.0555995623132762, + "grad_norm": 1.0852543115615845, + "learning_rate": 0.0001667387944358578, + "loss": 4.9133, + "step": 39570 + }, + { + "epoch": 0.055613613251439785, + "grad_norm": 1.074148178100586, + "learning_rate": 0.00016678094702824223, + "loss": 4.8821, + "step": 39580 + }, + { + "epoch": 0.05562766418960336, + "grad_norm": 1.2053712606430054, + "learning_rate": 0.00016682309962062666, + "loss": 4.8644, + "step": 39590 + }, + { + "epoch": 0.05564171512776694, + "grad_norm": 1.3233577013015747, + "learning_rate": 0.00016686525221301106, + "loss": 4.9788, + "step": 39600 + }, + { + "epoch": 0.05565576606593051, + "grad_norm": 1.0858327150344849, + "learning_rate": 0.0001669074048053955, + "loss": 4.9615, + "step": 39610 + }, + { + "epoch": 0.05566981700409409, + "grad_norm": 1.0294164419174194, + "learning_rate": 0.00016694955739777995, + "loss": 4.8376, + "step": 39620 + }, + { + "epoch": 0.05568386794225767, + "grad_norm": 1.046635627746582, + "learning_rate": 0.00016699170999016439, + "loss": 4.9348, + "step": 39630 + }, + { + "epoch": 0.05569791888042125, + "grad_norm": 1.0553175210952759, + "learning_rate": 0.00016703386258254882, + "loss": 5.0127, + "step": 39640 + }, + { + "epoch": 0.055711969818584825, + "grad_norm": 1.0525593757629395, + "learning_rate": 0.00016707601517493325, + "loss": 4.9244, + "step": 39650 + }, + { + "epoch": 0.0557260207567484, + "grad_norm": 1.049413800239563, + "learning_rate": 0.00016711816776731768, + "loss": 4.9705, + "step": 39660 + }, + { + "epoch": 0.05574007169491198, + "grad_norm": 1.0601624250411987, + "learning_rate": 0.00016716032035970208, + "loss": 4.9637, + "step": 39670 + }, + { + "epoch": 0.05575412263307556, + "grad_norm": 1.0123969316482544, + "learning_rate": 0.00016720247295208654, + "loss": 4.9764, + "step": 39680 + }, + { + "epoch": 0.05576817357123914, + "grad_norm": 1.0407583713531494, + "learning_rate": 0.00016724462554447097, + "loss": 4.9696, + "step": 39690 + }, + { + "epoch": 0.05578222450940271, + "grad_norm": 1.0655407905578613, + "learning_rate": 0.0001672867781368554, + "loss": 4.955, + "step": 39700 + }, + { + "epoch": 0.05579627544756629, + "grad_norm": 1.0213630199432373, + "learning_rate": 0.00016732893072923983, + "loss": 4.8745, + "step": 39710 + }, + { + "epoch": 0.055810326385729865, + "grad_norm": 1.0620572566986084, + "learning_rate": 0.00016737108332162427, + "loss": 4.925, + "step": 39720 + }, + { + "epoch": 0.05582437732389344, + "grad_norm": 1.0573869943618774, + "learning_rate": 0.00016741323591400867, + "loss": 4.9061, + "step": 39730 + }, + { + "epoch": 0.055838428262057024, + "grad_norm": 1.0326967239379883, + "learning_rate": 0.00016745538850639313, + "loss": 5.0449, + "step": 39740 + }, + { + "epoch": 0.0558524792002206, + "grad_norm": 1.082334280014038, + "learning_rate": 0.00016749754109877756, + "loss": 4.9577, + "step": 39750 + }, + { + "epoch": 0.05586653013838418, + "grad_norm": 1.0817333459854126, + "learning_rate": 0.000167539693691162, + "loss": 4.9172, + "step": 39760 + }, + { + "epoch": 0.05588058107654775, + "grad_norm": 1.0093368291854858, + "learning_rate": 0.00016758184628354642, + "loss": 4.8782, + "step": 39770 + }, + { + "epoch": 0.05589463201471133, + "grad_norm": 1.0431386232376099, + "learning_rate": 0.00016762399887593085, + "loss": 4.9162, + "step": 39780 + }, + { + "epoch": 0.05590868295287491, + "grad_norm": 1.052478313446045, + "learning_rate": 0.0001676661514683153, + "loss": 5.017, + "step": 39790 + }, + { + "epoch": 0.05592273389103849, + "grad_norm": 1.0899916887283325, + "learning_rate": 0.00016770830406069972, + "loss": 4.9324, + "step": 39800 + }, + { + "epoch": 0.055936784829202064, + "grad_norm": 1.0490303039550781, + "learning_rate": 0.00016775045665308415, + "loss": 4.9905, + "step": 39810 + }, + { + "epoch": 0.05595083576736564, + "grad_norm": 1.129417896270752, + "learning_rate": 0.00016779260924546858, + "loss": 4.8127, + "step": 39820 + }, + { + "epoch": 0.05596488670552922, + "grad_norm": 1.0525208711624146, + "learning_rate": 0.000167834761837853, + "loss": 4.9774, + "step": 39830 + }, + { + "epoch": 0.0559789376436928, + "grad_norm": 1.0649760961532593, + "learning_rate": 0.00016787691443023744, + "loss": 4.8925, + "step": 39840 + }, + { + "epoch": 0.055992988581856376, + "grad_norm": 1.0432103872299194, + "learning_rate": 0.0001679190670226219, + "loss": 5.1254, + "step": 39850 + }, + { + "epoch": 0.05600703952001995, + "grad_norm": 1.046905517578125, + "learning_rate": 0.0001679612196150063, + "loss": 5.0319, + "step": 39860 + }, + { + "epoch": 0.05602109045818353, + "grad_norm": 0.994096040725708, + "learning_rate": 0.00016800337220739073, + "loss": 5.0441, + "step": 39870 + }, + { + "epoch": 0.056035141396347105, + "grad_norm": 1.0326930284500122, + "learning_rate": 0.00016804552479977516, + "loss": 4.9293, + "step": 39880 + }, + { + "epoch": 0.05604919233451069, + "grad_norm": 1.0579875707626343, + "learning_rate": 0.0001680876773921596, + "loss": 4.9475, + "step": 39890 + }, + { + "epoch": 0.056063243272674264, + "grad_norm": 1.1440166234970093, + "learning_rate": 0.00016812982998454403, + "loss": 4.8159, + "step": 39900 + }, + { + "epoch": 0.05607729421083784, + "grad_norm": 1.0132042169570923, + "learning_rate": 0.00016817198257692849, + "loss": 5.0357, + "step": 39910 + }, + { + "epoch": 0.056091345149001416, + "grad_norm": 1.0793256759643555, + "learning_rate": 0.00016821413516931292, + "loss": 4.8232, + "step": 39920 + }, + { + "epoch": 0.05610539608716499, + "grad_norm": 1.0349791049957275, + "learning_rate": 0.00016825628776169732, + "loss": 5.0093, + "step": 39930 + }, + { + "epoch": 0.056119447025328575, + "grad_norm": 1.054980754852295, + "learning_rate": 0.00016829844035408175, + "loss": 4.905, + "step": 39940 + }, + { + "epoch": 0.05613349796349215, + "grad_norm": 1.1767756938934326, + "learning_rate": 0.00016834059294646618, + "loss": 4.9174, + "step": 39950 + }, + { + "epoch": 0.05614754890165573, + "grad_norm": 1.0445729494094849, + "learning_rate": 0.00016838274553885064, + "loss": 4.9582, + "step": 39960 + }, + { + "epoch": 0.056161599839819304, + "grad_norm": 1.0728813409805298, + "learning_rate": 0.00016842489813123507, + "loss": 4.8951, + "step": 39970 + }, + { + "epoch": 0.05617565077798288, + "grad_norm": 1.0347257852554321, + "learning_rate": 0.0001684670507236195, + "loss": 4.9209, + "step": 39980 + }, + { + "epoch": 0.05618970171614646, + "grad_norm": 1.0311264991760254, + "learning_rate": 0.0001685092033160039, + "loss": 4.9572, + "step": 39990 + }, + { + "epoch": 0.05620375265431004, + "grad_norm": 1.029685378074646, + "learning_rate": 0.00016855135590838834, + "loss": 4.8857, + "step": 40000 + }, + { + "epoch": 0.056217803592473616, + "grad_norm": 1.0317046642303467, + "learning_rate": 0.00016859350850077277, + "loss": 4.9465, + "step": 40010 + }, + { + "epoch": 0.05623185453063719, + "grad_norm": 1.0499831438064575, + "learning_rate": 0.00016863566109315723, + "loss": 4.914, + "step": 40020 + }, + { + "epoch": 0.05624590546880077, + "grad_norm": 1.0336376428604126, + "learning_rate": 0.00016867781368554166, + "loss": 5.013, + "step": 40030 + }, + { + "epoch": 0.05625995640696435, + "grad_norm": 1.024095892906189, + "learning_rate": 0.0001687199662779261, + "loss": 4.8756, + "step": 40040 + }, + { + "epoch": 0.05627400734512793, + "grad_norm": 1.0317274332046509, + "learning_rate": 0.00016876211887031052, + "loss": 4.9187, + "step": 40050 + }, + { + "epoch": 0.0562880582832915, + "grad_norm": 1.834438681602478, + "learning_rate": 0.00016880427146269493, + "loss": 4.8996, + "step": 40060 + }, + { + "epoch": 0.05630210922145508, + "grad_norm": 1.0885308980941772, + "learning_rate": 0.00016884642405507936, + "loss": 5.0043, + "step": 40070 + }, + { + "epoch": 0.056316160159618656, + "grad_norm": 1.1309927701950073, + "learning_rate": 0.00016888857664746381, + "loss": 4.9811, + "step": 40080 + }, + { + "epoch": 0.05633021109778223, + "grad_norm": 1.034248948097229, + "learning_rate": 0.00016893072923984825, + "loss": 4.9089, + "step": 40090 + }, + { + "epoch": 0.056344262035945815, + "grad_norm": 1.0103942155838013, + "learning_rate": 0.00016897288183223268, + "loss": 4.8929, + "step": 40100 + }, + { + "epoch": 0.05635831297410939, + "grad_norm": 1.0718656778335571, + "learning_rate": 0.0001690150344246171, + "loss": 4.9839, + "step": 40110 + }, + { + "epoch": 0.05637236391227297, + "grad_norm": 1.044500470161438, + "learning_rate": 0.0001690571870170015, + "loss": 4.8699, + "step": 40120 + }, + { + "epoch": 0.05638641485043654, + "grad_norm": 1.024745225906372, + "learning_rate": 0.00016909933960938594, + "loss": 5.0166, + "step": 40130 + }, + { + "epoch": 0.05640046578860012, + "grad_norm": 1.0210412740707397, + "learning_rate": 0.0001691414922017704, + "loss": 4.958, + "step": 40140 + }, + { + "epoch": 0.0564145167267637, + "grad_norm": 1.0404855012893677, + "learning_rate": 0.00016918364479415483, + "loss": 4.9962, + "step": 40150 + }, + { + "epoch": 0.05642856766492728, + "grad_norm": 1.0807390213012695, + "learning_rate": 0.00016922579738653926, + "loss": 4.9206, + "step": 40160 + }, + { + "epoch": 0.056442618603090855, + "grad_norm": 1.0216072797775269, + "learning_rate": 0.0001692679499789237, + "loss": 4.9689, + "step": 40170 + }, + { + "epoch": 0.05645666954125443, + "grad_norm": 1.0541982650756836, + "learning_rate": 0.00016931010257130813, + "loss": 4.9347, + "step": 40180 + }, + { + "epoch": 0.05647072047941801, + "grad_norm": 1.072575569152832, + "learning_rate": 0.00016935225516369253, + "loss": 5.0225, + "step": 40190 + }, + { + "epoch": 0.05648477141758159, + "grad_norm": 1.0966130495071411, + "learning_rate": 0.000169394407756077, + "loss": 4.9683, + "step": 40200 + }, + { + "epoch": 0.05649882235574517, + "grad_norm": 1.0755553245544434, + "learning_rate": 0.00016943656034846142, + "loss": 4.8777, + "step": 40210 + }, + { + "epoch": 0.05651287329390874, + "grad_norm": 1.0782699584960938, + "learning_rate": 0.00016947871294084585, + "loss": 4.9303, + "step": 40220 + }, + { + "epoch": 0.05652692423207232, + "grad_norm": 1.0849295854568481, + "learning_rate": 0.00016952086553323028, + "loss": 4.9465, + "step": 40230 + }, + { + "epoch": 0.056540975170235895, + "grad_norm": 1.0700656175613403, + "learning_rate": 0.0001695630181256147, + "loss": 4.9312, + "step": 40240 + }, + { + "epoch": 0.05655502610839948, + "grad_norm": 1.053043246269226, + "learning_rate": 0.00016960517071799912, + "loss": 4.9405, + "step": 40250 + }, + { + "epoch": 0.056569077046563054, + "grad_norm": 1.0179575681686401, + "learning_rate": 0.00016964732331038358, + "loss": 4.91, + "step": 40260 + }, + { + "epoch": 0.05658312798472663, + "grad_norm": 1.0473299026489258, + "learning_rate": 0.000169689475902768, + "loss": 5.0573, + "step": 40270 + }, + { + "epoch": 0.05659717892289021, + "grad_norm": 1.0085793733596802, + "learning_rate": 0.00016973162849515244, + "loss": 4.9596, + "step": 40280 + }, + { + "epoch": 0.05661122986105378, + "grad_norm": 1.210898518562317, + "learning_rate": 0.00016977378108753687, + "loss": 4.9208, + "step": 40290 + }, + { + "epoch": 0.056625280799217366, + "grad_norm": 1.0420124530792236, + "learning_rate": 0.0001698159336799213, + "loss": 4.8712, + "step": 40300 + }, + { + "epoch": 0.05663933173738094, + "grad_norm": 1.0410534143447876, + "learning_rate": 0.0001698580862723057, + "loss": 4.8958, + "step": 40310 + }, + { + "epoch": 0.05665338267554452, + "grad_norm": 1.0174304246902466, + "learning_rate": 0.00016990023886469016, + "loss": 5.0153, + "step": 40320 + }, + { + "epoch": 0.056667433613708094, + "grad_norm": 1.1811450719833374, + "learning_rate": 0.0001699423914570746, + "loss": 4.8823, + "step": 40330 + }, + { + "epoch": 0.05668148455187167, + "grad_norm": 1.0314950942993164, + "learning_rate": 0.00016998454404945903, + "loss": 5.0111, + "step": 40340 + }, + { + "epoch": 0.056695535490035254, + "grad_norm": 1.0767461061477661, + "learning_rate": 0.00017002669664184346, + "loss": 5.0005, + "step": 40350 + }, + { + "epoch": 0.05670958642819883, + "grad_norm": 1.0583289861679077, + "learning_rate": 0.0001700688492342279, + "loss": 4.9518, + "step": 40360 + }, + { + "epoch": 0.056723637366362406, + "grad_norm": 1.068487286567688, + "learning_rate": 0.00017011100182661235, + "loss": 5.0675, + "step": 40370 + }, + { + "epoch": 0.05673768830452598, + "grad_norm": 1.0626734495162964, + "learning_rate": 0.00017015315441899675, + "loss": 4.979, + "step": 40380 + }, + { + "epoch": 0.05675173924268956, + "grad_norm": 1.0374014377593994, + "learning_rate": 0.00017019530701138118, + "loss": 4.7829, + "step": 40390 + }, + { + "epoch": 0.056765790180853135, + "grad_norm": 1.0241491794586182, + "learning_rate": 0.0001702374596037656, + "loss": 4.9447, + "step": 40400 + }, + { + "epoch": 0.05677984111901672, + "grad_norm": 1.0468249320983887, + "learning_rate": 0.00017027961219615004, + "loss": 4.8091, + "step": 40410 + }, + { + "epoch": 0.056793892057180294, + "grad_norm": 1.0289359092712402, + "learning_rate": 0.00017032176478853447, + "loss": 4.9062, + "step": 40420 + }, + { + "epoch": 0.05680794299534387, + "grad_norm": 1.02260422706604, + "learning_rate": 0.00017036391738091893, + "loss": 4.9701, + "step": 40430 + }, + { + "epoch": 0.056821993933507446, + "grad_norm": 1.0522793531417847, + "learning_rate": 0.00017040606997330334, + "loss": 4.9831, + "step": 40440 + }, + { + "epoch": 0.05683604487167102, + "grad_norm": 1.0370879173278809, + "learning_rate": 0.00017044822256568777, + "loss": 4.8978, + "step": 40450 + }, + { + "epoch": 0.056850095809834605, + "grad_norm": 1.0785061120986938, + "learning_rate": 0.0001704903751580722, + "loss": 4.914, + "step": 40460 + }, + { + "epoch": 0.05686414674799818, + "grad_norm": 1.0285074710845947, + "learning_rate": 0.00017053252775045663, + "loss": 5.0287, + "step": 40470 + }, + { + "epoch": 0.05687819768616176, + "grad_norm": 1.0587258338928223, + "learning_rate": 0.00017057468034284106, + "loss": 4.8259, + "step": 40480 + }, + { + "epoch": 0.056892248624325334, + "grad_norm": 1.0602926015853882, + "learning_rate": 0.00017061683293522552, + "loss": 4.9808, + "step": 40490 + }, + { + "epoch": 0.05690629956248891, + "grad_norm": 1.0697758197784424, + "learning_rate": 0.00017065898552760995, + "loss": 5.0533, + "step": 40500 + }, + { + "epoch": 0.05692035050065249, + "grad_norm": 1.0304670333862305, + "learning_rate": 0.00017070113811999436, + "loss": 5.0054, + "step": 40510 + }, + { + "epoch": 0.05693440143881607, + "grad_norm": 1.0418838262557983, + "learning_rate": 0.00017074329071237879, + "loss": 4.9798, + "step": 40520 + }, + { + "epoch": 0.056948452376979646, + "grad_norm": 0.9913653135299683, + "learning_rate": 0.00017078544330476322, + "loss": 4.9925, + "step": 40530 + }, + { + "epoch": 0.05696250331514322, + "grad_norm": 0.988510012626648, + "learning_rate": 0.00017082759589714768, + "loss": 4.9377, + "step": 40540 + }, + { + "epoch": 0.0569765542533068, + "grad_norm": 1.019424557685852, + "learning_rate": 0.0001708697484895321, + "loss": 5.0321, + "step": 40550 + }, + { + "epoch": 0.05699060519147038, + "grad_norm": 1.0009490251541138, + "learning_rate": 0.00017091190108191654, + "loss": 4.9732, + "step": 40560 + }, + { + "epoch": 0.05700465612963396, + "grad_norm": 1.026256799697876, + "learning_rate": 0.00017095405367430094, + "loss": 4.912, + "step": 40570 + }, + { + "epoch": 0.05701870706779753, + "grad_norm": 1.0279102325439453, + "learning_rate": 0.00017099620626668537, + "loss": 4.9669, + "step": 40580 + }, + { + "epoch": 0.05703275800596111, + "grad_norm": 1.028504490852356, + "learning_rate": 0.0001710383588590698, + "loss": 4.9007, + "step": 40590 + }, + { + "epoch": 0.057046808944124686, + "grad_norm": 1.0499134063720703, + "learning_rate": 0.00017108051145145426, + "loss": 4.9829, + "step": 40600 + }, + { + "epoch": 0.05706085988228827, + "grad_norm": 1.0060728788375854, + "learning_rate": 0.0001711226640438387, + "loss": 4.9123, + "step": 40610 + }, + { + "epoch": 0.057074910820451845, + "grad_norm": 1.0461713075637817, + "learning_rate": 0.00017116481663622312, + "loss": 5.0092, + "step": 40620 + }, + { + "epoch": 0.05708896175861542, + "grad_norm": 1.0552021265029907, + "learning_rate": 0.00017120696922860756, + "loss": 4.8915, + "step": 40630 + }, + { + "epoch": 0.057103012696779, + "grad_norm": 1.0912543535232544, + "learning_rate": 0.00017124912182099196, + "loss": 4.9294, + "step": 40640 + }, + { + "epoch": 0.05711706363494257, + "grad_norm": 1.0685486793518066, + "learning_rate": 0.0001712912744133764, + "loss": 4.9328, + "step": 40650 + }, + { + "epoch": 0.057131114573106156, + "grad_norm": 1.093159556388855, + "learning_rate": 0.00017133342700576085, + "loss": 5.0754, + "step": 40660 + }, + { + "epoch": 0.05714516551126973, + "grad_norm": 1.0238051414489746, + "learning_rate": 0.00017137557959814528, + "loss": 4.8441, + "step": 40670 + }, + { + "epoch": 0.05715921644943331, + "grad_norm": 1.0513436794281006, + "learning_rate": 0.0001714177321905297, + "loss": 4.9075, + "step": 40680 + }, + { + "epoch": 0.057173267387596885, + "grad_norm": 1.0914078950881958, + "learning_rate": 0.00017145988478291414, + "loss": 4.9667, + "step": 40690 + }, + { + "epoch": 0.05718731832576046, + "grad_norm": 1.0058307647705078, + "learning_rate": 0.00017150203737529855, + "loss": 5.0005, + "step": 40700 + }, + { + "epoch": 0.05720136926392404, + "grad_norm": 0.9954103827476501, + "learning_rate": 0.00017154418996768298, + "loss": 5.0039, + "step": 40710 + }, + { + "epoch": 0.05721542020208762, + "grad_norm": 1.0139034986495972, + "learning_rate": 0.00017158634256006744, + "loss": 4.9052, + "step": 40720 + }, + { + "epoch": 0.0572294711402512, + "grad_norm": 1.0923491716384888, + "learning_rate": 0.00017162849515245187, + "loss": 4.8337, + "step": 40730 + }, + { + "epoch": 0.05724352207841477, + "grad_norm": 1.025458574295044, + "learning_rate": 0.0001716706477448363, + "loss": 4.941, + "step": 40740 + }, + { + "epoch": 0.05725757301657835, + "grad_norm": 1.1214048862457275, + "learning_rate": 0.00017171280033722073, + "loss": 4.9764, + "step": 40750 + }, + { + "epoch": 0.057271623954741925, + "grad_norm": 1.0020577907562256, + "learning_rate": 0.00017175495292960516, + "loss": 4.9437, + "step": 40760 + }, + { + "epoch": 0.05728567489290551, + "grad_norm": 1.0177772045135498, + "learning_rate": 0.00017179710552198957, + "loss": 4.8961, + "step": 40770 + }, + { + "epoch": 0.057299725831069084, + "grad_norm": 1.029328465461731, + "learning_rate": 0.00017183925811437402, + "loss": 4.8494, + "step": 40780 + }, + { + "epoch": 0.05731377676923266, + "grad_norm": 1.0189824104309082, + "learning_rate": 0.00017188141070675845, + "loss": 4.912, + "step": 40790 + }, + { + "epoch": 0.05732782770739624, + "grad_norm": 0.9939380884170532, + "learning_rate": 0.00017192356329914289, + "loss": 4.8773, + "step": 40800 + }, + { + "epoch": 0.05734187864555981, + "grad_norm": 1.076353907585144, + "learning_rate": 0.00017196571589152732, + "loss": 4.9416, + "step": 40810 + }, + { + "epoch": 0.057355929583723396, + "grad_norm": 1.049857258796692, + "learning_rate": 0.00017200786848391175, + "loss": 4.8408, + "step": 40820 + }, + { + "epoch": 0.05736998052188697, + "grad_norm": 1.0945760011672974, + "learning_rate": 0.00017205002107629615, + "loss": 4.928, + "step": 40830 + }, + { + "epoch": 0.05738403146005055, + "grad_norm": 1.0334876775741577, + "learning_rate": 0.0001720921736686806, + "loss": 5.0276, + "step": 40840 + }, + { + "epoch": 0.057398082398214124, + "grad_norm": 1.0239782333374023, + "learning_rate": 0.00017213432626106504, + "loss": 5.0475, + "step": 40850 + }, + { + "epoch": 0.0574121333363777, + "grad_norm": 1.0112613439559937, + "learning_rate": 0.00017217647885344947, + "loss": 5.0883, + "step": 40860 + }, + { + "epoch": 0.057426184274541284, + "grad_norm": 1.0588569641113281, + "learning_rate": 0.0001722186314458339, + "loss": 4.9827, + "step": 40870 + }, + { + "epoch": 0.05744023521270486, + "grad_norm": 1.0560933351516724, + "learning_rate": 0.00017226078403821834, + "loss": 4.953, + "step": 40880 + }, + { + "epoch": 0.057454286150868436, + "grad_norm": 1.08881413936615, + "learning_rate": 0.00017230293663060274, + "loss": 4.9457, + "step": 40890 + }, + { + "epoch": 0.05746833708903201, + "grad_norm": 1.0113881826400757, + "learning_rate": 0.0001723450892229872, + "loss": 5.0579, + "step": 40900 + }, + { + "epoch": 0.05748238802719559, + "grad_norm": 0.9996208548545837, + "learning_rate": 0.00017238724181537163, + "loss": 4.8815, + "step": 40910 + }, + { + "epoch": 0.05749643896535917, + "grad_norm": 1.0179857015609741, + "learning_rate": 0.00017242939440775606, + "loss": 4.8752, + "step": 40920 + }, + { + "epoch": 0.05751048990352275, + "grad_norm": 1.0453020334243774, + "learning_rate": 0.0001724715470001405, + "loss": 4.8984, + "step": 40930 + }, + { + "epoch": 0.057524540841686324, + "grad_norm": 1.0377174615859985, + "learning_rate": 0.00017251369959252492, + "loss": 4.9946, + "step": 40940 + }, + { + "epoch": 0.0575385917798499, + "grad_norm": 1.2559747695922852, + "learning_rate": 0.00017255585218490938, + "loss": 4.9518, + "step": 40950 + }, + { + "epoch": 0.057552642718013476, + "grad_norm": 1.0141798257827759, + "learning_rate": 0.00017259800477729378, + "loss": 4.9145, + "step": 40960 + }, + { + "epoch": 0.05756669365617706, + "grad_norm": 1.0602003335952759, + "learning_rate": 0.00017264015736967822, + "loss": 4.8659, + "step": 40970 + }, + { + "epoch": 0.057580744594340635, + "grad_norm": 1.0627175569534302, + "learning_rate": 0.00017268230996206265, + "loss": 5.0545, + "step": 40980 + }, + { + "epoch": 0.05759479553250421, + "grad_norm": 1.0080777406692505, + "learning_rate": 0.00017272446255444708, + "loss": 4.936, + "step": 40990 + }, + { + "epoch": 0.05760884647066779, + "grad_norm": 1.0756409168243408, + "learning_rate": 0.0001727666151468315, + "loss": 4.8759, + "step": 41000 + }, + { + "epoch": 0.057622897408831364, + "grad_norm": 1.0565085411071777, + "learning_rate": 0.00017280876773921597, + "loss": 4.9084, + "step": 41010 + }, + { + "epoch": 0.05763694834699494, + "grad_norm": 0.9990535378456116, + "learning_rate": 0.00017285092033160037, + "loss": 4.8277, + "step": 41020 + }, + { + "epoch": 0.05765099928515852, + "grad_norm": 1.066185712814331, + "learning_rate": 0.0001728930729239848, + "loss": 4.906, + "step": 41030 + }, + { + "epoch": 0.0576650502233221, + "grad_norm": 0.9987258315086365, + "learning_rate": 0.00017293522551636923, + "loss": 4.863, + "step": 41040 + }, + { + "epoch": 0.057679101161485676, + "grad_norm": 1.011893391609192, + "learning_rate": 0.00017297737810875367, + "loss": 4.931, + "step": 41050 + }, + { + "epoch": 0.05769315209964925, + "grad_norm": 1.0528898239135742, + "learning_rate": 0.0001730195307011381, + "loss": 5.0409, + "step": 41060 + }, + { + "epoch": 0.05770720303781283, + "grad_norm": 1.0291653871536255, + "learning_rate": 0.00017306168329352255, + "loss": 4.87, + "step": 41070 + }, + { + "epoch": 0.05772125397597641, + "grad_norm": 1.029781460762024, + "learning_rate": 0.00017310383588590699, + "loss": 5.0709, + "step": 41080 + }, + { + "epoch": 0.05773530491413999, + "grad_norm": 1.0627702474594116, + "learning_rate": 0.0001731459884782914, + "loss": 4.8982, + "step": 41090 + }, + { + "epoch": 0.05774935585230356, + "grad_norm": 1.027671456336975, + "learning_rate": 0.00017318814107067582, + "loss": 4.8947, + "step": 41100 + }, + { + "epoch": 0.05776340679046714, + "grad_norm": 1.0236762762069702, + "learning_rate": 0.00017323029366306025, + "loss": 4.9642, + "step": 41110 + }, + { + "epoch": 0.057777457728630716, + "grad_norm": 1.0269731283187866, + "learning_rate": 0.00017327244625544468, + "loss": 4.9839, + "step": 41120 + }, + { + "epoch": 0.0577915086667943, + "grad_norm": 1.002811074256897, + "learning_rate": 0.00017331459884782914, + "loss": 4.96, + "step": 41130 + }, + { + "epoch": 0.057805559604957875, + "grad_norm": 1.0268176794052124, + "learning_rate": 0.00017335675144021357, + "loss": 4.9521, + "step": 41140 + }, + { + "epoch": 0.05781961054312145, + "grad_norm": 1.0485950708389282, + "learning_rate": 0.00017339890403259798, + "loss": 4.9922, + "step": 41150 + }, + { + "epoch": 0.05783366148128503, + "grad_norm": 1.0000367164611816, + "learning_rate": 0.0001734410566249824, + "loss": 4.8602, + "step": 41160 + }, + { + "epoch": 0.0578477124194486, + "grad_norm": 1.0004191398620605, + "learning_rate": 0.00017348320921736684, + "loss": 4.8732, + "step": 41170 + }, + { + "epoch": 0.057861763357612186, + "grad_norm": 1.0630460977554321, + "learning_rate": 0.0001735253618097513, + "loss": 5.0591, + "step": 41180 + }, + { + "epoch": 0.05787581429577576, + "grad_norm": 1.069863200187683, + "learning_rate": 0.00017356751440213573, + "loss": 4.9227, + "step": 41190 + }, + { + "epoch": 0.05788986523393934, + "grad_norm": 1.0146735906600952, + "learning_rate": 0.00017360966699452016, + "loss": 4.9153, + "step": 41200 + }, + { + "epoch": 0.057903916172102915, + "grad_norm": 1.0278013944625854, + "learning_rate": 0.0001736518195869046, + "loss": 4.9406, + "step": 41210 + }, + { + "epoch": 0.05791796711026649, + "grad_norm": 1.0474538803100586, + "learning_rate": 0.000173693972179289, + "loss": 4.8249, + "step": 41220 + }, + { + "epoch": 0.057932018048430074, + "grad_norm": 1.0575395822525024, + "learning_rate": 0.00017373612477167343, + "loss": 4.9606, + "step": 41230 + }, + { + "epoch": 0.05794606898659365, + "grad_norm": 1.0785536766052246, + "learning_rate": 0.00017377827736405788, + "loss": 4.9818, + "step": 41240 + }, + { + "epoch": 0.05796011992475723, + "grad_norm": 1.034358024597168, + "learning_rate": 0.00017382042995644232, + "loss": 5.0435, + "step": 41250 + }, + { + "epoch": 0.0579741708629208, + "grad_norm": 1.0407671928405762, + "learning_rate": 0.00017386258254882675, + "loss": 4.9362, + "step": 41260 + }, + { + "epoch": 0.05798822180108438, + "grad_norm": 1.1351573467254639, + "learning_rate": 0.00017390473514121118, + "loss": 4.9251, + "step": 41270 + }, + { + "epoch": 0.05800227273924796, + "grad_norm": 1.0201154947280884, + "learning_rate": 0.00017394688773359558, + "loss": 4.9085, + "step": 41280 + }, + { + "epoch": 0.05801632367741154, + "grad_norm": 1.0584053993225098, + "learning_rate": 0.00017398904032598, + "loss": 4.9704, + "step": 41290 + }, + { + "epoch": 0.058030374615575114, + "grad_norm": 1.0306909084320068, + "learning_rate": 0.00017403119291836447, + "loss": 4.9058, + "step": 41300 + }, + { + "epoch": 0.05804442555373869, + "grad_norm": 1.0459680557250977, + "learning_rate": 0.0001740733455107489, + "loss": 4.949, + "step": 41310 + }, + { + "epoch": 0.05805847649190227, + "grad_norm": 0.9976636171340942, + "learning_rate": 0.00017411549810313333, + "loss": 5.0384, + "step": 41320 + }, + { + "epoch": 0.05807252743006584, + "grad_norm": 1.01653254032135, + "learning_rate": 0.00017415765069551776, + "loss": 4.9653, + "step": 41330 + }, + { + "epoch": 0.058086578368229426, + "grad_norm": 1.0457319021224976, + "learning_rate": 0.0001741998032879022, + "loss": 4.849, + "step": 41340 + }, + { + "epoch": 0.058100629306393, + "grad_norm": 1.0464872121810913, + "learning_rate": 0.0001742419558802866, + "loss": 4.8891, + "step": 41350 + }, + { + "epoch": 0.05811468024455658, + "grad_norm": 1.0535956621170044, + "learning_rate": 0.00017428410847267106, + "loss": 4.988, + "step": 41360 + }, + { + "epoch": 0.058128731182720154, + "grad_norm": 1.0425530672073364, + "learning_rate": 0.0001743262610650555, + "loss": 4.816, + "step": 41370 + }, + { + "epoch": 0.05814278212088373, + "grad_norm": 1.0837699174880981, + "learning_rate": 0.00017436841365743992, + "loss": 4.9787, + "step": 41380 + }, + { + "epoch": 0.058156833059047314, + "grad_norm": 1.025530457496643, + "learning_rate": 0.00017441056624982435, + "loss": 4.9268, + "step": 41390 + }, + { + "epoch": 0.05817088399721089, + "grad_norm": 1.0290592908859253, + "learning_rate": 0.00017445271884220878, + "loss": 4.9541, + "step": 41400 + }, + { + "epoch": 0.058184934935374466, + "grad_norm": 1.0412163734436035, + "learning_rate": 0.0001744948714345932, + "loss": 4.938, + "step": 41410 + }, + { + "epoch": 0.05819898587353804, + "grad_norm": 1.013218879699707, + "learning_rate": 0.00017453702402697765, + "loss": 5.0052, + "step": 41420 + }, + { + "epoch": 0.05821303681170162, + "grad_norm": 1.0787619352340698, + "learning_rate": 0.00017457917661936208, + "loss": 4.8859, + "step": 41430 + }, + { + "epoch": 0.0582270877498652, + "grad_norm": 1.0490294694900513, + "learning_rate": 0.0001746213292117465, + "loss": 4.8447, + "step": 41440 + }, + { + "epoch": 0.05824113868802878, + "grad_norm": 1.0566773414611816, + "learning_rate": 0.00017466348180413094, + "loss": 5.0328, + "step": 41450 + }, + { + "epoch": 0.058255189626192354, + "grad_norm": 1.0242851972579956, + "learning_rate": 0.00017470563439651537, + "loss": 4.9068, + "step": 41460 + }, + { + "epoch": 0.05826924056435593, + "grad_norm": 1.0676864385604858, + "learning_rate": 0.00017474778698889983, + "loss": 4.9384, + "step": 41470 + }, + { + "epoch": 0.058283291502519506, + "grad_norm": 1.0332306623458862, + "learning_rate": 0.00017478993958128423, + "loss": 5.0788, + "step": 41480 + }, + { + "epoch": 0.05829734244068309, + "grad_norm": 0.9824172258377075, + "learning_rate": 0.00017483209217366866, + "loss": 4.8847, + "step": 41490 + }, + { + "epoch": 0.058311393378846665, + "grad_norm": 1.071233868598938, + "learning_rate": 0.0001748742447660531, + "loss": 4.8921, + "step": 41500 + }, + { + "epoch": 0.05832544431701024, + "grad_norm": 1.0408722162246704, + "learning_rate": 0.00017491639735843753, + "loss": 5.0539, + "step": 41510 + }, + { + "epoch": 0.05833949525517382, + "grad_norm": 1.0348032712936401, + "learning_rate": 0.00017495854995082196, + "loss": 4.8807, + "step": 41520 + }, + { + "epoch": 0.058353546193337394, + "grad_norm": 1.0092370510101318, + "learning_rate": 0.00017500070254320642, + "loss": 4.9697, + "step": 41530 + }, + { + "epoch": 0.05836759713150098, + "grad_norm": 1.1517890691757202, + "learning_rate": 0.00017504285513559082, + "loss": 4.9005, + "step": 41540 + }, + { + "epoch": 0.05838164806966455, + "grad_norm": 1.056350827217102, + "learning_rate": 0.00017508500772797525, + "loss": 4.9908, + "step": 41550 + }, + { + "epoch": 0.05839569900782813, + "grad_norm": 1.0199683904647827, + "learning_rate": 0.00017512716032035968, + "loss": 4.9737, + "step": 41560 + }, + { + "epoch": 0.058409749945991705, + "grad_norm": 1.0125006437301636, + "learning_rate": 0.0001751693129127441, + "loss": 4.989, + "step": 41570 + }, + { + "epoch": 0.05842380088415528, + "grad_norm": 1.0040638446807861, + "learning_rate": 0.00017521146550512854, + "loss": 5.0114, + "step": 41580 + }, + { + "epoch": 0.058437851822318865, + "grad_norm": 1.3754082918167114, + "learning_rate": 0.000175253618097513, + "loss": 5.0118, + "step": 41590 + }, + { + "epoch": 0.05845190276048244, + "grad_norm": 0.9892797470092773, + "learning_rate": 0.0001752957706898974, + "loss": 4.9575, + "step": 41600 + }, + { + "epoch": 0.05846595369864602, + "grad_norm": 1.073715090751648, + "learning_rate": 0.00017533792328228184, + "loss": 4.9162, + "step": 41610 + }, + { + "epoch": 0.05848000463680959, + "grad_norm": 1.0172522068023682, + "learning_rate": 0.00017538007587466627, + "loss": 5.0024, + "step": 41620 + }, + { + "epoch": 0.05849405557497317, + "grad_norm": 1.0679402351379395, + "learning_rate": 0.0001754222284670507, + "loss": 4.9627, + "step": 41630 + }, + { + "epoch": 0.058508106513136746, + "grad_norm": 1.0129616260528564, + "learning_rate": 0.00017546438105943513, + "loss": 4.9732, + "step": 41640 + }, + { + "epoch": 0.05852215745130033, + "grad_norm": 1.0495364665985107, + "learning_rate": 0.0001755065336518196, + "loss": 4.8889, + "step": 41650 + }, + { + "epoch": 0.058536208389463905, + "grad_norm": 0.9987136721611023, + "learning_rate": 0.00017554868624420402, + "loss": 4.9349, + "step": 41660 + }, + { + "epoch": 0.05855025932762748, + "grad_norm": 1.0522116422653198, + "learning_rate": 0.00017559083883658842, + "loss": 4.9909, + "step": 41670 + }, + { + "epoch": 0.05856431026579106, + "grad_norm": 1.0374841690063477, + "learning_rate": 0.00017563299142897286, + "loss": 4.9103, + "step": 41680 + }, + { + "epoch": 0.05857836120395463, + "grad_norm": 1.0144661664962769, + "learning_rate": 0.0001756751440213573, + "loss": 4.9681, + "step": 41690 + }, + { + "epoch": 0.058592412142118216, + "grad_norm": 1.0426274538040161, + "learning_rate": 0.00017571729661374172, + "loss": 5.0318, + "step": 41700 + }, + { + "epoch": 0.05860646308028179, + "grad_norm": 1.0269166231155396, + "learning_rate": 0.00017575944920612618, + "loss": 5.0346, + "step": 41710 + }, + { + "epoch": 0.05862051401844537, + "grad_norm": 1.0402437448501587, + "learning_rate": 0.0001758016017985106, + "loss": 4.8424, + "step": 41720 + }, + { + "epoch": 0.058634564956608945, + "grad_norm": 0.9999310970306396, + "learning_rate": 0.000175843754390895, + "loss": 4.8544, + "step": 41730 + }, + { + "epoch": 0.05864861589477252, + "grad_norm": 1.0584388971328735, + "learning_rate": 0.00017588590698327944, + "loss": 4.976, + "step": 41740 + }, + { + "epoch": 0.058662666832936104, + "grad_norm": 1.0145403146743774, + "learning_rate": 0.00017592805957566387, + "loss": 4.9392, + "step": 41750 + }, + { + "epoch": 0.05867671777109968, + "grad_norm": 1.0221564769744873, + "learning_rate": 0.00017597021216804833, + "loss": 4.9297, + "step": 41760 + }, + { + "epoch": 0.05869076870926326, + "grad_norm": 1.0751060247421265, + "learning_rate": 0.00017601236476043276, + "loss": 4.9083, + "step": 41770 + }, + { + "epoch": 0.05870481964742683, + "grad_norm": 1.022549033164978, + "learning_rate": 0.0001760545173528172, + "loss": 4.9086, + "step": 41780 + }, + { + "epoch": 0.05871887058559041, + "grad_norm": 1.0539672374725342, + "learning_rate": 0.00017609666994520163, + "loss": 4.8246, + "step": 41790 + }, + { + "epoch": 0.05873292152375399, + "grad_norm": 1.0099142789840698, + "learning_rate": 0.00017613882253758603, + "loss": 4.8999, + "step": 41800 + }, + { + "epoch": 0.05874697246191757, + "grad_norm": 1.0530145168304443, + "learning_rate": 0.00017617675987073204, + "loss": 4.9517, + "step": 41810 + }, + { + "epoch": 0.058761023400081144, + "grad_norm": 1.0349324941635132, + "learning_rate": 0.00017621891246311647, + "loss": 4.9144, + "step": 41820 + }, + { + "epoch": 0.05877507433824472, + "grad_norm": 0.9777777791023254, + "learning_rate": 0.00017626106505550087, + "loss": 4.8404, + "step": 41830 + }, + { + "epoch": 0.0587891252764083, + "grad_norm": 1.0137258768081665, + "learning_rate": 0.00017630321764788533, + "loss": 4.7486, + "step": 41840 + }, + { + "epoch": 0.05880317621457188, + "grad_norm": 0.9945560097694397, + "learning_rate": 0.00017634537024026976, + "loss": 4.9688, + "step": 41850 + }, + { + "epoch": 0.058817227152735456, + "grad_norm": 1.0335863828659058, + "learning_rate": 0.0001763875228326542, + "loss": 4.9175, + "step": 41860 + }, + { + "epoch": 0.05883127809089903, + "grad_norm": 1.0477795600891113, + "learning_rate": 0.00017642967542503863, + "loss": 5.0537, + "step": 41870 + }, + { + "epoch": 0.05884532902906261, + "grad_norm": 1.0379626750946045, + "learning_rate": 0.00017647182801742306, + "loss": 4.8798, + "step": 41880 + }, + { + "epoch": 0.058859379967226184, + "grad_norm": 1.0096701383590698, + "learning_rate": 0.00017651398060980752, + "loss": 4.9598, + "step": 41890 + }, + { + "epoch": 0.05887343090538977, + "grad_norm": 0.9835436940193176, + "learning_rate": 0.00017655613320219192, + "loss": 4.8876, + "step": 41900 + }, + { + "epoch": 0.058887481843553344, + "grad_norm": 1.0108128786087036, + "learning_rate": 0.00017659828579457635, + "loss": 4.9465, + "step": 41910 + }, + { + "epoch": 0.05890153278171692, + "grad_norm": 0.9913715720176697, + "learning_rate": 0.00017664043838696078, + "loss": 4.8998, + "step": 41920 + }, + { + "epoch": 0.058915583719880496, + "grad_norm": 1.0140135288238525, + "learning_rate": 0.0001766825909793452, + "loss": 4.9478, + "step": 41930 + }, + { + "epoch": 0.05892963465804407, + "grad_norm": 1.0244427919387817, + "learning_rate": 0.00017672474357172964, + "loss": 4.8803, + "step": 41940 + }, + { + "epoch": 0.05894368559620765, + "grad_norm": 1.0371630191802979, + "learning_rate": 0.0001767668961641141, + "loss": 4.9136, + "step": 41950 + }, + { + "epoch": 0.05895773653437123, + "grad_norm": 1.0342624187469482, + "learning_rate": 0.0001768090487564985, + "loss": 5.0633, + "step": 41960 + }, + { + "epoch": 0.05897178747253481, + "grad_norm": 1.0081278085708618, + "learning_rate": 0.00017685120134888294, + "loss": 4.972, + "step": 41970 + }, + { + "epoch": 0.058985838410698384, + "grad_norm": 1.0183650255203247, + "learning_rate": 0.00017689335394126737, + "loss": 4.8956, + "step": 41980 + }, + { + "epoch": 0.05899988934886196, + "grad_norm": 1.0174623727798462, + "learning_rate": 0.0001769355065336518, + "loss": 4.7951, + "step": 41990 + }, + { + "epoch": 0.059013940287025536, + "grad_norm": 1.0399575233459473, + "learning_rate": 0.00017697765912603623, + "loss": 4.8732, + "step": 42000 + }, + { + "epoch": 0.05902799122518912, + "grad_norm": 1.0714610815048218, + "learning_rate": 0.0001770198117184207, + "loss": 4.8844, + "step": 42010 + }, + { + "epoch": 0.059042042163352695, + "grad_norm": 1.07644522190094, + "learning_rate": 0.00017706196431080512, + "loss": 4.8854, + "step": 42020 + }, + { + "epoch": 0.05905609310151627, + "grad_norm": 1.018179178237915, + "learning_rate": 0.00017710411690318953, + "loss": 4.9124, + "step": 42030 + }, + { + "epoch": 0.05907014403967985, + "grad_norm": 1.0028572082519531, + "learning_rate": 0.00017714626949557396, + "loss": 5.0101, + "step": 42040 + }, + { + "epoch": 0.059084194977843424, + "grad_norm": 1.0452194213867188, + "learning_rate": 0.0001771884220879584, + "loss": 4.9129, + "step": 42050 + }, + { + "epoch": 0.05909824591600701, + "grad_norm": 0.9863782525062561, + "learning_rate": 0.00017723057468034282, + "loss": 4.961, + "step": 42060 + }, + { + "epoch": 0.05911229685417058, + "grad_norm": 1.0302046537399292, + "learning_rate": 0.00017727272727272728, + "loss": 4.896, + "step": 42070 + }, + { + "epoch": 0.05912634779233416, + "grad_norm": 0.994310200214386, + "learning_rate": 0.0001773148798651117, + "loss": 4.9747, + "step": 42080 + }, + { + "epoch": 0.059140398730497735, + "grad_norm": 1.021562933921814, + "learning_rate": 0.0001773570324574961, + "loss": 4.8488, + "step": 42090 + }, + { + "epoch": 0.05915444966866131, + "grad_norm": 1.034226655960083, + "learning_rate": 0.00017739918504988054, + "loss": 4.8405, + "step": 42100 + }, + { + "epoch": 0.059168500606824895, + "grad_norm": 1.0121209621429443, + "learning_rate": 0.00017744133764226497, + "loss": 4.9374, + "step": 42110 + }, + { + "epoch": 0.05918255154498847, + "grad_norm": 0.9862031936645508, + "learning_rate": 0.0001774834902346494, + "loss": 4.8789, + "step": 42120 + }, + { + "epoch": 0.05919660248315205, + "grad_norm": 1.0321632623672485, + "learning_rate": 0.00017752564282703386, + "loss": 4.9114, + "step": 42130 + }, + { + "epoch": 0.05921065342131562, + "grad_norm": 1.0482895374298096, + "learning_rate": 0.0001775677954194183, + "loss": 4.8856, + "step": 42140 + }, + { + "epoch": 0.0592247043594792, + "grad_norm": 1.1908639669418335, + "learning_rate": 0.0001776099480118027, + "loss": 4.8354, + "step": 42150 + }, + { + "epoch": 0.05923875529764278, + "grad_norm": 1.1106269359588623, + "learning_rate": 0.00017765210060418713, + "loss": 4.8703, + "step": 42160 + }, + { + "epoch": 0.05925280623580636, + "grad_norm": 1.042508840560913, + "learning_rate": 0.00017769425319657156, + "loss": 4.8856, + "step": 42170 + }, + { + "epoch": 0.059266857173969935, + "grad_norm": 1.398681640625, + "learning_rate": 0.000177736405788956, + "loss": 4.9736, + "step": 42180 + }, + { + "epoch": 0.05928090811213351, + "grad_norm": 1.0562206506729126, + "learning_rate": 0.00017777855838134045, + "loss": 4.9827, + "step": 42190 + }, + { + "epoch": 0.05929495905029709, + "grad_norm": 0.977316677570343, + "learning_rate": 0.00017782071097372488, + "loss": 4.9283, + "step": 42200 + }, + { + "epoch": 0.05930900998846067, + "grad_norm": 1.0145175457000732, + "learning_rate": 0.0001778628635661093, + "loss": 4.9636, + "step": 42210 + }, + { + "epoch": 0.059323060926624246, + "grad_norm": 1.0313082933425903, + "learning_rate": 0.00017790501615849372, + "loss": 4.7409, + "step": 42220 + }, + { + "epoch": 0.05933711186478782, + "grad_norm": 1.0246168375015259, + "learning_rate": 0.00017794716875087815, + "loss": 4.9663, + "step": 42230 + }, + { + "epoch": 0.0593511628029514, + "grad_norm": 1.1417378187179565, + "learning_rate": 0.00017798932134326258, + "loss": 4.9773, + "step": 42240 + }, + { + "epoch": 0.059365213741114975, + "grad_norm": 1.0039445161819458, + "learning_rate": 0.00017803147393564704, + "loss": 4.9977, + "step": 42250 + }, + { + "epoch": 0.05937926467927855, + "grad_norm": 1.012376070022583, + "learning_rate": 0.00017807362652803147, + "loss": 4.8092, + "step": 42260 + }, + { + "epoch": 0.059393315617442134, + "grad_norm": 1.0877492427825928, + "learning_rate": 0.0001781157791204159, + "loss": 4.9357, + "step": 42270 + }, + { + "epoch": 0.05940736655560571, + "grad_norm": 1.0365524291992188, + "learning_rate": 0.0001781579317128003, + "loss": 4.8717, + "step": 42280 + }, + { + "epoch": 0.05942141749376929, + "grad_norm": 0.9986197352409363, + "learning_rate": 0.00017820008430518474, + "loss": 4.9256, + "step": 42290 + }, + { + "epoch": 0.05943546843193286, + "grad_norm": 1.0150564908981323, + "learning_rate": 0.00017824223689756917, + "loss": 4.8873, + "step": 42300 + }, + { + "epoch": 0.05944951937009644, + "grad_norm": 1.0646934509277344, + "learning_rate": 0.00017828438948995362, + "loss": 4.8436, + "step": 42310 + }, + { + "epoch": 0.05946357030826002, + "grad_norm": 1.0046230554580688, + "learning_rate": 0.00017832654208233806, + "loss": 5.0225, + "step": 42320 + }, + { + "epoch": 0.0594776212464236, + "grad_norm": 1.0435608625411987, + "learning_rate": 0.0001783686946747225, + "loss": 4.9509, + "step": 42330 + }, + { + "epoch": 0.059491672184587174, + "grad_norm": 1.004775047302246, + "learning_rate": 0.00017841084726710692, + "loss": 4.8448, + "step": 42340 + }, + { + "epoch": 0.05950572312275075, + "grad_norm": 1.0181119441986084, + "learning_rate": 0.00017845299985949132, + "loss": 4.9456, + "step": 42350 + }, + { + "epoch": 0.05951977406091433, + "grad_norm": 1.0563257932662964, + "learning_rate": 0.00017849515245187575, + "loss": 4.938, + "step": 42360 + }, + { + "epoch": 0.05953382499907791, + "grad_norm": 1.2527403831481934, + "learning_rate": 0.0001785373050442602, + "loss": 4.8606, + "step": 42370 + }, + { + "epoch": 0.059547875937241486, + "grad_norm": 0.9892441630363464, + "learning_rate": 0.00017857945763664464, + "loss": 4.9537, + "step": 42380 + }, + { + "epoch": 0.05956192687540506, + "grad_norm": 0.9748817086219788, + "learning_rate": 0.00017862161022902907, + "loss": 4.9919, + "step": 42390 + }, + { + "epoch": 0.05957597781356864, + "grad_norm": 1.0507982969284058, + "learning_rate": 0.0001786637628214135, + "loss": 4.9252, + "step": 42400 + }, + { + "epoch": 0.059590028751732214, + "grad_norm": 1.108594536781311, + "learning_rate": 0.0001787059154137979, + "loss": 4.9004, + "step": 42410 + }, + { + "epoch": 0.0596040796898958, + "grad_norm": 1.0259724855422974, + "learning_rate": 0.00017874806800618237, + "loss": 4.7901, + "step": 42420 + }, + { + "epoch": 0.059618130628059374, + "grad_norm": 1.0419907569885254, + "learning_rate": 0.0001787902205985668, + "loss": 4.9264, + "step": 42430 + }, + { + "epoch": 0.05963218156622295, + "grad_norm": 1.0113157033920288, + "learning_rate": 0.00017883237319095123, + "loss": 4.9435, + "step": 42440 + }, + { + "epoch": 0.059646232504386526, + "grad_norm": 1.0024772882461548, + "learning_rate": 0.00017887452578333566, + "loss": 5.0503, + "step": 42450 + }, + { + "epoch": 0.0596602834425501, + "grad_norm": 1.024277687072754, + "learning_rate": 0.0001789166783757201, + "loss": 5.0154, + "step": 42460 + }, + { + "epoch": 0.059674334380713685, + "grad_norm": 0.9590503573417664, + "learning_rate": 0.00017895883096810455, + "loss": 4.9712, + "step": 42470 + }, + { + "epoch": 0.05968838531887726, + "grad_norm": 1.0252444744110107, + "learning_rate": 0.00017900098356048895, + "loss": 4.8889, + "step": 42480 + }, + { + "epoch": 0.05970243625704084, + "grad_norm": 0.9953712821006775, + "learning_rate": 0.00017904313615287339, + "loss": 5.0119, + "step": 42490 + }, + { + "epoch": 0.059716487195204414, + "grad_norm": 1.018610954284668, + "learning_rate": 0.00017908528874525782, + "loss": 4.8525, + "step": 42500 + }, + { + "epoch": 0.05973053813336799, + "grad_norm": 0.9951494336128235, + "learning_rate": 0.00017912744133764225, + "loss": 4.9818, + "step": 42510 + }, + { + "epoch": 0.05974458907153157, + "grad_norm": 1.0133929252624512, + "learning_rate": 0.00017916959393002668, + "loss": 5.0259, + "step": 42520 + }, + { + "epoch": 0.05975864000969515, + "grad_norm": 0.9819827079772949, + "learning_rate": 0.00017921174652241114, + "loss": 4.9843, + "step": 42530 + }, + { + "epoch": 0.059772690947858725, + "grad_norm": 0.9785075187683105, + "learning_rate": 0.00017925389911479554, + "loss": 4.9552, + "step": 42540 + }, + { + "epoch": 0.0597867418860223, + "grad_norm": 1.0113465785980225, + "learning_rate": 0.00017929605170717997, + "loss": 4.9311, + "step": 42550 + }, + { + "epoch": 0.05980079282418588, + "grad_norm": 0.9836959838867188, + "learning_rate": 0.0001793382042995644, + "loss": 4.84, + "step": 42560 + }, + { + "epoch": 0.05981484376234946, + "grad_norm": 0.9870015382766724, + "learning_rate": 0.00017938035689194884, + "loss": 4.9337, + "step": 42570 + }, + { + "epoch": 0.05982889470051304, + "grad_norm": 0.9871739745140076, + "learning_rate": 0.00017942250948433327, + "loss": 4.8171, + "step": 42580 + }, + { + "epoch": 0.05984294563867661, + "grad_norm": 0.9959299564361572, + "learning_rate": 0.00017946466207671772, + "loss": 4.8464, + "step": 42590 + }, + { + "epoch": 0.05985699657684019, + "grad_norm": 1.0436452627182007, + "learning_rate": 0.00017950681466910216, + "loss": 4.941, + "step": 42600 + }, + { + "epoch": 0.059871047515003765, + "grad_norm": 1.0525795221328735, + "learning_rate": 0.00017954896726148656, + "loss": 4.8133, + "step": 42610 + }, + { + "epoch": 0.05988509845316734, + "grad_norm": 1.0073230266571045, + "learning_rate": 0.000179591119853871, + "loss": 4.8356, + "step": 42620 + }, + { + "epoch": 0.059899149391330925, + "grad_norm": 0.9857484102249146, + "learning_rate": 0.00017963327244625542, + "loss": 4.8508, + "step": 42630 + }, + { + "epoch": 0.0599132003294945, + "grad_norm": 1.013698935508728, + "learning_rate": 0.00017967542503863985, + "loss": 4.806, + "step": 42640 + }, + { + "epoch": 0.05992725126765808, + "grad_norm": 1.013667106628418, + "learning_rate": 0.0001797175776310243, + "loss": 4.9422, + "step": 42650 + }, + { + "epoch": 0.05994130220582165, + "grad_norm": 1.0081895589828491, + "learning_rate": 0.00017975973022340874, + "loss": 4.9618, + "step": 42660 + }, + { + "epoch": 0.05995535314398523, + "grad_norm": 1.0109422206878662, + "learning_rate": 0.00017980188281579315, + "loss": 4.8748, + "step": 42670 + }, + { + "epoch": 0.05996940408214881, + "grad_norm": 1.035300374031067, + "learning_rate": 0.00017984403540817758, + "loss": 4.8972, + "step": 42680 + }, + { + "epoch": 0.05998345502031239, + "grad_norm": 0.994915783405304, + "learning_rate": 0.000179886188000562, + "loss": 4.889, + "step": 42690 + }, + { + "epoch": 0.059997505958475965, + "grad_norm": 1.0624393224716187, + "learning_rate": 0.00017992834059294644, + "loss": 4.9058, + "step": 42700 + }, + { + "epoch": 0.06001155689663954, + "grad_norm": 1.0263959169387817, + "learning_rate": 0.0001799704931853309, + "loss": 4.8691, + "step": 42710 + }, + { + "epoch": 0.06002560783480312, + "grad_norm": 1.0453038215637207, + "learning_rate": 0.00018001264577771533, + "loss": 4.6771, + "step": 42720 + }, + { + "epoch": 0.0600396587729667, + "grad_norm": 1.070002794265747, + "learning_rate": 0.00018005479837009973, + "loss": 4.9346, + "step": 42730 + }, + { + "epoch": 0.060053709711130276, + "grad_norm": 1.0732343196868896, + "learning_rate": 0.00018009695096248417, + "loss": 5.021, + "step": 42740 + }, + { + "epoch": 0.06006776064929385, + "grad_norm": 1.127801775932312, + "learning_rate": 0.0001801391035548686, + "loss": 4.9084, + "step": 42750 + }, + { + "epoch": 0.06008181158745743, + "grad_norm": 1.0170589685440063, + "learning_rate": 0.00018018125614725303, + "loss": 4.9463, + "step": 42760 + }, + { + "epoch": 0.060095862525621005, + "grad_norm": 1.0990118980407715, + "learning_rate": 0.00018022340873963749, + "loss": 4.9675, + "step": 42770 + }, + { + "epoch": 0.06010991346378459, + "grad_norm": 1.0004825592041016, + "learning_rate": 0.00018026556133202192, + "loss": 5.0224, + "step": 42780 + }, + { + "epoch": 0.060123964401948164, + "grad_norm": 1.0344843864440918, + "learning_rate": 0.00018030771392440635, + "loss": 4.8195, + "step": 42790 + }, + { + "epoch": 0.06013801534011174, + "grad_norm": 1.0126923322677612, + "learning_rate": 0.00018034986651679075, + "loss": 4.9871, + "step": 42800 + }, + { + "epoch": 0.06015206627827532, + "grad_norm": 1.0566537380218506, + "learning_rate": 0.00018039201910917518, + "loss": 4.959, + "step": 42810 + }, + { + "epoch": 0.06016611721643889, + "grad_norm": 1.052660584449768, + "learning_rate": 0.00018043417170155961, + "loss": 4.995, + "step": 42820 + }, + { + "epoch": 0.060180168154602476, + "grad_norm": 1.0248833894729614, + "learning_rate": 0.00018047632429394407, + "loss": 4.8879, + "step": 42830 + }, + { + "epoch": 0.06019421909276605, + "grad_norm": 1.013227939605713, + "learning_rate": 0.0001805184768863285, + "loss": 4.9213, + "step": 42840 + }, + { + "epoch": 0.06020827003092963, + "grad_norm": 1.00045645236969, + "learning_rate": 0.00018056062947871294, + "loss": 5.0058, + "step": 42850 + }, + { + "epoch": 0.060222320969093204, + "grad_norm": 0.959171712398529, + "learning_rate": 0.00018060278207109734, + "loss": 4.9514, + "step": 42860 + }, + { + "epoch": 0.06023637190725678, + "grad_norm": 1.0098928213119507, + "learning_rate": 0.00018064493466348177, + "loss": 4.9365, + "step": 42870 + }, + { + "epoch": 0.060250422845420364, + "grad_norm": 1.0477204322814941, + "learning_rate": 0.0001806870872558662, + "loss": 4.9944, + "step": 42880 + }, + { + "epoch": 0.06026447378358394, + "grad_norm": 1.0132365226745605, + "learning_rate": 0.00018072923984825066, + "loss": 4.8793, + "step": 42890 + }, + { + "epoch": 0.060278524721747516, + "grad_norm": 1.0186618566513062, + "learning_rate": 0.0001807713924406351, + "loss": 4.9529, + "step": 42900 + }, + { + "epoch": 0.06029257565991109, + "grad_norm": 1.0929832458496094, + "learning_rate": 0.00018081354503301952, + "loss": 4.9407, + "step": 42910 + }, + { + "epoch": 0.06030662659807467, + "grad_norm": 1.0314265489578247, + "learning_rate": 0.00018085569762540395, + "loss": 4.819, + "step": 42920 + }, + { + "epoch": 0.060320677536238244, + "grad_norm": 0.9774079918861389, + "learning_rate": 0.00018089785021778836, + "loss": 4.9518, + "step": 42930 + }, + { + "epoch": 0.06033472847440183, + "grad_norm": 0.9911931157112122, + "learning_rate": 0.0001809400028101728, + "loss": 5.0276, + "step": 42940 + }, + { + "epoch": 0.060348779412565404, + "grad_norm": 1.0376858711242676, + "learning_rate": 0.00018098215540255725, + "loss": 4.8859, + "step": 42950 + }, + { + "epoch": 0.06036283035072898, + "grad_norm": 0.9969538450241089, + "learning_rate": 0.00018102430799494168, + "loss": 4.9157, + "step": 42960 + }, + { + "epoch": 0.060376881288892556, + "grad_norm": 1.0148366689682007, + "learning_rate": 0.0001810664605873261, + "loss": 4.942, + "step": 42970 + }, + { + "epoch": 0.06039093222705613, + "grad_norm": 0.9672243595123291, + "learning_rate": 0.00018110861317971054, + "loss": 4.9629, + "step": 42980 + }, + { + "epoch": 0.060404983165219715, + "grad_norm": 1.0580873489379883, + "learning_rate": 0.00018115076577209494, + "loss": 4.8622, + "step": 42990 + }, + { + "epoch": 0.06041903410338329, + "grad_norm": 1.0431230068206787, + "learning_rate": 0.0001811929183644794, + "loss": 4.8916, + "step": 43000 + }, + { + "epoch": 0.06043308504154687, + "grad_norm": 1.0647096633911133, + "learning_rate": 0.00018123507095686383, + "loss": 4.9537, + "step": 43010 + }, + { + "epoch": 0.060447135979710444, + "grad_norm": 1.0209344625473022, + "learning_rate": 0.00018127722354924826, + "loss": 4.8759, + "step": 43020 + }, + { + "epoch": 0.06046118691787402, + "grad_norm": 1.0193747282028198, + "learning_rate": 0.0001813193761416327, + "loss": 4.8229, + "step": 43030 + }, + { + "epoch": 0.0604752378560376, + "grad_norm": 1.0239225625991821, + "learning_rate": 0.00018136152873401713, + "loss": 4.9135, + "step": 43040 + }, + { + "epoch": 0.06048928879420118, + "grad_norm": 1.0313153266906738, + "learning_rate": 0.00018140368132640159, + "loss": 4.9518, + "step": 43050 + }, + { + "epoch": 0.060503339732364755, + "grad_norm": 1.0050175189971924, + "learning_rate": 0.000181445833918786, + "loss": 4.9435, + "step": 43060 + }, + { + "epoch": 0.06051739067052833, + "grad_norm": 0.9589523077011108, + "learning_rate": 0.00018148798651117042, + "loss": 4.8134, + "step": 43070 + }, + { + "epoch": 0.06053144160869191, + "grad_norm": 0.9744222164154053, + "learning_rate": 0.00018153013910355485, + "loss": 4.9269, + "step": 43080 + }, + { + "epoch": 0.06054549254685549, + "grad_norm": 0.9929285049438477, + "learning_rate": 0.00018157229169593928, + "loss": 4.9625, + "step": 43090 + }, + { + "epoch": 0.06055954348501907, + "grad_norm": 1.0369280576705933, + "learning_rate": 0.00018161444428832371, + "loss": 4.9153, + "step": 43100 + }, + { + "epoch": 0.06057359442318264, + "grad_norm": 0.994658350944519, + "learning_rate": 0.00018165659688070817, + "loss": 4.9614, + "step": 43110 + }, + { + "epoch": 0.06058764536134622, + "grad_norm": 1.1135982275009155, + "learning_rate": 0.00018169874947309258, + "loss": 4.897, + "step": 43120 + }, + { + "epoch": 0.060601696299509795, + "grad_norm": 0.9817977547645569, + "learning_rate": 0.000181740902065477, + "loss": 4.9774, + "step": 43130 + }, + { + "epoch": 0.06061574723767338, + "grad_norm": 1.008264422416687, + "learning_rate": 0.00018178305465786144, + "loss": 4.8703, + "step": 43140 + }, + { + "epoch": 0.060629798175836955, + "grad_norm": 1.0090116262435913, + "learning_rate": 0.00018182520725024587, + "loss": 4.7993, + "step": 43150 + }, + { + "epoch": 0.06064384911400053, + "grad_norm": 1.018884301185608, + "learning_rate": 0.0001818673598426303, + "loss": 5.0122, + "step": 43160 + }, + { + "epoch": 0.06065790005216411, + "grad_norm": 1.0300415754318237, + "learning_rate": 0.00018190951243501476, + "loss": 4.8523, + "step": 43170 + }, + { + "epoch": 0.06067195099032768, + "grad_norm": 1.0521844625473022, + "learning_rate": 0.0001819516650273992, + "loss": 4.8799, + "step": 43180 + }, + { + "epoch": 0.060686001928491266, + "grad_norm": 1.026895523071289, + "learning_rate": 0.0001819938176197836, + "loss": 4.9328, + "step": 43190 + }, + { + "epoch": 0.06070005286665484, + "grad_norm": 1.0086530447006226, + "learning_rate": 0.00018203597021216803, + "loss": 4.8162, + "step": 43200 + }, + { + "epoch": 0.06071410380481842, + "grad_norm": 1.0305287837982178, + "learning_rate": 0.00018207812280455246, + "loss": 4.858, + "step": 43210 + }, + { + "epoch": 0.060728154742981995, + "grad_norm": 0.997772216796875, + "learning_rate": 0.0001821202753969369, + "loss": 4.9303, + "step": 43220 + }, + { + "epoch": 0.06074220568114557, + "grad_norm": 1.0172392129898071, + "learning_rate": 0.00018216242798932135, + "loss": 4.9411, + "step": 43230 + }, + { + "epoch": 0.06075625661930915, + "grad_norm": 0.9829245805740356, + "learning_rate": 0.00018220458058170578, + "loss": 5.0541, + "step": 43240 + }, + { + "epoch": 0.06077030755747273, + "grad_norm": 0.9785235524177551, + "learning_rate": 0.00018224673317409018, + "loss": 4.953, + "step": 43250 + }, + { + "epoch": 0.060784358495636306, + "grad_norm": 0.9770336747169495, + "learning_rate": 0.0001822888857664746, + "loss": 5.0195, + "step": 43260 + }, + { + "epoch": 0.06079840943379988, + "grad_norm": 0.994461178779602, + "learning_rate": 0.00018233103835885904, + "loss": 4.8224, + "step": 43270 + }, + { + "epoch": 0.06081246037196346, + "grad_norm": 0.9978111982345581, + "learning_rate": 0.00018237319095124348, + "loss": 4.8499, + "step": 43280 + }, + { + "epoch": 0.060826511310127035, + "grad_norm": 1.0382758378982544, + "learning_rate": 0.00018241534354362793, + "loss": 4.9601, + "step": 43290 + }, + { + "epoch": 0.06084056224829062, + "grad_norm": 1.0007946491241455, + "learning_rate": 0.00018245749613601236, + "loss": 4.8906, + "step": 43300 + }, + { + "epoch": 0.060854613186454194, + "grad_norm": 0.9689368009567261, + "learning_rate": 0.0001824996487283968, + "loss": 4.9713, + "step": 43310 + }, + { + "epoch": 0.06086866412461777, + "grad_norm": 0.9696866273880005, + "learning_rate": 0.0001825418013207812, + "loss": 4.9007, + "step": 43320 + }, + { + "epoch": 0.06088271506278135, + "grad_norm": 0.9870903491973877, + "learning_rate": 0.00018258395391316563, + "loss": 4.9312, + "step": 43330 + }, + { + "epoch": 0.06089676600094492, + "grad_norm": 1.0196123123168945, + "learning_rate": 0.00018262610650555006, + "loss": 4.7852, + "step": 43340 + }, + { + "epoch": 0.060910816939108506, + "grad_norm": 0.9788631200790405, + "learning_rate": 0.00018266825909793452, + "loss": 4.8479, + "step": 43350 + }, + { + "epoch": 0.06092486787727208, + "grad_norm": 0.9982026219367981, + "learning_rate": 0.00018271041169031895, + "loss": 4.9585, + "step": 43360 + }, + { + "epoch": 0.06093891881543566, + "grad_norm": 1.0699291229248047, + "learning_rate": 0.00018275256428270338, + "loss": 4.9751, + "step": 43370 + }, + { + "epoch": 0.060952969753599234, + "grad_norm": 1.0497549772262573, + "learning_rate": 0.0001827947168750878, + "loss": 4.9279, + "step": 43380 + }, + { + "epoch": 0.06096702069176281, + "grad_norm": 0.9782308340072632, + "learning_rate": 0.00018283686946747222, + "loss": 4.9041, + "step": 43390 + }, + { + "epoch": 0.060981071629926394, + "grad_norm": 0.964344322681427, + "learning_rate": 0.00018287902205985665, + "loss": 4.9652, + "step": 43400 + }, + { + "epoch": 0.06099512256808997, + "grad_norm": 0.9865180253982544, + "learning_rate": 0.0001829211746522411, + "loss": 5.0092, + "step": 43410 + }, + { + "epoch": 0.061009173506253546, + "grad_norm": 1.071978211402893, + "learning_rate": 0.00018296332724462554, + "loss": 4.8976, + "step": 43420 + }, + { + "epoch": 0.06102322444441712, + "grad_norm": 0.9803053736686707, + "learning_rate": 0.00018300547983700997, + "loss": 4.8913, + "step": 43430 + }, + { + "epoch": 0.0610372753825807, + "grad_norm": 0.9943988919258118, + "learning_rate": 0.00018304763242939437, + "loss": 5.0052, + "step": 43440 + }, + { + "epoch": 0.06105132632074428, + "grad_norm": 0.9977701306343079, + "learning_rate": 0.0001830897850217788, + "loss": 4.8374, + "step": 43450 + }, + { + "epoch": 0.06106537725890786, + "grad_norm": 1.0006359815597534, + "learning_rate": 0.00018313193761416324, + "loss": 4.9586, + "step": 43460 + }, + { + "epoch": 0.061079428197071434, + "grad_norm": 1.0693806409835815, + "learning_rate": 0.0001831740902065477, + "loss": 4.932, + "step": 43470 + }, + { + "epoch": 0.06109347913523501, + "grad_norm": 0.9895679950714111, + "learning_rate": 0.00018321624279893213, + "loss": 4.9462, + "step": 43480 + }, + { + "epoch": 0.061107530073398586, + "grad_norm": 1.0478029251098633, + "learning_rate": 0.00018325839539131656, + "loss": 4.9418, + "step": 43490 + }, + { + "epoch": 0.06112158101156217, + "grad_norm": 0.9845918416976929, + "learning_rate": 0.000183300547983701, + "loss": 4.9005, + "step": 43500 + }, + { + "epoch": 0.061135631949725745, + "grad_norm": 1.0808178186416626, + "learning_rate": 0.0001833427005760854, + "loss": 4.8593, + "step": 43510 + }, + { + "epoch": 0.06114968288788932, + "grad_norm": 0.9863511919975281, + "learning_rate": 0.00018338485316846982, + "loss": 4.9801, + "step": 43520 + }, + { + "epoch": 0.0611637338260529, + "grad_norm": 1.0400804281234741, + "learning_rate": 0.00018342700576085428, + "loss": 4.8457, + "step": 43530 + }, + { + "epoch": 0.061177784764216474, + "grad_norm": 1.01461660861969, + "learning_rate": 0.0001834691583532387, + "loss": 4.969, + "step": 43540 + }, + { + "epoch": 0.06119183570238005, + "grad_norm": 0.9691588282585144, + "learning_rate": 0.00018351131094562314, + "loss": 4.8088, + "step": 43550 + }, + { + "epoch": 0.06120588664054363, + "grad_norm": 1.038812279701233, + "learning_rate": 0.00018355346353800757, + "loss": 4.8867, + "step": 43560 + }, + { + "epoch": 0.06121993757870721, + "grad_norm": 0.9811944365501404, + "learning_rate": 0.00018359561613039198, + "loss": 4.9723, + "step": 43570 + }, + { + "epoch": 0.061233988516870785, + "grad_norm": 1.0180128812789917, + "learning_rate": 0.00018363776872277644, + "loss": 4.6997, + "step": 43580 + }, + { + "epoch": 0.06124803945503436, + "grad_norm": 1.078423023223877, + "learning_rate": 0.00018367992131516087, + "loss": 4.8831, + "step": 43590 + }, + { + "epoch": 0.06126209039319794, + "grad_norm": 0.9817917346954346, + "learning_rate": 0.0001837220739075453, + "loss": 4.9102, + "step": 43600 + }, + { + "epoch": 0.06127614133136152, + "grad_norm": 0.977243959903717, + "learning_rate": 0.00018376422649992973, + "loss": 4.883, + "step": 43610 + }, + { + "epoch": 0.0612901922695251, + "grad_norm": 1.1097636222839355, + "learning_rate": 0.00018380637909231416, + "loss": 4.8, + "step": 43620 + }, + { + "epoch": 0.06130424320768867, + "grad_norm": 0.9828046560287476, + "learning_rate": 0.00018384853168469862, + "loss": 4.8724, + "step": 43630 + }, + { + "epoch": 0.06131829414585225, + "grad_norm": 0.9683628678321838, + "learning_rate": 0.00018389068427708302, + "loss": 4.9274, + "step": 43640 + }, + { + "epoch": 0.061332345084015825, + "grad_norm": 0.989050030708313, + "learning_rate": 0.00018393283686946746, + "loss": 4.9037, + "step": 43650 + }, + { + "epoch": 0.06134639602217941, + "grad_norm": 1.0087295770645142, + "learning_rate": 0.0001839749894618519, + "loss": 4.9268, + "step": 43660 + }, + { + "epoch": 0.061360446960342985, + "grad_norm": 1.0443772077560425, + "learning_rate": 0.00018401714205423632, + "loss": 4.9625, + "step": 43670 + }, + { + "epoch": 0.06137449789850656, + "grad_norm": 1.0253444910049438, + "learning_rate": 0.00018405929464662075, + "loss": 4.7028, + "step": 43680 + }, + { + "epoch": 0.06138854883667014, + "grad_norm": 1.0143331289291382, + "learning_rate": 0.0001841014472390052, + "loss": 4.8486, + "step": 43690 + }, + { + "epoch": 0.06140259977483371, + "grad_norm": 0.9996601939201355, + "learning_rate": 0.0001841435998313896, + "loss": 4.9462, + "step": 43700 + }, + { + "epoch": 0.061416650712997296, + "grad_norm": 0.9822000861167908, + "learning_rate": 0.00018418575242377404, + "loss": 4.9844, + "step": 43710 + }, + { + "epoch": 0.06143070165116087, + "grad_norm": 1.0107418298721313, + "learning_rate": 0.00018422790501615847, + "loss": 5.0152, + "step": 43720 + }, + { + "epoch": 0.06144475258932445, + "grad_norm": 1.0787968635559082, + "learning_rate": 0.0001842700576085429, + "loss": 4.9467, + "step": 43730 + }, + { + "epoch": 0.061458803527488025, + "grad_norm": 1.0120429992675781, + "learning_rate": 0.00018431221020092734, + "loss": 4.8599, + "step": 43740 + }, + { + "epoch": 0.0614728544656516, + "grad_norm": 1.002541422843933, + "learning_rate": 0.0001843543627933118, + "loss": 5.0134, + "step": 43750 + }, + { + "epoch": 0.061486905403815184, + "grad_norm": 1.0086458921432495, + "learning_rate": 0.00018439651538569623, + "loss": 4.8738, + "step": 43760 + }, + { + "epoch": 0.06150095634197876, + "grad_norm": 1.0043725967407227, + "learning_rate": 0.00018443866797808063, + "loss": 4.9894, + "step": 43770 + }, + { + "epoch": 0.061515007280142336, + "grad_norm": 0.9925393462181091, + "learning_rate": 0.00018448082057046506, + "loss": 4.9451, + "step": 43780 + }, + { + "epoch": 0.06152905821830591, + "grad_norm": 0.9828112721443176, + "learning_rate": 0.0001845229731628495, + "loss": 4.8894, + "step": 43790 + }, + { + "epoch": 0.06154310915646949, + "grad_norm": 1.0312516689300537, + "learning_rate": 0.00018456512575523392, + "loss": 4.8535, + "step": 43800 + }, + { + "epoch": 0.06155716009463307, + "grad_norm": 1.0502887964248657, + "learning_rate": 0.00018460727834761838, + "loss": 4.8503, + "step": 43810 + }, + { + "epoch": 0.06157121103279665, + "grad_norm": 0.9553511738777161, + "learning_rate": 0.0001846494309400028, + "loss": 4.931, + "step": 43820 + }, + { + "epoch": 0.061585261970960224, + "grad_norm": 1.0687919855117798, + "learning_rate": 0.00018469158353238722, + "loss": 4.858, + "step": 43830 + }, + { + "epoch": 0.0615993129091238, + "grad_norm": 0.9931618571281433, + "learning_rate": 0.00018473373612477165, + "loss": 4.9647, + "step": 43840 + }, + { + "epoch": 0.061613363847287377, + "grad_norm": 1.0885789394378662, + "learning_rate": 0.00018477588871715608, + "loss": 4.8131, + "step": 43850 + }, + { + "epoch": 0.06162741478545095, + "grad_norm": 0.9883633255958557, + "learning_rate": 0.0001848180413095405, + "loss": 4.8692, + "step": 43860 + }, + { + "epoch": 0.061641465723614536, + "grad_norm": 0.9546381235122681, + "learning_rate": 0.00018486019390192497, + "loss": 4.9282, + "step": 43870 + }, + { + "epoch": 0.06165551666177811, + "grad_norm": 0.9708349704742432, + "learning_rate": 0.0001849023464943094, + "loss": 4.9329, + "step": 43880 + }, + { + "epoch": 0.06166956759994169, + "grad_norm": 0.985491156578064, + "learning_rate": 0.00018494449908669383, + "loss": 5.066, + "step": 43890 + }, + { + "epoch": 0.061683618538105264, + "grad_norm": 1.0057768821716309, + "learning_rate": 0.00018498665167907823, + "loss": 5.0658, + "step": 43900 + }, + { + "epoch": 0.06169766947626884, + "grad_norm": 1.0529582500457764, + "learning_rate": 0.00018502880427146267, + "loss": 4.9211, + "step": 43910 + }, + { + "epoch": 0.061711720414432424, + "grad_norm": 0.9484959840774536, + "learning_rate": 0.0001850709568638471, + "loss": 4.8917, + "step": 43920 + }, + { + "epoch": 0.061725771352596, + "grad_norm": 0.9802437424659729, + "learning_rate": 0.00018511310945623156, + "loss": 4.853, + "step": 43930 + }, + { + "epoch": 0.061739822290759576, + "grad_norm": 1.045055627822876, + "learning_rate": 0.00018515526204861599, + "loss": 4.8402, + "step": 43940 + }, + { + "epoch": 0.06175387322892315, + "grad_norm": 1.073882818222046, + "learning_rate": 0.00018519741464100042, + "loss": 4.9227, + "step": 43950 + }, + { + "epoch": 0.06176792416708673, + "grad_norm": 0.9608358144760132, + "learning_rate": 0.00018523956723338482, + "loss": 4.9701, + "step": 43960 + }, + { + "epoch": 0.06178197510525031, + "grad_norm": 0.9510095119476318, + "learning_rate": 0.00018528171982576925, + "loss": 4.9003, + "step": 43970 + }, + { + "epoch": 0.06179602604341389, + "grad_norm": 1.0726205110549927, + "learning_rate": 0.00018532387241815368, + "loss": 4.9263, + "step": 43980 + }, + { + "epoch": 0.061810076981577464, + "grad_norm": 0.9834354519844055, + "learning_rate": 0.00018536602501053814, + "loss": 5.0427, + "step": 43990 + }, + { + "epoch": 0.06182412791974104, + "grad_norm": 0.9882546067237854, + "learning_rate": 0.00018540817760292257, + "loss": 5.026, + "step": 44000 + }, + { + "epoch": 0.061838178857904616, + "grad_norm": 0.9732726216316223, + "learning_rate": 0.000185450330195307, + "loss": 4.9425, + "step": 44010 + }, + { + "epoch": 0.0618522297960682, + "grad_norm": 0.9791545271873474, + "learning_rate": 0.00018549248278769144, + "loss": 4.965, + "step": 44020 + }, + { + "epoch": 0.061866280734231775, + "grad_norm": 1.0380457639694214, + "learning_rate": 0.00018553463538007584, + "loss": 4.9099, + "step": 44030 + }, + { + "epoch": 0.06188033167239535, + "grad_norm": 1.010048747062683, + "learning_rate": 0.00018557678797246027, + "loss": 4.9624, + "step": 44040 + }, + { + "epoch": 0.06189438261055893, + "grad_norm": 0.9912928342819214, + "learning_rate": 0.00018561894056484473, + "loss": 4.9728, + "step": 44050 + }, + { + "epoch": 0.061908433548722504, + "grad_norm": 0.9759913682937622, + "learning_rate": 0.00018566109315722916, + "loss": 4.9517, + "step": 44060 + }, + { + "epoch": 0.06192248448688609, + "grad_norm": 0.997549295425415, + "learning_rate": 0.0001857032457496136, + "loss": 4.8432, + "step": 44070 + }, + { + "epoch": 0.06193653542504966, + "grad_norm": 1.0062215328216553, + "learning_rate": 0.00018574539834199802, + "loss": 4.9373, + "step": 44080 + }, + { + "epoch": 0.06195058636321324, + "grad_norm": 1.012075424194336, + "learning_rate": 0.00018578755093438243, + "loss": 4.8813, + "step": 44090 + }, + { + "epoch": 0.061964637301376815, + "grad_norm": 0.95697021484375, + "learning_rate": 0.00018582970352676686, + "loss": 4.944, + "step": 44100 + }, + { + "epoch": 0.06197868823954039, + "grad_norm": 0.9985111951828003, + "learning_rate": 0.00018587185611915132, + "loss": 4.8227, + "step": 44110 + }, + { + "epoch": 0.061992739177703975, + "grad_norm": 0.9926140308380127, + "learning_rate": 0.00018591400871153575, + "loss": 5.0079, + "step": 44120 + }, + { + "epoch": 0.06200679011586755, + "grad_norm": 1.1736414432525635, + "learning_rate": 0.00018595616130392018, + "loss": 4.7146, + "step": 44130 + }, + { + "epoch": 0.06202084105403113, + "grad_norm": 1.0231581926345825, + "learning_rate": 0.0001859983138963046, + "loss": 5.0206, + "step": 44140 + }, + { + "epoch": 0.0620348919921947, + "grad_norm": 1.0800116062164307, + "learning_rate": 0.00018604046648868901, + "loss": 4.8657, + "step": 44150 + }, + { + "epoch": 0.06204894293035828, + "grad_norm": 0.9866999387741089, + "learning_rate": 0.00018608261908107347, + "loss": 4.9722, + "step": 44160 + }, + { + "epoch": 0.062062993868521855, + "grad_norm": 1.0231711864471436, + "learning_rate": 0.0001861247716734579, + "loss": 4.8777, + "step": 44170 + }, + { + "epoch": 0.06207704480668544, + "grad_norm": 0.9434810876846313, + "learning_rate": 0.00018616692426584233, + "loss": 4.9722, + "step": 44180 + }, + { + "epoch": 0.062091095744849015, + "grad_norm": 1.0458160638809204, + "learning_rate": 0.00018620907685822677, + "loss": 4.8806, + "step": 44190 + }, + { + "epoch": 0.06210514668301259, + "grad_norm": 0.9809548258781433, + "learning_rate": 0.0001862512294506112, + "loss": 4.7962, + "step": 44200 + }, + { + "epoch": 0.06211919762117617, + "grad_norm": 1.012259840965271, + "learning_rate": 0.00018629338204299565, + "loss": 4.9335, + "step": 44210 + }, + { + "epoch": 0.06213324855933974, + "grad_norm": 0.9664605855941772, + "learning_rate": 0.00018633553463538006, + "loss": 4.9491, + "step": 44220 + }, + { + "epoch": 0.062147299497503326, + "grad_norm": 0.9853652715682983, + "learning_rate": 0.0001863776872277645, + "loss": 4.9029, + "step": 44230 + }, + { + "epoch": 0.0621613504356669, + "grad_norm": 1.0576822757720947, + "learning_rate": 0.00018641983982014892, + "loss": 4.8949, + "step": 44240 + }, + { + "epoch": 0.06217540137383048, + "grad_norm": 1.0025372505187988, + "learning_rate": 0.00018646199241253335, + "loss": 4.9004, + "step": 44250 + }, + { + "epoch": 0.062189452311994055, + "grad_norm": 1.0175806283950806, + "learning_rate": 0.00018650414500491778, + "loss": 5.03, + "step": 44260 + }, + { + "epoch": 0.06220350325015763, + "grad_norm": 1.0149425268173218, + "learning_rate": 0.00018654629759730224, + "loss": 4.9339, + "step": 44270 + }, + { + "epoch": 0.062217554188321214, + "grad_norm": 1.0029298067092896, + "learning_rate": 0.00018658845018968665, + "loss": 4.8234, + "step": 44280 + }, + { + "epoch": 0.06223160512648479, + "grad_norm": 0.9579691290855408, + "learning_rate": 0.00018663060278207108, + "loss": 5.1016, + "step": 44290 + }, + { + "epoch": 0.062245656064648366, + "grad_norm": 1.061834454536438, + "learning_rate": 0.0001866727553744555, + "loss": 4.9704, + "step": 44300 + }, + { + "epoch": 0.06225970700281194, + "grad_norm": 1.004465103149414, + "learning_rate": 0.00018671490796683994, + "loss": 4.9067, + "step": 44310 + }, + { + "epoch": 0.06227375794097552, + "grad_norm": 0.9837499856948853, + "learning_rate": 0.00018675706055922437, + "loss": 4.9309, + "step": 44320 + }, + { + "epoch": 0.0622878088791391, + "grad_norm": 1.000868558883667, + "learning_rate": 0.00018679921315160883, + "loss": 4.8866, + "step": 44330 + }, + { + "epoch": 0.06230185981730268, + "grad_norm": 0.9788700938224792, + "learning_rate": 0.00018684136574399326, + "loss": 4.9327, + "step": 44340 + }, + { + "epoch": 0.062315910755466254, + "grad_norm": 0.9754438400268555, + "learning_rate": 0.00018688351833637766, + "loss": 4.8594, + "step": 44350 + }, + { + "epoch": 0.06232996169362983, + "grad_norm": 0.951196014881134, + "learning_rate": 0.0001869256709287621, + "loss": 4.9354, + "step": 44360 + }, + { + "epoch": 0.062344012631793407, + "grad_norm": 0.9627026915550232, + "learning_rate": 0.00018696782352114653, + "loss": 4.9621, + "step": 44370 + }, + { + "epoch": 0.06235806356995699, + "grad_norm": 1.063202977180481, + "learning_rate": 0.00018700997611353096, + "loss": 4.8807, + "step": 44380 + }, + { + "epoch": 0.062372114508120566, + "grad_norm": 0.9841533899307251, + "learning_rate": 0.00018705212870591542, + "loss": 4.9947, + "step": 44390 + }, + { + "epoch": 0.06238616544628414, + "grad_norm": 0.9734062552452087, + "learning_rate": 0.00018709428129829985, + "loss": 4.9473, + "step": 44400 + }, + { + "epoch": 0.06240021638444772, + "grad_norm": 0.9729593396186829, + "learning_rate": 0.00018713643389068425, + "loss": 5.1047, + "step": 44410 + }, + { + "epoch": 0.062414267322611294, + "grad_norm": 1.000727891921997, + "learning_rate": 0.00018717858648306868, + "loss": 4.9258, + "step": 44420 + }, + { + "epoch": 0.06242831826077488, + "grad_norm": 0.9832708835601807, + "learning_rate": 0.0001872207390754531, + "loss": 4.7362, + "step": 44430 + }, + { + "epoch": 0.062442369198938454, + "grad_norm": 1.075731635093689, + "learning_rate": 0.00018726289166783754, + "loss": 5.0231, + "step": 44440 + }, + { + "epoch": 0.06245642013710203, + "grad_norm": 0.965521514415741, + "learning_rate": 0.000187305044260222, + "loss": 4.8227, + "step": 44450 + }, + { + "epoch": 0.062470471075265606, + "grad_norm": 1.0088485479354858, + "learning_rate": 0.00018734719685260643, + "loss": 4.8802, + "step": 44460 + }, + { + "epoch": 0.06248452201342918, + "grad_norm": 0.9651148319244385, + "learning_rate": 0.00018738934944499087, + "loss": 4.8197, + "step": 44470 + }, + { + "epoch": 0.06249857295159276, + "grad_norm": 0.9971210360527039, + "learning_rate": 0.00018743150203737527, + "loss": 4.9242, + "step": 44480 + }, + { + "epoch": 0.06251262388975634, + "grad_norm": 0.9815917015075684, + "learning_rate": 0.0001874736546297597, + "loss": 4.8368, + "step": 44490 + }, + { + "epoch": 0.06252667482791992, + "grad_norm": 0.9839165806770325, + "learning_rate": 0.00018751580722214413, + "loss": 4.8778, + "step": 44500 + }, + { + "epoch": 0.0625407257660835, + "grad_norm": 0.990401029586792, + "learning_rate": 0.0001875579598145286, + "loss": 4.9428, + "step": 44510 + }, + { + "epoch": 0.06255477670424707, + "grad_norm": 0.9693237543106079, + "learning_rate": 0.00018760011240691302, + "loss": 4.8596, + "step": 44520 + }, + { + "epoch": 0.06256882764241065, + "grad_norm": 1.0013371706008911, + "learning_rate": 0.00018764226499929745, + "loss": 4.9533, + "step": 44530 + }, + { + "epoch": 0.06258287858057422, + "grad_norm": 0.9999648928642273, + "learning_rate": 0.00018768441759168186, + "loss": 4.9845, + "step": 44540 + }, + { + "epoch": 0.0625969295187378, + "grad_norm": 0.982397198677063, + "learning_rate": 0.0001877265701840663, + "loss": 4.8334, + "step": 44550 + }, + { + "epoch": 0.06261098045690139, + "grad_norm": 0.9798036217689514, + "learning_rate": 0.0001877645075172123, + "loss": 4.8915, + "step": 44560 + }, + { + "epoch": 0.06262503139506496, + "grad_norm": 0.9902676343917847, + "learning_rate": 0.0001878066601095967, + "loss": 4.9485, + "step": 44570 + }, + { + "epoch": 0.06263908233322854, + "grad_norm": 0.9916227459907532, + "learning_rate": 0.00018784881270198113, + "loss": 4.8654, + "step": 44580 + }, + { + "epoch": 0.06265313327139212, + "grad_norm": 1.0029773712158203, + "learning_rate": 0.0001878909652943656, + "loss": 4.7969, + "step": 44590 + }, + { + "epoch": 0.06266718420955569, + "grad_norm": 0.985836386680603, + "learning_rate": 0.00018793311788675002, + "loss": 4.9815, + "step": 44600 + }, + { + "epoch": 0.06268123514771927, + "grad_norm": 1.05781888961792, + "learning_rate": 0.00018797527047913445, + "loss": 4.9361, + "step": 44610 + }, + { + "epoch": 0.06269528608588285, + "grad_norm": 1.0255253314971924, + "learning_rate": 0.00018801742307151888, + "loss": 4.7969, + "step": 44620 + }, + { + "epoch": 0.06270933702404642, + "grad_norm": 0.9807506799697876, + "learning_rate": 0.00018805957566390332, + "loss": 4.8693, + "step": 44630 + }, + { + "epoch": 0.06272338796221, + "grad_norm": 0.9755240082740784, + "learning_rate": 0.00018810172825628772, + "loss": 4.8811, + "step": 44640 + }, + { + "epoch": 0.06273743890037357, + "grad_norm": 0.9982534050941467, + "learning_rate": 0.00018814388084867218, + "loss": 5.0296, + "step": 44650 + }, + { + "epoch": 0.06275148983853715, + "grad_norm": 0.9563882350921631, + "learning_rate": 0.0001881860334410566, + "loss": 4.9605, + "step": 44660 + }, + { + "epoch": 0.06276554077670074, + "grad_norm": 0.9405322074890137, + "learning_rate": 0.00018822818603344104, + "loss": 4.9641, + "step": 44670 + }, + { + "epoch": 0.06277959171486432, + "grad_norm": 0.9542915225028992, + "learning_rate": 0.00018827033862582547, + "loss": 4.99, + "step": 44680 + }, + { + "epoch": 0.06279364265302789, + "grad_norm": 0.9676512479782104, + "learning_rate": 0.0001883124912182099, + "loss": 4.9995, + "step": 44690 + }, + { + "epoch": 0.06280769359119147, + "grad_norm": 0.9878407120704651, + "learning_rate": 0.0001883546438105943, + "loss": 4.962, + "step": 44700 + }, + { + "epoch": 0.06282174452935504, + "grad_norm": 0.9911555647850037, + "learning_rate": 0.00018839679640297876, + "loss": 4.8163, + "step": 44710 + }, + { + "epoch": 0.06283579546751862, + "grad_norm": 1.0555427074432373, + "learning_rate": 0.0001884389489953632, + "loss": 5.0145, + "step": 44720 + }, + { + "epoch": 0.0628498464056822, + "grad_norm": 0.9912372827529907, + "learning_rate": 0.00018848110158774763, + "loss": 4.8577, + "step": 44730 + }, + { + "epoch": 0.06286389734384577, + "grad_norm": 0.9817824959754944, + "learning_rate": 0.00018852325418013206, + "loss": 4.8828, + "step": 44740 + }, + { + "epoch": 0.06287794828200935, + "grad_norm": 0.9970001578330994, + "learning_rate": 0.0001885654067725165, + "loss": 4.8374, + "step": 44750 + }, + { + "epoch": 0.06289199922017293, + "grad_norm": 1.0151931047439575, + "learning_rate": 0.00018860755936490095, + "loss": 4.9853, + "step": 44760 + }, + { + "epoch": 0.06290605015833652, + "grad_norm": 0.9986475110054016, + "learning_rate": 0.00018864971195728535, + "loss": 4.9118, + "step": 44770 + }, + { + "epoch": 0.06292010109650009, + "grad_norm": 1.008439064025879, + "learning_rate": 0.00018869186454966978, + "loss": 4.9462, + "step": 44780 + }, + { + "epoch": 0.06293415203466367, + "grad_norm": 1.03360116481781, + "learning_rate": 0.00018873401714205421, + "loss": 4.8674, + "step": 44790 + }, + { + "epoch": 0.06294820297282724, + "grad_norm": 0.9503140449523926, + "learning_rate": 0.00018877616973443865, + "loss": 4.9065, + "step": 44800 + }, + { + "epoch": 0.06296225391099082, + "grad_norm": 0.9909397959709167, + "learning_rate": 0.0001888183223268231, + "loss": 5.0051, + "step": 44810 + }, + { + "epoch": 0.0629763048491544, + "grad_norm": 0.9907909035682678, + "learning_rate": 0.00018886047491920753, + "loss": 4.8448, + "step": 44820 + }, + { + "epoch": 0.06299035578731797, + "grad_norm": 0.962094783782959, + "learning_rate": 0.00018890262751159194, + "loss": 4.962, + "step": 44830 + }, + { + "epoch": 0.06300440672548155, + "grad_norm": 0.9932377934455872, + "learning_rate": 0.00018894478010397637, + "loss": 5.0061, + "step": 44840 + }, + { + "epoch": 0.06301845766364512, + "grad_norm": 0.9481516480445862, + "learning_rate": 0.0001889869326963608, + "loss": 4.9518, + "step": 44850 + }, + { + "epoch": 0.0630325086018087, + "grad_norm": 0.9638423323631287, + "learning_rate": 0.00018902908528874523, + "loss": 4.9881, + "step": 44860 + }, + { + "epoch": 0.06304655953997229, + "grad_norm": 1.0749502182006836, + "learning_rate": 0.0001890712378811297, + "loss": 4.915, + "step": 44870 + }, + { + "epoch": 0.06306061047813587, + "grad_norm": 0.9810838103294373, + "learning_rate": 0.00018911339047351412, + "loss": 4.9, + "step": 44880 + }, + { + "epoch": 0.06307466141629944, + "grad_norm": 1.0337384939193726, + "learning_rate": 0.00018915554306589855, + "loss": 4.845, + "step": 44890 + }, + { + "epoch": 0.06308871235446302, + "grad_norm": 0.9709575772285461, + "learning_rate": 0.00018919769565828296, + "loss": 4.8708, + "step": 44900 + }, + { + "epoch": 0.0631027632926266, + "grad_norm": 1.0215866565704346, + "learning_rate": 0.0001892398482506674, + "loss": 4.939, + "step": 44910 + }, + { + "epoch": 0.06311681423079017, + "grad_norm": 0.9717178344726562, + "learning_rate": 0.00018928200084305182, + "loss": 4.9069, + "step": 44920 + }, + { + "epoch": 0.06313086516895375, + "grad_norm": 0.9674050807952881, + "learning_rate": 0.00018932415343543628, + "loss": 4.8942, + "step": 44930 + }, + { + "epoch": 0.06314491610711732, + "grad_norm": 1.011043906211853, + "learning_rate": 0.0001893663060278207, + "loss": 4.9006, + "step": 44940 + }, + { + "epoch": 0.0631589670452809, + "grad_norm": 0.9711986184120178, + "learning_rate": 0.00018940845862020514, + "loss": 4.8699, + "step": 44950 + }, + { + "epoch": 0.06317301798344448, + "grad_norm": 1.0319536924362183, + "learning_rate": 0.00018945061121258954, + "loss": 4.8679, + "step": 44960 + }, + { + "epoch": 0.06318706892160805, + "grad_norm": 1.0072388648986816, + "learning_rate": 0.00018949276380497398, + "loss": 4.9792, + "step": 44970 + }, + { + "epoch": 0.06320111985977164, + "grad_norm": 0.9475632905960083, + "learning_rate": 0.0001895349163973584, + "loss": 4.8833, + "step": 44980 + }, + { + "epoch": 0.06321517079793522, + "grad_norm": 1.02812922000885, + "learning_rate": 0.00018957706898974286, + "loss": 4.9171, + "step": 44990 + }, + { + "epoch": 0.0632292217360988, + "grad_norm": 0.9555014967918396, + "learning_rate": 0.0001896192215821273, + "loss": 4.9232, + "step": 45000 + }, + { + "epoch": 0.06324327267426237, + "grad_norm": 0.9676048755645752, + "learning_rate": 0.00018966137417451173, + "loss": 4.9209, + "step": 45010 + }, + { + "epoch": 0.06325732361242595, + "grad_norm": 1.054318904876709, + "learning_rate": 0.00018970352676689616, + "loss": 4.9587, + "step": 45020 + }, + { + "epoch": 0.06327137455058952, + "grad_norm": 0.9832106232643127, + "learning_rate": 0.00018974567935928056, + "loss": 4.8852, + "step": 45030 + }, + { + "epoch": 0.0632854254887531, + "grad_norm": 0.9964709281921387, + "learning_rate": 0.000189787831951665, + "loss": 4.8634, + "step": 45040 + }, + { + "epoch": 0.06329947642691668, + "grad_norm": 0.959078848361969, + "learning_rate": 0.00018982998454404945, + "loss": 4.9014, + "step": 45050 + }, + { + "epoch": 0.06331352736508025, + "grad_norm": 0.993674099445343, + "learning_rate": 0.00018987213713643388, + "loss": 4.7264, + "step": 45060 + }, + { + "epoch": 0.06332757830324383, + "grad_norm": 0.9697414636611938, + "learning_rate": 0.00018991428972881831, + "loss": 4.8757, + "step": 45070 + }, + { + "epoch": 0.06334162924140742, + "grad_norm": 0.9635653495788574, + "learning_rate": 0.00018995644232120275, + "loss": 4.9426, + "step": 45080 + }, + { + "epoch": 0.063355680179571, + "grad_norm": 0.9881270527839661, + "learning_rate": 0.00018999859491358715, + "loss": 4.9085, + "step": 45090 + }, + { + "epoch": 0.06336973111773457, + "grad_norm": 0.9849804639816284, + "learning_rate": 0.00019004074750597158, + "loss": 4.9223, + "step": 45100 + }, + { + "epoch": 0.06338378205589815, + "grad_norm": 0.953628659248352, + "learning_rate": 0.00019008290009835604, + "loss": 4.9461, + "step": 45110 + }, + { + "epoch": 0.06339783299406172, + "grad_norm": 0.9483082890510559, + "learning_rate": 0.00019012505269074047, + "loss": 4.9306, + "step": 45120 + }, + { + "epoch": 0.0634118839322253, + "grad_norm": 1.0407912731170654, + "learning_rate": 0.0001901672052831249, + "loss": 4.7811, + "step": 45130 + }, + { + "epoch": 0.06342593487038888, + "grad_norm": 0.9633585214614868, + "learning_rate": 0.00019020935787550933, + "loss": 4.8575, + "step": 45140 + }, + { + "epoch": 0.06343998580855245, + "grad_norm": 0.9573111534118652, + "learning_rate": 0.00019025151046789376, + "loss": 4.873, + "step": 45150 + }, + { + "epoch": 0.06345403674671603, + "grad_norm": 1.0147194862365723, + "learning_rate": 0.00019029366306027817, + "loss": 4.7867, + "step": 45160 + }, + { + "epoch": 0.0634680876848796, + "grad_norm": 0.9639556407928467, + "learning_rate": 0.00019033581565266263, + "loss": 4.8389, + "step": 45170 + }, + { + "epoch": 0.0634821386230432, + "grad_norm": 0.9909672141075134, + "learning_rate": 0.00019037796824504706, + "loss": 4.9519, + "step": 45180 + }, + { + "epoch": 0.06349618956120677, + "grad_norm": 0.9403594732284546, + "learning_rate": 0.0001904201208374315, + "loss": 5.012, + "step": 45190 + }, + { + "epoch": 0.06351024049937035, + "grad_norm": 1.0044161081314087, + "learning_rate": 0.00019046227342981592, + "loss": 4.8788, + "step": 45200 + }, + { + "epoch": 0.06352429143753392, + "grad_norm": 0.9971259832382202, + "learning_rate": 0.00019050442602220035, + "loss": 4.9838, + "step": 45210 + }, + { + "epoch": 0.0635383423756975, + "grad_norm": 0.9722863435745239, + "learning_rate": 0.00019054657861458475, + "loss": 4.9108, + "step": 45220 + }, + { + "epoch": 0.06355239331386107, + "grad_norm": 0.967170774936676, + "learning_rate": 0.0001905887312069692, + "loss": 4.8078, + "step": 45230 + }, + { + "epoch": 0.06356644425202465, + "grad_norm": 0.9585663676261902, + "learning_rate": 0.00019063088379935364, + "loss": 4.9564, + "step": 45240 + }, + { + "epoch": 0.06358049519018823, + "grad_norm": 1.025834083557129, + "learning_rate": 0.00019067303639173807, + "loss": 4.8575, + "step": 45250 + }, + { + "epoch": 0.0635945461283518, + "grad_norm": 0.9579908847808838, + "learning_rate": 0.0001907151889841225, + "loss": 4.8684, + "step": 45260 + }, + { + "epoch": 0.06360859706651538, + "grad_norm": 1.0075056552886963, + "learning_rate": 0.00019075734157650694, + "loss": 4.9019, + "step": 45270 + }, + { + "epoch": 0.06362264800467896, + "grad_norm": 0.9414964914321899, + "learning_rate": 0.00019079949416889134, + "loss": 4.8507, + "step": 45280 + }, + { + "epoch": 0.06363669894284255, + "grad_norm": 0.947604775428772, + "learning_rate": 0.0001908416467612758, + "loss": 4.9015, + "step": 45290 + }, + { + "epoch": 0.06365074988100612, + "grad_norm": 0.961848795413971, + "learning_rate": 0.00019088379935366023, + "loss": 4.9397, + "step": 45300 + }, + { + "epoch": 0.0636648008191697, + "grad_norm": 0.9270375370979309, + "learning_rate": 0.00019092595194604466, + "loss": 4.8831, + "step": 45310 + }, + { + "epoch": 0.06367885175733327, + "grad_norm": 0.9623255729675293, + "learning_rate": 0.0001909681045384291, + "loss": 4.9062, + "step": 45320 + }, + { + "epoch": 0.06369290269549685, + "grad_norm": 0.9620530605316162, + "learning_rate": 0.00019101025713081352, + "loss": 4.9139, + "step": 45330 + }, + { + "epoch": 0.06370695363366043, + "grad_norm": 0.9391373991966248, + "learning_rate": 0.00019105240972319798, + "loss": 4.9787, + "step": 45340 + }, + { + "epoch": 0.063721004571824, + "grad_norm": 1.0350898504257202, + "learning_rate": 0.0001910945623155824, + "loss": 4.7686, + "step": 45350 + }, + { + "epoch": 0.06373505550998758, + "grad_norm": 0.9722337126731873, + "learning_rate": 0.00019113671490796682, + "loss": 4.8961, + "step": 45360 + }, + { + "epoch": 0.06374910644815115, + "grad_norm": 0.9319054484367371, + "learning_rate": 0.00019117886750035125, + "loss": 5.0057, + "step": 45370 + }, + { + "epoch": 0.06376315738631473, + "grad_norm": 0.9770829081535339, + "learning_rate": 0.00019122102009273568, + "loss": 4.9013, + "step": 45380 + }, + { + "epoch": 0.06377720832447832, + "grad_norm": 0.9620832204818726, + "learning_rate": 0.00019126317268512014, + "loss": 4.974, + "step": 45390 + }, + { + "epoch": 0.0637912592626419, + "grad_norm": 1.0096343755722046, + "learning_rate": 0.00019130532527750457, + "loss": 4.8912, + "step": 45400 + }, + { + "epoch": 0.06380531020080547, + "grad_norm": 0.962091326713562, + "learning_rate": 0.00019134747786988897, + "loss": 4.9313, + "step": 45410 + }, + { + "epoch": 0.06381936113896905, + "grad_norm": 0.9614576697349548, + "learning_rate": 0.0001913896304622734, + "loss": 4.9008, + "step": 45420 + }, + { + "epoch": 0.06383341207713263, + "grad_norm": 0.9405627846717834, + "learning_rate": 0.00019143178305465784, + "loss": 4.8902, + "step": 45430 + }, + { + "epoch": 0.0638474630152962, + "grad_norm": 1.075329065322876, + "learning_rate": 0.00019147393564704227, + "loss": 4.7951, + "step": 45440 + }, + { + "epoch": 0.06386151395345978, + "grad_norm": 0.9246218800544739, + "learning_rate": 0.00019151608823942673, + "loss": 4.8835, + "step": 45450 + }, + { + "epoch": 0.06387556489162335, + "grad_norm": 0.9820383191108704, + "learning_rate": 0.00019155824083181116, + "loss": 4.9678, + "step": 45460 + }, + { + "epoch": 0.06388961582978693, + "grad_norm": 0.9855993390083313, + "learning_rate": 0.0001916003934241956, + "loss": 4.9499, + "step": 45470 + }, + { + "epoch": 0.0639036667679505, + "grad_norm": 0.9557892084121704, + "learning_rate": 0.00019164254601658, + "loss": 4.9888, + "step": 45480 + }, + { + "epoch": 0.0639177177061141, + "grad_norm": 0.9590747356414795, + "learning_rate": 0.00019168469860896442, + "loss": 4.9599, + "step": 45490 + }, + { + "epoch": 0.06393176864427767, + "grad_norm": 0.9951782822608948, + "learning_rate": 0.00019172685120134885, + "loss": 4.8487, + "step": 45500 + }, + { + "epoch": 0.06394581958244125, + "grad_norm": 0.9474301338195801, + "learning_rate": 0.0001917690037937333, + "loss": 4.9257, + "step": 45510 + }, + { + "epoch": 0.06395987052060483, + "grad_norm": 0.9949396252632141, + "learning_rate": 0.00019181115638611774, + "loss": 4.8466, + "step": 45520 + }, + { + "epoch": 0.0639739214587684, + "grad_norm": 0.9865704774856567, + "learning_rate": 0.00019185330897850217, + "loss": 4.9049, + "step": 45530 + }, + { + "epoch": 0.06398797239693198, + "grad_norm": 1.0222458839416504, + "learning_rate": 0.00019189546157088658, + "loss": 4.8955, + "step": 45540 + }, + { + "epoch": 0.06400202333509555, + "grad_norm": 0.9480180144309998, + "learning_rate": 0.000191937614163271, + "loss": 4.9021, + "step": 45550 + }, + { + "epoch": 0.06401607427325913, + "grad_norm": 0.9689090847969055, + "learning_rate": 0.00019197976675565544, + "loss": 5.0263, + "step": 45560 + }, + { + "epoch": 0.0640301252114227, + "grad_norm": 0.9644342064857483, + "learning_rate": 0.0001920219193480399, + "loss": 4.9652, + "step": 45570 + }, + { + "epoch": 0.06404417614958628, + "grad_norm": 0.9961534738540649, + "learning_rate": 0.00019206407194042433, + "loss": 4.8413, + "step": 45580 + }, + { + "epoch": 0.06405822708774986, + "grad_norm": 0.9438686966896057, + "learning_rate": 0.00019210622453280876, + "loss": 4.9344, + "step": 45590 + }, + { + "epoch": 0.06407227802591345, + "grad_norm": 0.9871533513069153, + "learning_rate": 0.0001921483771251932, + "loss": 4.9071, + "step": 45600 + }, + { + "epoch": 0.06408632896407702, + "grad_norm": 0.9871090650558472, + "learning_rate": 0.0001921905297175776, + "loss": 4.8008, + "step": 45610 + }, + { + "epoch": 0.0641003799022406, + "grad_norm": 1.0225098133087158, + "learning_rate": 0.00019223268230996203, + "loss": 5.0841, + "step": 45620 + }, + { + "epoch": 0.06411443084040418, + "grad_norm": 0.9514864087104797, + "learning_rate": 0.00019227483490234649, + "loss": 4.8433, + "step": 45630 + }, + { + "epoch": 0.06412848177856775, + "grad_norm": 0.9759644865989685, + "learning_rate": 0.00019231698749473092, + "loss": 4.9319, + "step": 45640 + }, + { + "epoch": 0.06414253271673133, + "grad_norm": 0.9882025122642517, + "learning_rate": 0.00019235914008711535, + "loss": 4.9566, + "step": 45650 + }, + { + "epoch": 0.0641565836548949, + "grad_norm": 0.9645729660987854, + "learning_rate": 0.00019240129267949978, + "loss": 4.963, + "step": 45660 + }, + { + "epoch": 0.06417063459305848, + "grad_norm": 0.9985837936401367, + "learning_rate": 0.00019244344527188418, + "loss": 4.968, + "step": 45670 + }, + { + "epoch": 0.06418468553122206, + "grad_norm": 0.9520775675773621, + "learning_rate": 0.00019248559786426862, + "loss": 4.9343, + "step": 45680 + }, + { + "epoch": 0.06419873646938563, + "grad_norm": 0.947435736656189, + "learning_rate": 0.00019252775045665307, + "loss": 4.8316, + "step": 45690 + }, + { + "epoch": 0.06421278740754922, + "grad_norm": 1.099055528640747, + "learning_rate": 0.0001925699030490375, + "loss": 4.7128, + "step": 45700 + }, + { + "epoch": 0.0642268383457128, + "grad_norm": 0.9727609753608704, + "learning_rate": 0.00019261205564142194, + "loss": 4.8179, + "step": 45710 + }, + { + "epoch": 0.06424088928387638, + "grad_norm": 0.9793810248374939, + "learning_rate": 0.00019265420823380637, + "loss": 4.9058, + "step": 45720 + }, + { + "epoch": 0.06425494022203995, + "grad_norm": 0.9617207646369934, + "learning_rate": 0.0001926963608261908, + "loss": 4.8955, + "step": 45730 + }, + { + "epoch": 0.06426899116020353, + "grad_norm": 0.9949877262115479, + "learning_rate": 0.0001927385134185752, + "loss": 4.9032, + "step": 45740 + }, + { + "epoch": 0.0642830420983671, + "grad_norm": 0.9807795882225037, + "learning_rate": 0.00019278066601095966, + "loss": 4.9258, + "step": 45750 + }, + { + "epoch": 0.06429709303653068, + "grad_norm": 0.9766243696212769, + "learning_rate": 0.0001928228186033441, + "loss": 4.8925, + "step": 45760 + }, + { + "epoch": 0.06431114397469426, + "grad_norm": 0.971154510974884, + "learning_rate": 0.00019286497119572852, + "loss": 5.0002, + "step": 45770 + }, + { + "epoch": 0.06432519491285783, + "grad_norm": 0.966464102268219, + "learning_rate": 0.00019290712378811295, + "loss": 4.8888, + "step": 45780 + }, + { + "epoch": 0.06433924585102141, + "grad_norm": 0.9556533694267273, + "learning_rate": 0.00019294927638049738, + "loss": 4.8204, + "step": 45790 + }, + { + "epoch": 0.064353296789185, + "grad_norm": 0.9517030715942383, + "learning_rate": 0.0001929914289728818, + "loss": 4.8391, + "step": 45800 + }, + { + "epoch": 0.06436734772734858, + "grad_norm": 1.0709158182144165, + "learning_rate": 0.00019303358156526625, + "loss": 4.8775, + "step": 45810 + }, + { + "epoch": 0.06438139866551215, + "grad_norm": 0.9663071036338806, + "learning_rate": 0.00019307573415765068, + "loss": 4.7677, + "step": 45820 + }, + { + "epoch": 0.06439544960367573, + "grad_norm": 0.9344285726547241, + "learning_rate": 0.0001931178867500351, + "loss": 4.9828, + "step": 45830 + }, + { + "epoch": 0.0644095005418393, + "grad_norm": 0.9900237917900085, + "learning_rate": 0.00019316003934241954, + "loss": 4.7965, + "step": 45840 + }, + { + "epoch": 0.06442355148000288, + "grad_norm": 0.9743140935897827, + "learning_rate": 0.00019320219193480397, + "loss": 4.963, + "step": 45850 + }, + { + "epoch": 0.06443760241816646, + "grad_norm": 0.9134804606437683, + "learning_rate": 0.00019324434452718843, + "loss": 4.8572, + "step": 45860 + }, + { + "epoch": 0.06445165335633003, + "grad_norm": 0.9742643237113953, + "learning_rate": 0.00019328649711957283, + "loss": 4.8383, + "step": 45870 + }, + { + "epoch": 0.06446570429449361, + "grad_norm": 0.9616056680679321, + "learning_rate": 0.00019332864971195727, + "loss": 4.7778, + "step": 45880 + }, + { + "epoch": 0.06447975523265718, + "grad_norm": 0.9398157596588135, + "learning_rate": 0.0001933708023043417, + "loss": 4.9049, + "step": 45890 + }, + { + "epoch": 0.06449380617082076, + "grad_norm": 1.0289732217788696, + "learning_rate": 0.00019341295489672613, + "loss": 4.9867, + "step": 45900 + }, + { + "epoch": 0.06450785710898435, + "grad_norm": 0.9898787140846252, + "learning_rate": 0.00019345510748911056, + "loss": 4.9202, + "step": 45910 + }, + { + "epoch": 0.06452190804714793, + "grad_norm": 0.9590668082237244, + "learning_rate": 0.00019349726008149502, + "loss": 4.8323, + "step": 45920 + }, + { + "epoch": 0.0645359589853115, + "grad_norm": 0.9680622220039368, + "learning_rate": 0.00019353941267387942, + "loss": 4.9099, + "step": 45930 + }, + { + "epoch": 0.06455000992347508, + "grad_norm": 0.9715376496315002, + "learning_rate": 0.00019358156526626385, + "loss": 4.8441, + "step": 45940 + }, + { + "epoch": 0.06456406086163866, + "grad_norm": 1.0132158994674683, + "learning_rate": 0.00019362371785864828, + "loss": 4.9372, + "step": 45950 + }, + { + "epoch": 0.06457811179980223, + "grad_norm": 1.0213428735733032, + "learning_rate": 0.00019366587045103271, + "loss": 4.9999, + "step": 45960 + }, + { + "epoch": 0.06459216273796581, + "grad_norm": 0.9593140482902527, + "learning_rate": 0.00019370802304341717, + "loss": 4.9006, + "step": 45970 + }, + { + "epoch": 0.06460621367612938, + "grad_norm": 0.9583016037940979, + "learning_rate": 0.0001937501756358016, + "loss": 4.9547, + "step": 45980 + }, + { + "epoch": 0.06462026461429296, + "grad_norm": 1.043340802192688, + "learning_rate": 0.000193792328228186, + "loss": 4.8028, + "step": 45990 + }, + { + "epoch": 0.06463431555245654, + "grad_norm": 0.9959349036216736, + "learning_rate": 0.00019383448082057044, + "loss": 4.8156, + "step": 46000 + }, + { + "epoch": 0.06464836649062013, + "grad_norm": 0.9628585577011108, + "learning_rate": 0.00019387663341295487, + "loss": 4.9479, + "step": 46010 + }, + { + "epoch": 0.0646624174287837, + "grad_norm": 0.9335030317306519, + "learning_rate": 0.0001939187860053393, + "loss": 4.8707, + "step": 46020 + }, + { + "epoch": 0.06467646836694728, + "grad_norm": 0.9447938203811646, + "learning_rate": 0.00019396093859772376, + "loss": 4.8819, + "step": 46030 + }, + { + "epoch": 0.06469051930511086, + "grad_norm": 0.9535121917724609, + "learning_rate": 0.0001940030911901082, + "loss": 4.9952, + "step": 46040 + }, + { + "epoch": 0.06470457024327443, + "grad_norm": 0.9345034956932068, + "learning_rate": 0.00019404524378249262, + "loss": 4.8327, + "step": 46050 + }, + { + "epoch": 0.06471862118143801, + "grad_norm": 0.9597700238227844, + "learning_rate": 0.00019408739637487703, + "loss": 4.9953, + "step": 46060 + }, + { + "epoch": 0.06473267211960158, + "grad_norm": 0.9878246188163757, + "learning_rate": 0.00019412954896726146, + "loss": 4.9171, + "step": 46070 + }, + { + "epoch": 0.06474672305776516, + "grad_norm": 0.9477070569992065, + "learning_rate": 0.0001941717015596459, + "loss": 4.8915, + "step": 46080 + }, + { + "epoch": 0.06476077399592874, + "grad_norm": 1.2421875, + "learning_rate": 0.00019421385415203035, + "loss": 4.9158, + "step": 46090 + }, + { + "epoch": 0.06477482493409231, + "grad_norm": 0.9720420837402344, + "learning_rate": 0.00019425600674441478, + "loss": 5.008, + "step": 46100 + }, + { + "epoch": 0.0647888758722559, + "grad_norm": 0.9742789268493652, + "learning_rate": 0.0001942981593367992, + "loss": 4.8, + "step": 46110 + }, + { + "epoch": 0.06480292681041948, + "grad_norm": 0.9673479795455933, + "learning_rate": 0.0001943403119291836, + "loss": 4.8455, + "step": 46120 + }, + { + "epoch": 0.06481697774858305, + "grad_norm": 0.9082801938056946, + "learning_rate": 0.00019438246452156804, + "loss": 4.8671, + "step": 46130 + }, + { + "epoch": 0.06483102868674663, + "grad_norm": 0.9455775022506714, + "learning_rate": 0.00019442461711395248, + "loss": 4.8722, + "step": 46140 + }, + { + "epoch": 0.0648450796249102, + "grad_norm": 0.9338746666908264, + "learning_rate": 0.00019446676970633693, + "loss": 4.8412, + "step": 46150 + }, + { + "epoch": 0.06485913056307378, + "grad_norm": 0.976826012134552, + "learning_rate": 0.00019450892229872137, + "loss": 4.7848, + "step": 46160 + }, + { + "epoch": 0.06487318150123736, + "grad_norm": 0.9593846797943115, + "learning_rate": 0.0001945510748911058, + "loss": 4.8292, + "step": 46170 + }, + { + "epoch": 0.06488723243940094, + "grad_norm": 0.9429964423179626, + "learning_rate": 0.00019459322748349023, + "loss": 4.8929, + "step": 46180 + }, + { + "epoch": 0.06490128337756451, + "grad_norm": 0.9602091312408447, + "learning_rate": 0.00019463538007587463, + "loss": 4.8737, + "step": 46190 + }, + { + "epoch": 0.06491533431572809, + "grad_norm": 0.9551581740379333, + "learning_rate": 0.00019467753266825906, + "loss": 4.8948, + "step": 46200 + }, + { + "epoch": 0.06492938525389168, + "grad_norm": 0.9924933314323425, + "learning_rate": 0.00019471968526064352, + "loss": 4.9313, + "step": 46210 + }, + { + "epoch": 0.06494343619205525, + "grad_norm": 0.9613593220710754, + "learning_rate": 0.00019476183785302795, + "loss": 4.8588, + "step": 46220 + }, + { + "epoch": 0.06495748713021883, + "grad_norm": 0.9572134017944336, + "learning_rate": 0.00019480399044541238, + "loss": 4.8414, + "step": 46230 + }, + { + "epoch": 0.0649715380683824, + "grad_norm": 0.9708147048950195, + "learning_rate": 0.00019484614303779681, + "loss": 4.8422, + "step": 46240 + }, + { + "epoch": 0.06498558900654598, + "grad_norm": 0.9796114563941956, + "learning_rate": 0.00019488829563018122, + "loss": 4.843, + "step": 46250 + }, + { + "epoch": 0.06499963994470956, + "grad_norm": 0.9643402099609375, + "learning_rate": 0.00019493044822256565, + "loss": 4.9446, + "step": 46260 + }, + { + "epoch": 0.06501369088287313, + "grad_norm": 0.9505442976951599, + "learning_rate": 0.0001949726008149501, + "loss": 5.0311, + "step": 46270 + }, + { + "epoch": 0.06502774182103671, + "grad_norm": 0.9346354007720947, + "learning_rate": 0.00019501475340733454, + "loss": 4.8329, + "step": 46280 + }, + { + "epoch": 0.06504179275920029, + "grad_norm": 1.1094235181808472, + "learning_rate": 0.00019505690599971897, + "loss": 4.9727, + "step": 46290 + }, + { + "epoch": 0.06505584369736386, + "grad_norm": 0.9481832385063171, + "learning_rate": 0.0001950990585921034, + "loss": 4.9542, + "step": 46300 + }, + { + "epoch": 0.06506989463552744, + "grad_norm": 0.9947605729103088, + "learning_rate": 0.00019514121118448783, + "loss": 4.979, + "step": 46310 + }, + { + "epoch": 0.06508394557369103, + "grad_norm": 0.9515612125396729, + "learning_rate": 0.00019518336377687224, + "loss": 4.9198, + "step": 46320 + }, + { + "epoch": 0.0650979965118546, + "grad_norm": 0.9424987435340881, + "learning_rate": 0.0001952255163692567, + "loss": 4.8822, + "step": 46330 + }, + { + "epoch": 0.06511204745001818, + "grad_norm": 1.0159428119659424, + "learning_rate": 0.00019526766896164113, + "loss": 4.7915, + "step": 46340 + }, + { + "epoch": 0.06512609838818176, + "grad_norm": 0.993810772895813, + "learning_rate": 0.00019530982155402556, + "loss": 4.9791, + "step": 46350 + }, + { + "epoch": 0.06514014932634533, + "grad_norm": 1.029266357421875, + "learning_rate": 0.00019535197414641, + "loss": 4.7561, + "step": 46360 + }, + { + "epoch": 0.06515420026450891, + "grad_norm": 0.9550025463104248, + "learning_rate": 0.00019539412673879442, + "loss": 4.9486, + "step": 46370 + }, + { + "epoch": 0.06516825120267249, + "grad_norm": 0.9962372183799744, + "learning_rate": 0.00019543627933117882, + "loss": 4.8646, + "step": 46380 + }, + { + "epoch": 0.06518230214083606, + "grad_norm": 0.9664082527160645, + "learning_rate": 0.00019547843192356328, + "loss": 4.9335, + "step": 46390 + }, + { + "epoch": 0.06519635307899964, + "grad_norm": 1.0118764638900757, + "learning_rate": 0.0001955205845159477, + "loss": 4.8674, + "step": 46400 + }, + { + "epoch": 0.06521040401716321, + "grad_norm": 1.034801959991455, + "learning_rate": 0.00019556273710833214, + "loss": 4.8237, + "step": 46410 + }, + { + "epoch": 0.0652244549553268, + "grad_norm": 0.9353872537612915, + "learning_rate": 0.00019560488970071658, + "loss": 4.9462, + "step": 46420 + }, + { + "epoch": 0.06523850589349038, + "grad_norm": 1.0210577249526978, + "learning_rate": 0.000195647042293101, + "loss": 4.914, + "step": 46430 + }, + { + "epoch": 0.06525255683165396, + "grad_norm": 0.9545332789421082, + "learning_rate": 0.00019568919488548547, + "loss": 4.9091, + "step": 46440 + }, + { + "epoch": 0.06526660776981753, + "grad_norm": 0.9154708981513977, + "learning_rate": 0.00019573134747786987, + "loss": 4.88, + "step": 46450 + }, + { + "epoch": 0.06528065870798111, + "grad_norm": 0.9885104894638062, + "learning_rate": 0.0001957735000702543, + "loss": 4.8147, + "step": 46460 + }, + { + "epoch": 0.06529470964614469, + "grad_norm": 1.0183314085006714, + "learning_rate": 0.00019581565266263873, + "loss": 4.8397, + "step": 46470 + }, + { + "epoch": 0.06530876058430826, + "grad_norm": 0.969292163848877, + "learning_rate": 0.00019585780525502316, + "loss": 4.82, + "step": 46480 + }, + { + "epoch": 0.06532281152247184, + "grad_norm": 0.9941924810409546, + "learning_rate": 0.0001958999578474076, + "loss": 4.8408, + "step": 46490 + }, + { + "epoch": 0.06533686246063541, + "grad_norm": 0.9590076804161072, + "learning_rate": 0.00019594211043979205, + "loss": 4.8889, + "step": 46500 + }, + { + "epoch": 0.06535091339879899, + "grad_norm": 0.9533814787864685, + "learning_rate": 0.00019598426303217646, + "loss": 4.8636, + "step": 46510 + }, + { + "epoch": 0.06536496433696258, + "grad_norm": 0.9973676800727844, + "learning_rate": 0.0001960264156245609, + "loss": 4.8238, + "step": 46520 + }, + { + "epoch": 0.06537901527512616, + "grad_norm": 0.9768134951591492, + "learning_rate": 0.00019606856821694532, + "loss": 4.8206, + "step": 46530 + }, + { + "epoch": 0.06539306621328973, + "grad_norm": 0.9430971145629883, + "learning_rate": 0.00019611072080932975, + "loss": 4.919, + "step": 46540 + }, + { + "epoch": 0.06540711715145331, + "grad_norm": 0.9878526329994202, + "learning_rate": 0.0001961528734017142, + "loss": 4.9512, + "step": 46550 + }, + { + "epoch": 0.06542116808961689, + "grad_norm": 0.9809000492095947, + "learning_rate": 0.00019619502599409864, + "loss": 4.8933, + "step": 46560 + }, + { + "epoch": 0.06543521902778046, + "grad_norm": 0.957922637462616, + "learning_rate": 0.00019623717858648307, + "loss": 4.8238, + "step": 46570 + }, + { + "epoch": 0.06544926996594404, + "grad_norm": 0.9714646935462952, + "learning_rate": 0.00019627511591962905, + "loss": 4.928, + "step": 46580 + }, + { + "epoch": 0.06546332090410761, + "grad_norm": 0.9626190662384033, + "learning_rate": 0.00019631726851201348, + "loss": 4.7571, + "step": 46590 + }, + { + "epoch": 0.06547737184227119, + "grad_norm": 0.9654691815376282, + "learning_rate": 0.00019635942110439792, + "loss": 4.9651, + "step": 46600 + }, + { + "epoch": 0.06549142278043477, + "grad_norm": 0.9982614517211914, + "learning_rate": 0.00019640157369678232, + "loss": 5.0936, + "step": 46610 + }, + { + "epoch": 0.06550547371859834, + "grad_norm": 0.9625030159950256, + "learning_rate": 0.00019644372628916675, + "loss": 4.9313, + "step": 46620 + }, + { + "epoch": 0.06551952465676193, + "grad_norm": 0.9751782417297363, + "learning_rate": 0.0001964858788815512, + "loss": 4.9248, + "step": 46630 + }, + { + "epoch": 0.06553357559492551, + "grad_norm": 0.9599964022636414, + "learning_rate": 0.00019652803147393564, + "loss": 4.9034, + "step": 46640 + }, + { + "epoch": 0.06554762653308908, + "grad_norm": 0.9146353602409363, + "learning_rate": 0.00019657018406632007, + "loss": 4.9831, + "step": 46650 + }, + { + "epoch": 0.06556167747125266, + "grad_norm": 0.9709069728851318, + "learning_rate": 0.0001966123366587045, + "loss": 4.8672, + "step": 46660 + }, + { + "epoch": 0.06557572840941624, + "grad_norm": 0.9541670083999634, + "learning_rate": 0.0001966544892510889, + "loss": 4.9094, + "step": 46670 + }, + { + "epoch": 0.06558977934757981, + "grad_norm": 0.9834936857223511, + "learning_rate": 0.00019669664184347334, + "loss": 4.7472, + "step": 46680 + }, + { + "epoch": 0.06560383028574339, + "grad_norm": 0.919543445110321, + "learning_rate": 0.0001967387944358578, + "loss": 5.0209, + "step": 46690 + }, + { + "epoch": 0.06561788122390697, + "grad_norm": 0.9145947694778442, + "learning_rate": 0.00019678094702824223, + "loss": 4.7879, + "step": 46700 + }, + { + "epoch": 0.06563193216207054, + "grad_norm": 0.9651200771331787, + "learning_rate": 0.00019682309962062666, + "loss": 4.756, + "step": 46710 + }, + { + "epoch": 0.06564598310023412, + "grad_norm": 0.9358447790145874, + "learning_rate": 0.0001968652522130111, + "loss": 4.9653, + "step": 46720 + }, + { + "epoch": 0.06566003403839771, + "grad_norm": 0.9291307330131531, + "learning_rate": 0.00019690740480539552, + "loss": 4.8989, + "step": 46730 + }, + { + "epoch": 0.06567408497656128, + "grad_norm": 0.9250601530075073, + "learning_rate": 0.00019694955739777992, + "loss": 4.7435, + "step": 46740 + }, + { + "epoch": 0.06568813591472486, + "grad_norm": 0.9375638961791992, + "learning_rate": 0.00019699170999016438, + "loss": 4.8347, + "step": 46750 + }, + { + "epoch": 0.06570218685288844, + "grad_norm": 0.9673593044281006, + "learning_rate": 0.00019703386258254881, + "loss": 4.8107, + "step": 46760 + }, + { + "epoch": 0.06571623779105201, + "grad_norm": 0.9529427886009216, + "learning_rate": 0.00019707601517493324, + "loss": 4.8632, + "step": 46770 + }, + { + "epoch": 0.06573028872921559, + "grad_norm": 0.9514247179031372, + "learning_rate": 0.00019711816776731768, + "loss": 4.9264, + "step": 46780 + }, + { + "epoch": 0.06574433966737916, + "grad_norm": 1.0021986961364746, + "learning_rate": 0.0001971603203597021, + "loss": 4.8666, + "step": 46790 + }, + { + "epoch": 0.06575839060554274, + "grad_norm": 1.545295000076294, + "learning_rate": 0.0001972024729520865, + "loss": 4.8099, + "step": 46800 + }, + { + "epoch": 0.06577244154370632, + "grad_norm": 0.9640814065933228, + "learning_rate": 0.00019724462554447097, + "loss": 4.9203, + "step": 46810 + }, + { + "epoch": 0.0657864924818699, + "grad_norm": 0.9138042330741882, + "learning_rate": 0.0001972867781368554, + "loss": 5.0355, + "step": 46820 + }, + { + "epoch": 0.06580054342003348, + "grad_norm": 0.9734580516815186, + "learning_rate": 0.00019732893072923983, + "loss": 4.8711, + "step": 46830 + }, + { + "epoch": 0.06581459435819706, + "grad_norm": 1.0923768281936646, + "learning_rate": 0.00019737108332162426, + "loss": 4.9167, + "step": 46840 + }, + { + "epoch": 0.06582864529636064, + "grad_norm": 1.087605595588684, + "learning_rate": 0.0001974132359140087, + "loss": 4.9004, + "step": 46850 + }, + { + "epoch": 0.06584269623452421, + "grad_norm": 0.9761277437210083, + "learning_rate": 0.00019745538850639315, + "loss": 4.9228, + "step": 46860 + }, + { + "epoch": 0.06585674717268779, + "grad_norm": 1.0201919078826904, + "learning_rate": 0.00019749754109877756, + "loss": 4.9418, + "step": 46870 + }, + { + "epoch": 0.06587079811085136, + "grad_norm": 0.9499254822731018, + "learning_rate": 0.000197539693691162, + "loss": 4.9016, + "step": 46880 + }, + { + "epoch": 0.06588484904901494, + "grad_norm": 0.9977640509605408, + "learning_rate": 0.00019758184628354642, + "loss": 4.8839, + "step": 46890 + }, + { + "epoch": 0.06589889998717852, + "grad_norm": 0.9540080428123474, + "learning_rate": 0.00019762399887593085, + "loss": 4.8135, + "step": 46900 + }, + { + "epoch": 0.06591295092534209, + "grad_norm": 0.9515107274055481, + "learning_rate": 0.00019766615146831528, + "loss": 4.7421, + "step": 46910 + }, + { + "epoch": 0.06592700186350567, + "grad_norm": 0.9759949445724487, + "learning_rate": 0.00019770830406069974, + "loss": 4.9691, + "step": 46920 + }, + { + "epoch": 0.06594105280166924, + "grad_norm": 0.9368551969528198, + "learning_rate": 0.00019775045665308414, + "loss": 4.9767, + "step": 46930 + }, + { + "epoch": 0.06595510373983283, + "grad_norm": 0.9469462037086487, + "learning_rate": 0.00019779260924546857, + "loss": 4.8049, + "step": 46940 + }, + { + "epoch": 0.06596915467799641, + "grad_norm": 0.979142427444458, + "learning_rate": 0.000197834761837853, + "loss": 4.8257, + "step": 46950 + }, + { + "epoch": 0.06598320561615999, + "grad_norm": 0.9656456112861633, + "learning_rate": 0.00019787691443023744, + "loss": 4.9793, + "step": 46960 + }, + { + "epoch": 0.06599725655432356, + "grad_norm": 0.9597137570381165, + "learning_rate": 0.00019791906702262187, + "loss": 4.882, + "step": 46970 + }, + { + "epoch": 0.06601130749248714, + "grad_norm": 0.9437071084976196, + "learning_rate": 0.00019796121961500633, + "loss": 4.8506, + "step": 46980 + }, + { + "epoch": 0.06602535843065072, + "grad_norm": 0.9833372235298157, + "learning_rate": 0.00019800337220739076, + "loss": 4.8654, + "step": 46990 + }, + { + "epoch": 0.06603940936881429, + "grad_norm": 0.9324950575828552, + "learning_rate": 0.00019804552479977516, + "loss": 4.8477, + "step": 47000 + }, + { + "epoch": 0.06605346030697787, + "grad_norm": 0.9253446459770203, + "learning_rate": 0.0001980876773921596, + "loss": 4.831, + "step": 47010 + }, + { + "epoch": 0.06606751124514144, + "grad_norm": 0.9337993264198303, + "learning_rate": 0.00019812982998454402, + "loss": 4.8415, + "step": 47020 + }, + { + "epoch": 0.06608156218330502, + "grad_norm": 0.9470283389091492, + "learning_rate": 0.00019817198257692846, + "loss": 4.9581, + "step": 47030 + }, + { + "epoch": 0.06609561312146861, + "grad_norm": 0.9741782546043396, + "learning_rate": 0.00019821413516931291, + "loss": 4.9539, + "step": 47040 + }, + { + "epoch": 0.06610966405963219, + "grad_norm": 0.9608268141746521, + "learning_rate": 0.00019825628776169734, + "loss": 4.8291, + "step": 47050 + }, + { + "epoch": 0.06612371499779576, + "grad_norm": 1.0971790552139282, + "learning_rate": 0.00019829844035408175, + "loss": 4.8117, + "step": 47060 + }, + { + "epoch": 0.06613776593595934, + "grad_norm": 0.9461948871612549, + "learning_rate": 0.00019834059294646618, + "loss": 4.9276, + "step": 47070 + }, + { + "epoch": 0.06615181687412292, + "grad_norm": 0.9624106287956238, + "learning_rate": 0.0001983827455388506, + "loss": 4.8393, + "step": 47080 + }, + { + "epoch": 0.06616586781228649, + "grad_norm": 0.9574129581451416, + "learning_rate": 0.00019842489813123504, + "loss": 4.7859, + "step": 47090 + }, + { + "epoch": 0.06617991875045007, + "grad_norm": 0.9906713962554932, + "learning_rate": 0.0001984670507236195, + "loss": 4.9673, + "step": 47100 + }, + { + "epoch": 0.06619396968861364, + "grad_norm": 0.9565755128860474, + "learning_rate": 0.00019850920331600393, + "loss": 4.9318, + "step": 47110 + }, + { + "epoch": 0.06620802062677722, + "grad_norm": 0.9409468770027161, + "learning_rate": 0.00019855135590838834, + "loss": 4.7968, + "step": 47120 + }, + { + "epoch": 0.0662220715649408, + "grad_norm": 0.9226107001304626, + "learning_rate": 0.00019859350850077277, + "loss": 4.7978, + "step": 47130 + }, + { + "epoch": 0.06623612250310439, + "grad_norm": 0.919445812702179, + "learning_rate": 0.0001986356610931572, + "loss": 4.9299, + "step": 47140 + }, + { + "epoch": 0.06625017344126796, + "grad_norm": 0.9550936818122864, + "learning_rate": 0.00019867781368554163, + "loss": 4.8339, + "step": 47150 + }, + { + "epoch": 0.06626422437943154, + "grad_norm": 0.9679076075553894, + "learning_rate": 0.0001987199662779261, + "loss": 4.912, + "step": 47160 + }, + { + "epoch": 0.06627827531759511, + "grad_norm": 0.9844717979431152, + "learning_rate": 0.00019876211887031052, + "loss": 4.9034, + "step": 47170 + }, + { + "epoch": 0.06629232625575869, + "grad_norm": 0.9533633589744568, + "learning_rate": 0.00019880427146269495, + "loss": 4.9254, + "step": 47180 + }, + { + "epoch": 0.06630637719392227, + "grad_norm": 0.9623854160308838, + "learning_rate": 0.00019884642405507935, + "loss": 4.9148, + "step": 47190 + }, + { + "epoch": 0.06632042813208584, + "grad_norm": 0.9844478964805603, + "learning_rate": 0.00019888857664746379, + "loss": 4.901, + "step": 47200 + }, + { + "epoch": 0.06633447907024942, + "grad_norm": 0.9062860012054443, + "learning_rate": 0.00019893072923984824, + "loss": 4.8345, + "step": 47210 + }, + { + "epoch": 0.066348530008413, + "grad_norm": 0.9490434527397156, + "learning_rate": 0.00019897288183223267, + "loss": 4.8597, + "step": 47220 + }, + { + "epoch": 0.06636258094657657, + "grad_norm": 0.9500983357429504, + "learning_rate": 0.0001990150344246171, + "loss": 4.8869, + "step": 47230 + }, + { + "epoch": 0.06637663188474015, + "grad_norm": 0.9309141039848328, + "learning_rate": 0.00019905718701700154, + "loss": 4.864, + "step": 47240 + }, + { + "epoch": 0.06639068282290374, + "grad_norm": 1.0119329690933228, + "learning_rate": 0.00019909933960938594, + "loss": 4.8581, + "step": 47250 + }, + { + "epoch": 0.06640473376106731, + "grad_norm": 0.9545915126800537, + "learning_rate": 0.00019914149220177037, + "loss": 5.0142, + "step": 47260 + }, + { + "epoch": 0.06641878469923089, + "grad_norm": 0.948043167591095, + "learning_rate": 0.00019918364479415483, + "loss": 4.9292, + "step": 47270 + }, + { + "epoch": 0.06643283563739447, + "grad_norm": 0.9219819903373718, + "learning_rate": 0.00019922579738653926, + "loss": 4.9284, + "step": 47280 + }, + { + "epoch": 0.06644688657555804, + "grad_norm": 0.9398238658905029, + "learning_rate": 0.0001992679499789237, + "loss": 4.8902, + "step": 47290 + }, + { + "epoch": 0.06646093751372162, + "grad_norm": 0.9383578896522522, + "learning_rate": 0.00019931010257130812, + "loss": 4.827, + "step": 47300 + }, + { + "epoch": 0.0664749884518852, + "grad_norm": 0.9550478458404541, + "learning_rate": 0.00019935225516369256, + "loss": 5.0553, + "step": 47310 + }, + { + "epoch": 0.06648903939004877, + "grad_norm": 0.9616899490356445, + "learning_rate": 0.00019939440775607696, + "loss": 4.8483, + "step": 47320 + }, + { + "epoch": 0.06650309032821235, + "grad_norm": 0.9487091898918152, + "learning_rate": 0.00019943656034846142, + "loss": 4.9295, + "step": 47330 + }, + { + "epoch": 0.06651714126637592, + "grad_norm": 0.939850926399231, + "learning_rate": 0.00019947871294084585, + "loss": 4.7803, + "step": 47340 + }, + { + "epoch": 0.06653119220453951, + "grad_norm": 0.9599955081939697, + "learning_rate": 0.00019952086553323028, + "loss": 4.8776, + "step": 47350 + }, + { + "epoch": 0.06654524314270309, + "grad_norm": 0.9502904415130615, + "learning_rate": 0.0001995630181256147, + "loss": 4.8173, + "step": 47360 + }, + { + "epoch": 0.06655929408086667, + "grad_norm": 0.9396896958351135, + "learning_rate": 0.00019960517071799914, + "loss": 4.9296, + "step": 47370 + }, + { + "epoch": 0.06657334501903024, + "grad_norm": 0.9196685552597046, + "learning_rate": 0.00019964732331038355, + "loss": 5.0329, + "step": 47380 + }, + { + "epoch": 0.06658739595719382, + "grad_norm": 0.9280641078948975, + "learning_rate": 0.000199689475902768, + "loss": 4.8225, + "step": 47390 + }, + { + "epoch": 0.0666014468953574, + "grad_norm": 0.9672068357467651, + "learning_rate": 0.00019973162849515244, + "loss": 4.8844, + "step": 47400 + }, + { + "epoch": 0.06661549783352097, + "grad_norm": 0.9293968677520752, + "learning_rate": 0.00019977378108753687, + "loss": 4.828, + "step": 47410 + }, + { + "epoch": 0.06662954877168455, + "grad_norm": 0.9214613437652588, + "learning_rate": 0.0001998159336799213, + "loss": 4.8509, + "step": 47420 + }, + { + "epoch": 0.06664359970984812, + "grad_norm": 0.9712655544281006, + "learning_rate": 0.00019985808627230573, + "loss": 4.8579, + "step": 47430 + }, + { + "epoch": 0.0666576506480117, + "grad_norm": 0.9182402491569519, + "learning_rate": 0.0001999002388646902, + "loss": 4.789, + "step": 47440 + }, + { + "epoch": 0.06667170158617529, + "grad_norm": 0.9156343936920166, + "learning_rate": 0.0001999423914570746, + "loss": 4.8536, + "step": 47450 + }, + { + "epoch": 0.06668575252433886, + "grad_norm": 0.9341609477996826, + "learning_rate": 0.00019998454404945902, + "loss": 4.989, + "step": 47460 + }, + { + "epoch": 0.06669980346250244, + "grad_norm": 0.9106461405754089, + "learning_rate": 0.00020002669664184345, + "loss": 4.9627, + "step": 47470 + }, + { + "epoch": 0.06671385440066602, + "grad_norm": 1.2454720735549927, + "learning_rate": 0.00020006884923422788, + "loss": 4.8649, + "step": 47480 + }, + { + "epoch": 0.0667279053388296, + "grad_norm": 0.9427316784858704, + "learning_rate": 0.00020011100182661232, + "loss": 4.8691, + "step": 47490 + }, + { + "epoch": 0.06674195627699317, + "grad_norm": 0.9439139366149902, + "learning_rate": 0.00020015315441899677, + "loss": 4.9893, + "step": 47500 + }, + { + "epoch": 0.06675600721515675, + "grad_norm": 0.9804021120071411, + "learning_rate": 0.00020019530701138118, + "loss": 5.011, + "step": 47510 + }, + { + "epoch": 0.06677005815332032, + "grad_norm": 0.9366146922111511, + "learning_rate": 0.0002002374596037656, + "loss": 4.9722, + "step": 47520 + }, + { + "epoch": 0.0667841090914839, + "grad_norm": 0.9678098559379578, + "learning_rate": 0.00020027961219615004, + "loss": 4.91, + "step": 47530 + }, + { + "epoch": 0.06679816002964747, + "grad_norm": 0.97898268699646, + "learning_rate": 0.00020032176478853447, + "loss": 4.8503, + "step": 47540 + }, + { + "epoch": 0.06681221096781105, + "grad_norm": 0.9911873936653137, + "learning_rate": 0.0002003639173809189, + "loss": 4.8557, + "step": 47550 + }, + { + "epoch": 0.06682626190597464, + "grad_norm": 0.9423503875732422, + "learning_rate": 0.00020040606997330336, + "loss": 4.783, + "step": 47560 + }, + { + "epoch": 0.06684031284413822, + "grad_norm": 0.956548810005188, + "learning_rate": 0.0002004482225656878, + "loss": 5.0057, + "step": 47570 + }, + { + "epoch": 0.06685436378230179, + "grad_norm": 0.9642254114151001, + "learning_rate": 0.0002004903751580722, + "loss": 4.9451, + "step": 47580 + }, + { + "epoch": 0.06686841472046537, + "grad_norm": 0.9153860211372375, + "learning_rate": 0.00020053252775045663, + "loss": 4.8504, + "step": 47590 + }, + { + "epoch": 0.06688246565862895, + "grad_norm": 0.9987194538116455, + "learning_rate": 0.00020057468034284106, + "loss": 4.7961, + "step": 47600 + }, + { + "epoch": 0.06689651659679252, + "grad_norm": 0.8986924290657043, + "learning_rate": 0.0002006168329352255, + "loss": 4.8971, + "step": 47610 + }, + { + "epoch": 0.0669105675349561, + "grad_norm": 0.9505689740180969, + "learning_rate": 0.00020065898552760995, + "loss": 4.9268, + "step": 47620 + }, + { + "epoch": 0.06692461847311967, + "grad_norm": 0.9821462631225586, + "learning_rate": 0.00020070113811999438, + "loss": 4.8518, + "step": 47630 + }, + { + "epoch": 0.06693866941128325, + "grad_norm": 0.9475671648979187, + "learning_rate": 0.00020074329071237878, + "loss": 4.9195, + "step": 47640 + }, + { + "epoch": 0.06695272034944683, + "grad_norm": 0.9619015455245972, + "learning_rate": 0.00020078544330476321, + "loss": 4.8156, + "step": 47650 + }, + { + "epoch": 0.06696677128761042, + "grad_norm": 0.9441516995429993, + "learning_rate": 0.00020082759589714765, + "loss": 4.9198, + "step": 47660 + }, + { + "epoch": 0.06698082222577399, + "grad_norm": 0.923410177230835, + "learning_rate": 0.00020086974848953208, + "loss": 4.9305, + "step": 47670 + }, + { + "epoch": 0.06699487316393757, + "grad_norm": 0.9332021474838257, + "learning_rate": 0.00020091190108191654, + "loss": 4.7413, + "step": 47680 + }, + { + "epoch": 0.06700892410210114, + "grad_norm": 0.9328610897064209, + "learning_rate": 0.00020095405367430097, + "loss": 4.847, + "step": 47690 + }, + { + "epoch": 0.06702297504026472, + "grad_norm": 0.9511117339134216, + "learning_rate": 0.0002009962062666854, + "loss": 4.8878, + "step": 47700 + }, + { + "epoch": 0.0670370259784283, + "grad_norm": 0.9161847829818726, + "learning_rate": 0.0002010383588590698, + "loss": 4.9897, + "step": 47710 + }, + { + "epoch": 0.06705107691659187, + "grad_norm": 0.9360916614532471, + "learning_rate": 0.00020108051145145423, + "loss": 4.9512, + "step": 47720 + }, + { + "epoch": 0.06706512785475545, + "grad_norm": 1.0616661310195923, + "learning_rate": 0.00020112266404383866, + "loss": 4.9006, + "step": 47730 + }, + { + "epoch": 0.06707917879291903, + "grad_norm": 0.9333809614181519, + "learning_rate": 0.00020116481663622312, + "loss": 4.7788, + "step": 47740 + }, + { + "epoch": 0.0670932297310826, + "grad_norm": 0.9299507737159729, + "learning_rate": 0.00020120696922860755, + "loss": 4.9081, + "step": 47750 + }, + { + "epoch": 0.06710728066924619, + "grad_norm": 0.961584746837616, + "learning_rate": 0.00020124912182099198, + "loss": 4.8686, + "step": 47760 + }, + { + "epoch": 0.06712133160740977, + "grad_norm": 0.9510294198989868, + "learning_rate": 0.0002012912744133764, + "loss": 4.8925, + "step": 47770 + }, + { + "epoch": 0.06713538254557334, + "grad_norm": 0.9562880396842957, + "learning_rate": 0.00020133342700576082, + "loss": 4.8545, + "step": 47780 + }, + { + "epoch": 0.06714943348373692, + "grad_norm": 0.9377868175506592, + "learning_rate": 0.00020137557959814528, + "loss": 4.8412, + "step": 47790 + }, + { + "epoch": 0.0671634844219005, + "grad_norm": 0.9249438643455505, + "learning_rate": 0.0002014177321905297, + "loss": 4.9385, + "step": 47800 + }, + { + "epoch": 0.06717753536006407, + "grad_norm": 0.9281932711601257, + "learning_rate": 0.00020145988478291414, + "loss": 4.8762, + "step": 47810 + }, + { + "epoch": 0.06719158629822765, + "grad_norm": 0.9970051050186157, + "learning_rate": 0.00020150203737529857, + "loss": 4.7625, + "step": 47820 + }, + { + "epoch": 0.06720563723639122, + "grad_norm": 0.9395272731781006, + "learning_rate": 0.00020154418996768298, + "loss": 4.753, + "step": 47830 + }, + { + "epoch": 0.0672196881745548, + "grad_norm": 1.0339237451553345, + "learning_rate": 0.0002015863425600674, + "loss": 4.8091, + "step": 47840 + }, + { + "epoch": 0.06723373911271838, + "grad_norm": 0.9100208282470703, + "learning_rate": 0.00020162849515245187, + "loss": 5.0206, + "step": 47850 + }, + { + "epoch": 0.06724779005088195, + "grad_norm": 0.9560271501541138, + "learning_rate": 0.0002016706477448363, + "loss": 4.9883, + "step": 47860 + }, + { + "epoch": 0.06726184098904554, + "grad_norm": 0.9476458430290222, + "learning_rate": 0.00020171280033722073, + "loss": 4.7588, + "step": 47870 + }, + { + "epoch": 0.06727589192720912, + "grad_norm": 0.9497449398040771, + "learning_rate": 0.00020175495292960516, + "loss": 4.8974, + "step": 47880 + }, + { + "epoch": 0.0672899428653727, + "grad_norm": 0.9444364309310913, + "learning_rate": 0.0002017971055219896, + "loss": 4.884, + "step": 47890 + }, + { + "epoch": 0.06730399380353627, + "grad_norm": 0.9414916634559631, + "learning_rate": 0.000201839258114374, + "loss": 4.9842, + "step": 47900 + }, + { + "epoch": 0.06731804474169985, + "grad_norm": 0.9677934646606445, + "learning_rate": 0.00020188141070675845, + "loss": 4.8616, + "step": 47910 + }, + { + "epoch": 0.06733209567986342, + "grad_norm": 0.9179354906082153, + "learning_rate": 0.00020192356329914288, + "loss": 4.9397, + "step": 47920 + }, + { + "epoch": 0.067346146618027, + "grad_norm": 0.9381024241447449, + "learning_rate": 0.00020196571589152731, + "loss": 4.957, + "step": 47930 + }, + { + "epoch": 0.06736019755619058, + "grad_norm": 0.9283067584037781, + "learning_rate": 0.00020200786848391175, + "loss": 4.6955, + "step": 47940 + }, + { + "epoch": 0.06737424849435415, + "grad_norm": 0.9692529439926147, + "learning_rate": 0.00020205002107629618, + "loss": 4.8843, + "step": 47950 + }, + { + "epoch": 0.06738829943251773, + "grad_norm": 0.9457141160964966, + "learning_rate": 0.00020209217366868058, + "loss": 4.955, + "step": 47960 + }, + { + "epoch": 0.06740235037068132, + "grad_norm": 0.9691393375396729, + "learning_rate": 0.00020213432626106504, + "loss": 4.8771, + "step": 47970 + }, + { + "epoch": 0.0674164013088449, + "grad_norm": 0.9532297253608704, + "learning_rate": 0.00020217647885344947, + "loss": 4.7826, + "step": 47980 + }, + { + "epoch": 0.06743045224700847, + "grad_norm": 0.9230175614356995, + "learning_rate": 0.0002022186314458339, + "loss": 4.8278, + "step": 47990 + }, + { + "epoch": 0.06744450318517205, + "grad_norm": 0.9743686318397522, + "learning_rate": 0.00020226078403821833, + "loss": 4.9041, + "step": 48000 + }, + { + "epoch": 0.06745855412333562, + "grad_norm": 0.942179799079895, + "learning_rate": 0.00020230293663060276, + "loss": 4.8774, + "step": 48010 + }, + { + "epoch": 0.0674726050614992, + "grad_norm": 0.9336681962013245, + "learning_rate": 0.00020234508922298722, + "loss": 4.9366, + "step": 48020 + }, + { + "epoch": 0.06748665599966278, + "grad_norm": 1.0104656219482422, + "learning_rate": 0.00020238724181537163, + "loss": 4.9661, + "step": 48030 + }, + { + "epoch": 0.06750070693782635, + "grad_norm": 0.9617270827293396, + "learning_rate": 0.00020242939440775606, + "loss": 4.8729, + "step": 48040 + }, + { + "epoch": 0.06751475787598993, + "grad_norm": 0.8925577402114868, + "learning_rate": 0.0002024715470001405, + "loss": 4.9942, + "step": 48050 + }, + { + "epoch": 0.0675288088141535, + "grad_norm": 0.9957881569862366, + "learning_rate": 0.00020251369959252492, + "loss": 4.8822, + "step": 48060 + }, + { + "epoch": 0.0675428597523171, + "grad_norm": 0.8969820737838745, + "learning_rate": 0.00020255585218490935, + "loss": 4.8443, + "step": 48070 + }, + { + "epoch": 0.06755691069048067, + "grad_norm": 0.8763751983642578, + "learning_rate": 0.0002025980047772938, + "loss": 4.8714, + "step": 48080 + }, + { + "epoch": 0.06757096162864425, + "grad_norm": 0.9305700659751892, + "learning_rate": 0.0002026401573696782, + "loss": 4.8867, + "step": 48090 + }, + { + "epoch": 0.06758501256680782, + "grad_norm": 0.9393258690834045, + "learning_rate": 0.00020268230996206264, + "loss": 4.9085, + "step": 48100 + }, + { + "epoch": 0.0675990635049714, + "grad_norm": 0.9406274557113647, + "learning_rate": 0.00020272446255444708, + "loss": 4.9558, + "step": 48110 + }, + { + "epoch": 0.06761311444313498, + "grad_norm": 0.9472900032997131, + "learning_rate": 0.0002027666151468315, + "loss": 4.8328, + "step": 48120 + }, + { + "epoch": 0.06762716538129855, + "grad_norm": 0.9359706044197083, + "learning_rate": 0.00020280876773921594, + "loss": 4.9482, + "step": 48130 + }, + { + "epoch": 0.06764121631946213, + "grad_norm": 0.9522212743759155, + "learning_rate": 0.0002028509203316004, + "loss": 4.8838, + "step": 48140 + }, + { + "epoch": 0.0676552672576257, + "grad_norm": 0.9575254917144775, + "learning_rate": 0.00020289307292398483, + "loss": 4.9737, + "step": 48150 + }, + { + "epoch": 0.06766931819578928, + "grad_norm": 0.9331732988357544, + "learning_rate": 0.00020293522551636923, + "loss": 4.7875, + "step": 48160 + }, + { + "epoch": 0.06768336913395286, + "grad_norm": 0.9446621537208557, + "learning_rate": 0.00020297737810875366, + "loss": 4.8592, + "step": 48170 + }, + { + "epoch": 0.06769742007211645, + "grad_norm": 0.9311910271644592, + "learning_rate": 0.0002030195307011381, + "loss": 4.8407, + "step": 48180 + }, + { + "epoch": 0.06771147101028002, + "grad_norm": 0.9321035146713257, + "learning_rate": 0.00020306168329352252, + "loss": 4.9385, + "step": 48190 + }, + { + "epoch": 0.0677255219484436, + "grad_norm": 0.9516061544418335, + "learning_rate": 0.00020310383588590698, + "loss": 4.9021, + "step": 48200 + }, + { + "epoch": 0.06773957288660717, + "grad_norm": 0.9016589522361755, + "learning_rate": 0.00020314598847829141, + "loss": 5.0427, + "step": 48210 + }, + { + "epoch": 0.06775362382477075, + "grad_norm": 0.9039486646652222, + "learning_rate": 0.00020318814107067582, + "loss": 4.9687, + "step": 48220 + }, + { + "epoch": 0.06776767476293433, + "grad_norm": 0.9151666164398193, + "learning_rate": 0.00020323029366306025, + "loss": 4.9319, + "step": 48230 + }, + { + "epoch": 0.0677817257010979, + "grad_norm": 0.9722932577133179, + "learning_rate": 0.00020327244625544468, + "loss": 4.9052, + "step": 48240 + }, + { + "epoch": 0.06779577663926148, + "grad_norm": 0.9459034204483032, + "learning_rate": 0.0002033145988478291, + "loss": 4.7892, + "step": 48250 + }, + { + "epoch": 0.06780982757742506, + "grad_norm": 0.9252526760101318, + "learning_rate": 0.00020335675144021357, + "loss": 4.8437, + "step": 48260 + }, + { + "epoch": 0.06782387851558863, + "grad_norm": 0.9979690909385681, + "learning_rate": 0.000203398904032598, + "loss": 4.9336, + "step": 48270 + }, + { + "epoch": 0.06783792945375222, + "grad_norm": 1.0930742025375366, + "learning_rate": 0.00020344105662498243, + "loss": 4.8809, + "step": 48280 + }, + { + "epoch": 0.0678519803919158, + "grad_norm": 0.9363275766372681, + "learning_rate": 0.00020348320921736684, + "loss": 4.829, + "step": 48290 + }, + { + "epoch": 0.06786603133007937, + "grad_norm": 0.9431924223899841, + "learning_rate": 0.00020352536180975127, + "loss": 4.8887, + "step": 48300 + }, + { + "epoch": 0.06788008226824295, + "grad_norm": 0.949468731880188, + "learning_rate": 0.0002035675144021357, + "loss": 4.9473, + "step": 48310 + }, + { + "epoch": 0.06789413320640653, + "grad_norm": 0.9422009587287903, + "learning_rate": 0.00020360966699452016, + "loss": 4.8516, + "step": 48320 + }, + { + "epoch": 0.0679081841445701, + "grad_norm": 0.9336867928504944, + "learning_rate": 0.0002036518195869046, + "loss": 4.8706, + "step": 48330 + }, + { + "epoch": 0.06792223508273368, + "grad_norm": 0.9072600603103638, + "learning_rate": 0.00020369397217928902, + "loss": 4.9884, + "step": 48340 + }, + { + "epoch": 0.06793628602089725, + "grad_norm": 0.9766586422920227, + "learning_rate": 0.00020373612477167342, + "loss": 4.8165, + "step": 48350 + }, + { + "epoch": 0.06795033695906083, + "grad_norm": 0.9324946403503418, + "learning_rate": 0.00020377827736405785, + "loss": 4.8802, + "step": 48360 + }, + { + "epoch": 0.06796438789722441, + "grad_norm": 0.949245035648346, + "learning_rate": 0.00020382042995644229, + "loss": 4.8723, + "step": 48370 + }, + { + "epoch": 0.067978438835388, + "grad_norm": 0.9230749011039734, + "learning_rate": 0.00020386258254882674, + "loss": 4.8103, + "step": 48380 + }, + { + "epoch": 0.06799248977355157, + "grad_norm": 0.9191190004348755, + "learning_rate": 0.00020390473514121118, + "loss": 4.8711, + "step": 48390 + }, + { + "epoch": 0.06800654071171515, + "grad_norm": 0.9459436535835266, + "learning_rate": 0.0002039468877335956, + "loss": 4.8038, + "step": 48400 + }, + { + "epoch": 0.06802059164987873, + "grad_norm": 0.9071534872055054, + "learning_rate": 0.00020398904032598004, + "loss": 4.8562, + "step": 48410 + }, + { + "epoch": 0.0680346425880423, + "grad_norm": 0.9326480627059937, + "learning_rate": 0.00020403119291836444, + "loss": 4.9268, + "step": 48420 + }, + { + "epoch": 0.06804869352620588, + "grad_norm": 0.9467244744300842, + "learning_rate": 0.0002040733455107489, + "loss": 4.92, + "step": 48430 + }, + { + "epoch": 0.06806274446436945, + "grad_norm": 0.9642537832260132, + "learning_rate": 0.00020411549810313333, + "loss": 4.7312, + "step": 48440 + }, + { + "epoch": 0.06807679540253303, + "grad_norm": 0.9437421560287476, + "learning_rate": 0.00020415765069551776, + "loss": 4.8087, + "step": 48450 + }, + { + "epoch": 0.0680908463406966, + "grad_norm": 0.9311556816101074, + "learning_rate": 0.0002041998032879022, + "loss": 4.8312, + "step": 48460 + }, + { + "epoch": 0.06810489727886018, + "grad_norm": 1.035953164100647, + "learning_rate": 0.00020424195588028662, + "loss": 4.9495, + "step": 48470 + }, + { + "epoch": 0.06811894821702376, + "grad_norm": 0.9514090418815613, + "learning_rate": 0.00020428410847267103, + "loss": 4.8289, + "step": 48480 + }, + { + "epoch": 0.06813299915518735, + "grad_norm": 1.0147558450698853, + "learning_rate": 0.0002043262610650555, + "loss": 4.793, + "step": 48490 + }, + { + "epoch": 0.06814705009335092, + "grad_norm": 0.947508692741394, + "learning_rate": 0.00020436841365743992, + "loss": 4.836, + "step": 48500 + }, + { + "epoch": 0.0681611010315145, + "grad_norm": 0.9145636558532715, + "learning_rate": 0.00020441056624982435, + "loss": 4.7227, + "step": 48510 + }, + { + "epoch": 0.06817515196967808, + "grad_norm": 0.9304710030555725, + "learning_rate": 0.00020445271884220878, + "loss": 4.9376, + "step": 48520 + }, + { + "epoch": 0.06818920290784165, + "grad_norm": 0.9380136728286743, + "learning_rate": 0.0002044948714345932, + "loss": 4.9731, + "step": 48530 + }, + { + "epoch": 0.06820325384600523, + "grad_norm": 0.9320715665817261, + "learning_rate": 0.00020453702402697762, + "loss": 4.7998, + "step": 48540 + }, + { + "epoch": 0.0682173047841688, + "grad_norm": 0.939115047454834, + "learning_rate": 0.00020457917661936207, + "loss": 4.9202, + "step": 48550 + }, + { + "epoch": 0.06823135572233238, + "grad_norm": 0.9177758693695068, + "learning_rate": 0.0002046213292117465, + "loss": 4.865, + "step": 48560 + }, + { + "epoch": 0.06824540666049596, + "grad_norm": 0.9489643573760986, + "learning_rate": 0.00020466348180413094, + "loss": 4.8896, + "step": 48570 + }, + { + "epoch": 0.06825945759865953, + "grad_norm": 0.9503336548805237, + "learning_rate": 0.00020470563439651537, + "loss": 4.7754, + "step": 48580 + }, + { + "epoch": 0.06827350853682312, + "grad_norm": 0.920091986656189, + "learning_rate": 0.0002047477869888998, + "loss": 4.9028, + "step": 48590 + }, + { + "epoch": 0.0682875594749867, + "grad_norm": 0.9541656970977783, + "learning_rate": 0.00020478993958128426, + "loss": 4.8475, + "step": 48600 + }, + { + "epoch": 0.06830161041315028, + "grad_norm": 0.9243221879005432, + "learning_rate": 0.00020483209217366866, + "loss": 4.8828, + "step": 48610 + }, + { + "epoch": 0.06831566135131385, + "grad_norm": 0.9231835603713989, + "learning_rate": 0.0002048742447660531, + "loss": 4.9098, + "step": 48620 + }, + { + "epoch": 0.06832971228947743, + "grad_norm": 0.9024877548217773, + "learning_rate": 0.00020491639735843752, + "loss": 4.8797, + "step": 48630 + }, + { + "epoch": 0.068343763227641, + "grad_norm": 0.9409641623497009, + "learning_rate": 0.00020495854995082195, + "loss": 4.864, + "step": 48640 + }, + { + "epoch": 0.06835781416580458, + "grad_norm": 0.9287962317466736, + "learning_rate": 0.00020500070254320639, + "loss": 4.8829, + "step": 48650 + }, + { + "epoch": 0.06837186510396816, + "grad_norm": 0.9256884455680847, + "learning_rate": 0.00020504285513559084, + "loss": 4.7937, + "step": 48660 + }, + { + "epoch": 0.06838591604213173, + "grad_norm": 0.9251719117164612, + "learning_rate": 0.00020508500772797525, + "loss": 4.9763, + "step": 48670 + }, + { + "epoch": 0.06839996698029531, + "grad_norm": 0.9523289203643799, + "learning_rate": 0.00020512716032035968, + "loss": 4.9023, + "step": 48680 + }, + { + "epoch": 0.0684140179184589, + "grad_norm": 0.9629274606704712, + "learning_rate": 0.0002051693129127441, + "loss": 4.9088, + "step": 48690 + }, + { + "epoch": 0.06842806885662248, + "grad_norm": 0.9680508971214294, + "learning_rate": 0.00020521146550512854, + "loss": 4.9068, + "step": 48700 + }, + { + "epoch": 0.06844211979478605, + "grad_norm": 0.989648699760437, + "learning_rate": 0.00020525361809751297, + "loss": 4.7496, + "step": 48710 + }, + { + "epoch": 0.06845617073294963, + "grad_norm": 0.9049251079559326, + "learning_rate": 0.00020529577068989743, + "loss": 4.9653, + "step": 48720 + }, + { + "epoch": 0.0684702216711132, + "grad_norm": 0.9711537957191467, + "learning_rate": 0.00020533792328228186, + "loss": 4.9024, + "step": 48730 + }, + { + "epoch": 0.06848427260927678, + "grad_norm": 0.987939178943634, + "learning_rate": 0.00020538007587466627, + "loss": 4.8989, + "step": 48740 + }, + { + "epoch": 0.06849832354744036, + "grad_norm": 0.9508666396141052, + "learning_rate": 0.0002054222284670507, + "loss": 4.8888, + "step": 48750 + }, + { + "epoch": 0.06851237448560393, + "grad_norm": 1.0984941720962524, + "learning_rate": 0.00020546438105943513, + "loss": 4.8828, + "step": 48760 + }, + { + "epoch": 0.06852642542376751, + "grad_norm": 0.9066498875617981, + "learning_rate": 0.00020550653365181956, + "loss": 4.7879, + "step": 48770 + }, + { + "epoch": 0.06854047636193109, + "grad_norm": 0.957650899887085, + "learning_rate": 0.00020554868624420402, + "loss": 4.9034, + "step": 48780 + }, + { + "epoch": 0.06855452730009466, + "grad_norm": 0.9184542894363403, + "learning_rate": 0.00020559083883658845, + "loss": 4.9151, + "step": 48790 + }, + { + "epoch": 0.06856857823825825, + "grad_norm": 0.9331673383712769, + "learning_rate": 0.00020563299142897285, + "loss": 4.764, + "step": 48800 + }, + { + "epoch": 0.06858262917642183, + "grad_norm": 0.9273619055747986, + "learning_rate": 0.00020567514402135728, + "loss": 4.8304, + "step": 48810 + }, + { + "epoch": 0.0685966801145854, + "grad_norm": 0.962047815322876, + "learning_rate": 0.00020571729661374172, + "loss": 4.7868, + "step": 48820 + }, + { + "epoch": 0.06861073105274898, + "grad_norm": 0.9674386382102966, + "learning_rate": 0.00020575944920612615, + "loss": 4.8832, + "step": 48830 + }, + { + "epoch": 0.06862478199091256, + "grad_norm": 0.925841212272644, + "learning_rate": 0.0002058016017985106, + "loss": 4.8394, + "step": 48840 + }, + { + "epoch": 0.06863883292907613, + "grad_norm": 0.9452566504478455, + "learning_rate": 0.00020584375439089504, + "loss": 4.8387, + "step": 48850 + }, + { + "epoch": 0.06865288386723971, + "grad_norm": 1.0907858610153198, + "learning_rate": 0.00020588590698327947, + "loss": 4.873, + "step": 48860 + }, + { + "epoch": 0.06866693480540328, + "grad_norm": 0.9650761485099792, + "learning_rate": 0.00020592805957566387, + "loss": 4.9087, + "step": 48870 + }, + { + "epoch": 0.06868098574356686, + "grad_norm": 0.8910475373268127, + "learning_rate": 0.0002059702121680483, + "loss": 4.8725, + "step": 48880 + }, + { + "epoch": 0.06869503668173044, + "grad_norm": 0.9114874005317688, + "learning_rate": 0.00020601236476043273, + "loss": 4.9084, + "step": 48890 + }, + { + "epoch": 0.06870908761989403, + "grad_norm": 0.9370610117912292, + "learning_rate": 0.0002060545173528172, + "loss": 4.8315, + "step": 48900 + }, + { + "epoch": 0.0687231385580576, + "grad_norm": 0.9323891401290894, + "learning_rate": 0.00020609666994520162, + "loss": 4.7051, + "step": 48910 + }, + { + "epoch": 0.06873718949622118, + "grad_norm": 1.0027904510498047, + "learning_rate": 0.00020613882253758605, + "loss": 4.87, + "step": 48920 + }, + { + "epoch": 0.06875124043438476, + "grad_norm": 0.9343486428260803, + "learning_rate": 0.00020618097512997046, + "loss": 4.8427, + "step": 48930 + }, + { + "epoch": 0.06876529137254833, + "grad_norm": 0.9870249032974243, + "learning_rate": 0.0002062231277223549, + "loss": 4.8214, + "step": 48940 + }, + { + "epoch": 0.06877934231071191, + "grad_norm": 0.9833707213401794, + "learning_rate": 0.00020626528031473932, + "loss": 4.9394, + "step": 48950 + }, + { + "epoch": 0.06879339324887548, + "grad_norm": 0.9330481290817261, + "learning_rate": 0.00020630743290712378, + "loss": 4.8596, + "step": 48960 + }, + { + "epoch": 0.06880744418703906, + "grad_norm": 0.9530215859413147, + "learning_rate": 0.0002063495854995082, + "loss": 4.892, + "step": 48970 + }, + { + "epoch": 0.06882149512520264, + "grad_norm": 0.9184706807136536, + "learning_rate": 0.00020639173809189264, + "loss": 4.9649, + "step": 48980 + }, + { + "epoch": 0.06883554606336621, + "grad_norm": 0.9311974048614502, + "learning_rate": 0.00020643389068427707, + "loss": 4.9251, + "step": 48990 + }, + { + "epoch": 0.0688495970015298, + "grad_norm": 0.9431549310684204, + "learning_rate": 0.00020647604327666148, + "loss": 4.8339, + "step": 49000 + }, + { + "epoch": 0.06886364793969338, + "grad_norm": 1.0781751871109009, + "learning_rate": 0.00020651819586904593, + "loss": 4.915, + "step": 49010 + }, + { + "epoch": 0.06887769887785695, + "grad_norm": 0.9329824447631836, + "learning_rate": 0.00020656034846143037, + "loss": 4.8891, + "step": 49020 + }, + { + "epoch": 0.06889174981602053, + "grad_norm": 0.9006100296974182, + "learning_rate": 0.0002066025010538148, + "loss": 4.9299, + "step": 49030 + }, + { + "epoch": 0.06890580075418411, + "grad_norm": 0.8924393057823181, + "learning_rate": 0.00020664465364619923, + "loss": 4.7221, + "step": 49040 + }, + { + "epoch": 0.06891985169234768, + "grad_norm": 0.9394928216934204, + "learning_rate": 0.00020668680623858366, + "loss": 4.8564, + "step": 49050 + }, + { + "epoch": 0.06893390263051126, + "grad_norm": 0.9694632291793823, + "learning_rate": 0.00020672895883096806, + "loss": 4.8552, + "step": 49060 + }, + { + "epoch": 0.06894795356867484, + "grad_norm": 0.9108564853668213, + "learning_rate": 0.00020677111142335252, + "loss": 4.8766, + "step": 49070 + }, + { + "epoch": 0.06896200450683841, + "grad_norm": 0.9487257599830627, + "learning_rate": 0.00020681326401573695, + "loss": 4.7678, + "step": 49080 + }, + { + "epoch": 0.06897605544500199, + "grad_norm": 0.9499151706695557, + "learning_rate": 0.00020685541660812138, + "loss": 4.822, + "step": 49090 + }, + { + "epoch": 0.06899010638316556, + "grad_norm": 1.0061317682266235, + "learning_rate": 0.00020689756920050582, + "loss": 4.8902, + "step": 49100 + }, + { + "epoch": 0.06900415732132915, + "grad_norm": 0.9726933240890503, + "learning_rate": 0.00020693972179289025, + "loss": 4.8946, + "step": 49110 + }, + { + "epoch": 0.06901820825949273, + "grad_norm": 0.8891914486885071, + "learning_rate": 0.00020698187438527465, + "loss": 4.9583, + "step": 49120 + }, + { + "epoch": 0.0690322591976563, + "grad_norm": 0.9320970773696899, + "learning_rate": 0.0002070240269776591, + "loss": 4.825, + "step": 49130 + }, + { + "epoch": 0.06904631013581988, + "grad_norm": 0.8775174021720886, + "learning_rate": 0.00020706617957004354, + "loss": 4.8636, + "step": 49140 + }, + { + "epoch": 0.06906036107398346, + "grad_norm": 0.9681304097175598, + "learning_rate": 0.00020710411690318955, + "loss": 4.8397, + "step": 49150 + }, + { + "epoch": 0.06907441201214704, + "grad_norm": 0.9166766405105591, + "learning_rate": 0.00020714626949557395, + "loss": 4.8822, + "step": 49160 + }, + { + "epoch": 0.06908846295031061, + "grad_norm": 0.9451488852500916, + "learning_rate": 0.00020718842208795838, + "loss": 4.9481, + "step": 49170 + }, + { + "epoch": 0.06910251388847419, + "grad_norm": 1.025514006614685, + "learning_rate": 0.00020723057468034282, + "loss": 4.7886, + "step": 49180 + }, + { + "epoch": 0.06911656482663776, + "grad_norm": 1.0833834409713745, + "learning_rate": 0.00020727272727272725, + "loss": 4.8875, + "step": 49190 + }, + { + "epoch": 0.06913061576480134, + "grad_norm": 0.9185973405838013, + "learning_rate": 0.0002073148798651117, + "loss": 4.9811, + "step": 49200 + }, + { + "epoch": 0.06914466670296493, + "grad_norm": 0.9156882762908936, + "learning_rate": 0.00020735703245749614, + "loss": 4.8988, + "step": 49210 + }, + { + "epoch": 0.0691587176411285, + "grad_norm": 0.9855953454971313, + "learning_rate": 0.00020739918504988054, + "loss": 4.7664, + "step": 49220 + }, + { + "epoch": 0.06917276857929208, + "grad_norm": 0.9379180073738098, + "learning_rate": 0.00020744133764226497, + "loss": 4.7998, + "step": 49230 + }, + { + "epoch": 0.06918681951745566, + "grad_norm": 0.901465892791748, + "learning_rate": 0.0002074834902346494, + "loss": 4.9366, + "step": 49240 + }, + { + "epoch": 0.06920087045561923, + "grad_norm": 0.9280766844749451, + "learning_rate": 0.00020752564282703383, + "loss": 4.9037, + "step": 49250 + }, + { + "epoch": 0.06921492139378281, + "grad_norm": 1.0117437839508057, + "learning_rate": 0.0002075677954194183, + "loss": 4.8711, + "step": 49260 + }, + { + "epoch": 0.06922897233194639, + "grad_norm": 0.936126708984375, + "learning_rate": 0.00020760994801180272, + "loss": 4.8852, + "step": 49270 + }, + { + "epoch": 0.06924302327010996, + "grad_norm": 0.907888650894165, + "learning_rate": 0.00020765210060418715, + "loss": 4.8348, + "step": 49280 + }, + { + "epoch": 0.06925707420827354, + "grad_norm": 0.9258394241333008, + "learning_rate": 0.00020769425319657156, + "loss": 4.912, + "step": 49290 + }, + { + "epoch": 0.06927112514643712, + "grad_norm": 0.9378911852836609, + "learning_rate": 0.000207736405788956, + "loss": 4.9453, + "step": 49300 + }, + { + "epoch": 0.0692851760846007, + "grad_norm": 0.9445339441299438, + "learning_rate": 0.00020777855838134042, + "loss": 4.7425, + "step": 49310 + }, + { + "epoch": 0.06929922702276428, + "grad_norm": 0.9408323168754578, + "learning_rate": 0.00020782071097372488, + "loss": 4.8162, + "step": 49320 + }, + { + "epoch": 0.06931327796092786, + "grad_norm": 0.9732528328895569, + "learning_rate": 0.0002078628635661093, + "loss": 4.8715, + "step": 49330 + }, + { + "epoch": 0.06932732889909143, + "grad_norm": 0.9590655565261841, + "learning_rate": 0.00020790501615849374, + "loss": 4.7933, + "step": 49340 + }, + { + "epoch": 0.06934137983725501, + "grad_norm": 0.9126473069190979, + "learning_rate": 0.00020794716875087815, + "loss": 4.8097, + "step": 49350 + }, + { + "epoch": 0.06935543077541859, + "grad_norm": 0.9477401375770569, + "learning_rate": 0.00020798932134326258, + "loss": 4.9911, + "step": 49360 + }, + { + "epoch": 0.06936948171358216, + "grad_norm": 0.897299587726593, + "learning_rate": 0.000208031473935647, + "loss": 4.9375, + "step": 49370 + }, + { + "epoch": 0.06938353265174574, + "grad_norm": 0.951884388923645, + "learning_rate": 0.00020807362652803147, + "loss": 4.78, + "step": 49380 + }, + { + "epoch": 0.06939758358990931, + "grad_norm": 0.9042193293571472, + "learning_rate": 0.0002081157791204159, + "loss": 4.8414, + "step": 49390 + }, + { + "epoch": 0.06941163452807289, + "grad_norm": 0.9469078183174133, + "learning_rate": 0.00020815793171280033, + "loss": 4.9658, + "step": 49400 + }, + { + "epoch": 0.06942568546623647, + "grad_norm": 0.9061809778213501, + "learning_rate": 0.00020820008430518476, + "loss": 4.7509, + "step": 49410 + }, + { + "epoch": 0.06943973640440006, + "grad_norm": 0.9353413581848145, + "learning_rate": 0.00020824223689756916, + "loss": 4.8421, + "step": 49420 + }, + { + "epoch": 0.06945378734256363, + "grad_norm": 0.9837194085121155, + "learning_rate": 0.0002082843894899536, + "loss": 4.8281, + "step": 49430 + }, + { + "epoch": 0.06946783828072721, + "grad_norm": 0.9294975996017456, + "learning_rate": 0.00020832654208233805, + "loss": 4.7543, + "step": 49440 + }, + { + "epoch": 0.06948188921889079, + "grad_norm": 0.9024710059165955, + "learning_rate": 0.00020836869467472248, + "loss": 4.8502, + "step": 49450 + }, + { + "epoch": 0.06949594015705436, + "grad_norm": 0.9258474707603455, + "learning_rate": 0.00020841084726710692, + "loss": 4.9781, + "step": 49460 + }, + { + "epoch": 0.06950999109521794, + "grad_norm": 1.0224697589874268, + "learning_rate": 0.00020845299985949135, + "loss": 4.766, + "step": 49470 + }, + { + "epoch": 0.06952404203338151, + "grad_norm": 0.9513781070709229, + "learning_rate": 0.00020849515245187575, + "loss": 4.879, + "step": 49480 + }, + { + "epoch": 0.06953809297154509, + "grad_norm": 0.9261512160301208, + "learning_rate": 0.00020853730504426018, + "loss": 4.7467, + "step": 49490 + }, + { + "epoch": 0.06955214390970867, + "grad_norm": 0.9599387049674988, + "learning_rate": 0.00020857945763664464, + "loss": 4.9011, + "step": 49500 + }, + { + "epoch": 0.06956619484787224, + "grad_norm": 0.908785343170166, + "learning_rate": 0.00020862161022902907, + "loss": 4.8237, + "step": 49510 + }, + { + "epoch": 0.06958024578603583, + "grad_norm": 0.8983234167098999, + "learning_rate": 0.0002086637628214135, + "loss": 4.9282, + "step": 49520 + }, + { + "epoch": 0.06959429672419941, + "grad_norm": 0.8948048949241638, + "learning_rate": 0.00020870591541379793, + "loss": 4.9866, + "step": 49530 + }, + { + "epoch": 0.06960834766236298, + "grad_norm": 0.8868510127067566, + "learning_rate": 0.00020874806800618237, + "loss": 4.8474, + "step": 49540 + }, + { + "epoch": 0.06962239860052656, + "grad_norm": 1.2011126279830933, + "learning_rate": 0.00020879022059856677, + "loss": 4.9062, + "step": 49550 + }, + { + "epoch": 0.06963644953869014, + "grad_norm": 0.9646673202514648, + "learning_rate": 0.00020883237319095123, + "loss": 4.7145, + "step": 49560 + }, + { + "epoch": 0.06965050047685371, + "grad_norm": 0.9322654604911804, + "learning_rate": 0.00020887452578333566, + "loss": 4.9656, + "step": 49570 + }, + { + "epoch": 0.06966455141501729, + "grad_norm": 0.9516728520393372, + "learning_rate": 0.0002089166783757201, + "loss": 4.8092, + "step": 49580 + }, + { + "epoch": 0.06967860235318087, + "grad_norm": 0.9191582202911377, + "learning_rate": 0.00020895883096810452, + "loss": 4.8314, + "step": 49590 + }, + { + "epoch": 0.06969265329134444, + "grad_norm": 0.9767720699310303, + "learning_rate": 0.00020900098356048898, + "loss": 4.9208, + "step": 49600 + }, + { + "epoch": 0.06970670422950802, + "grad_norm": 0.9100189208984375, + "learning_rate": 0.00020904313615287336, + "loss": 4.8623, + "step": 49610 + }, + { + "epoch": 0.06972075516767161, + "grad_norm": 0.9272692203521729, + "learning_rate": 0.00020908528874525781, + "loss": 4.7475, + "step": 49620 + }, + { + "epoch": 0.06973480610583518, + "grad_norm": 0.9022219181060791, + "learning_rate": 0.00020912744133764225, + "loss": 4.8501, + "step": 49630 + }, + { + "epoch": 0.06974885704399876, + "grad_norm": 0.9093071222305298, + "learning_rate": 0.00020916959393002668, + "loss": 4.9728, + "step": 49640 + }, + { + "epoch": 0.06976290798216234, + "grad_norm": 0.9411072134971619, + "learning_rate": 0.0002092117465224111, + "loss": 4.7739, + "step": 49650 + }, + { + "epoch": 0.06977695892032591, + "grad_norm": 0.9116350412368774, + "learning_rate": 0.00020925389911479557, + "loss": 4.8922, + "step": 49660 + }, + { + "epoch": 0.06979100985848949, + "grad_norm": 0.9344385266304016, + "learning_rate": 0.00020929605170717997, + "loss": 4.8928, + "step": 49670 + }, + { + "epoch": 0.06980506079665307, + "grad_norm": 0.9087563157081604, + "learning_rate": 0.0002093382042995644, + "loss": 4.818, + "step": 49680 + }, + { + "epoch": 0.06981911173481664, + "grad_norm": 0.8912515044212341, + "learning_rate": 0.00020938035689194883, + "loss": 4.8745, + "step": 49690 + }, + { + "epoch": 0.06983316267298022, + "grad_norm": 0.9308001399040222, + "learning_rate": 0.00020942250948433326, + "loss": 4.8263, + "step": 49700 + }, + { + "epoch": 0.0698472136111438, + "grad_norm": 0.910260021686554, + "learning_rate": 0.0002094646620767177, + "loss": 4.753, + "step": 49710 + }, + { + "epoch": 0.06986126454930737, + "grad_norm": 0.9524611830711365, + "learning_rate": 0.00020950681466910215, + "loss": 4.8124, + "step": 49720 + }, + { + "epoch": 0.06987531548747096, + "grad_norm": 0.8876780271530151, + "learning_rate": 0.00020954896726148658, + "loss": 4.934, + "step": 49730 + }, + { + "epoch": 0.06988936642563454, + "grad_norm": 0.9967767596244812, + "learning_rate": 0.000209591119853871, + "loss": 4.8565, + "step": 49740 + }, + { + "epoch": 0.06990341736379811, + "grad_norm": 0.9247635006904602, + "learning_rate": 0.000209629057187017, + "loss": 4.8617, + "step": 49750 + }, + { + "epoch": 0.06991746830196169, + "grad_norm": 0.886182427406311, + "learning_rate": 0.00020967120977940143, + "loss": 4.9107, + "step": 49760 + }, + { + "epoch": 0.06993151924012526, + "grad_norm": 0.963778555393219, + "learning_rate": 0.00020971336237178583, + "loss": 4.8586, + "step": 49770 + }, + { + "epoch": 0.06994557017828884, + "grad_norm": 0.9098008275032043, + "learning_rate": 0.00020975551496417026, + "loss": 4.8441, + "step": 49780 + }, + { + "epoch": 0.06995962111645242, + "grad_norm": 0.9119469523429871, + "learning_rate": 0.0002097976675565547, + "loss": 4.8072, + "step": 49790 + }, + { + "epoch": 0.06997367205461599, + "grad_norm": 0.920755922794342, + "learning_rate": 0.00020983982014893915, + "loss": 4.8332, + "step": 49800 + }, + { + "epoch": 0.06998772299277957, + "grad_norm": 0.9371573328971863, + "learning_rate": 0.00020988197274132359, + "loss": 4.7674, + "step": 49810 + }, + { + "epoch": 0.07000177393094315, + "grad_norm": 0.926888108253479, + "learning_rate": 0.00020992412533370802, + "loss": 4.7776, + "step": 49820 + }, + { + "epoch": 0.07001582486910674, + "grad_norm": 0.9252434968948364, + "learning_rate": 0.00020996627792609245, + "loss": 4.8627, + "step": 49830 + }, + { + "epoch": 0.07002987580727031, + "grad_norm": 0.8703329563140869, + "learning_rate": 0.00021000843051847685, + "loss": 5.0105, + "step": 49840 + }, + { + "epoch": 0.07004392674543389, + "grad_norm": 0.9439218640327454, + "learning_rate": 0.00021005058311086128, + "loss": 4.7864, + "step": 49850 + }, + { + "epoch": 0.07005797768359746, + "grad_norm": 0.9055263996124268, + "learning_rate": 0.00021009273570324574, + "loss": 4.8109, + "step": 49860 + }, + { + "epoch": 0.07007202862176104, + "grad_norm": 0.9342459440231323, + "learning_rate": 0.00021013488829563017, + "loss": 4.8661, + "step": 49870 + }, + { + "epoch": 0.07008607955992462, + "grad_norm": 0.910137414932251, + "learning_rate": 0.0002101770408880146, + "loss": 4.8083, + "step": 49880 + }, + { + "epoch": 0.07010013049808819, + "grad_norm": 0.9078051447868347, + "learning_rate": 0.00021021919348039903, + "loss": 5.0483, + "step": 49890 + }, + { + "epoch": 0.07011418143625177, + "grad_norm": 0.9174194931983948, + "learning_rate": 0.00021026134607278344, + "loss": 4.9562, + "step": 49900 + }, + { + "epoch": 0.07012823237441534, + "grad_norm": 0.9276567101478577, + "learning_rate": 0.00021030349866516787, + "loss": 4.8575, + "step": 49910 + }, + { + "epoch": 0.07014228331257892, + "grad_norm": 0.9092664122581482, + "learning_rate": 0.00021034565125755233, + "loss": 4.673, + "step": 49920 + }, + { + "epoch": 0.07015633425074251, + "grad_norm": 0.9359709620475769, + "learning_rate": 0.00021038780384993676, + "loss": 4.8121, + "step": 49930 + }, + { + "epoch": 0.07017038518890609, + "grad_norm": 0.9264384508132935, + "learning_rate": 0.0002104299564423212, + "loss": 4.825, + "step": 49940 + }, + { + "epoch": 0.07018443612706966, + "grad_norm": 0.881436824798584, + "learning_rate": 0.00021047210903470562, + "loss": 4.7949, + "step": 49950 + }, + { + "epoch": 0.07019848706523324, + "grad_norm": 0.9295757412910461, + "learning_rate": 0.00021051426162709005, + "loss": 4.8378, + "step": 49960 + }, + { + "epoch": 0.07021253800339682, + "grad_norm": 0.9156924486160278, + "learning_rate": 0.00021055641421947446, + "loss": 4.9212, + "step": 49970 + }, + { + "epoch": 0.07022658894156039, + "grad_norm": 0.9250329732894897, + "learning_rate": 0.00021059856681185892, + "loss": 4.8159, + "step": 49980 + }, + { + "epoch": 0.07024063987972397, + "grad_norm": 0.88548743724823, + "learning_rate": 0.00021064071940424335, + "loss": 4.858, + "step": 49990 + }, + { + "epoch": 0.07025469081788754, + "grad_norm": 0.9146063327789307, + "learning_rate": 0.00021068287199662778, + "loss": 4.8865, + "step": 50000 + }, + { + "epoch": 0.07026874175605112, + "grad_norm": 0.9075859785079956, + "learning_rate": 0.0002107250245890122, + "loss": 4.8735, + "step": 50010 + }, + { + "epoch": 0.0702827926942147, + "grad_norm": 0.9209066033363342, + "learning_rate": 0.00021076717718139664, + "loss": 4.8269, + "step": 50020 + }, + { + "epoch": 0.07029684363237827, + "grad_norm": 0.9381097555160522, + "learning_rate": 0.00021080932977378104, + "loss": 4.9601, + "step": 50030 + }, + { + "epoch": 0.07031089457054186, + "grad_norm": 0.8956311941146851, + "learning_rate": 0.0002108514823661655, + "loss": 4.9086, + "step": 50040 + }, + { + "epoch": 0.07032494550870544, + "grad_norm": 0.9148889780044556, + "learning_rate": 0.00021089363495854993, + "loss": 4.7701, + "step": 50050 + }, + { + "epoch": 0.07033899644686901, + "grad_norm": 0.9117963314056396, + "learning_rate": 0.00021093578755093436, + "loss": 4.9762, + "step": 50060 + }, + { + "epoch": 0.07035304738503259, + "grad_norm": 0.9547762870788574, + "learning_rate": 0.0002109779401433188, + "loss": 4.8947, + "step": 50070 + }, + { + "epoch": 0.07036709832319617, + "grad_norm": 0.9054858088493347, + "learning_rate": 0.00021102009273570323, + "loss": 4.8054, + "step": 50080 + }, + { + "epoch": 0.07038114926135974, + "grad_norm": 0.8751288652420044, + "learning_rate": 0.00021106224532808763, + "loss": 4.8845, + "step": 50090 + }, + { + "epoch": 0.07039520019952332, + "grad_norm": 0.9098647236824036, + "learning_rate": 0.0002111043979204721, + "loss": 4.7945, + "step": 50100 + }, + { + "epoch": 0.0704092511376869, + "grad_norm": 0.9443636536598206, + "learning_rate": 0.00021114655051285652, + "loss": 4.8985, + "step": 50110 + }, + { + "epoch": 0.07042330207585047, + "grad_norm": 0.9238690137863159, + "learning_rate": 0.00021118870310524095, + "loss": 4.78, + "step": 50120 + }, + { + "epoch": 0.07043735301401405, + "grad_norm": 0.9531887173652649, + "learning_rate": 0.00021123085569762538, + "loss": 4.7975, + "step": 50130 + }, + { + "epoch": 0.07045140395217764, + "grad_norm": 0.9312260150909424, + "learning_rate": 0.00021127300829000981, + "loss": 4.8473, + "step": 50140 + }, + { + "epoch": 0.07046545489034121, + "grad_norm": 0.9800652861595154, + "learning_rate": 0.00021131516088239427, + "loss": 4.8799, + "step": 50150 + }, + { + "epoch": 0.07047950582850479, + "grad_norm": 0.8918007016181946, + "learning_rate": 0.00021135731347477868, + "loss": 4.967, + "step": 50160 + }, + { + "epoch": 0.07049355676666837, + "grad_norm": 0.9277655482292175, + "learning_rate": 0.0002113994660671631, + "loss": 4.7375, + "step": 50170 + }, + { + "epoch": 0.07050760770483194, + "grad_norm": 0.9381228089332581, + "learning_rate": 0.00021144161865954754, + "loss": 4.8449, + "step": 50180 + }, + { + "epoch": 0.07052165864299552, + "grad_norm": 0.9662410616874695, + "learning_rate": 0.00021148377125193197, + "loss": 4.8809, + "step": 50190 + }, + { + "epoch": 0.0705357095811591, + "grad_norm": 0.9184268116950989, + "learning_rate": 0.0002115259238443164, + "loss": 4.9633, + "step": 50200 + }, + { + "epoch": 0.07054976051932267, + "grad_norm": 0.9069191217422485, + "learning_rate": 0.00021156807643670086, + "loss": 4.8526, + "step": 50210 + }, + { + "epoch": 0.07056381145748625, + "grad_norm": 0.908988356590271, + "learning_rate": 0.00021161022902908526, + "loss": 4.7715, + "step": 50220 + }, + { + "epoch": 0.07057786239564982, + "grad_norm": 0.9809160828590393, + "learning_rate": 0.0002116523816214697, + "loss": 4.9594, + "step": 50230 + }, + { + "epoch": 0.07059191333381341, + "grad_norm": 0.9043667912483215, + "learning_rate": 0.00021169453421385413, + "loss": 4.8035, + "step": 50240 + }, + { + "epoch": 0.07060596427197699, + "grad_norm": 1.00430166721344, + "learning_rate": 0.00021173668680623856, + "loss": 4.8976, + "step": 50250 + }, + { + "epoch": 0.07062001521014057, + "grad_norm": 0.9381590485572815, + "learning_rate": 0.00021177883939862301, + "loss": 4.8531, + "step": 50260 + }, + { + "epoch": 0.07063406614830414, + "grad_norm": 0.8797604441642761, + "learning_rate": 0.00021182099199100745, + "loss": 4.8615, + "step": 50270 + }, + { + "epoch": 0.07064811708646772, + "grad_norm": 0.8880553245544434, + "learning_rate": 0.00021186314458339188, + "loss": 4.8398, + "step": 50280 + }, + { + "epoch": 0.0706621680246313, + "grad_norm": 0.9292892217636108, + "learning_rate": 0.00021190529717577628, + "loss": 4.9145, + "step": 50290 + }, + { + "epoch": 0.07067621896279487, + "grad_norm": 0.9727832078933716, + "learning_rate": 0.0002119474497681607, + "loss": 4.7891, + "step": 50300 + }, + { + "epoch": 0.07069026990095845, + "grad_norm": 0.9402398467063904, + "learning_rate": 0.00021198960236054514, + "loss": 4.8532, + "step": 50310 + }, + { + "epoch": 0.07070432083912202, + "grad_norm": 0.9445501565933228, + "learning_rate": 0.0002120317549529296, + "loss": 4.8188, + "step": 50320 + }, + { + "epoch": 0.0707183717772856, + "grad_norm": 0.9223546385765076, + "learning_rate": 0.00021207390754531403, + "loss": 4.9161, + "step": 50330 + }, + { + "epoch": 0.07073242271544918, + "grad_norm": 0.9386568069458008, + "learning_rate": 0.00021211606013769846, + "loss": 4.9415, + "step": 50340 + }, + { + "epoch": 0.07074647365361277, + "grad_norm": 0.9313529133796692, + "learning_rate": 0.00021215821273008287, + "loss": 4.8792, + "step": 50350 + }, + { + "epoch": 0.07076052459177634, + "grad_norm": 0.9341327548027039, + "learning_rate": 0.0002122003653224673, + "loss": 4.8473, + "step": 50360 + }, + { + "epoch": 0.07077457552993992, + "grad_norm": 0.8974583745002747, + "learning_rate": 0.00021224251791485173, + "loss": 4.919, + "step": 50370 + }, + { + "epoch": 0.0707886264681035, + "grad_norm": 0.9477220177650452, + "learning_rate": 0.0002122846705072362, + "loss": 4.9454, + "step": 50380 + }, + { + "epoch": 0.07080267740626707, + "grad_norm": 0.9393336176872253, + "learning_rate": 0.00021232682309962062, + "loss": 5.068, + "step": 50390 + }, + { + "epoch": 0.07081672834443065, + "grad_norm": 0.9357321858406067, + "learning_rate": 0.00021236897569200505, + "loss": 4.794, + "step": 50400 + }, + { + "epoch": 0.07083077928259422, + "grad_norm": 0.9102789163589478, + "learning_rate": 0.00021241112828438948, + "loss": 4.9233, + "step": 50410 + }, + { + "epoch": 0.0708448302207578, + "grad_norm": 0.9070646166801453, + "learning_rate": 0.00021245328087677389, + "loss": 4.9097, + "step": 50420 + }, + { + "epoch": 0.07085888115892137, + "grad_norm": 0.8896940350532532, + "learning_rate": 0.00021249543346915832, + "loss": 4.9443, + "step": 50430 + }, + { + "epoch": 0.07087293209708495, + "grad_norm": 0.9510523080825806, + "learning_rate": 0.00021253758606154278, + "loss": 4.7866, + "step": 50440 + }, + { + "epoch": 0.07088698303524854, + "grad_norm": 0.9242532253265381, + "learning_rate": 0.0002125797386539272, + "loss": 4.8039, + "step": 50450 + }, + { + "epoch": 0.07090103397341212, + "grad_norm": 0.9695941209793091, + "learning_rate": 0.00021262189124631164, + "loss": 4.859, + "step": 50460 + }, + { + "epoch": 0.0709150849115757, + "grad_norm": 0.8837727904319763, + "learning_rate": 0.00021266404383869607, + "loss": 4.9475, + "step": 50470 + }, + { + "epoch": 0.07092913584973927, + "grad_norm": 0.9693347811698914, + "learning_rate": 0.00021270619643108047, + "loss": 4.8531, + "step": 50480 + }, + { + "epoch": 0.07094318678790285, + "grad_norm": 0.9365562796592712, + "learning_rate": 0.0002127483490234649, + "loss": 4.8912, + "step": 50490 + }, + { + "epoch": 0.07095723772606642, + "grad_norm": 0.8954295516014099, + "learning_rate": 0.00021279050161584936, + "loss": 4.8255, + "step": 50500 + }, + { + "epoch": 0.07097128866423, + "grad_norm": 0.9566641449928284, + "learning_rate": 0.0002128326542082338, + "loss": 4.9466, + "step": 50510 + }, + { + "epoch": 0.07098533960239357, + "grad_norm": 0.8944385647773743, + "learning_rate": 0.00021287480680061823, + "loss": 4.9177, + "step": 50520 + }, + { + "epoch": 0.07099939054055715, + "grad_norm": 0.8918579816818237, + "learning_rate": 0.00021291695939300266, + "loss": 4.7941, + "step": 50530 + }, + { + "epoch": 0.07101344147872073, + "grad_norm": 0.8937610387802124, + "learning_rate": 0.0002129591119853871, + "loss": 4.8411, + "step": 50540 + }, + { + "epoch": 0.07102749241688432, + "grad_norm": 0.9060569405555725, + "learning_rate": 0.0002130012645777715, + "loss": 4.7955, + "step": 50550 + }, + { + "epoch": 0.07104154335504789, + "grad_norm": 0.9308838248252869, + "learning_rate": 0.00021304341717015595, + "loss": 4.8368, + "step": 50560 + }, + { + "epoch": 0.07105559429321147, + "grad_norm": 0.8938664793968201, + "learning_rate": 0.00021308556976254038, + "loss": 4.8399, + "step": 50570 + }, + { + "epoch": 0.07106964523137504, + "grad_norm": 0.8650916814804077, + "learning_rate": 0.0002131277223549248, + "loss": 4.9222, + "step": 50580 + }, + { + "epoch": 0.07108369616953862, + "grad_norm": 0.9728739857673645, + "learning_rate": 0.00021316987494730924, + "loss": 4.7345, + "step": 50590 + }, + { + "epoch": 0.0710977471077022, + "grad_norm": 0.912001371383667, + "learning_rate": 0.00021321202753969367, + "loss": 4.88, + "step": 50600 + }, + { + "epoch": 0.07111179804586577, + "grad_norm": 0.9506505131721497, + "learning_rate": 0.00021325418013207808, + "loss": 4.8297, + "step": 50610 + }, + { + "epoch": 0.07112584898402935, + "grad_norm": 0.9795990586280823, + "learning_rate": 0.00021329633272446254, + "loss": 4.8647, + "step": 50620 + }, + { + "epoch": 0.07113989992219293, + "grad_norm": 0.9694528579711914, + "learning_rate": 0.00021333848531684697, + "loss": 4.8235, + "step": 50630 + }, + { + "epoch": 0.0711539508603565, + "grad_norm": 0.9510214924812317, + "learning_rate": 0.0002133806379092314, + "loss": 4.9937, + "step": 50640 + }, + { + "epoch": 0.07116800179852008, + "grad_norm": 0.9015481472015381, + "learning_rate": 0.00021342279050161583, + "loss": 4.8967, + "step": 50650 + }, + { + "epoch": 0.07118205273668367, + "grad_norm": 0.8986161947250366, + "learning_rate": 0.00021346494309400026, + "loss": 4.9995, + "step": 50660 + }, + { + "epoch": 0.07119610367484724, + "grad_norm": 0.8803780674934387, + "learning_rate": 0.00021350709568638472, + "loss": 4.8779, + "step": 50670 + }, + { + "epoch": 0.07121015461301082, + "grad_norm": 0.893484354019165, + "learning_rate": 0.00021354924827876912, + "loss": 5.0183, + "step": 50680 + }, + { + "epoch": 0.0712242055511744, + "grad_norm": 0.9707047343254089, + "learning_rate": 0.00021359140087115355, + "loss": 4.8389, + "step": 50690 + }, + { + "epoch": 0.07123825648933797, + "grad_norm": 0.8995065689086914, + "learning_rate": 0.00021363355346353799, + "loss": 4.7205, + "step": 50700 + }, + { + "epoch": 0.07125230742750155, + "grad_norm": 1.0342682600021362, + "learning_rate": 0.00021367570605592242, + "loss": 4.7475, + "step": 50710 + }, + { + "epoch": 0.07126635836566513, + "grad_norm": 0.9065954685211182, + "learning_rate": 0.00021371785864830685, + "loss": 4.8589, + "step": 50720 + }, + { + "epoch": 0.0712804093038287, + "grad_norm": 0.9274217486381531, + "learning_rate": 0.0002137600112406913, + "loss": 4.9241, + "step": 50730 + }, + { + "epoch": 0.07129446024199228, + "grad_norm": 0.9372475743293762, + "learning_rate": 0.0002138021638330757, + "loss": 4.7475, + "step": 50740 + }, + { + "epoch": 0.07130851118015585, + "grad_norm": 0.8814763426780701, + "learning_rate": 0.00021384431642546014, + "loss": 4.9215, + "step": 50750 + }, + { + "epoch": 0.07132256211831944, + "grad_norm": 0.9730595946311951, + "learning_rate": 0.00021388646901784457, + "loss": 4.9178, + "step": 50760 + }, + { + "epoch": 0.07133661305648302, + "grad_norm": 0.9046145081520081, + "learning_rate": 0.000213928621610229, + "loss": 4.7846, + "step": 50770 + }, + { + "epoch": 0.0713506639946466, + "grad_norm": 0.8996490240097046, + "learning_rate": 0.00021397077420261344, + "loss": 4.8804, + "step": 50780 + }, + { + "epoch": 0.07136471493281017, + "grad_norm": 0.9104477763175964, + "learning_rate": 0.0002140129267949979, + "loss": 5.0551, + "step": 50790 + }, + { + "epoch": 0.07137876587097375, + "grad_norm": 0.9244865775108337, + "learning_rate": 0.0002140550793873823, + "loss": 4.9711, + "step": 50800 + }, + { + "epoch": 0.07139281680913732, + "grad_norm": 0.9118911623954773, + "learning_rate": 0.00021409723197976673, + "loss": 4.9435, + "step": 50810 + }, + { + "epoch": 0.0714068677473009, + "grad_norm": 0.8913031816482544, + "learning_rate": 0.00021413938457215116, + "loss": 4.7714, + "step": 50820 + }, + { + "epoch": 0.07142091868546448, + "grad_norm": 0.9230485558509827, + "learning_rate": 0.0002141815371645356, + "loss": 4.8122, + "step": 50830 + }, + { + "epoch": 0.07143496962362805, + "grad_norm": 0.9266636967658997, + "learning_rate": 0.00021422368975692005, + "loss": 4.8106, + "step": 50840 + }, + { + "epoch": 0.07144902056179163, + "grad_norm": 0.9038143157958984, + "learning_rate": 0.00021426584234930448, + "loss": 4.8459, + "step": 50850 + }, + { + "epoch": 0.07146307149995522, + "grad_norm": 0.9343311190605164, + "learning_rate": 0.0002143079949416889, + "loss": 4.8616, + "step": 50860 + }, + { + "epoch": 0.0714771224381188, + "grad_norm": 0.8671455979347229, + "learning_rate": 0.00021435014753407332, + "loss": 4.8417, + "step": 50870 + }, + { + "epoch": 0.07149117337628237, + "grad_norm": 0.9161444306373596, + "learning_rate": 0.00021439230012645775, + "loss": 4.7486, + "step": 50880 + }, + { + "epoch": 0.07150522431444595, + "grad_norm": 0.8902353644371033, + "learning_rate": 0.00021443445271884218, + "loss": 4.9153, + "step": 50890 + }, + { + "epoch": 0.07151927525260952, + "grad_norm": 0.9097145199775696, + "learning_rate": 0.00021447660531122664, + "loss": 4.8412, + "step": 50900 + }, + { + "epoch": 0.0715333261907731, + "grad_norm": 0.9682596921920776, + "learning_rate": 0.00021451875790361107, + "loss": 4.7094, + "step": 50910 + }, + { + "epoch": 0.07154737712893668, + "grad_norm": 0.9312680959701538, + "learning_rate": 0.0002145609104959955, + "loss": 4.8971, + "step": 50920 + }, + { + "epoch": 0.07156142806710025, + "grad_norm": 0.8974460959434509, + "learning_rate": 0.0002146030630883799, + "loss": 4.7942, + "step": 50930 + }, + { + "epoch": 0.07157547900526383, + "grad_norm": 0.9163692593574524, + "learning_rate": 0.00021464521568076433, + "loss": 4.8219, + "step": 50940 + }, + { + "epoch": 0.0715895299434274, + "grad_norm": 0.9357579946517944, + "learning_rate": 0.00021468736827314877, + "loss": 4.865, + "step": 50950 + }, + { + "epoch": 0.071603580881591, + "grad_norm": 0.9122851490974426, + "learning_rate": 0.00021472952086553322, + "loss": 4.9114, + "step": 50960 + }, + { + "epoch": 0.07161763181975457, + "grad_norm": 0.8831797242164612, + "learning_rate": 0.00021477167345791765, + "loss": 4.8798, + "step": 50970 + }, + { + "epoch": 0.07163168275791815, + "grad_norm": 0.9927180409431458, + "learning_rate": 0.00021481382605030209, + "loss": 4.7515, + "step": 50980 + }, + { + "epoch": 0.07164573369608172, + "grad_norm": 0.9619544148445129, + "learning_rate": 0.00021485597864268652, + "loss": 4.7979, + "step": 50990 + }, + { + "epoch": 0.0716597846342453, + "grad_norm": 0.92621248960495, + "learning_rate": 0.00021489813123507092, + "loss": 4.8098, + "step": 51000 + }, + { + "epoch": 0.07167383557240888, + "grad_norm": 0.9142638444900513, + "learning_rate": 0.00021494028382745535, + "loss": 4.8371, + "step": 51010 + }, + { + "epoch": 0.07168788651057245, + "grad_norm": 0.9437655210494995, + "learning_rate": 0.0002149824364198398, + "loss": 4.8507, + "step": 51020 + }, + { + "epoch": 0.07170193744873603, + "grad_norm": 0.9370262026786804, + "learning_rate": 0.00021502458901222424, + "loss": 4.8096, + "step": 51030 + }, + { + "epoch": 0.0717159883868996, + "grad_norm": 0.8940436244010925, + "learning_rate": 0.00021506674160460867, + "loss": 4.7918, + "step": 51040 + }, + { + "epoch": 0.07173003932506318, + "grad_norm": 0.9073984026908875, + "learning_rate": 0.0002151088941969931, + "loss": 4.8127, + "step": 51050 + }, + { + "epoch": 0.07174409026322676, + "grad_norm": 0.9012202024459839, + "learning_rate": 0.0002151510467893775, + "loss": 4.7126, + "step": 51060 + }, + { + "epoch": 0.07175814120139035, + "grad_norm": 0.8825278282165527, + "learning_rate": 0.00021519319938176194, + "loss": 4.8573, + "step": 51070 + }, + { + "epoch": 0.07177219213955392, + "grad_norm": 0.9400650262832642, + "learning_rate": 0.0002152353519741464, + "loss": 4.8074, + "step": 51080 + }, + { + "epoch": 0.0717862430777175, + "grad_norm": 0.8937569260597229, + "learning_rate": 0.00021527750456653083, + "loss": 4.7746, + "step": 51090 + }, + { + "epoch": 0.07180029401588107, + "grad_norm": 0.9248559474945068, + "learning_rate": 0.00021531965715891526, + "loss": 4.889, + "step": 51100 + }, + { + "epoch": 0.07181434495404465, + "grad_norm": 0.8893438577651978, + "learning_rate": 0.0002153618097512997, + "loss": 4.7216, + "step": 51110 + }, + { + "epoch": 0.07182839589220823, + "grad_norm": 0.9111591577529907, + "learning_rate": 0.00021540396234368412, + "loss": 4.8645, + "step": 51120 + }, + { + "epoch": 0.0718424468303718, + "grad_norm": 1.1774877309799194, + "learning_rate": 0.00021544611493606853, + "loss": 4.8673, + "step": 51130 + }, + { + "epoch": 0.07185649776853538, + "grad_norm": 0.8930349946022034, + "learning_rate": 0.00021548826752845298, + "loss": 4.9113, + "step": 51140 + }, + { + "epoch": 0.07187054870669896, + "grad_norm": 0.9083362221717834, + "learning_rate": 0.00021553042012083742, + "loss": 4.9166, + "step": 51150 + }, + { + "epoch": 0.07188459964486253, + "grad_norm": 0.9521656632423401, + "learning_rate": 0.00021557257271322185, + "loss": 4.8467, + "step": 51160 + }, + { + "epoch": 0.07189865058302612, + "grad_norm": 0.8885960578918457, + "learning_rate": 0.00021561472530560628, + "loss": 4.9143, + "step": 51170 + }, + { + "epoch": 0.0719127015211897, + "grad_norm": 0.9528680443763733, + "learning_rate": 0.0002156568778979907, + "loss": 4.936, + "step": 51180 + }, + { + "epoch": 0.07192675245935327, + "grad_norm": 0.9279723763465881, + "learning_rate": 0.0002156990304903751, + "loss": 4.9369, + "step": 51190 + }, + { + "epoch": 0.07194080339751685, + "grad_norm": 0.8902644515037537, + "learning_rate": 0.00021574118308275957, + "loss": 4.9081, + "step": 51200 + }, + { + "epoch": 0.07195485433568043, + "grad_norm": 0.8919149041175842, + "learning_rate": 0.000215783335675144, + "loss": 4.7189, + "step": 51210 + }, + { + "epoch": 0.071968905273844, + "grad_norm": 0.8641528487205505, + "learning_rate": 0.00021582548826752843, + "loss": 4.9004, + "step": 51220 + }, + { + "epoch": 0.07198295621200758, + "grad_norm": 0.9722802639007568, + "learning_rate": 0.00021586764085991287, + "loss": 4.8593, + "step": 51230 + }, + { + "epoch": 0.07199700715017116, + "grad_norm": 0.9114046692848206, + "learning_rate": 0.0002159097934522973, + "loss": 4.8033, + "step": 51240 + }, + { + "epoch": 0.07201105808833473, + "grad_norm": 0.9105363488197327, + "learning_rate": 0.00021595194604468175, + "loss": 4.929, + "step": 51250 + }, + { + "epoch": 0.07202510902649831, + "grad_norm": 0.9299967288970947, + "learning_rate": 0.00021599409863706616, + "loss": 4.8742, + "step": 51260 + }, + { + "epoch": 0.0720391599646619, + "grad_norm": 0.905307948589325, + "learning_rate": 0.0002160362512294506, + "loss": 4.9712, + "step": 51270 + }, + { + "epoch": 0.07205321090282547, + "grad_norm": 0.8946942090988159, + "learning_rate": 0.00021607840382183502, + "loss": 4.8168, + "step": 51280 + }, + { + "epoch": 0.07206726184098905, + "grad_norm": 0.8690630197525024, + "learning_rate": 0.00021612055641421945, + "loss": 4.9673, + "step": 51290 + }, + { + "epoch": 0.07208131277915263, + "grad_norm": 0.9104512333869934, + "learning_rate": 0.00021616270900660388, + "loss": 4.8752, + "step": 51300 + }, + { + "epoch": 0.0720953637173162, + "grad_norm": 0.9049569368362427, + "learning_rate": 0.00021620486159898834, + "loss": 4.8247, + "step": 51310 + }, + { + "epoch": 0.07210941465547978, + "grad_norm": 0.9620559215545654, + "learning_rate": 0.00021624701419137275, + "loss": 4.8299, + "step": 51320 + }, + { + "epoch": 0.07212346559364335, + "grad_norm": 0.925479531288147, + "learning_rate": 0.00021628916678375718, + "loss": 4.9163, + "step": 51330 + }, + { + "epoch": 0.07213751653180693, + "grad_norm": 0.9238383769989014, + "learning_rate": 0.0002163313193761416, + "loss": 4.7718, + "step": 51340 + }, + { + "epoch": 0.0721515674699705, + "grad_norm": 0.9379529356956482, + "learning_rate": 0.00021637347196852604, + "loss": 4.8578, + "step": 51350 + }, + { + "epoch": 0.07216561840813408, + "grad_norm": 0.9757288694381714, + "learning_rate": 0.00021641562456091047, + "loss": 4.7786, + "step": 51360 + }, + { + "epoch": 0.07217966934629766, + "grad_norm": 0.9008384943008423, + "learning_rate": 0.00021645777715329493, + "loss": 4.7713, + "step": 51370 + }, + { + "epoch": 0.07219372028446125, + "grad_norm": 0.9475241899490356, + "learning_rate": 0.00021649992974567936, + "loss": 4.7283, + "step": 51380 + }, + { + "epoch": 0.07220777122262483, + "grad_norm": 0.8943010568618774, + "learning_rate": 0.00021654208233806376, + "loss": 4.9106, + "step": 51390 + }, + { + "epoch": 0.0722218221607884, + "grad_norm": 0.9169595241546631, + "learning_rate": 0.0002165842349304482, + "loss": 4.8022, + "step": 51400 + }, + { + "epoch": 0.07223587309895198, + "grad_norm": 0.9076150059700012, + "learning_rate": 0.00021662638752283263, + "loss": 4.8884, + "step": 51410 + }, + { + "epoch": 0.07224992403711555, + "grad_norm": 0.8879525661468506, + "learning_rate": 0.00021666854011521708, + "loss": 4.8106, + "step": 51420 + }, + { + "epoch": 0.07226397497527913, + "grad_norm": 0.9336953163146973, + "learning_rate": 0.00021671069270760152, + "loss": 4.8502, + "step": 51430 + }, + { + "epoch": 0.0722780259134427, + "grad_norm": 0.9571032524108887, + "learning_rate": 0.00021675284529998595, + "loss": 4.8924, + "step": 51440 + }, + { + "epoch": 0.07229207685160628, + "grad_norm": 0.8778558969497681, + "learning_rate": 0.00021679499789237035, + "loss": 4.7895, + "step": 51450 + }, + { + "epoch": 0.07230612778976986, + "grad_norm": 0.9267377853393555, + "learning_rate": 0.00021683715048475478, + "loss": 4.8497, + "step": 51460 + }, + { + "epoch": 0.07232017872793343, + "grad_norm": 0.8599752187728882, + "learning_rate": 0.0002168793030771392, + "loss": 4.7963, + "step": 51470 + }, + { + "epoch": 0.07233422966609702, + "grad_norm": 0.9141520261764526, + "learning_rate": 0.00021692145566952367, + "loss": 4.889, + "step": 51480 + }, + { + "epoch": 0.0723482806042606, + "grad_norm": 0.9507299065589905, + "learning_rate": 0.0002169636082619081, + "loss": 4.9028, + "step": 51490 + }, + { + "epoch": 0.07236233154242418, + "grad_norm": 0.9014940857887268, + "learning_rate": 0.00021700576085429253, + "loss": 4.8435, + "step": 51500 + }, + { + "epoch": 0.07237638248058775, + "grad_norm": 0.9214157462120056, + "learning_rate": 0.00021704791344667694, + "loss": 4.8943, + "step": 51510 + }, + { + "epoch": 0.07239043341875133, + "grad_norm": 0.8904531598091125, + "learning_rate": 0.00021709006603906137, + "loss": 4.788, + "step": 51520 + }, + { + "epoch": 0.0724044843569149, + "grad_norm": 0.8769239187240601, + "learning_rate": 0.0002171322186314458, + "loss": 4.8987, + "step": 51530 + }, + { + "epoch": 0.07241853529507848, + "grad_norm": 0.8964927196502686, + "learning_rate": 0.00021717437122383026, + "loss": 4.7834, + "step": 51540 + }, + { + "epoch": 0.07243258623324206, + "grad_norm": 0.9342873096466064, + "learning_rate": 0.0002172165238162147, + "loss": 4.8952, + "step": 51550 + }, + { + "epoch": 0.07244663717140563, + "grad_norm": 0.9120386838912964, + "learning_rate": 0.00021725867640859912, + "loss": 4.927, + "step": 51560 + }, + { + "epoch": 0.07246068810956921, + "grad_norm": 0.8827990889549255, + "learning_rate": 0.00021730082900098355, + "loss": 4.9028, + "step": 51570 + }, + { + "epoch": 0.0724747390477328, + "grad_norm": 0.8882848620414734, + "learning_rate": 0.00021734298159336796, + "loss": 4.8229, + "step": 51580 + }, + { + "epoch": 0.07248878998589638, + "grad_norm": 0.9415622353553772, + "learning_rate": 0.0002173851341857524, + "loss": 4.851, + "step": 51590 + }, + { + "epoch": 0.07250284092405995, + "grad_norm": 0.936032772064209, + "learning_rate": 0.00021742728677813685, + "loss": 4.7324, + "step": 51600 + }, + { + "epoch": 0.07251689186222353, + "grad_norm": 0.9080641865730286, + "learning_rate": 0.00021746943937052128, + "loss": 4.8478, + "step": 51610 + }, + { + "epoch": 0.0725309428003871, + "grad_norm": 0.8853969573974609, + "learning_rate": 0.0002175115919629057, + "loss": 4.8263, + "step": 51620 + }, + { + "epoch": 0.07254499373855068, + "grad_norm": 0.9154071807861328, + "learning_rate": 0.00021755374455529014, + "loss": 4.9162, + "step": 51630 + }, + { + "epoch": 0.07255904467671426, + "grad_norm": 0.894340991973877, + "learning_rate": 0.00021759589714767454, + "loss": 4.8624, + "step": 51640 + }, + { + "epoch": 0.07257309561487783, + "grad_norm": 0.9048279523849487, + "learning_rate": 0.00021763804974005897, + "loss": 4.9428, + "step": 51650 + }, + { + "epoch": 0.07258714655304141, + "grad_norm": 0.8889136910438538, + "learning_rate": 0.00021768020233244343, + "loss": 4.8428, + "step": 51660 + }, + { + "epoch": 0.07260119749120499, + "grad_norm": 1.0818347930908203, + "learning_rate": 0.00021772235492482786, + "loss": 4.9329, + "step": 51670 + }, + { + "epoch": 0.07261524842936856, + "grad_norm": 0.9882572889328003, + "learning_rate": 0.0002177645075172123, + "loss": 4.8468, + "step": 51680 + }, + { + "epoch": 0.07262929936753215, + "grad_norm": 0.8729943633079529, + "learning_rate": 0.00021780666010959673, + "loss": 5.0247, + "step": 51690 + }, + { + "epoch": 0.07264335030569573, + "grad_norm": 0.8866937160491943, + "learning_rate": 0.00021784881270198116, + "loss": 4.7614, + "step": 51700 + }, + { + "epoch": 0.0726574012438593, + "grad_norm": 0.9050795435905457, + "learning_rate": 0.00021789096529436556, + "loss": 4.9629, + "step": 51710 + }, + { + "epoch": 0.07267145218202288, + "grad_norm": 0.9173886775970459, + "learning_rate": 0.00021793311788675002, + "loss": 4.7605, + "step": 51720 + }, + { + "epoch": 0.07268550312018646, + "grad_norm": 0.8916087746620178, + "learning_rate": 0.00021797527047913445, + "loss": 4.8462, + "step": 51730 + }, + { + "epoch": 0.07269955405835003, + "grad_norm": 0.8887311220169067, + "learning_rate": 0.00021801742307151888, + "loss": 4.9066, + "step": 51740 + }, + { + "epoch": 0.07271360499651361, + "grad_norm": 0.871232271194458, + "learning_rate": 0.0002180595756639033, + "loss": 4.9546, + "step": 51750 + }, + { + "epoch": 0.07272765593467719, + "grad_norm": 0.8541275858879089, + "learning_rate": 0.00021810172825628774, + "loss": 4.8691, + "step": 51760 + }, + { + "epoch": 0.07274170687284076, + "grad_norm": 0.8941138982772827, + "learning_rate": 0.00021814388084867215, + "loss": 4.8881, + "step": 51770 + }, + { + "epoch": 0.07275575781100434, + "grad_norm": 0.9860665202140808, + "learning_rate": 0.0002181860334410566, + "loss": 4.7749, + "step": 51780 + }, + { + "epoch": 0.07276980874916793, + "grad_norm": 0.8882085680961609, + "learning_rate": 0.00021822818603344104, + "loss": 4.8333, + "step": 51790 + }, + { + "epoch": 0.0727838596873315, + "grad_norm": 0.9299302697181702, + "learning_rate": 0.00021827033862582547, + "loss": 4.8661, + "step": 51800 + }, + { + "epoch": 0.07279791062549508, + "grad_norm": 0.8716930747032166, + "learning_rate": 0.0002183124912182099, + "loss": 4.8118, + "step": 51810 + }, + { + "epoch": 0.07281196156365866, + "grad_norm": 0.9178931713104248, + "learning_rate": 0.00021835464381059433, + "loss": 4.867, + "step": 51820 + }, + { + "epoch": 0.07282601250182223, + "grad_norm": 0.9466065764427185, + "learning_rate": 0.0002183967964029788, + "loss": 4.8518, + "step": 51830 + }, + { + "epoch": 0.07284006343998581, + "grad_norm": 0.9389024972915649, + "learning_rate": 0.0002184389489953632, + "loss": 4.7342, + "step": 51840 + }, + { + "epoch": 0.07285411437814938, + "grad_norm": 0.9677282571792603, + "learning_rate": 0.00021848110158774762, + "loss": 4.8186, + "step": 51850 + }, + { + "epoch": 0.07286816531631296, + "grad_norm": 0.9958227872848511, + "learning_rate": 0.00021852325418013206, + "loss": 4.9577, + "step": 51860 + }, + { + "epoch": 0.07288221625447654, + "grad_norm": 0.8936029076576233, + "learning_rate": 0.0002185654067725165, + "loss": 4.9129, + "step": 51870 + }, + { + "epoch": 0.07289626719264011, + "grad_norm": 0.9088529944419861, + "learning_rate": 0.00021860755936490092, + "loss": 4.8115, + "step": 51880 + }, + { + "epoch": 0.0729103181308037, + "grad_norm": 0.9063525795936584, + "learning_rate": 0.00021864971195728538, + "loss": 4.9166, + "step": 51890 + }, + { + "epoch": 0.07292436906896728, + "grad_norm": 0.9061357378959656, + "learning_rate": 0.00021869186454966978, + "loss": 4.8306, + "step": 51900 + }, + { + "epoch": 0.07293842000713086, + "grad_norm": 0.9639674425125122, + "learning_rate": 0.0002187340171420542, + "loss": 4.8523, + "step": 51910 + }, + { + "epoch": 0.07295247094529443, + "grad_norm": 0.9000791311264038, + "learning_rate": 0.00021877616973443864, + "loss": 4.7562, + "step": 51920 + }, + { + "epoch": 0.07296652188345801, + "grad_norm": 0.9156561493873596, + "learning_rate": 0.00021881832232682307, + "loss": 5.0079, + "step": 51930 + }, + { + "epoch": 0.07298057282162158, + "grad_norm": 0.9364044666290283, + "learning_rate": 0.0002188604749192075, + "loss": 4.9567, + "step": 51940 + }, + { + "epoch": 0.07299462375978516, + "grad_norm": 0.8726660013198853, + "learning_rate": 0.00021890262751159196, + "loss": 4.9127, + "step": 51950 + }, + { + "epoch": 0.07300867469794874, + "grad_norm": 0.9417910575866699, + "learning_rate": 0.0002189447801039764, + "loss": 4.8765, + "step": 51960 + }, + { + "epoch": 0.07302272563611231, + "grad_norm": 0.8728827238082886, + "learning_rate": 0.0002189869326963608, + "loss": 4.7915, + "step": 51970 + }, + { + "epoch": 0.07303677657427589, + "grad_norm": 0.9182146787643433, + "learning_rate": 0.00021902908528874523, + "loss": 4.9794, + "step": 51980 + }, + { + "epoch": 0.07305082751243946, + "grad_norm": 0.8773520588874817, + "learning_rate": 0.00021907123788112966, + "loss": 4.863, + "step": 51990 + }, + { + "epoch": 0.07306487845060305, + "grad_norm": 0.9333748817443848, + "learning_rate": 0.0002191133904735141, + "loss": 4.7767, + "step": 52000 + }, + { + "epoch": 0.07307892938876663, + "grad_norm": 0.9405140280723572, + "learning_rate": 0.00021915554306589855, + "loss": 4.8579, + "step": 52010 + }, + { + "epoch": 0.07309298032693021, + "grad_norm": 0.8939602971076965, + "learning_rate": 0.00021919769565828298, + "loss": 4.9197, + "step": 52020 + }, + { + "epoch": 0.07310703126509378, + "grad_norm": 1.0516551733016968, + "learning_rate": 0.00021923984825066739, + "loss": 4.7807, + "step": 52030 + }, + { + "epoch": 0.07312108220325736, + "grad_norm": 0.9042669534683228, + "learning_rate": 0.00021928200084305182, + "loss": 4.8805, + "step": 52040 + }, + { + "epoch": 0.07313513314142094, + "grad_norm": 0.8705670833587646, + "learning_rate": 0.00021932415343543625, + "loss": 4.8592, + "step": 52050 + }, + { + "epoch": 0.07314918407958451, + "grad_norm": 0.9007607698440552, + "learning_rate": 0.0002193663060278207, + "loss": 4.9571, + "step": 52060 + }, + { + "epoch": 0.07316323501774809, + "grad_norm": 0.9101800322532654, + "learning_rate": 0.00021940845862020514, + "loss": 4.8341, + "step": 52070 + }, + { + "epoch": 0.07317728595591166, + "grad_norm": 0.8472366333007812, + "learning_rate": 0.00021945061121258957, + "loss": 5.002, + "step": 52080 + }, + { + "epoch": 0.07319133689407524, + "grad_norm": 0.8725908994674683, + "learning_rate": 0.00021949276380497397, + "loss": 4.8479, + "step": 52090 + }, + { + "epoch": 0.07320538783223883, + "grad_norm": 0.8859214782714844, + "learning_rate": 0.0002195349163973584, + "loss": 4.8376, + "step": 52100 + }, + { + "epoch": 0.0732194387704024, + "grad_norm": 0.9071083068847656, + "learning_rate": 0.00021957706898974283, + "loss": 4.7523, + "step": 52110 + }, + { + "epoch": 0.07323348970856598, + "grad_norm": 0.8878831267356873, + "learning_rate": 0.0002196192215821273, + "loss": 4.937, + "step": 52120 + }, + { + "epoch": 0.07324754064672956, + "grad_norm": 0.924068033695221, + "learning_rate": 0.00021966137417451172, + "loss": 4.6908, + "step": 52130 + }, + { + "epoch": 0.07326159158489313, + "grad_norm": 0.9516233205795288, + "learning_rate": 0.00021970352676689616, + "loss": 4.8146, + "step": 52140 + }, + { + "epoch": 0.07327564252305671, + "grad_norm": 0.8874931931495667, + "learning_rate": 0.0002197456793592806, + "loss": 4.799, + "step": 52150 + }, + { + "epoch": 0.07328969346122029, + "grad_norm": 0.8723164200782776, + "learning_rate": 0.000219787831951665, + "loss": 4.6445, + "step": 52160 + }, + { + "epoch": 0.07330374439938386, + "grad_norm": 0.8742164373397827, + "learning_rate": 0.00021982998454404942, + "loss": 5.0286, + "step": 52170 + }, + { + "epoch": 0.07331779533754744, + "grad_norm": 1.0268549919128418, + "learning_rate": 0.00021987213713643388, + "loss": 4.7709, + "step": 52180 + }, + { + "epoch": 0.07333184627571102, + "grad_norm": 0.8598684668540955, + "learning_rate": 0.0002199142897288183, + "loss": 4.9128, + "step": 52190 + }, + { + "epoch": 0.0733458972138746, + "grad_norm": 0.9072014689445496, + "learning_rate": 0.00021995644232120274, + "loss": 4.8774, + "step": 52200 + }, + { + "epoch": 0.07335994815203818, + "grad_norm": 0.9579162001609802, + "learning_rate": 0.00021999859491358717, + "loss": 4.8299, + "step": 52210 + }, + { + "epoch": 0.07337399909020176, + "grad_norm": 0.9101182222366333, + "learning_rate": 0.00022004074750597158, + "loss": 4.7819, + "step": 52220 + }, + { + "epoch": 0.07338805002836533, + "grad_norm": 0.8982588052749634, + "learning_rate": 0.000220082900098356, + "loss": 4.8396, + "step": 52230 + }, + { + "epoch": 0.07340210096652891, + "grad_norm": 0.9071044921875, + "learning_rate": 0.00022012505269074047, + "loss": 4.8595, + "step": 52240 + }, + { + "epoch": 0.07341615190469249, + "grad_norm": 0.952265739440918, + "learning_rate": 0.0002201672052831249, + "loss": 4.8164, + "step": 52250 + }, + { + "epoch": 0.07343020284285606, + "grad_norm": 0.9162461161613464, + "learning_rate": 0.00022020935787550933, + "loss": 4.8448, + "step": 52260 + }, + { + "epoch": 0.07344425378101964, + "grad_norm": 0.8878207802772522, + "learning_rate": 0.00022025151046789376, + "loss": 4.8139, + "step": 52270 + }, + { + "epoch": 0.07345830471918322, + "grad_norm": 0.8891446590423584, + "learning_rate": 0.0002202936630602782, + "loss": 4.9729, + "step": 52280 + }, + { + "epoch": 0.07347235565734679, + "grad_norm": 0.9611853957176208, + "learning_rate": 0.0002203358156526626, + "loss": 4.9082, + "step": 52290 + }, + { + "epoch": 0.07348640659551037, + "grad_norm": 0.8403993248939514, + "learning_rate": 0.00022037796824504705, + "loss": 4.904, + "step": 52300 + }, + { + "epoch": 0.07350045753367396, + "grad_norm": 0.873360276222229, + "learning_rate": 0.00022042012083743149, + "loss": 4.9462, + "step": 52310 + }, + { + "epoch": 0.07351450847183753, + "grad_norm": 0.8688433766365051, + "learning_rate": 0.00022046227342981592, + "loss": 5.0245, + "step": 52320 + }, + { + "epoch": 0.07352855941000111, + "grad_norm": 0.899304986000061, + "learning_rate": 0.00022050442602220035, + "loss": 4.7631, + "step": 52330 + }, + { + "epoch": 0.07354261034816469, + "grad_norm": 0.8732203841209412, + "learning_rate": 0.00022054657861458478, + "loss": 4.9718, + "step": 52340 + }, + { + "epoch": 0.07355666128632826, + "grad_norm": 0.8838686943054199, + "learning_rate": 0.00022058873120696918, + "loss": 4.9258, + "step": 52350 + }, + { + "epoch": 0.07357071222449184, + "grad_norm": 0.9056248068809509, + "learning_rate": 0.00022063088379935364, + "loss": 4.789, + "step": 52360 + }, + { + "epoch": 0.07358476316265541, + "grad_norm": 0.8867168426513672, + "learning_rate": 0.00022067303639173807, + "loss": 4.8421, + "step": 52370 + }, + { + "epoch": 0.07359881410081899, + "grad_norm": 1.1361435651779175, + "learning_rate": 0.0002207151889841225, + "loss": 4.8528, + "step": 52380 + }, + { + "epoch": 0.07361286503898257, + "grad_norm": 0.897068977355957, + "learning_rate": 0.00022075734157650693, + "loss": 4.7161, + "step": 52390 + }, + { + "epoch": 0.07362691597714614, + "grad_norm": 0.9281494617462158, + "learning_rate": 0.00022079949416889137, + "loss": 4.7663, + "step": 52400 + }, + { + "epoch": 0.07364096691530973, + "grad_norm": 0.9386702179908752, + "learning_rate": 0.00022084164676127582, + "loss": 4.8699, + "step": 52410 + }, + { + "epoch": 0.07365501785347331, + "grad_norm": 0.8745991587638855, + "learning_rate": 0.00022088379935366023, + "loss": 4.7858, + "step": 52420 + }, + { + "epoch": 0.07366906879163689, + "grad_norm": 0.9027533531188965, + "learning_rate": 0.00022092595194604466, + "loss": 4.8626, + "step": 52430 + }, + { + "epoch": 0.07368311972980046, + "grad_norm": 0.8828858733177185, + "learning_rate": 0.0002209681045384291, + "loss": 4.6567, + "step": 52440 + }, + { + "epoch": 0.07369717066796404, + "grad_norm": 0.8733296990394592, + "learning_rate": 0.00022101025713081352, + "loss": 4.8987, + "step": 52450 + }, + { + "epoch": 0.07371122160612761, + "grad_norm": 0.8760149478912354, + "learning_rate": 0.00022105240972319795, + "loss": 4.8488, + "step": 52460 + }, + { + "epoch": 0.07372527254429119, + "grad_norm": 0.9006288647651672, + "learning_rate": 0.0002210945623155824, + "loss": 4.8752, + "step": 52470 + }, + { + "epoch": 0.07373932348245477, + "grad_norm": 0.8950282335281372, + "learning_rate": 0.00022113671490796682, + "loss": 4.9373, + "step": 52480 + }, + { + "epoch": 0.07375337442061834, + "grad_norm": 0.8998293876647949, + "learning_rate": 0.00022117886750035125, + "loss": 4.8634, + "step": 52490 + }, + { + "epoch": 0.07376742535878192, + "grad_norm": 0.8735572099685669, + "learning_rate": 0.00022122102009273568, + "loss": 4.869, + "step": 52500 + }, + { + "epoch": 0.07378147629694551, + "grad_norm": 0.9324967265129089, + "learning_rate": 0.0002212631726851201, + "loss": 4.7894, + "step": 52510 + }, + { + "epoch": 0.07379552723510908, + "grad_norm": 0.8685128092765808, + "learning_rate": 0.00022130532527750454, + "loss": 4.8566, + "step": 52520 + }, + { + "epoch": 0.07380957817327266, + "grad_norm": 0.916084885597229, + "learning_rate": 0.000221347477869889, + "loss": 4.6707, + "step": 52530 + }, + { + "epoch": 0.07382362911143624, + "grad_norm": 0.8901728987693787, + "learning_rate": 0.00022138963046227343, + "loss": 4.8058, + "step": 52540 + }, + { + "epoch": 0.07383768004959981, + "grad_norm": 0.871108889579773, + "learning_rate": 0.00022143178305465783, + "loss": 4.9019, + "step": 52550 + }, + { + "epoch": 0.07385173098776339, + "grad_norm": 0.8851538300514221, + "learning_rate": 0.00022147393564704226, + "loss": 4.8358, + "step": 52560 + }, + { + "epoch": 0.07386578192592697, + "grad_norm": 0.9221569895744324, + "learning_rate": 0.0002215160882394267, + "loss": 4.9551, + "step": 52570 + }, + { + "epoch": 0.07387983286409054, + "grad_norm": 0.9061564803123474, + "learning_rate": 0.00022155824083181113, + "loss": 4.6869, + "step": 52580 + }, + { + "epoch": 0.07389388380225412, + "grad_norm": 0.8911418914794922, + "learning_rate": 0.00022160039342419559, + "loss": 4.8726, + "step": 52590 + }, + { + "epoch": 0.0739079347404177, + "grad_norm": 0.9063213467597961, + "learning_rate": 0.00022164254601658002, + "loss": 4.7479, + "step": 52600 + }, + { + "epoch": 0.07392198567858127, + "grad_norm": 0.8305318355560303, + "learning_rate": 0.00022168469860896442, + "loss": 4.9001, + "step": 52610 + }, + { + "epoch": 0.07393603661674486, + "grad_norm": 0.871112585067749, + "learning_rate": 0.00022172685120134885, + "loss": 4.8549, + "step": 52620 + }, + { + "epoch": 0.07395008755490844, + "grad_norm": 0.8946985006332397, + "learning_rate": 0.00022176900379373328, + "loss": 4.8635, + "step": 52630 + }, + { + "epoch": 0.07396413849307201, + "grad_norm": 0.9011940360069275, + "learning_rate": 0.00022181115638611774, + "loss": 4.8687, + "step": 52640 + }, + { + "epoch": 0.07397818943123559, + "grad_norm": 0.8744351267814636, + "learning_rate": 0.00022185330897850217, + "loss": 4.9532, + "step": 52650 + }, + { + "epoch": 0.07399224036939916, + "grad_norm": 0.886094331741333, + "learning_rate": 0.0002218954615708866, + "loss": 4.9025, + "step": 52660 + }, + { + "epoch": 0.07400629130756274, + "grad_norm": 0.8723426461219788, + "learning_rate": 0.00022193761416327103, + "loss": 4.8615, + "step": 52670 + }, + { + "epoch": 0.07402034224572632, + "grad_norm": 0.8950220942497253, + "learning_rate": 0.00022197976675565544, + "loss": 4.825, + "step": 52680 + }, + { + "epoch": 0.0740343931838899, + "grad_norm": 0.8471283316612244, + "learning_rate": 0.00022202191934803987, + "loss": 4.7089, + "step": 52690 + }, + { + "epoch": 0.07404844412205347, + "grad_norm": 0.8990341424942017, + "learning_rate": 0.00022206407194042433, + "loss": 4.9334, + "step": 52700 + }, + { + "epoch": 0.07406249506021705, + "grad_norm": 0.9074385762214661, + "learning_rate": 0.00022210622453280876, + "loss": 4.8409, + "step": 52710 + }, + { + "epoch": 0.07407654599838064, + "grad_norm": 0.8630465865135193, + "learning_rate": 0.0002221483771251932, + "loss": 4.8109, + "step": 52720 + }, + { + "epoch": 0.07409059693654421, + "grad_norm": 0.9616298675537109, + "learning_rate": 0.00022219052971757762, + "loss": 4.8484, + "step": 52730 + }, + { + "epoch": 0.07410464787470779, + "grad_norm": 0.9482759237289429, + "learning_rate": 0.00022223268230996203, + "loss": 4.886, + "step": 52740 + }, + { + "epoch": 0.07411869881287136, + "grad_norm": 0.8867536187171936, + "learning_rate": 0.00022227483490234646, + "loss": 4.8773, + "step": 52750 + }, + { + "epoch": 0.07413274975103494, + "grad_norm": 0.9389952421188354, + "learning_rate": 0.00022231698749473091, + "loss": 4.8021, + "step": 52760 + }, + { + "epoch": 0.07414680068919852, + "grad_norm": 0.8756046891212463, + "learning_rate": 0.00022235914008711535, + "loss": 4.708, + "step": 52770 + }, + { + "epoch": 0.07416085162736209, + "grad_norm": 0.8724471926689148, + "learning_rate": 0.00022240129267949978, + "loss": 4.794, + "step": 52780 + }, + { + "epoch": 0.07417490256552567, + "grad_norm": 0.895776629447937, + "learning_rate": 0.0002224434452718842, + "loss": 4.8679, + "step": 52790 + }, + { + "epoch": 0.07418895350368925, + "grad_norm": 0.9033660292625427, + "learning_rate": 0.0002224855978642686, + "loss": 4.6832, + "step": 52800 + }, + { + "epoch": 0.07420300444185282, + "grad_norm": 0.9038524627685547, + "learning_rate": 0.00022252775045665304, + "loss": 4.7711, + "step": 52810 + }, + { + "epoch": 0.07421705538001641, + "grad_norm": 0.8903810381889343, + "learning_rate": 0.0002225699030490375, + "loss": 4.8756, + "step": 52820 + }, + { + "epoch": 0.07423110631817999, + "grad_norm": 0.9286917448043823, + "learning_rate": 0.00022261205564142193, + "loss": 4.8495, + "step": 52830 + }, + { + "epoch": 0.07424515725634356, + "grad_norm": 0.8560378551483154, + "learning_rate": 0.00022265420823380636, + "loss": 4.8827, + "step": 52840 + }, + { + "epoch": 0.07425920819450714, + "grad_norm": 1.5267508029937744, + "learning_rate": 0.0002226963608261908, + "loss": 4.7531, + "step": 52850 + }, + { + "epoch": 0.07427325913267072, + "grad_norm": 0.8825668096542358, + "learning_rate": 0.00022273851341857523, + "loss": 4.9219, + "step": 52860 + }, + { + "epoch": 0.07428731007083429, + "grad_norm": 0.8723922967910767, + "learning_rate": 0.00022278066601095963, + "loss": 4.7371, + "step": 52870 + }, + { + "epoch": 0.07430136100899787, + "grad_norm": 0.8615224361419678, + "learning_rate": 0.0002228228186033441, + "loss": 4.7961, + "step": 52880 + }, + { + "epoch": 0.07431541194716144, + "grad_norm": 0.8599326610565186, + "learning_rate": 0.00022286497119572852, + "loss": 4.9175, + "step": 52890 + }, + { + "epoch": 0.07432946288532502, + "grad_norm": 0.9164808988571167, + "learning_rate": 0.00022290712378811295, + "loss": 4.7722, + "step": 52900 + }, + { + "epoch": 0.0743435138234886, + "grad_norm": 0.8620802760124207, + "learning_rate": 0.00022294927638049738, + "loss": 4.8847, + "step": 52910 + }, + { + "epoch": 0.07435756476165217, + "grad_norm": 0.88958340883255, + "learning_rate": 0.0002229914289728818, + "loss": 4.8875, + "step": 52920 + }, + { + "epoch": 0.07437161569981576, + "grad_norm": 0.894120454788208, + "learning_rate": 0.00022303358156526622, + "loss": 4.8375, + "step": 52930 + }, + { + "epoch": 0.07438566663797934, + "grad_norm": 0.8956342935562134, + "learning_rate": 0.00022307573415765068, + "loss": 4.9163, + "step": 52940 + }, + { + "epoch": 0.07439971757614292, + "grad_norm": 0.8793826699256897, + "learning_rate": 0.0002231178867500351, + "loss": 4.8802, + "step": 52950 + }, + { + "epoch": 0.07441376851430649, + "grad_norm": 0.8765672445297241, + "learning_rate": 0.00022316003934241954, + "loss": 4.8051, + "step": 52960 + }, + { + "epoch": 0.07442781945247007, + "grad_norm": 0.9064963459968567, + "learning_rate": 0.00022320219193480397, + "loss": 4.804, + "step": 52970 + }, + { + "epoch": 0.07444187039063364, + "grad_norm": 0.879390299320221, + "learning_rate": 0.0002232443445271884, + "loss": 5.0118, + "step": 52980 + }, + { + "epoch": 0.07445592132879722, + "grad_norm": 0.8609398603439331, + "learning_rate": 0.00022328649711957286, + "loss": 4.8256, + "step": 52990 + }, + { + "epoch": 0.0744699722669608, + "grad_norm": 0.9296293258666992, + "learning_rate": 0.00022332864971195726, + "loss": 4.8331, + "step": 53000 + }, + { + "epoch": 0.07448402320512437, + "grad_norm": 1.0869863033294678, + "learning_rate": 0.0002233708023043417, + "loss": 4.7803, + "step": 53010 + }, + { + "epoch": 0.07449807414328795, + "grad_norm": 0.9066222906112671, + "learning_rate": 0.00022341295489672613, + "loss": 4.868, + "step": 53020 + }, + { + "epoch": 0.07451212508145154, + "grad_norm": 0.889058530330658, + "learning_rate": 0.00022345510748911056, + "loss": 4.7838, + "step": 53030 + }, + { + "epoch": 0.07452617601961511, + "grad_norm": 1.2822096347808838, + "learning_rate": 0.000223497260081495, + "loss": 4.7383, + "step": 53040 + }, + { + "epoch": 0.07454022695777869, + "grad_norm": 0.8843401074409485, + "learning_rate": 0.00022353941267387945, + "loss": 4.8443, + "step": 53050 + }, + { + "epoch": 0.07455427789594227, + "grad_norm": 0.9470616579055786, + "learning_rate": 0.00022358156526626385, + "loss": 4.836, + "step": 53060 + }, + { + "epoch": 0.07456832883410584, + "grad_norm": 0.878521740436554, + "learning_rate": 0.00022362371785864828, + "loss": 4.9428, + "step": 53070 + }, + { + "epoch": 0.07458237977226942, + "grad_norm": 0.894644558429718, + "learning_rate": 0.0002236658704510327, + "loss": 4.911, + "step": 53080 + }, + { + "epoch": 0.074596430710433, + "grad_norm": 0.8709197044372559, + "learning_rate": 0.00022370802304341714, + "loss": 4.8799, + "step": 53090 + }, + { + "epoch": 0.07461048164859657, + "grad_norm": 0.8657320141792297, + "learning_rate": 0.00022375017563580157, + "loss": 4.9005, + "step": 53100 + }, + { + "epoch": 0.07462453258676015, + "grad_norm": 0.9332450032234192, + "learning_rate": 0.00022379232822818603, + "loss": 4.994, + "step": 53110 + }, + { + "epoch": 0.07463858352492372, + "grad_norm": 0.9028719067573547, + "learning_rate": 0.00022383448082057046, + "loss": 4.9586, + "step": 53120 + }, + { + "epoch": 0.07465263446308731, + "grad_norm": 0.9070411920547485, + "learning_rate": 0.00022387663341295487, + "loss": 5.001, + "step": 53130 + }, + { + "epoch": 0.07466668540125089, + "grad_norm": 0.8787438869476318, + "learning_rate": 0.0002239187860053393, + "loss": 4.9275, + "step": 53140 + }, + { + "epoch": 0.07468073633941447, + "grad_norm": 0.901603102684021, + "learning_rate": 0.00022396093859772373, + "loss": 4.9226, + "step": 53150 + }, + { + "epoch": 0.07469478727757804, + "grad_norm": 0.9032551646232605, + "learning_rate": 0.00022400309119010816, + "loss": 4.7562, + "step": 53160 + }, + { + "epoch": 0.07470883821574162, + "grad_norm": 0.9277701377868652, + "learning_rate": 0.00022404524378249262, + "loss": 4.9293, + "step": 53170 + }, + { + "epoch": 0.0747228891539052, + "grad_norm": 0.9006163477897644, + "learning_rate": 0.00022408739637487705, + "loss": 4.7839, + "step": 53180 + }, + { + "epoch": 0.07473694009206877, + "grad_norm": 0.9267420768737793, + "learning_rate": 0.00022412954896726146, + "loss": 4.7228, + "step": 53190 + }, + { + "epoch": 0.07475099103023235, + "grad_norm": 0.8903605937957764, + "learning_rate": 0.00022417170155964589, + "loss": 4.7873, + "step": 53200 + }, + { + "epoch": 0.07476504196839592, + "grad_norm": 0.8836460113525391, + "learning_rate": 0.00022421385415203032, + "loss": 4.9616, + "step": 53210 + }, + { + "epoch": 0.0747790929065595, + "grad_norm": 0.8987323045730591, + "learning_rate": 0.00022425600674441478, + "loss": 4.8712, + "step": 53220 + }, + { + "epoch": 0.07479314384472308, + "grad_norm": 0.8787208199501038, + "learning_rate": 0.0002242981593367992, + "loss": 4.8302, + "step": 53230 + }, + { + "epoch": 0.07480719478288667, + "grad_norm": 0.8872378468513489, + "learning_rate": 0.00022434031192918364, + "loss": 4.7994, + "step": 53240 + }, + { + "epoch": 0.07482124572105024, + "grad_norm": 0.8948333263397217, + "learning_rate": 0.00022438246452156807, + "loss": 4.9177, + "step": 53250 + }, + { + "epoch": 0.07483529665921382, + "grad_norm": 0.8879920840263367, + "learning_rate": 0.00022442461711395247, + "loss": 4.8703, + "step": 53260 + }, + { + "epoch": 0.0748493475973774, + "grad_norm": 0.897305965423584, + "learning_rate": 0.0002244667697063369, + "loss": 4.8631, + "step": 53270 + }, + { + "epoch": 0.07486339853554097, + "grad_norm": 0.8983936905860901, + "learning_rate": 0.00022450892229872136, + "loss": 4.7904, + "step": 53280 + }, + { + "epoch": 0.07487744947370455, + "grad_norm": 0.9298288822174072, + "learning_rate": 0.0002245510748911058, + "loss": 4.9048, + "step": 53290 + }, + { + "epoch": 0.07489150041186812, + "grad_norm": 0.9018476009368896, + "learning_rate": 0.00022459322748349022, + "loss": 4.8721, + "step": 53300 + }, + { + "epoch": 0.0749055513500317, + "grad_norm": 0.8535480499267578, + "learning_rate": 0.00022463538007587466, + "loss": 4.9276, + "step": 53310 + }, + { + "epoch": 0.07491960228819528, + "grad_norm": 0.8795814514160156, + "learning_rate": 0.00022467753266825906, + "loss": 4.8589, + "step": 53320 + }, + { + "epoch": 0.07493365322635885, + "grad_norm": 0.9002819061279297, + "learning_rate": 0.0002247196852606435, + "loss": 5.0146, + "step": 53330 + }, + { + "epoch": 0.07494770416452244, + "grad_norm": 0.9197555780410767, + "learning_rate": 0.00022476183785302795, + "loss": 4.9211, + "step": 53340 + }, + { + "epoch": 0.07496175510268602, + "grad_norm": 0.9005763530731201, + "learning_rate": 0.00022480399044541238, + "loss": 4.9638, + "step": 53350 + }, + { + "epoch": 0.0749758060408496, + "grad_norm": 0.8709673881530762, + "learning_rate": 0.0002248461430377968, + "loss": 4.8595, + "step": 53360 + }, + { + "epoch": 0.07498985697901317, + "grad_norm": 0.8983860611915588, + "learning_rate": 0.00022488829563018124, + "loss": 4.7567, + "step": 53370 + }, + { + "epoch": 0.07500390791717675, + "grad_norm": 0.8729245066642761, + "learning_rate": 0.00022493044822256567, + "loss": 4.8404, + "step": 53380 + }, + { + "epoch": 0.07501795885534032, + "grad_norm": 0.8825687170028687, + "learning_rate": 0.00022497260081495008, + "loss": 4.8511, + "step": 53390 + }, + { + "epoch": 0.0750320097935039, + "grad_norm": 1.1445651054382324, + "learning_rate": 0.00022501475340733454, + "loss": 4.8355, + "step": 53400 + }, + { + "epoch": 0.07504606073166747, + "grad_norm": 0.8848526477813721, + "learning_rate": 0.00022505690599971897, + "loss": 4.8923, + "step": 53410 + }, + { + "epoch": 0.07506011166983105, + "grad_norm": 0.9172847867012024, + "learning_rate": 0.0002250990585921034, + "loss": 4.9146, + "step": 53420 + }, + { + "epoch": 0.07507416260799463, + "grad_norm": 0.8731652498245239, + "learning_rate": 0.00022514121118448783, + "loss": 5.0147, + "step": 53430 + }, + { + "epoch": 0.07508821354615822, + "grad_norm": 0.8790530562400818, + "learning_rate": 0.00022518336377687226, + "loss": 4.8628, + "step": 53440 + }, + { + "epoch": 0.07510226448432179, + "grad_norm": 0.8736057281494141, + "learning_rate": 0.00022522551636925667, + "loss": 4.7773, + "step": 53450 + }, + { + "epoch": 0.07511631542248537, + "grad_norm": 0.9131947755813599, + "learning_rate": 0.00022526766896164112, + "loss": 4.7888, + "step": 53460 + }, + { + "epoch": 0.07513036636064895, + "grad_norm": 0.8734030723571777, + "learning_rate": 0.00022530982155402555, + "loss": 4.6335, + "step": 53470 + }, + { + "epoch": 0.07514441729881252, + "grad_norm": 0.8606353402137756, + "learning_rate": 0.00022535197414640999, + "loss": 4.7898, + "step": 53480 + }, + { + "epoch": 0.0751584682369761, + "grad_norm": 0.9181352853775024, + "learning_rate": 0.00022539412673879442, + "loss": 4.7451, + "step": 53490 + }, + { + "epoch": 0.07517251917513967, + "grad_norm": 0.8751580119132996, + "learning_rate": 0.00022543627933117885, + "loss": 4.9376, + "step": 53500 + }, + { + "epoch": 0.07518657011330325, + "grad_norm": 0.8582903742790222, + "learning_rate": 0.00022547843192356325, + "loss": 4.8904, + "step": 53510 + }, + { + "epoch": 0.07520062105146683, + "grad_norm": 0.9157120585441589, + "learning_rate": 0.0002255205845159477, + "loss": 4.8151, + "step": 53520 + }, + { + "epoch": 0.0752146719896304, + "grad_norm": 0.866913914680481, + "learning_rate": 0.00022556273710833214, + "loss": 4.8619, + "step": 53530 + }, + { + "epoch": 0.07522872292779398, + "grad_norm": 0.9174304008483887, + "learning_rate": 0.00022560488970071657, + "loss": 4.8327, + "step": 53540 + }, + { + "epoch": 0.07524277386595757, + "grad_norm": 0.8768737316131592, + "learning_rate": 0.000225647042293101, + "loss": 4.7639, + "step": 53550 + }, + { + "epoch": 0.07525682480412114, + "grad_norm": 0.8870129585266113, + "learning_rate": 0.00022568919488548544, + "loss": 4.7549, + "step": 53560 + }, + { + "epoch": 0.07527087574228472, + "grad_norm": 0.8497852087020874, + "learning_rate": 0.0002257313474778699, + "loss": 4.6357, + "step": 53570 + }, + { + "epoch": 0.0752849266804483, + "grad_norm": 0.8594181537628174, + "learning_rate": 0.0002257735000702543, + "loss": 4.9172, + "step": 53580 + }, + { + "epoch": 0.07529897761861187, + "grad_norm": 0.8665555715560913, + "learning_rate": 0.00022581565266263873, + "loss": 4.8535, + "step": 53590 + }, + { + "epoch": 0.07531302855677545, + "grad_norm": 0.8849197030067444, + "learning_rate": 0.00022585780525502316, + "loss": 4.8296, + "step": 53600 + }, + { + "epoch": 0.07532707949493903, + "grad_norm": 0.8826219439506531, + "learning_rate": 0.0002258999578474076, + "loss": 4.8087, + "step": 53610 + }, + { + "epoch": 0.0753411304331026, + "grad_norm": 0.9253659248352051, + "learning_rate": 0.00022594211043979202, + "loss": 4.8322, + "step": 53620 + }, + { + "epoch": 0.07535518137126618, + "grad_norm": 0.9128340482711792, + "learning_rate": 0.00022598426303217648, + "loss": 4.6608, + "step": 53630 + }, + { + "epoch": 0.07536923230942975, + "grad_norm": 0.8583227396011353, + "learning_rate": 0.00022602641562456088, + "loss": 4.7733, + "step": 53640 + }, + { + "epoch": 0.07538328324759334, + "grad_norm": 0.8400426506996155, + "learning_rate": 0.00022606856821694532, + "loss": 4.799, + "step": 53650 + }, + { + "epoch": 0.07539733418575692, + "grad_norm": 0.9156240224838257, + "learning_rate": 0.00022611072080932975, + "loss": 4.7519, + "step": 53660 + }, + { + "epoch": 0.0754113851239205, + "grad_norm": 0.902900218963623, + "learning_rate": 0.00022615287340171418, + "loss": 4.6597, + "step": 53670 + }, + { + "epoch": 0.07542543606208407, + "grad_norm": 0.8428563475608826, + "learning_rate": 0.0002261950259940986, + "loss": 4.8529, + "step": 53680 + }, + { + "epoch": 0.07543948700024765, + "grad_norm": 0.9369030594825745, + "learning_rate": 0.00022623717858648307, + "loss": 4.6786, + "step": 53690 + }, + { + "epoch": 0.07545353793841122, + "grad_norm": 0.8624921441078186, + "learning_rate": 0.0002262793311788675, + "loss": 4.8759, + "step": 53700 + }, + { + "epoch": 0.0754675888765748, + "grad_norm": 0.864963710308075, + "learning_rate": 0.0002263214837712519, + "loss": 4.8758, + "step": 53710 + }, + { + "epoch": 0.07548163981473838, + "grad_norm": 0.8992347121238708, + "learning_rate": 0.00022636363636363633, + "loss": 4.7599, + "step": 53720 + }, + { + "epoch": 0.07549569075290195, + "grad_norm": 0.9037030935287476, + "learning_rate": 0.00022640578895602077, + "loss": 4.8639, + "step": 53730 + }, + { + "epoch": 0.07550974169106553, + "grad_norm": 0.917029857635498, + "learning_rate": 0.0002264479415484052, + "loss": 4.7925, + "step": 53740 + }, + { + "epoch": 0.07552379262922912, + "grad_norm": 0.8903194665908813, + "learning_rate": 0.00022649009414078965, + "loss": 4.7699, + "step": 53750 + }, + { + "epoch": 0.0755378435673927, + "grad_norm": 0.8409320712089539, + "learning_rate": 0.00022653224673317409, + "loss": 4.7887, + "step": 53760 + }, + { + "epoch": 0.07555189450555627, + "grad_norm": 0.9254723787307739, + "learning_rate": 0.0002265743993255585, + "loss": 4.7081, + "step": 53770 + }, + { + "epoch": 0.07556594544371985, + "grad_norm": 0.9033946990966797, + "learning_rate": 0.00022661655191794292, + "loss": 4.8571, + "step": 53780 + }, + { + "epoch": 0.07557999638188342, + "grad_norm": 0.8677847385406494, + "learning_rate": 0.00022665870451032735, + "loss": 4.8679, + "step": 53790 + }, + { + "epoch": 0.075594047320047, + "grad_norm": 0.871347963809967, + "learning_rate": 0.0002267008571027118, + "loss": 4.7151, + "step": 53800 + }, + { + "epoch": 0.07560809825821058, + "grad_norm": 0.8525243997573853, + "learning_rate": 0.00022674300969509624, + "loss": 4.9235, + "step": 53810 + }, + { + "epoch": 0.07562214919637415, + "grad_norm": 0.850013256072998, + "learning_rate": 0.00022678516228748067, + "loss": 4.9236, + "step": 53820 + }, + { + "epoch": 0.07563620013453773, + "grad_norm": 0.8850587606430054, + "learning_rate": 0.0002268273148798651, + "loss": 4.8146, + "step": 53830 + }, + { + "epoch": 0.0756502510727013, + "grad_norm": 0.9525415301322937, + "learning_rate": 0.0002268694674722495, + "loss": 4.6886, + "step": 53840 + }, + { + "epoch": 0.07566430201086488, + "grad_norm": 0.8798964619636536, + "learning_rate": 0.00022691162006463394, + "loss": 4.7848, + "step": 53850 + }, + { + "epoch": 0.07567835294902847, + "grad_norm": 0.8539289236068726, + "learning_rate": 0.0002269537726570184, + "loss": 4.7355, + "step": 53860 + }, + { + "epoch": 0.07569240388719205, + "grad_norm": 0.8648083806037903, + "learning_rate": 0.00022699592524940283, + "loss": 4.7876, + "step": 53870 + }, + { + "epoch": 0.07570645482535562, + "grad_norm": 0.906799852848053, + "learning_rate": 0.00022703807784178726, + "loss": 4.8246, + "step": 53880 + }, + { + "epoch": 0.0757205057635192, + "grad_norm": 0.9276502132415771, + "learning_rate": 0.0002270802304341717, + "loss": 4.8731, + "step": 53890 + }, + { + "epoch": 0.07573455670168278, + "grad_norm": 0.8921993970870972, + "learning_rate": 0.0002271223830265561, + "loss": 4.8887, + "step": 53900 + }, + { + "epoch": 0.07574860763984635, + "grad_norm": 0.8780595064163208, + "learning_rate": 0.00022716453561894053, + "loss": 4.7841, + "step": 53910 + }, + { + "epoch": 0.07576265857800993, + "grad_norm": 0.9392471313476562, + "learning_rate": 0.00022720668821132498, + "loss": 4.8724, + "step": 53920 + }, + { + "epoch": 0.0757767095161735, + "grad_norm": 0.89900803565979, + "learning_rate": 0.00022724884080370942, + "loss": 4.7855, + "step": 53930 + }, + { + "epoch": 0.07579076045433708, + "grad_norm": 0.9714470505714417, + "learning_rate": 0.00022729099339609385, + "loss": 4.8981, + "step": 53940 + }, + { + "epoch": 0.07580481139250066, + "grad_norm": 0.8538759350776672, + "learning_rate": 0.00022733314598847828, + "loss": 4.9213, + "step": 53950 + }, + { + "epoch": 0.07581886233066425, + "grad_norm": 0.879152238368988, + "learning_rate": 0.0002273752985808627, + "loss": 4.8243, + "step": 53960 + }, + { + "epoch": 0.07583291326882782, + "grad_norm": 0.8458477258682251, + "learning_rate": 0.0002274174511732471, + "loss": 4.8997, + "step": 53970 + }, + { + "epoch": 0.0758469642069914, + "grad_norm": 0.9155076146125793, + "learning_rate": 0.00022745960376563157, + "loss": 4.8447, + "step": 53980 + }, + { + "epoch": 0.07586101514515498, + "grad_norm": 0.9187341332435608, + "learning_rate": 0.000227501756358016, + "loss": 4.9255, + "step": 53990 + }, + { + "epoch": 0.07587506608331855, + "grad_norm": 0.8751149773597717, + "learning_rate": 0.00022754390895040043, + "loss": 4.8512, + "step": 54000 + }, + { + "epoch": 0.07588911702148213, + "grad_norm": 0.8937033414840698, + "learning_rate": 0.00022758606154278486, + "loss": 4.8653, + "step": 54010 + }, + { + "epoch": 0.0759031679596457, + "grad_norm": 0.862657904624939, + "learning_rate": 0.0002276282141351693, + "loss": 4.748, + "step": 54020 + }, + { + "epoch": 0.07591721889780928, + "grad_norm": 0.8705819845199585, + "learning_rate": 0.0002276703667275537, + "loss": 4.8872, + "step": 54030 + }, + { + "epoch": 0.07593126983597286, + "grad_norm": 0.8772851824760437, + "learning_rate": 0.00022771251931993816, + "loss": 4.9373, + "step": 54040 + }, + { + "epoch": 0.07594532077413643, + "grad_norm": 0.8797933459281921, + "learning_rate": 0.0002277546719123226, + "loss": 4.8121, + "step": 54050 + }, + { + "epoch": 0.07595937171230002, + "grad_norm": 0.8654122948646545, + "learning_rate": 0.00022779682450470702, + "loss": 4.76, + "step": 54060 + }, + { + "epoch": 0.0759734226504636, + "grad_norm": 0.875860869884491, + "learning_rate": 0.00022783897709709145, + "loss": 4.7232, + "step": 54070 + }, + { + "epoch": 0.07598747358862717, + "grad_norm": 0.9174049496650696, + "learning_rate": 0.00022788112968947588, + "loss": 4.7754, + "step": 54080 + }, + { + "epoch": 0.07600152452679075, + "grad_norm": 0.8807012438774109, + "learning_rate": 0.00022792328228186034, + "loss": 4.8693, + "step": 54090 + }, + { + "epoch": 0.07601557546495433, + "grad_norm": 0.8439174294471741, + "learning_rate": 0.00022796543487424475, + "loss": 4.8432, + "step": 54100 + }, + { + "epoch": 0.0760296264031179, + "grad_norm": 0.9406492710113525, + "learning_rate": 0.00022800758746662918, + "loss": 4.8385, + "step": 54110 + }, + { + "epoch": 0.07604367734128148, + "grad_norm": 0.8603804707527161, + "learning_rate": 0.0002280497400590136, + "loss": 4.7904, + "step": 54120 + }, + { + "epoch": 0.07605772827944506, + "grad_norm": 0.8673272132873535, + "learning_rate": 0.0002280876773921596, + "loss": 4.8706, + "step": 54130 + }, + { + "epoch": 0.07607177921760863, + "grad_norm": 0.8606926202774048, + "learning_rate": 0.00022812982998454402, + "loss": 4.7967, + "step": 54140 + }, + { + "epoch": 0.07608583015577221, + "grad_norm": 0.8870376348495483, + "learning_rate": 0.00022817198257692845, + "loss": 4.8095, + "step": 54150 + }, + { + "epoch": 0.07609988109393578, + "grad_norm": 0.8442344665527344, + "learning_rate": 0.00022821413516931288, + "loss": 4.8743, + "step": 54160 + }, + { + "epoch": 0.07611393203209937, + "grad_norm": 0.8706654906272888, + "learning_rate": 0.00022825628776169734, + "loss": 4.839, + "step": 54170 + }, + { + "epoch": 0.07612798297026295, + "grad_norm": 0.8698878288269043, + "learning_rate": 0.00022829844035408177, + "loss": 4.8972, + "step": 54180 + }, + { + "epoch": 0.07614203390842653, + "grad_norm": 0.8605942726135254, + "learning_rate": 0.00022834059294646618, + "loss": 4.7414, + "step": 54190 + }, + { + "epoch": 0.0761560848465901, + "grad_norm": 0.8853992819786072, + "learning_rate": 0.0002283827455388506, + "loss": 4.7935, + "step": 54200 + }, + { + "epoch": 0.07617013578475368, + "grad_norm": 1.3746801614761353, + "learning_rate": 0.00022842489813123504, + "loss": 4.8538, + "step": 54210 + }, + { + "epoch": 0.07618418672291725, + "grad_norm": 0.8895174264907837, + "learning_rate": 0.00022846705072361947, + "loss": 4.8864, + "step": 54220 + }, + { + "epoch": 0.07619823766108083, + "grad_norm": 0.9199739098548889, + "learning_rate": 0.00022850920331600393, + "loss": 4.7924, + "step": 54230 + }, + { + "epoch": 0.07621228859924441, + "grad_norm": 0.9062716364860535, + "learning_rate": 0.00022855135590838836, + "loss": 4.8258, + "step": 54240 + }, + { + "epoch": 0.07622633953740798, + "grad_norm": 0.8298748731613159, + "learning_rate": 0.0002285935085007728, + "loss": 4.8469, + "step": 54250 + }, + { + "epoch": 0.07624039047557156, + "grad_norm": 0.866485059261322, + "learning_rate": 0.0002286356610931572, + "loss": 4.8599, + "step": 54260 + }, + { + "epoch": 0.07625444141373515, + "grad_norm": 0.8970810174942017, + "learning_rate": 0.00022867781368554163, + "loss": 4.8573, + "step": 54270 + }, + { + "epoch": 0.07626849235189873, + "grad_norm": 0.8525856137275696, + "learning_rate": 0.00022871996627792606, + "loss": 4.7595, + "step": 54280 + }, + { + "epoch": 0.0762825432900623, + "grad_norm": 0.9844761490821838, + "learning_rate": 0.00022876211887031052, + "loss": 4.829, + "step": 54290 + }, + { + "epoch": 0.07629659422822588, + "grad_norm": 0.8804373741149902, + "learning_rate": 0.00022880427146269495, + "loss": 4.8698, + "step": 54300 + }, + { + "epoch": 0.07631064516638945, + "grad_norm": 0.9130948781967163, + "learning_rate": 0.00022884642405507938, + "loss": 4.8689, + "step": 54310 + }, + { + "epoch": 0.07632469610455303, + "grad_norm": 0.8662232160568237, + "learning_rate": 0.00022888857664746378, + "loss": 4.9434, + "step": 54320 + }, + { + "epoch": 0.0763387470427166, + "grad_norm": 0.8689405918121338, + "learning_rate": 0.00022893072923984821, + "loss": 4.892, + "step": 54330 + }, + { + "epoch": 0.07635279798088018, + "grad_norm": 0.8666510581970215, + "learning_rate": 0.00022897288183223264, + "loss": 4.9568, + "step": 54340 + }, + { + "epoch": 0.07636684891904376, + "grad_norm": 0.885850191116333, + "learning_rate": 0.0002290150344246171, + "loss": 4.8736, + "step": 54350 + }, + { + "epoch": 0.07638089985720733, + "grad_norm": 0.8362758159637451, + "learning_rate": 0.00022905718701700153, + "loss": 4.7962, + "step": 54360 + }, + { + "epoch": 0.07639495079537093, + "grad_norm": 0.8462679982185364, + "learning_rate": 0.00022909933960938597, + "loss": 4.7975, + "step": 54370 + }, + { + "epoch": 0.0764090017335345, + "grad_norm": 0.8839713335037231, + "learning_rate": 0.0002291414922017704, + "loss": 4.7665, + "step": 54380 + }, + { + "epoch": 0.07642305267169808, + "grad_norm": 0.8993695974349976, + "learning_rate": 0.0002291836447941548, + "loss": 4.8324, + "step": 54390 + }, + { + "epoch": 0.07643710360986165, + "grad_norm": 0.8608269095420837, + "learning_rate": 0.00022922579738653923, + "loss": 4.7931, + "step": 54400 + }, + { + "epoch": 0.07645115454802523, + "grad_norm": 0.8435550928115845, + "learning_rate": 0.0002292679499789237, + "loss": 4.7409, + "step": 54410 + }, + { + "epoch": 0.0764652054861888, + "grad_norm": 0.9165601134300232, + "learning_rate": 0.00022931010257130812, + "loss": 4.7406, + "step": 54420 + }, + { + "epoch": 0.07647925642435238, + "grad_norm": 0.8588644862174988, + "learning_rate": 0.00022935225516369255, + "loss": 4.7544, + "step": 54430 + }, + { + "epoch": 0.07649330736251596, + "grad_norm": 0.9699477553367615, + "learning_rate": 0.00022939440775607698, + "loss": 4.8401, + "step": 54440 + }, + { + "epoch": 0.07650735830067953, + "grad_norm": 0.822721004486084, + "learning_rate": 0.0002294365603484614, + "loss": 4.9051, + "step": 54450 + }, + { + "epoch": 0.07652140923884311, + "grad_norm": 0.8678251504898071, + "learning_rate": 0.00022947871294084585, + "loss": 4.9236, + "step": 54460 + }, + { + "epoch": 0.07653546017700669, + "grad_norm": 0.8811358213424683, + "learning_rate": 0.00022952086553323028, + "loss": 4.9256, + "step": 54470 + }, + { + "epoch": 0.07654951111517028, + "grad_norm": 0.8711695075035095, + "learning_rate": 0.0002295630181256147, + "loss": 4.8998, + "step": 54480 + }, + { + "epoch": 0.07656356205333385, + "grad_norm": 0.8371874094009399, + "learning_rate": 0.00022960517071799914, + "loss": 4.7928, + "step": 54490 + }, + { + "epoch": 0.07657761299149743, + "grad_norm": 0.8641277551651001, + "learning_rate": 0.00022964732331038357, + "loss": 4.8052, + "step": 54500 + }, + { + "epoch": 0.076591663929661, + "grad_norm": 0.8634895086288452, + "learning_rate": 0.00022968947590276803, + "loss": 4.9161, + "step": 54510 + }, + { + "epoch": 0.07660571486782458, + "grad_norm": 0.9361057281494141, + "learning_rate": 0.00022973162849515243, + "loss": 4.8, + "step": 54520 + }, + { + "epoch": 0.07661976580598816, + "grad_norm": 0.8911561965942383, + "learning_rate": 0.00022977378108753686, + "loss": 4.8536, + "step": 54530 + }, + { + "epoch": 0.07663381674415173, + "grad_norm": 0.8985777497291565, + "learning_rate": 0.0002298159336799213, + "loss": 4.6861, + "step": 54540 + }, + { + "epoch": 0.07664786768231531, + "grad_norm": 0.8521196246147156, + "learning_rate": 0.00022985808627230573, + "loss": 4.8514, + "step": 54550 + }, + { + "epoch": 0.07666191862047889, + "grad_norm": 0.8784716129302979, + "learning_rate": 0.00022990023886469016, + "loss": 4.8231, + "step": 54560 + }, + { + "epoch": 0.07667596955864246, + "grad_norm": 0.8700553774833679, + "learning_rate": 0.00022994239145707462, + "loss": 4.8225, + "step": 54570 + }, + { + "epoch": 0.07669002049680605, + "grad_norm": 0.8594143986701965, + "learning_rate": 0.00022998454404945902, + "loss": 4.8293, + "step": 54580 + }, + { + "epoch": 0.07670407143496963, + "grad_norm": 0.8826280832290649, + "learning_rate": 0.00023002669664184345, + "loss": 4.8003, + "step": 54590 + }, + { + "epoch": 0.0767181223731332, + "grad_norm": 0.9344685673713684, + "learning_rate": 0.00023006884923422788, + "loss": 4.688, + "step": 54600 + }, + { + "epoch": 0.07673217331129678, + "grad_norm": 0.8549621105194092, + "learning_rate": 0.00023010678656737386, + "loss": 4.8083, + "step": 54610 + }, + { + "epoch": 0.07674622424946036, + "grad_norm": 0.8863023519515991, + "learning_rate": 0.0002301489391597583, + "loss": 4.681, + "step": 54620 + }, + { + "epoch": 0.07676027518762393, + "grad_norm": 0.8559615015983582, + "learning_rate": 0.00023019109175214273, + "loss": 4.8223, + "step": 54630 + }, + { + "epoch": 0.07677432612578751, + "grad_norm": 0.8779832720756531, + "learning_rate": 0.00023023324434452716, + "loss": 4.8405, + "step": 54640 + }, + { + "epoch": 0.07678837706395109, + "grad_norm": 0.8753707408905029, + "learning_rate": 0.00023027539693691162, + "loss": 4.8169, + "step": 54650 + }, + { + "epoch": 0.07680242800211466, + "grad_norm": 0.8928582668304443, + "learning_rate": 0.00023031754952929605, + "loss": 4.7668, + "step": 54660 + }, + { + "epoch": 0.07681647894027824, + "grad_norm": 0.8623060584068298, + "learning_rate": 0.00023035970212168048, + "loss": 4.7727, + "step": 54670 + }, + { + "epoch": 0.07683052987844183, + "grad_norm": 0.9036771655082703, + "learning_rate": 0.00023040185471406488, + "loss": 4.7807, + "step": 54680 + }, + { + "epoch": 0.0768445808166054, + "grad_norm": 0.8482112884521484, + "learning_rate": 0.00023044400730644931, + "loss": 4.7243, + "step": 54690 + }, + { + "epoch": 0.07685863175476898, + "grad_norm": 0.8324148058891296, + "learning_rate": 0.00023048615989883375, + "loss": 4.8233, + "step": 54700 + }, + { + "epoch": 0.07687268269293256, + "grad_norm": 0.8993374109268188, + "learning_rate": 0.0002305283124912182, + "loss": 4.9474, + "step": 54710 + }, + { + "epoch": 0.07688673363109613, + "grad_norm": 0.8752866387367249, + "learning_rate": 0.00023057046508360263, + "loss": 4.8148, + "step": 54720 + }, + { + "epoch": 0.07690078456925971, + "grad_norm": 0.884166419506073, + "learning_rate": 0.00023061261767598707, + "loss": 4.9107, + "step": 54730 + }, + { + "epoch": 0.07691483550742328, + "grad_norm": 0.8490076661109924, + "learning_rate": 0.00023065477026837147, + "loss": 4.7861, + "step": 54740 + }, + { + "epoch": 0.07692888644558686, + "grad_norm": 0.8822079300880432, + "learning_rate": 0.0002306969228607559, + "loss": 4.959, + "step": 54750 + }, + { + "epoch": 0.07694293738375044, + "grad_norm": 0.8657336831092834, + "learning_rate": 0.00023073907545314033, + "loss": 4.7831, + "step": 54760 + }, + { + "epoch": 0.07695698832191401, + "grad_norm": 0.8511672019958496, + "learning_rate": 0.0002307812280455248, + "loss": 4.7675, + "step": 54770 + }, + { + "epoch": 0.07697103926007759, + "grad_norm": 0.8640127778053284, + "learning_rate": 0.00023082338063790922, + "loss": 4.8313, + "step": 54780 + }, + { + "epoch": 0.07698509019824118, + "grad_norm": 0.8668428659439087, + "learning_rate": 0.00023086553323029365, + "loss": 4.9068, + "step": 54790 + }, + { + "epoch": 0.07699914113640476, + "grad_norm": 0.8341074585914612, + "learning_rate": 0.00023090768582267808, + "loss": 4.8259, + "step": 54800 + }, + { + "epoch": 0.07701319207456833, + "grad_norm": 0.8660095930099487, + "learning_rate": 0.0002309498384150625, + "loss": 4.7503, + "step": 54810 + }, + { + "epoch": 0.07702724301273191, + "grad_norm": 0.8701623678207397, + "learning_rate": 0.00023099199100744692, + "loss": 4.7477, + "step": 54820 + }, + { + "epoch": 0.07704129395089548, + "grad_norm": 0.8704190850257874, + "learning_rate": 0.00023103414359983138, + "loss": 4.7662, + "step": 54830 + }, + { + "epoch": 0.07705534488905906, + "grad_norm": 0.8704859614372253, + "learning_rate": 0.0002310762961922158, + "loss": 4.8061, + "step": 54840 + }, + { + "epoch": 0.07706939582722264, + "grad_norm": 0.8688639998435974, + "learning_rate": 0.00023111844878460024, + "loss": 4.6981, + "step": 54850 + }, + { + "epoch": 0.07708344676538621, + "grad_norm": 0.8302347660064697, + "learning_rate": 0.00023116060137698467, + "loss": 4.7741, + "step": 54860 + }, + { + "epoch": 0.07709749770354979, + "grad_norm": 0.8849836587905884, + "learning_rate": 0.00023120275396936908, + "loss": 4.8139, + "step": 54870 + }, + { + "epoch": 0.07711154864171336, + "grad_norm": 0.8696654438972473, + "learning_rate": 0.0002312449065617535, + "loss": 4.8226, + "step": 54880 + }, + { + "epoch": 0.07712559957987696, + "grad_norm": 0.8935073018074036, + "learning_rate": 0.00023128705915413796, + "loss": 4.7954, + "step": 54890 + }, + { + "epoch": 0.07713965051804053, + "grad_norm": 0.8748064637184143, + "learning_rate": 0.0002313292117465224, + "loss": 4.8052, + "step": 54900 + }, + { + "epoch": 0.07715370145620411, + "grad_norm": 0.8696599006652832, + "learning_rate": 0.00023137136433890683, + "loss": 4.8104, + "step": 54910 + }, + { + "epoch": 0.07716775239436768, + "grad_norm": 0.852148175239563, + "learning_rate": 0.00023141351693129126, + "loss": 4.7914, + "step": 54920 + }, + { + "epoch": 0.07718180333253126, + "grad_norm": 0.849692702293396, + "learning_rate": 0.0002314556695236757, + "loss": 4.8389, + "step": 54930 + }, + { + "epoch": 0.07719585427069484, + "grad_norm": 0.8747760057449341, + "learning_rate": 0.0002314978221160601, + "loss": 4.8114, + "step": 54940 + }, + { + "epoch": 0.07720990520885841, + "grad_norm": 0.972870409488678, + "learning_rate": 0.00023153997470844455, + "loss": 4.9658, + "step": 54950 + }, + { + "epoch": 0.07722395614702199, + "grad_norm": 0.883500337600708, + "learning_rate": 0.00023158212730082898, + "loss": 4.915, + "step": 54960 + }, + { + "epoch": 0.07723800708518556, + "grad_norm": 0.860719621181488, + "learning_rate": 0.00023162427989321341, + "loss": 4.7428, + "step": 54970 + }, + { + "epoch": 0.07725205802334914, + "grad_norm": 0.8673244118690491, + "learning_rate": 0.00023166643248559785, + "loss": 4.9087, + "step": 54980 + }, + { + "epoch": 0.07726610896151273, + "grad_norm": 0.862847626209259, + "learning_rate": 0.00023170858507798228, + "loss": 4.7927, + "step": 54990 + }, + { + "epoch": 0.0772801598996763, + "grad_norm": 0.873631477355957, + "learning_rate": 0.00023175073767036668, + "loss": 4.7431, + "step": 55000 + }, + { + "epoch": 0.07729421083783988, + "grad_norm": 0.8591784834861755, + "learning_rate": 0.00023179289026275114, + "loss": 4.8536, + "step": 55010 + }, + { + "epoch": 0.07730826177600346, + "grad_norm": 0.8631607890129089, + "learning_rate": 0.00023183504285513557, + "loss": 4.7317, + "step": 55020 + }, + { + "epoch": 0.07732231271416704, + "grad_norm": 0.8369123339653015, + "learning_rate": 0.00023187719544752, + "loss": 4.7036, + "step": 55030 + }, + { + "epoch": 0.07733636365233061, + "grad_norm": 0.8408652544021606, + "learning_rate": 0.00023191934803990443, + "loss": 4.9028, + "step": 55040 + }, + { + "epoch": 0.07735041459049419, + "grad_norm": 0.8339114189147949, + "learning_rate": 0.0002319615006322889, + "loss": 4.7352, + "step": 55050 + }, + { + "epoch": 0.07736446552865776, + "grad_norm": 0.8363640308380127, + "learning_rate": 0.00023200365322467327, + "loss": 4.7126, + "step": 55060 + }, + { + "epoch": 0.07737851646682134, + "grad_norm": 0.8503546714782715, + "learning_rate": 0.00023204580581705773, + "loss": 4.8552, + "step": 55070 + }, + { + "epoch": 0.07739256740498492, + "grad_norm": 0.8623889088630676, + "learning_rate": 0.00023208795840944216, + "loss": 4.7625, + "step": 55080 + }, + { + "epoch": 0.07740661834314849, + "grad_norm": 0.8672446608543396, + "learning_rate": 0.0002321301110018266, + "loss": 4.703, + "step": 55090 + }, + { + "epoch": 0.07742066928131208, + "grad_norm": 0.83352130651474, + "learning_rate": 0.00023217226359421102, + "loss": 4.9044, + "step": 55100 + }, + { + "epoch": 0.07743472021947566, + "grad_norm": 0.8357213139533997, + "learning_rate": 0.00023221441618659548, + "loss": 4.7765, + "step": 55110 + }, + { + "epoch": 0.07744877115763923, + "grad_norm": 0.8617483973503113, + "learning_rate": 0.0002322565687789799, + "loss": 4.8928, + "step": 55120 + }, + { + "epoch": 0.07746282209580281, + "grad_norm": 0.8757510185241699, + "learning_rate": 0.0002322987213713643, + "loss": 4.9618, + "step": 55130 + }, + { + "epoch": 0.07747687303396639, + "grad_norm": 0.8584406971931458, + "learning_rate": 0.00023234087396374874, + "loss": 4.7733, + "step": 55140 + }, + { + "epoch": 0.07749092397212996, + "grad_norm": 0.8585185408592224, + "learning_rate": 0.00023238302655613318, + "loss": 4.8477, + "step": 55150 + }, + { + "epoch": 0.07750497491029354, + "grad_norm": 0.9038747549057007, + "learning_rate": 0.0002324251791485176, + "loss": 4.9543, + "step": 55160 + }, + { + "epoch": 0.07751902584845712, + "grad_norm": 0.8479253649711609, + "learning_rate": 0.00023246733174090206, + "loss": 4.7465, + "step": 55170 + }, + { + "epoch": 0.07753307678662069, + "grad_norm": 0.8460027575492859, + "learning_rate": 0.0002325094843332865, + "loss": 4.8782, + "step": 55180 + }, + { + "epoch": 0.07754712772478427, + "grad_norm": 0.8895059823989868, + "learning_rate": 0.0002325516369256709, + "loss": 4.9051, + "step": 55190 + }, + { + "epoch": 0.07756117866294786, + "grad_norm": 0.8584068417549133, + "learning_rate": 0.00023259378951805533, + "loss": 4.8017, + "step": 55200 + }, + { + "epoch": 0.07757522960111143, + "grad_norm": 0.8654706478118896, + "learning_rate": 0.00023263594211043976, + "loss": 4.8685, + "step": 55210 + }, + { + "epoch": 0.07758928053927501, + "grad_norm": 0.8444214463233948, + "learning_rate": 0.0002326780947028242, + "loss": 4.7856, + "step": 55220 + }, + { + "epoch": 0.07760333147743859, + "grad_norm": 0.9347013235092163, + "learning_rate": 0.00023272024729520865, + "loss": 4.8631, + "step": 55230 + }, + { + "epoch": 0.07761738241560216, + "grad_norm": 0.8600069880485535, + "learning_rate": 0.00023276239988759308, + "loss": 4.953, + "step": 55240 + }, + { + "epoch": 0.07763143335376574, + "grad_norm": 0.8897565007209778, + "learning_rate": 0.00023280455247997751, + "loss": 4.8058, + "step": 55250 + }, + { + "epoch": 0.07764548429192931, + "grad_norm": 0.868504524230957, + "learning_rate": 0.00023284670507236192, + "loss": 4.7797, + "step": 55260 + }, + { + "epoch": 0.07765953523009289, + "grad_norm": 0.8592216372489929, + "learning_rate": 0.00023288885766474635, + "loss": 4.7845, + "step": 55270 + }, + { + "epoch": 0.07767358616825647, + "grad_norm": 0.8826098442077637, + "learning_rate": 0.00023293101025713078, + "loss": 4.7159, + "step": 55280 + }, + { + "epoch": 0.07768763710642004, + "grad_norm": 0.8514786958694458, + "learning_rate": 0.00023297316284951524, + "loss": 4.7917, + "step": 55290 + }, + { + "epoch": 0.07770168804458363, + "grad_norm": 0.9170621037483215, + "learning_rate": 0.00023301531544189967, + "loss": 4.7785, + "step": 55300 + }, + { + "epoch": 0.07771573898274721, + "grad_norm": 0.8372135162353516, + "learning_rate": 0.0002330574680342841, + "loss": 4.8629, + "step": 55310 + }, + { + "epoch": 0.07772978992091079, + "grad_norm": 0.867605447769165, + "learning_rate": 0.0002330996206266685, + "loss": 4.8731, + "step": 55320 + }, + { + "epoch": 0.07774384085907436, + "grad_norm": 0.8326741456985474, + "learning_rate": 0.00023314177321905294, + "loss": 4.9606, + "step": 55330 + }, + { + "epoch": 0.07775789179723794, + "grad_norm": 0.8772097229957581, + "learning_rate": 0.00023318392581143737, + "loss": 4.7784, + "step": 55340 + }, + { + "epoch": 0.07777194273540151, + "grad_norm": 0.8476175665855408, + "learning_rate": 0.00023322607840382183, + "loss": 4.9128, + "step": 55350 + }, + { + "epoch": 0.07778599367356509, + "grad_norm": 0.8802474737167358, + "learning_rate": 0.00023326823099620626, + "loss": 4.7922, + "step": 55360 + }, + { + "epoch": 0.07780004461172867, + "grad_norm": 0.8601409792900085, + "learning_rate": 0.0002333103835885907, + "loss": 4.8804, + "step": 55370 + }, + { + "epoch": 0.07781409554989224, + "grad_norm": 0.8463119864463806, + "learning_rate": 0.00023335253618097512, + "loss": 4.8471, + "step": 55380 + }, + { + "epoch": 0.07782814648805582, + "grad_norm": 0.8972960114479065, + "learning_rate": 0.00023339468877335952, + "loss": 4.8792, + "step": 55390 + }, + { + "epoch": 0.0778421974262194, + "grad_norm": 0.9296767711639404, + "learning_rate": 0.00023343684136574395, + "loss": 4.7318, + "step": 55400 + }, + { + "epoch": 0.07785624836438299, + "grad_norm": 0.9014337062835693, + "learning_rate": 0.0002334789939581284, + "loss": 4.8008, + "step": 55410 + }, + { + "epoch": 0.07787029930254656, + "grad_norm": 0.8471907377243042, + "learning_rate": 0.00023352114655051284, + "loss": 4.9053, + "step": 55420 + }, + { + "epoch": 0.07788435024071014, + "grad_norm": 0.8656927347183228, + "learning_rate": 0.00023356329914289727, + "loss": 4.8065, + "step": 55430 + }, + { + "epoch": 0.07789840117887371, + "grad_norm": 0.8613098859786987, + "learning_rate": 0.0002336054517352817, + "loss": 5.0078, + "step": 55440 + }, + { + "epoch": 0.07791245211703729, + "grad_norm": 0.8597536683082581, + "learning_rate": 0.0002336476043276661, + "loss": 4.8672, + "step": 55450 + }, + { + "epoch": 0.07792650305520087, + "grad_norm": 0.8532637357711792, + "learning_rate": 0.00023368975692005054, + "loss": 4.7714, + "step": 55460 + }, + { + "epoch": 0.07794055399336444, + "grad_norm": 0.9325604438781738, + "learning_rate": 0.000233731909512435, + "loss": 4.6782, + "step": 55470 + }, + { + "epoch": 0.07795460493152802, + "grad_norm": 0.8740974068641663, + "learning_rate": 0.00023377406210481943, + "loss": 4.7074, + "step": 55480 + }, + { + "epoch": 0.0779686558696916, + "grad_norm": 0.8521115183830261, + "learning_rate": 0.00023381621469720386, + "loss": 4.8376, + "step": 55490 + }, + { + "epoch": 0.07798270680785517, + "grad_norm": 0.8538252711296082, + "learning_rate": 0.0002338583672895883, + "loss": 4.8655, + "step": 55500 + }, + { + "epoch": 0.07799675774601876, + "grad_norm": 0.8827087879180908, + "learning_rate": 0.00023390051988197272, + "loss": 4.9127, + "step": 55510 + }, + { + "epoch": 0.07801080868418234, + "grad_norm": 0.8596060872077942, + "learning_rate": 0.00023394267247435713, + "loss": 4.7259, + "step": 55520 + }, + { + "epoch": 0.07802485962234591, + "grad_norm": 0.8459311723709106, + "learning_rate": 0.0002339848250667416, + "loss": 4.9033, + "step": 55530 + }, + { + "epoch": 0.07803891056050949, + "grad_norm": 0.8380910754203796, + "learning_rate": 0.00023402697765912602, + "loss": 4.7834, + "step": 55540 + }, + { + "epoch": 0.07805296149867307, + "grad_norm": 0.8815401792526245, + "learning_rate": 0.00023406913025151045, + "loss": 4.8435, + "step": 55550 + }, + { + "epoch": 0.07806701243683664, + "grad_norm": 0.8380628228187561, + "learning_rate": 0.00023411128284389488, + "loss": 4.8127, + "step": 55560 + }, + { + "epoch": 0.07808106337500022, + "grad_norm": 0.8409835696220398, + "learning_rate": 0.0002341534354362793, + "loss": 4.8411, + "step": 55570 + }, + { + "epoch": 0.0780951143131638, + "grad_norm": 0.8814390897750854, + "learning_rate": 0.00023419558802866372, + "loss": 4.8431, + "step": 55580 + }, + { + "epoch": 0.07810916525132737, + "grad_norm": 0.8419836163520813, + "learning_rate": 0.00023423774062104817, + "loss": 4.8255, + "step": 55590 + }, + { + "epoch": 0.07812321618949095, + "grad_norm": 0.8572084903717041, + "learning_rate": 0.0002342798932134326, + "loss": 4.727, + "step": 55600 + }, + { + "epoch": 0.07813726712765454, + "grad_norm": 0.8860167860984802, + "learning_rate": 0.00023432204580581704, + "loss": 4.9694, + "step": 55610 + }, + { + "epoch": 0.07815131806581811, + "grad_norm": 0.8654281497001648, + "learning_rate": 0.00023436419839820147, + "loss": 4.8643, + "step": 55620 + }, + { + "epoch": 0.07816536900398169, + "grad_norm": 0.848604142665863, + "learning_rate": 0.0002344063509905859, + "loss": 4.9082, + "step": 55630 + }, + { + "epoch": 0.07817941994214526, + "grad_norm": 0.8291561007499695, + "learning_rate": 0.00023444850358297036, + "loss": 4.9041, + "step": 55640 + }, + { + "epoch": 0.07819347088030884, + "grad_norm": 0.8431739807128906, + "learning_rate": 0.00023449065617535476, + "loss": 4.8418, + "step": 55650 + }, + { + "epoch": 0.07820752181847242, + "grad_norm": 0.8272885084152222, + "learning_rate": 0.0002345328087677392, + "loss": 4.7878, + "step": 55660 + }, + { + "epoch": 0.078221572756636, + "grad_norm": 0.8414096236228943, + "learning_rate": 0.00023457496136012362, + "loss": 4.9341, + "step": 55670 + }, + { + "epoch": 0.07823562369479957, + "grad_norm": 0.8919362425804138, + "learning_rate": 0.00023461711395250805, + "loss": 4.9505, + "step": 55680 + }, + { + "epoch": 0.07824967463296315, + "grad_norm": 0.8313416242599487, + "learning_rate": 0.0002346592665448925, + "loss": 4.9542, + "step": 55690 + }, + { + "epoch": 0.07826372557112672, + "grad_norm": 0.902398943901062, + "learning_rate": 0.00023470141913727694, + "loss": 4.8456, + "step": 55700 + }, + { + "epoch": 0.07827777650929031, + "grad_norm": 0.8654336333274841, + "learning_rate": 0.00023474357172966135, + "loss": 4.7756, + "step": 55710 + }, + { + "epoch": 0.07829182744745389, + "grad_norm": 0.848102867603302, + "learning_rate": 0.00023478572432204578, + "loss": 4.9353, + "step": 55720 + }, + { + "epoch": 0.07830587838561746, + "grad_norm": 0.8511942028999329, + "learning_rate": 0.0002348278769144302, + "loss": 4.7061, + "step": 55730 + }, + { + "epoch": 0.07831992932378104, + "grad_norm": 0.8495092988014221, + "learning_rate": 0.00023487002950681464, + "loss": 4.8499, + "step": 55740 + }, + { + "epoch": 0.07833398026194462, + "grad_norm": 0.8534985780715942, + "learning_rate": 0.0002349121820991991, + "loss": 4.7557, + "step": 55750 + }, + { + "epoch": 0.07834803120010819, + "grad_norm": 0.8438852429389954, + "learning_rate": 0.00023495433469158353, + "loss": 4.7647, + "step": 55760 + }, + { + "epoch": 0.07836208213827177, + "grad_norm": 0.8534382581710815, + "learning_rate": 0.00023499648728396793, + "loss": 4.7887, + "step": 55770 + }, + { + "epoch": 0.07837613307643534, + "grad_norm": 0.8537501096725464, + "learning_rate": 0.00023503863987635237, + "loss": 4.8719, + "step": 55780 + }, + { + "epoch": 0.07839018401459892, + "grad_norm": 0.862980306148529, + "learning_rate": 0.0002350807924687368, + "loss": 4.8047, + "step": 55790 + }, + { + "epoch": 0.0784042349527625, + "grad_norm": 0.8612253069877625, + "learning_rate": 0.00023512294506112123, + "loss": 4.8798, + "step": 55800 + }, + { + "epoch": 0.07841828589092607, + "grad_norm": 0.8723065257072449, + "learning_rate": 0.00023516509765350569, + "loss": 4.8036, + "step": 55810 + }, + { + "epoch": 0.07843233682908966, + "grad_norm": 0.8549647331237793, + "learning_rate": 0.00023520725024589012, + "loss": 4.9236, + "step": 55820 + }, + { + "epoch": 0.07844638776725324, + "grad_norm": 0.8528120517730713, + "learning_rate": 0.00023524940283827455, + "loss": 4.8222, + "step": 55830 + }, + { + "epoch": 0.07846043870541682, + "grad_norm": 0.8351045250892639, + "learning_rate": 0.00023529155543065895, + "loss": 4.8075, + "step": 55840 + }, + { + "epoch": 0.07847448964358039, + "grad_norm": 0.8468853831291199, + "learning_rate": 0.00023533370802304338, + "loss": 4.7578, + "step": 55850 + }, + { + "epoch": 0.07848854058174397, + "grad_norm": 0.9013736844062805, + "learning_rate": 0.00023537586061542781, + "loss": 4.7043, + "step": 55860 + }, + { + "epoch": 0.07850259151990754, + "grad_norm": 0.8511090874671936, + "learning_rate": 0.00023541801320781227, + "loss": 4.8785, + "step": 55870 + }, + { + "epoch": 0.07851664245807112, + "grad_norm": 0.8878459930419922, + "learning_rate": 0.0002354601658001967, + "loss": 4.8643, + "step": 55880 + }, + { + "epoch": 0.0785306933962347, + "grad_norm": 0.873618483543396, + "learning_rate": 0.00023550231839258114, + "loss": 4.7879, + "step": 55890 + }, + { + "epoch": 0.07854474433439827, + "grad_norm": 0.829479992389679, + "learning_rate": 0.00023554447098496554, + "loss": 4.6622, + "step": 55900 + }, + { + "epoch": 0.07855879527256185, + "grad_norm": 0.8406625986099243, + "learning_rate": 0.00023558662357734997, + "loss": 4.972, + "step": 55910 + }, + { + "epoch": 0.07857284621072544, + "grad_norm": 0.8553104400634766, + "learning_rate": 0.0002356287761697344, + "loss": 4.7771, + "step": 55920 + }, + { + "epoch": 0.07858689714888902, + "grad_norm": 0.8392889499664307, + "learning_rate": 0.00023567092876211886, + "loss": 4.7373, + "step": 55930 + }, + { + "epoch": 0.07860094808705259, + "grad_norm": 0.8430394530296326, + "learning_rate": 0.0002357130813545033, + "loss": 4.9143, + "step": 55940 + }, + { + "epoch": 0.07861499902521617, + "grad_norm": 0.8386268615722656, + "learning_rate": 0.00023575523394688772, + "loss": 4.8825, + "step": 55950 + }, + { + "epoch": 0.07862904996337974, + "grad_norm": 0.8521925210952759, + "learning_rate": 0.00023579738653927215, + "loss": 4.7222, + "step": 55960 + }, + { + "epoch": 0.07864310090154332, + "grad_norm": 0.8255799412727356, + "learning_rate": 0.00023583953913165656, + "loss": 4.8464, + "step": 55970 + }, + { + "epoch": 0.0786571518397069, + "grad_norm": 0.9146137237548828, + "learning_rate": 0.000235881691724041, + "loss": 4.7756, + "step": 55980 + }, + { + "epoch": 0.07867120277787047, + "grad_norm": 0.9210695624351501, + "learning_rate": 0.00023592384431642545, + "loss": 4.7982, + "step": 55990 + }, + { + "epoch": 0.07868525371603405, + "grad_norm": 0.8495543003082275, + "learning_rate": 0.00023596599690880988, + "loss": 4.6518, + "step": 56000 + }, + { + "epoch": 0.07869930465419762, + "grad_norm": 0.8346798419952393, + "learning_rate": 0.0002360081495011943, + "loss": 4.8465, + "step": 56010 + }, + { + "epoch": 0.07871335559236121, + "grad_norm": 0.8985382914543152, + "learning_rate": 0.00023605030209357874, + "loss": 4.7423, + "step": 56020 + }, + { + "epoch": 0.07872740653052479, + "grad_norm": 0.8527060747146606, + "learning_rate": 0.00023609245468596314, + "loss": 4.9118, + "step": 56030 + }, + { + "epoch": 0.07874145746868837, + "grad_norm": 0.8444516658782959, + "learning_rate": 0.00023613460727834758, + "loss": 4.9117, + "step": 56040 + }, + { + "epoch": 0.07875550840685194, + "grad_norm": 0.879189133644104, + "learning_rate": 0.00023617675987073203, + "loss": 4.6753, + "step": 56050 + }, + { + "epoch": 0.07876955934501552, + "grad_norm": 0.8668951392173767, + "learning_rate": 0.00023621891246311647, + "loss": 4.8826, + "step": 56060 + }, + { + "epoch": 0.0787836102831791, + "grad_norm": 0.8374396562576294, + "learning_rate": 0.0002362610650555009, + "loss": 4.8564, + "step": 56070 + }, + { + "epoch": 0.07879766122134267, + "grad_norm": 0.8434193730354309, + "learning_rate": 0.00023630321764788533, + "loss": 4.85, + "step": 56080 + }, + { + "epoch": 0.07881171215950625, + "grad_norm": 0.8440457582473755, + "learning_rate": 0.00023634537024026976, + "loss": 4.8728, + "step": 56090 + }, + { + "epoch": 0.07882576309766982, + "grad_norm": 0.8732414245605469, + "learning_rate": 0.00023638752283265416, + "loss": 4.9149, + "step": 56100 + }, + { + "epoch": 0.0788398140358334, + "grad_norm": 0.8345431685447693, + "learning_rate": 0.00023642967542503862, + "loss": 4.9031, + "step": 56110 + }, + { + "epoch": 0.07885386497399698, + "grad_norm": 0.8340375423431396, + "learning_rate": 0.00023647182801742305, + "loss": 4.8705, + "step": 56120 + }, + { + "epoch": 0.07886791591216057, + "grad_norm": 0.8727152943611145, + "learning_rate": 0.00023651398060980748, + "loss": 4.8331, + "step": 56130 + }, + { + "epoch": 0.07888196685032414, + "grad_norm": 0.8571959733963013, + "learning_rate": 0.00023655613320219191, + "loss": 4.8701, + "step": 56140 + }, + { + "epoch": 0.07889601778848772, + "grad_norm": 0.8859211802482605, + "learning_rate": 0.00023659828579457635, + "loss": 4.7979, + "step": 56150 + }, + { + "epoch": 0.0789100687266513, + "grad_norm": 0.8285291790962219, + "learning_rate": 0.00023664043838696075, + "loss": 4.862, + "step": 56160 + }, + { + "epoch": 0.07892411966481487, + "grad_norm": 0.8508478403091431, + "learning_rate": 0.0002366825909793452, + "loss": 4.8707, + "step": 56170 + }, + { + "epoch": 0.07893817060297845, + "grad_norm": 0.8419012427330017, + "learning_rate": 0.00023672474357172964, + "loss": 4.8833, + "step": 56180 + }, + { + "epoch": 0.07895222154114202, + "grad_norm": 0.9277992844581604, + "learning_rate": 0.00023676689616411407, + "loss": 4.8056, + "step": 56190 + }, + { + "epoch": 0.0789662724793056, + "grad_norm": 0.879784882068634, + "learning_rate": 0.0002368090487564985, + "loss": 4.9411, + "step": 56200 + }, + { + "epoch": 0.07898032341746918, + "grad_norm": 0.8691972494125366, + "learning_rate": 0.00023685120134888293, + "loss": 4.7991, + "step": 56210 + }, + { + "epoch": 0.07899437435563275, + "grad_norm": 0.8471894860267639, + "learning_rate": 0.0002368933539412674, + "loss": 4.6985, + "step": 56220 + }, + { + "epoch": 0.07900842529379634, + "grad_norm": 0.8567607402801514, + "learning_rate": 0.0002369355065336518, + "loss": 4.8068, + "step": 56230 + }, + { + "epoch": 0.07902247623195992, + "grad_norm": 0.8540003895759583, + "learning_rate": 0.00023697765912603623, + "loss": 4.6384, + "step": 56240 + }, + { + "epoch": 0.0790365271701235, + "grad_norm": 0.886030912399292, + "learning_rate": 0.00023701981171842066, + "loss": 4.8265, + "step": 56250 + }, + { + "epoch": 0.07905057810828707, + "grad_norm": 0.9057062268257141, + "learning_rate": 0.0002370619643108051, + "loss": 4.881, + "step": 56260 + }, + { + "epoch": 0.07906462904645065, + "grad_norm": 0.8884521126747131, + "learning_rate": 0.00023710411690318955, + "loss": 4.7785, + "step": 56270 + }, + { + "epoch": 0.07907867998461422, + "grad_norm": 0.8333665132522583, + "learning_rate": 0.00023714626949557398, + "loss": 4.83, + "step": 56280 + }, + { + "epoch": 0.0790927309227778, + "grad_norm": 0.8531104922294617, + "learning_rate": 0.00023718842208795838, + "loss": 4.7392, + "step": 56290 + }, + { + "epoch": 0.07910678186094137, + "grad_norm": 0.853764533996582, + "learning_rate": 0.0002372305746803428, + "loss": 4.8426, + "step": 56300 + }, + { + "epoch": 0.07912083279910495, + "grad_norm": 0.8597157597541809, + "learning_rate": 0.00023727272727272724, + "loss": 4.9028, + "step": 56310 + }, + { + "epoch": 0.07913488373726853, + "grad_norm": 0.8901702165603638, + "learning_rate": 0.00023731487986511168, + "loss": 4.7871, + "step": 56320 + }, + { + "epoch": 0.07914893467543212, + "grad_norm": 0.8476951122283936, + "learning_rate": 0.00023735703245749613, + "loss": 4.7935, + "step": 56330 + }, + { + "epoch": 0.0791629856135957, + "grad_norm": 0.8251456022262573, + "learning_rate": 0.00023739918504988057, + "loss": 4.8088, + "step": 56340 + }, + { + "epoch": 0.07917703655175927, + "grad_norm": 0.8737718462944031, + "learning_rate": 0.000237441337642265, + "loss": 4.8629, + "step": 56350 + }, + { + "epoch": 0.07919108748992285, + "grad_norm": 0.847468912601471, + "learning_rate": 0.0002374834902346494, + "loss": 4.8208, + "step": 56360 + }, + { + "epoch": 0.07920513842808642, + "grad_norm": 0.8241137266159058, + "learning_rate": 0.00023752564282703383, + "loss": 4.7132, + "step": 56370 + }, + { + "epoch": 0.07921918936625, + "grad_norm": 0.8659803867340088, + "learning_rate": 0.00023756779541941826, + "loss": 4.7646, + "step": 56380 + }, + { + "epoch": 0.07923324030441357, + "grad_norm": 0.876525342464447, + "learning_rate": 0.00023760994801180272, + "loss": 4.8045, + "step": 56390 + }, + { + "epoch": 0.07924729124257715, + "grad_norm": 0.8721202611923218, + "learning_rate": 0.00023765210060418715, + "loss": 4.8596, + "step": 56400 + }, + { + "epoch": 0.07926134218074073, + "grad_norm": 0.8766302466392517, + "learning_rate": 0.00023769425319657158, + "loss": 4.8604, + "step": 56410 + }, + { + "epoch": 0.0792753931189043, + "grad_norm": 0.8864064812660217, + "learning_rate": 0.000237736405788956, + "loss": 4.7736, + "step": 56420 + }, + { + "epoch": 0.07928944405706788, + "grad_norm": 0.8010143041610718, + "learning_rate": 0.00023777855838134042, + "loss": 4.8483, + "step": 56430 + }, + { + "epoch": 0.07930349499523147, + "grad_norm": 0.8585301637649536, + "learning_rate": 0.00023782071097372485, + "loss": 4.8176, + "step": 56440 + }, + { + "epoch": 0.07931754593339504, + "grad_norm": 0.8217198848724365, + "learning_rate": 0.0002378628635661093, + "loss": 4.8178, + "step": 56450 + }, + { + "epoch": 0.07933159687155862, + "grad_norm": 0.8594200015068054, + "learning_rate": 0.00023790501615849374, + "loss": 4.7902, + "step": 56460 + }, + { + "epoch": 0.0793456478097222, + "grad_norm": 0.8369775414466858, + "learning_rate": 0.00023794716875087817, + "loss": 4.6128, + "step": 56470 + }, + { + "epoch": 0.07935969874788577, + "grad_norm": 0.8443807363510132, + "learning_rate": 0.00023798932134326257, + "loss": 4.9548, + "step": 56480 + }, + { + "epoch": 0.07937374968604935, + "grad_norm": 0.921536386013031, + "learning_rate": 0.000238031473935647, + "loss": 4.9101, + "step": 56490 + }, + { + "epoch": 0.07938780062421293, + "grad_norm": 0.842056155204773, + "learning_rate": 0.00023807362652803144, + "loss": 4.825, + "step": 56500 + }, + { + "epoch": 0.0794018515623765, + "grad_norm": 0.8614702224731445, + "learning_rate": 0.0002381157791204159, + "loss": 4.7713, + "step": 56510 + }, + { + "epoch": 0.07941590250054008, + "grad_norm": 0.8810839056968689, + "learning_rate": 0.00023815793171280033, + "loss": 4.9506, + "step": 56520 + }, + { + "epoch": 0.07942995343870365, + "grad_norm": 0.8495205640792847, + "learning_rate": 0.00023820008430518476, + "loss": 4.8596, + "step": 56530 + }, + { + "epoch": 0.07944400437686724, + "grad_norm": 0.829397439956665, + "learning_rate": 0.0002382422368975692, + "loss": 4.7906, + "step": 56540 + }, + { + "epoch": 0.07945805531503082, + "grad_norm": 0.8445591926574707, + "learning_rate": 0.0002382843894899536, + "loss": 4.8183, + "step": 56550 + }, + { + "epoch": 0.0794721062531944, + "grad_norm": 0.8689804673194885, + "learning_rate": 0.00023832654208233802, + "loss": 4.9633, + "step": 56560 + }, + { + "epoch": 0.07948615719135797, + "grad_norm": 0.8556300401687622, + "learning_rate": 0.00023836869467472248, + "loss": 4.8468, + "step": 56570 + }, + { + "epoch": 0.07950020812952155, + "grad_norm": 0.9189597964286804, + "learning_rate": 0.0002384108472671069, + "loss": 4.7865, + "step": 56580 + }, + { + "epoch": 0.07951425906768513, + "grad_norm": 0.8929227590560913, + "learning_rate": 0.00023845299985949134, + "loss": 4.9381, + "step": 56590 + }, + { + "epoch": 0.0795283100058487, + "grad_norm": 0.8549538254737854, + "learning_rate": 0.00023849515245187578, + "loss": 4.8449, + "step": 56600 + }, + { + "epoch": 0.07954236094401228, + "grad_norm": 0.8399584293365479, + "learning_rate": 0.00023853730504426018, + "loss": 4.8057, + "step": 56610 + }, + { + "epoch": 0.07955641188217585, + "grad_norm": 0.8562041521072388, + "learning_rate": 0.0002385794576366446, + "loss": 4.7622, + "step": 56620 + }, + { + "epoch": 0.07957046282033943, + "grad_norm": 0.8549632430076599, + "learning_rate": 0.00023862161022902907, + "loss": 4.7727, + "step": 56630 + }, + { + "epoch": 0.07958451375850302, + "grad_norm": 0.8222755789756775, + "learning_rate": 0.0002386637628214135, + "loss": 4.8657, + "step": 56640 + }, + { + "epoch": 0.0795985646966666, + "grad_norm": 0.8658560514450073, + "learning_rate": 0.00023870591541379793, + "loss": 4.8183, + "step": 56650 + }, + { + "epoch": 0.07961261563483017, + "grad_norm": 0.8258546590805054, + "learning_rate": 0.00023874806800618236, + "loss": 4.8242, + "step": 56660 + }, + { + "epoch": 0.07962666657299375, + "grad_norm": 0.8417488932609558, + "learning_rate": 0.0002387902205985668, + "loss": 4.7378, + "step": 56670 + }, + { + "epoch": 0.07964071751115732, + "grad_norm": 0.8733723759651184, + "learning_rate": 0.0002388323731909512, + "loss": 4.7247, + "step": 56680 + }, + { + "epoch": 0.0796547684493209, + "grad_norm": 0.8495579361915588, + "learning_rate": 0.00023887452578333566, + "loss": 4.8123, + "step": 56690 + }, + { + "epoch": 0.07966881938748448, + "grad_norm": 0.8252209424972534, + "learning_rate": 0.0002389166783757201, + "loss": 4.791, + "step": 56700 + }, + { + "epoch": 0.07968287032564805, + "grad_norm": 0.8873644471168518, + "learning_rate": 0.00023895883096810452, + "loss": 4.7452, + "step": 56710 + }, + { + "epoch": 0.07969692126381163, + "grad_norm": 0.8369097113609314, + "learning_rate": 0.00023900098356048895, + "loss": 4.6592, + "step": 56720 + }, + { + "epoch": 0.0797109722019752, + "grad_norm": 0.9035923480987549, + "learning_rate": 0.00023904313615287338, + "loss": 4.646, + "step": 56730 + }, + { + "epoch": 0.07972502314013878, + "grad_norm": 0.87725430727005, + "learning_rate": 0.00023908528874525778, + "loss": 4.6869, + "step": 56740 + }, + { + "epoch": 0.07973907407830237, + "grad_norm": 0.8985244631767273, + "learning_rate": 0.00023912744133764224, + "loss": 4.7142, + "step": 56750 + }, + { + "epoch": 0.07975312501646595, + "grad_norm": 0.8276535868644714, + "learning_rate": 0.00023916959393002667, + "loss": 4.8682, + "step": 56760 + }, + { + "epoch": 0.07976717595462952, + "grad_norm": 0.8182491660118103, + "learning_rate": 0.0002392117465224111, + "loss": 4.9084, + "step": 56770 + }, + { + "epoch": 0.0797812268927931, + "grad_norm": 0.8240747451782227, + "learning_rate": 0.00023925389911479554, + "loss": 4.957, + "step": 56780 + }, + { + "epoch": 0.07979527783095668, + "grad_norm": 0.8549389243125916, + "learning_rate": 0.00023929605170717997, + "loss": 4.8275, + "step": 56790 + }, + { + "epoch": 0.07980932876912025, + "grad_norm": 0.8908678293228149, + "learning_rate": 0.00023933820429956443, + "loss": 4.9102, + "step": 56800 + }, + { + "epoch": 0.07982337970728383, + "grad_norm": 0.8527509570121765, + "learning_rate": 0.00023938035689194883, + "loss": 4.9285, + "step": 56810 + }, + { + "epoch": 0.0798374306454474, + "grad_norm": 0.8254045248031616, + "learning_rate": 0.00023942250948433326, + "loss": 4.8112, + "step": 56820 + }, + { + "epoch": 0.07985148158361098, + "grad_norm": 0.850629448890686, + "learning_rate": 0.0002394646620767177, + "loss": 4.8258, + "step": 56830 + }, + { + "epoch": 0.07986553252177456, + "grad_norm": 0.8825765252113342, + "learning_rate": 0.00023950681466910212, + "loss": 4.8338, + "step": 56840 + }, + { + "epoch": 0.07987958345993815, + "grad_norm": 0.8650710582733154, + "learning_rate": 0.00023954896726148658, + "loss": 4.8267, + "step": 56850 + }, + { + "epoch": 0.07989363439810172, + "grad_norm": 0.8440834283828735, + "learning_rate": 0.000239591119853871, + "loss": 4.9396, + "step": 56860 + }, + { + "epoch": 0.0799076853362653, + "grad_norm": 0.8128529191017151, + "learning_rate": 0.00023963327244625542, + "loss": 4.9086, + "step": 56870 + }, + { + "epoch": 0.07992173627442888, + "grad_norm": 0.9077975749969482, + "learning_rate": 0.00023967542503863985, + "loss": 4.7803, + "step": 56880 + }, + { + "epoch": 0.07993578721259245, + "grad_norm": 0.9005282521247864, + "learning_rate": 0.00023971757763102428, + "loss": 4.8147, + "step": 56890 + }, + { + "epoch": 0.07994983815075603, + "grad_norm": 0.8272696137428284, + "learning_rate": 0.0002397597302234087, + "loss": 4.7913, + "step": 56900 + }, + { + "epoch": 0.0799638890889196, + "grad_norm": 0.8787935972213745, + "learning_rate": 0.00023980188281579317, + "loss": 4.817, + "step": 56910 + }, + { + "epoch": 0.07997794002708318, + "grad_norm": 0.8999579548835754, + "learning_rate": 0.0002398440354081776, + "loss": 4.8105, + "step": 56920 + }, + { + "epoch": 0.07999199096524676, + "grad_norm": 0.8166282773017883, + "learning_rate": 0.00023988618800056203, + "loss": 4.8415, + "step": 56930 + }, + { + "epoch": 0.08000604190341033, + "grad_norm": 0.8887068033218384, + "learning_rate": 0.00023992834059294644, + "loss": 4.8513, + "step": 56940 + }, + { + "epoch": 0.08002009284157392, + "grad_norm": 0.8588322997093201, + "learning_rate": 0.00023997049318533087, + "loss": 4.7216, + "step": 56950 + }, + { + "epoch": 0.0800341437797375, + "grad_norm": 0.8577817678451538, + "learning_rate": 0.0002400126457777153, + "loss": 4.8628, + "step": 56960 + }, + { + "epoch": 0.08004819471790107, + "grad_norm": 0.8334136605262756, + "learning_rate": 0.00024005479837009976, + "loss": 4.8069, + "step": 56970 + }, + { + "epoch": 0.08006224565606465, + "grad_norm": 0.8446940779685974, + "learning_rate": 0.0002400969509624842, + "loss": 4.771, + "step": 56980 + }, + { + "epoch": 0.08007629659422823, + "grad_norm": 0.817052960395813, + "learning_rate": 0.00024013910355486862, + "loss": 4.8963, + "step": 56990 + }, + { + "epoch": 0.0800903475323918, + "grad_norm": 0.8921146392822266, + "learning_rate": 0.00024018125614725302, + "loss": 4.8283, + "step": 57000 + }, + { + "epoch": 0.08010439847055538, + "grad_norm": 0.8406715989112854, + "learning_rate": 0.00024022340873963745, + "loss": 4.759, + "step": 57010 + }, + { + "epoch": 0.08011844940871896, + "grad_norm": 0.8852546811103821, + "learning_rate": 0.00024026556133202188, + "loss": 4.8704, + "step": 57020 + }, + { + "epoch": 0.08013250034688253, + "grad_norm": 0.8223075866699219, + "learning_rate": 0.00024030771392440634, + "loss": 4.7321, + "step": 57030 + }, + { + "epoch": 0.08014655128504611, + "grad_norm": 0.8500157594680786, + "learning_rate": 0.00024034986651679077, + "loss": 4.9298, + "step": 57040 + }, + { + "epoch": 0.08016060222320968, + "grad_norm": 0.8550832867622375, + "learning_rate": 0.0002403920191091752, + "loss": 4.8613, + "step": 57050 + }, + { + "epoch": 0.08017465316137327, + "grad_norm": 0.9191502928733826, + "learning_rate": 0.00024043417170155964, + "loss": 4.6645, + "step": 57060 + }, + { + "epoch": 0.08018870409953685, + "grad_norm": 0.8604241013526917, + "learning_rate": 0.00024047632429394404, + "loss": 4.7982, + "step": 57070 + }, + { + "epoch": 0.08020275503770043, + "grad_norm": 0.8823716640472412, + "learning_rate": 0.00024051847688632847, + "loss": 4.7106, + "step": 57080 + }, + { + "epoch": 0.080216805975864, + "grad_norm": 0.8595419526100159, + "learning_rate": 0.00024056062947871293, + "loss": 4.835, + "step": 57090 + }, + { + "epoch": 0.08023085691402758, + "grad_norm": 0.8353159427642822, + "learning_rate": 0.00024060278207109736, + "loss": 4.7358, + "step": 57100 + }, + { + "epoch": 0.08024490785219116, + "grad_norm": 0.8417918682098389, + "learning_rate": 0.0002406449346634818, + "loss": 4.9533, + "step": 57110 + }, + { + "epoch": 0.08025895879035473, + "grad_norm": 0.8293863534927368, + "learning_rate": 0.00024068708725586622, + "loss": 4.9028, + "step": 57120 + }, + { + "epoch": 0.08027300972851831, + "grad_norm": 0.8590785264968872, + "learning_rate": 0.00024072923984825063, + "loss": 4.8322, + "step": 57130 + }, + { + "epoch": 0.08028706066668188, + "grad_norm": 0.8248816728591919, + "learning_rate": 0.00024077139244063506, + "loss": 4.7921, + "step": 57140 + }, + { + "epoch": 0.08030111160484546, + "grad_norm": 0.8333460688591003, + "learning_rate": 0.00024081354503301952, + "loss": 4.9145, + "step": 57150 + }, + { + "epoch": 0.08031516254300905, + "grad_norm": 0.8591793179512024, + "learning_rate": 0.00024085569762540395, + "loss": 4.8857, + "step": 57160 + }, + { + "epoch": 0.08032921348117263, + "grad_norm": 0.8233816623687744, + "learning_rate": 0.00024089785021778838, + "loss": 4.8737, + "step": 57170 + }, + { + "epoch": 0.0803432644193362, + "grad_norm": 0.8355622887611389, + "learning_rate": 0.0002409400028101728, + "loss": 4.7534, + "step": 57180 + }, + { + "epoch": 0.08035731535749978, + "grad_norm": 0.8862990140914917, + "learning_rate": 0.00024098215540255721, + "loss": 4.952, + "step": 57190 + }, + { + "epoch": 0.08037136629566335, + "grad_norm": 0.8540744781494141, + "learning_rate": 0.00024102430799494165, + "loss": 4.8611, + "step": 57200 + }, + { + "epoch": 0.08038541723382693, + "grad_norm": 0.8337244987487793, + "learning_rate": 0.0002410664605873261, + "loss": 4.8597, + "step": 57210 + }, + { + "epoch": 0.0803994681719905, + "grad_norm": 0.859093189239502, + "learning_rate": 0.00024110861317971053, + "loss": 4.7506, + "step": 57220 + }, + { + "epoch": 0.08041351911015408, + "grad_norm": 0.8373620510101318, + "learning_rate": 0.00024115076577209497, + "loss": 4.7512, + "step": 57230 + }, + { + "epoch": 0.08042757004831766, + "grad_norm": 0.9458513855934143, + "learning_rate": 0.0002411929183644794, + "loss": 4.849, + "step": 57240 + }, + { + "epoch": 0.08044162098648124, + "grad_norm": 0.830441415309906, + "learning_rate": 0.00024123507095686383, + "loss": 4.8426, + "step": 57250 + }, + { + "epoch": 0.08045567192464483, + "grad_norm": 0.8454274535179138, + "learning_rate": 0.00024127722354924823, + "loss": 4.931, + "step": 57260 + }, + { + "epoch": 0.0804697228628084, + "grad_norm": 0.8949636816978455, + "learning_rate": 0.0002413193761416327, + "loss": 4.8015, + "step": 57270 + }, + { + "epoch": 0.08048377380097198, + "grad_norm": 0.8301366567611694, + "learning_rate": 0.00024136152873401712, + "loss": 4.7237, + "step": 57280 + }, + { + "epoch": 0.08049782473913555, + "grad_norm": 0.8346551060676575, + "learning_rate": 0.00024140368132640155, + "loss": 4.8849, + "step": 57290 + }, + { + "epoch": 0.08051187567729913, + "grad_norm": 0.8517528772354126, + "learning_rate": 0.00024144583391878598, + "loss": 4.9193, + "step": 57300 + }, + { + "epoch": 0.0805259266154627, + "grad_norm": 0.8681390285491943, + "learning_rate": 0.00024148798651117042, + "loss": 4.8459, + "step": 57310 + }, + { + "epoch": 0.08053997755362628, + "grad_norm": 0.8334530591964722, + "learning_rate": 0.00024153013910355482, + "loss": 4.7559, + "step": 57320 + }, + { + "epoch": 0.08055402849178986, + "grad_norm": 0.844562828540802, + "learning_rate": 0.00024157229169593928, + "loss": 4.7291, + "step": 57330 + }, + { + "epoch": 0.08056807942995343, + "grad_norm": 0.8150333166122437, + "learning_rate": 0.0002416144442883237, + "loss": 4.8139, + "step": 57340 + }, + { + "epoch": 0.08058213036811701, + "grad_norm": 0.8444922566413879, + "learning_rate": 0.00024165659688070814, + "loss": 4.8298, + "step": 57350 + }, + { + "epoch": 0.08059618130628059, + "grad_norm": 0.832202672958374, + "learning_rate": 0.00024169874947309257, + "loss": 4.6788, + "step": 57360 + }, + { + "epoch": 0.08061023224444418, + "grad_norm": 0.819246232509613, + "learning_rate": 0.000241740902065477, + "loss": 4.8065, + "step": 57370 + }, + { + "epoch": 0.08062428318260775, + "grad_norm": 0.8348022103309631, + "learning_rate": 0.00024178305465786146, + "loss": 4.8509, + "step": 57380 + }, + { + "epoch": 0.08063833412077133, + "grad_norm": 0.85079425573349, + "learning_rate": 0.00024182520725024586, + "loss": 4.7028, + "step": 57390 + }, + { + "epoch": 0.0806523850589349, + "grad_norm": 0.8373984098434448, + "learning_rate": 0.0002418673598426303, + "loss": 4.7505, + "step": 57400 + }, + { + "epoch": 0.08066643599709848, + "grad_norm": 0.8485515713691711, + "learning_rate": 0.00024190951243501473, + "loss": 4.8502, + "step": 57410 + }, + { + "epoch": 0.08068048693526206, + "grad_norm": 0.8618393540382385, + "learning_rate": 0.00024195166502739916, + "loss": 4.8078, + "step": 57420 + }, + { + "epoch": 0.08069453787342563, + "grad_norm": 0.8273009657859802, + "learning_rate": 0.00024199381761978362, + "loss": 4.8212, + "step": 57430 + }, + { + "epoch": 0.08070858881158921, + "grad_norm": 0.8302605748176575, + "learning_rate": 0.00024203597021216805, + "loss": 4.6817, + "step": 57440 + }, + { + "epoch": 0.08072263974975279, + "grad_norm": 0.8489684462547302, + "learning_rate": 0.00024207812280455245, + "loss": 4.79, + "step": 57450 + }, + { + "epoch": 0.08073669068791636, + "grad_norm": 0.842799186706543, + "learning_rate": 0.00024212027539693688, + "loss": 4.8192, + "step": 57460 + }, + { + "epoch": 0.08075074162607995, + "grad_norm": 0.8465150594711304, + "learning_rate": 0.00024216242798932131, + "loss": 4.8832, + "step": 57470 + }, + { + "epoch": 0.08076479256424353, + "grad_norm": 0.8409611582756042, + "learning_rate": 0.00024220458058170575, + "loss": 4.7202, + "step": 57480 + }, + { + "epoch": 0.0807788435024071, + "grad_norm": 0.8487250804901123, + "learning_rate": 0.0002422467331740902, + "loss": 4.748, + "step": 57490 + }, + { + "epoch": 0.08079289444057068, + "grad_norm": 0.845812976360321, + "learning_rate": 0.00024228888576647463, + "loss": 4.7638, + "step": 57500 + }, + { + "epoch": 0.08080694537873426, + "grad_norm": 0.8647302985191345, + "learning_rate": 0.00024233103835885907, + "loss": 4.8452, + "step": 57510 + }, + { + "epoch": 0.08082099631689783, + "grad_norm": 0.8373607993125916, + "learning_rate": 0.00024237319095124347, + "loss": 4.8014, + "step": 57520 + }, + { + "epoch": 0.08083504725506141, + "grad_norm": 0.8152607679367065, + "learning_rate": 0.0002424153435436279, + "loss": 4.8544, + "step": 57530 + }, + { + "epoch": 0.08084909819322499, + "grad_norm": 0.8385706543922424, + "learning_rate": 0.00024245749613601233, + "loss": 4.8766, + "step": 57540 + }, + { + "epoch": 0.08086314913138856, + "grad_norm": 0.8651167154312134, + "learning_rate": 0.0002424996487283968, + "loss": 4.8027, + "step": 57550 + }, + { + "epoch": 0.08087720006955214, + "grad_norm": 0.837224543094635, + "learning_rate": 0.00024254180132078122, + "loss": 4.7422, + "step": 57560 + }, + { + "epoch": 0.08089125100771573, + "grad_norm": 0.8273807168006897, + "learning_rate": 0.00024258395391316565, + "loss": 4.7841, + "step": 57570 + }, + { + "epoch": 0.0809053019458793, + "grad_norm": 0.8563796281814575, + "learning_rate": 0.00024262610650555006, + "loss": 4.825, + "step": 57580 + }, + { + "epoch": 0.08091935288404288, + "grad_norm": 0.8224353790283203, + "learning_rate": 0.0002426682590979345, + "loss": 4.8982, + "step": 57590 + }, + { + "epoch": 0.08093340382220646, + "grad_norm": 0.8756199479103088, + "learning_rate": 0.00024271041169031892, + "loss": 4.7504, + "step": 57600 + }, + { + "epoch": 0.08094745476037003, + "grad_norm": 0.8563529253005981, + "learning_rate": 0.00024275256428270338, + "loss": 4.8682, + "step": 57610 + }, + { + "epoch": 0.08096150569853361, + "grad_norm": 0.8547507524490356, + "learning_rate": 0.0002427947168750878, + "loss": 4.8426, + "step": 57620 + }, + { + "epoch": 0.08097555663669719, + "grad_norm": 0.8033429980278015, + "learning_rate": 0.00024283686946747224, + "loss": 4.8511, + "step": 57630 + }, + { + "epoch": 0.08098960757486076, + "grad_norm": 0.8401547074317932, + "learning_rate": 0.00024287902205985667, + "loss": 4.7927, + "step": 57640 + }, + { + "epoch": 0.08100365851302434, + "grad_norm": 0.8384739756584167, + "learning_rate": 0.00024292117465224108, + "loss": 4.8009, + "step": 57650 + }, + { + "epoch": 0.08101770945118791, + "grad_norm": 0.840840220451355, + "learning_rate": 0.0002429633272446255, + "loss": 4.7733, + "step": 57660 + }, + { + "epoch": 0.08103176038935149, + "grad_norm": 0.8522785902023315, + "learning_rate": 0.00024300547983700996, + "loss": 4.7992, + "step": 57670 + }, + { + "epoch": 0.08104581132751508, + "grad_norm": 0.8566286563873291, + "learning_rate": 0.0002430476324293944, + "loss": 4.8891, + "step": 57680 + }, + { + "epoch": 0.08105986226567866, + "grad_norm": 0.8864015340805054, + "learning_rate": 0.00024308978502177883, + "loss": 4.8248, + "step": 57690 + }, + { + "epoch": 0.08107391320384223, + "grad_norm": 0.8461414575576782, + "learning_rate": 0.00024313193761416326, + "loss": 4.8703, + "step": 57700 + }, + { + "epoch": 0.08108796414200581, + "grad_norm": 0.8322480916976929, + "learning_rate": 0.00024317409020654766, + "loss": 4.872, + "step": 57710 + }, + { + "epoch": 0.08110201508016938, + "grad_norm": 0.8064298033714294, + "learning_rate": 0.0002432162427989321, + "loss": 4.9469, + "step": 57720 + }, + { + "epoch": 0.08111606601833296, + "grad_norm": 0.8545445799827576, + "learning_rate": 0.00024325839539131655, + "loss": 4.8237, + "step": 57730 + }, + { + "epoch": 0.08113011695649654, + "grad_norm": 0.8561162948608398, + "learning_rate": 0.00024330054798370098, + "loss": 4.8889, + "step": 57740 + }, + { + "epoch": 0.08114416789466011, + "grad_norm": 0.862308919429779, + "learning_rate": 0.00024334270057608541, + "loss": 4.6817, + "step": 57750 + }, + { + "epoch": 0.08115821883282369, + "grad_norm": 0.832358717918396, + "learning_rate": 0.00024338485316846985, + "loss": 4.819, + "step": 57760 + }, + { + "epoch": 0.08117226977098727, + "grad_norm": 0.8506022095680237, + "learning_rate": 0.00024342700576085428, + "loss": 4.7971, + "step": 57770 + }, + { + "epoch": 0.08118632070915086, + "grad_norm": 0.8205908536911011, + "learning_rate": 0.00024346915835323868, + "loss": 4.7174, + "step": 57780 + }, + { + "epoch": 0.08120037164731443, + "grad_norm": 0.843183159828186, + "learning_rate": 0.00024351131094562314, + "loss": 4.8403, + "step": 57790 + }, + { + "epoch": 0.08121442258547801, + "grad_norm": 0.8414803147315979, + "learning_rate": 0.00024355346353800757, + "loss": 4.7169, + "step": 57800 + }, + { + "epoch": 0.08122847352364158, + "grad_norm": 0.8241091370582581, + "learning_rate": 0.000243595616130392, + "loss": 4.7469, + "step": 57810 + }, + { + "epoch": 0.08124252446180516, + "grad_norm": 0.8284342288970947, + "learning_rate": 0.00024363776872277643, + "loss": 4.8104, + "step": 57820 + }, + { + "epoch": 0.08125657539996874, + "grad_norm": 0.8515377044677734, + "learning_rate": 0.00024367992131516086, + "loss": 4.7973, + "step": 57830 + }, + { + "epoch": 0.08127062633813231, + "grad_norm": 0.8532301187515259, + "learning_rate": 0.00024372207390754527, + "loss": 4.7666, + "step": 57840 + }, + { + "epoch": 0.08128467727629589, + "grad_norm": 0.8152916431427002, + "learning_rate": 0.00024376422649992973, + "loss": 4.797, + "step": 57850 + }, + { + "epoch": 0.08129872821445946, + "grad_norm": 0.834984540939331, + "learning_rate": 0.00024380637909231416, + "loss": 4.7428, + "step": 57860 + }, + { + "epoch": 0.08131277915262304, + "grad_norm": 0.8661616444587708, + "learning_rate": 0.0002438485316846986, + "loss": 4.8754, + "step": 57870 + }, + { + "epoch": 0.08132683009078663, + "grad_norm": 0.843447744846344, + "learning_rate": 0.00024389068427708302, + "loss": 5.0731, + "step": 57880 + }, + { + "epoch": 0.08134088102895021, + "grad_norm": 0.8291600346565247, + "learning_rate": 0.00024393283686946745, + "loss": 4.8219, + "step": 57890 + }, + { + "epoch": 0.08135493196711378, + "grad_norm": 0.8383052349090576, + "learning_rate": 0.00024397077420261343, + "loss": 4.7993, + "step": 57900 + }, + { + "epoch": 0.08136898290527736, + "grad_norm": 0.8022859692573547, + "learning_rate": 0.00024401292679499786, + "loss": 4.8967, + "step": 57910 + }, + { + "epoch": 0.08138303384344094, + "grad_norm": 0.8225985169410706, + "learning_rate": 0.00024405507938738232, + "loss": 4.7647, + "step": 57920 + }, + { + "epoch": 0.08139708478160451, + "grad_norm": 0.8517218828201294, + "learning_rate": 0.00024409723197976675, + "loss": 4.7826, + "step": 57930 + }, + { + "epoch": 0.08141113571976809, + "grad_norm": 0.8240352869033813, + "learning_rate": 0.00024413938457215116, + "loss": 4.7381, + "step": 57940 + }, + { + "epoch": 0.08142518665793166, + "grad_norm": 0.8103241324424744, + "learning_rate": 0.0002441815371645356, + "loss": 4.7835, + "step": 57950 + }, + { + "epoch": 0.08143923759609524, + "grad_norm": 0.8353750705718994, + "learning_rate": 0.00024422368975692005, + "loss": 4.8407, + "step": 57960 + }, + { + "epoch": 0.08145328853425882, + "grad_norm": 0.8388403058052063, + "learning_rate": 0.0002442658423493045, + "loss": 4.8119, + "step": 57970 + }, + { + "epoch": 0.08146733947242239, + "grad_norm": 0.8061336278915405, + "learning_rate": 0.0002443079949416889, + "loss": 4.7948, + "step": 57980 + }, + { + "epoch": 0.08148139041058598, + "grad_norm": 0.9352821707725525, + "learning_rate": 0.00024435014753407334, + "loss": 4.8472, + "step": 57990 + }, + { + "epoch": 0.08149544134874956, + "grad_norm": 0.8587878346443176, + "learning_rate": 0.0002443923001264577, + "loss": 4.8216, + "step": 58000 + }, + { + "epoch": 0.08150949228691313, + "grad_norm": 0.8197136521339417, + "learning_rate": 0.0002444344527188422, + "loss": 4.7632, + "step": 58010 + }, + { + "epoch": 0.08152354322507671, + "grad_norm": 0.8324862718582153, + "learning_rate": 0.00024447660531122663, + "loss": 4.8624, + "step": 58020 + }, + { + "epoch": 0.08153759416324029, + "grad_norm": 0.8072025179862976, + "learning_rate": 0.00024451875790361107, + "loss": 4.8684, + "step": 58030 + }, + { + "epoch": 0.08155164510140386, + "grad_norm": 0.8620959520339966, + "learning_rate": 0.0002445609104959955, + "loss": 4.8949, + "step": 58040 + }, + { + "epoch": 0.08156569603956744, + "grad_norm": 0.8675084114074707, + "learning_rate": 0.00024460306308837993, + "loss": 4.7576, + "step": 58050 + }, + { + "epoch": 0.08157974697773102, + "grad_norm": 0.8540964126586914, + "learning_rate": 0.00024464521568076436, + "loss": 4.7649, + "step": 58060 + }, + { + "epoch": 0.08159379791589459, + "grad_norm": 0.8450266122817993, + "learning_rate": 0.0002446873682731488, + "loss": 4.8923, + "step": 58070 + }, + { + "epoch": 0.08160784885405817, + "grad_norm": 0.8445958495140076, + "learning_rate": 0.0002447295208655332, + "loss": 4.7243, + "step": 58080 + }, + { + "epoch": 0.08162189979222176, + "grad_norm": 0.823774516582489, + "learning_rate": 0.00024477167345791765, + "loss": 4.7552, + "step": 58090 + }, + { + "epoch": 0.08163595073038533, + "grad_norm": 0.8373647928237915, + "learning_rate": 0.0002448138260503021, + "loss": 4.7746, + "step": 58100 + }, + { + "epoch": 0.08165000166854891, + "grad_norm": 0.8526290655136108, + "learning_rate": 0.0002448559786426865, + "loss": 4.7328, + "step": 58110 + }, + { + "epoch": 0.08166405260671249, + "grad_norm": 0.8044569492340088, + "learning_rate": 0.00024489813123507095, + "loss": 4.8296, + "step": 58120 + }, + { + "epoch": 0.08167810354487606, + "grad_norm": 0.8177915811538696, + "learning_rate": 0.0002449402838274554, + "loss": 4.8325, + "step": 58130 + }, + { + "epoch": 0.08169215448303964, + "grad_norm": 0.8297937512397766, + "learning_rate": 0.0002449824364198398, + "loss": 4.8649, + "step": 58140 + }, + { + "epoch": 0.08170620542120322, + "grad_norm": 0.8474819660186768, + "learning_rate": 0.00024502458901222424, + "loss": 4.7662, + "step": 58150 + }, + { + "epoch": 0.08172025635936679, + "grad_norm": 0.876619815826416, + "learning_rate": 0.00024506674160460867, + "loss": 4.8893, + "step": 58160 + }, + { + "epoch": 0.08173430729753037, + "grad_norm": 0.8410592675209045, + "learning_rate": 0.0002451088941969931, + "loss": 4.7708, + "step": 58170 + }, + { + "epoch": 0.08174835823569394, + "grad_norm": 0.8169021010398865, + "learning_rate": 0.00024515104678937753, + "loss": 4.892, + "step": 58180 + }, + { + "epoch": 0.08176240917385753, + "grad_norm": 0.8390880823135376, + "learning_rate": 0.00024519319938176196, + "loss": 4.8694, + "step": 58190 + }, + { + "epoch": 0.08177646011202111, + "grad_norm": 0.8502917885780334, + "learning_rate": 0.0002452353519741464, + "loss": 4.7237, + "step": 58200 + }, + { + "epoch": 0.08179051105018469, + "grad_norm": 0.8233117461204529, + "learning_rate": 0.0002452775045665308, + "loss": 4.7619, + "step": 58210 + }, + { + "epoch": 0.08180456198834826, + "grad_norm": 0.8351155519485474, + "learning_rate": 0.00024531965715891526, + "loss": 4.8656, + "step": 58220 + }, + { + "epoch": 0.08181861292651184, + "grad_norm": 0.8331232666969299, + "learning_rate": 0.0002453618097512997, + "loss": 4.7766, + "step": 58230 + }, + { + "epoch": 0.08183266386467541, + "grad_norm": 0.8282060623168945, + "learning_rate": 0.0002454039623436841, + "loss": 4.7699, + "step": 58240 + }, + { + "epoch": 0.08184671480283899, + "grad_norm": 0.8700007200241089, + "learning_rate": 0.00024544611493606855, + "loss": 4.7489, + "step": 58250 + }, + { + "epoch": 0.08186076574100257, + "grad_norm": 0.8399953842163086, + "learning_rate": 0.000245488267528453, + "loss": 4.7302, + "step": 58260 + }, + { + "epoch": 0.08187481667916614, + "grad_norm": 0.8028297424316406, + "learning_rate": 0.0002455304201208374, + "loss": 4.9267, + "step": 58270 + }, + { + "epoch": 0.08188886761732972, + "grad_norm": 0.8101634979248047, + "learning_rate": 0.00024557257271322184, + "loss": 4.8224, + "step": 58280 + }, + { + "epoch": 0.0819029185554933, + "grad_norm": 0.8356708884239197, + "learning_rate": 0.0002456147253056063, + "loss": 4.7565, + "step": 58290 + }, + { + "epoch": 0.08191696949365689, + "grad_norm": 0.8344118595123291, + "learning_rate": 0.0002456568778979907, + "loss": 4.812, + "step": 58300 + }, + { + "epoch": 0.08193102043182046, + "grad_norm": 0.8281126022338867, + "learning_rate": 0.00024569903049037514, + "loss": 4.8933, + "step": 58310 + }, + { + "epoch": 0.08194507136998404, + "grad_norm": 0.8315061926841736, + "learning_rate": 0.00024574118308275957, + "loss": 4.6364, + "step": 58320 + }, + { + "epoch": 0.08195912230814761, + "grad_norm": 0.8570355772972107, + "learning_rate": 0.000245783335675144, + "loss": 4.8018, + "step": 58330 + }, + { + "epoch": 0.08197317324631119, + "grad_norm": 0.8422298431396484, + "learning_rate": 0.00024582548826752843, + "loss": 4.7245, + "step": 58340 + }, + { + "epoch": 0.08198722418447477, + "grad_norm": 0.8204920887947083, + "learning_rate": 0.00024586764085991286, + "loss": 4.7966, + "step": 58350 + }, + { + "epoch": 0.08200127512263834, + "grad_norm": 0.8395097851753235, + "learning_rate": 0.0002459097934522973, + "loss": 4.7934, + "step": 58360 + }, + { + "epoch": 0.08201532606080192, + "grad_norm": 0.8092932105064392, + "learning_rate": 0.0002459519460446817, + "loss": 4.812, + "step": 58370 + }, + { + "epoch": 0.0820293769989655, + "grad_norm": 0.818016529083252, + "learning_rate": 0.00024599409863706616, + "loss": 4.8381, + "step": 58380 + }, + { + "epoch": 0.08204342793712907, + "grad_norm": 0.866174042224884, + "learning_rate": 0.0002460362512294506, + "loss": 4.8195, + "step": 58390 + }, + { + "epoch": 0.08205747887529266, + "grad_norm": 0.8038418292999268, + "learning_rate": 0.000246078403821835, + "loss": 4.8522, + "step": 58400 + }, + { + "epoch": 0.08207152981345624, + "grad_norm": 0.8310818672180176, + "learning_rate": 0.00024612055641421945, + "loss": 4.8415, + "step": 58410 + }, + { + "epoch": 0.08208558075161981, + "grad_norm": 0.8277154564857483, + "learning_rate": 0.0002461627090066039, + "loss": 4.738, + "step": 58420 + }, + { + "epoch": 0.08209963168978339, + "grad_norm": 0.8364769220352173, + "learning_rate": 0.0002462048615989883, + "loss": 4.8246, + "step": 58430 + }, + { + "epoch": 0.08211368262794697, + "grad_norm": 0.8868595361709595, + "learning_rate": 0.00024624701419137274, + "loss": 4.8204, + "step": 58440 + }, + { + "epoch": 0.08212773356611054, + "grad_norm": 0.8325658440589905, + "learning_rate": 0.0002462891667837572, + "loss": 4.7931, + "step": 58450 + }, + { + "epoch": 0.08214178450427412, + "grad_norm": 0.8351956605911255, + "learning_rate": 0.0002463313193761416, + "loss": 4.8991, + "step": 58460 + }, + { + "epoch": 0.0821558354424377, + "grad_norm": 0.8221379518508911, + "learning_rate": 0.00024637347196852604, + "loss": 4.867, + "step": 58470 + }, + { + "epoch": 0.08216988638060127, + "grad_norm": 0.827820897102356, + "learning_rate": 0.00024641562456091047, + "loss": 4.793, + "step": 58480 + }, + { + "epoch": 0.08218393731876485, + "grad_norm": 0.902683436870575, + "learning_rate": 0.0002464577771532949, + "loss": 4.7763, + "step": 58490 + }, + { + "epoch": 0.08219798825692844, + "grad_norm": 0.8596385717391968, + "learning_rate": 0.00024649992974567933, + "loss": 4.7973, + "step": 58500 + }, + { + "epoch": 0.08221203919509201, + "grad_norm": 0.8017410635948181, + "learning_rate": 0.00024654208233806376, + "loss": 4.8217, + "step": 58510 + }, + { + "epoch": 0.08222609013325559, + "grad_norm": 0.8284733891487122, + "learning_rate": 0.0002465842349304482, + "loss": 4.8301, + "step": 58520 + }, + { + "epoch": 0.08224014107141916, + "grad_norm": 0.8326720595359802, + "learning_rate": 0.0002466263875228326, + "loss": 4.7582, + "step": 58530 + }, + { + "epoch": 0.08225419200958274, + "grad_norm": 0.8211853504180908, + "learning_rate": 0.00024666854011521705, + "loss": 4.8276, + "step": 58540 + }, + { + "epoch": 0.08226824294774632, + "grad_norm": 0.8203056454658508, + "learning_rate": 0.0002467106927076015, + "loss": 4.8199, + "step": 58550 + }, + { + "epoch": 0.0822822938859099, + "grad_norm": 0.8489205837249756, + "learning_rate": 0.0002467528452999859, + "loss": 4.8442, + "step": 58560 + }, + { + "epoch": 0.08229634482407347, + "grad_norm": 0.8325367569923401, + "learning_rate": 0.00024679499789237035, + "loss": 4.8388, + "step": 58570 + }, + { + "epoch": 0.08231039576223705, + "grad_norm": 0.8521776795387268, + "learning_rate": 0.0002468371504847548, + "loss": 4.9109, + "step": 58580 + }, + { + "epoch": 0.08232444670040062, + "grad_norm": 0.8650569915771484, + "learning_rate": 0.0002468793030771392, + "loss": 4.8815, + "step": 58590 + }, + { + "epoch": 0.0823384976385642, + "grad_norm": 0.8365474343299866, + "learning_rate": 0.00024692145566952364, + "loss": 4.8285, + "step": 58600 + }, + { + "epoch": 0.08235254857672779, + "grad_norm": 0.8498002886772156, + "learning_rate": 0.00024696360826190807, + "loss": 4.8788, + "step": 58610 + }, + { + "epoch": 0.08236659951489136, + "grad_norm": 0.8518571257591248, + "learning_rate": 0.0002470057608542925, + "loss": 4.7712, + "step": 58620 + }, + { + "epoch": 0.08238065045305494, + "grad_norm": 0.8263545632362366, + "learning_rate": 0.00024704791344667694, + "loss": 4.9409, + "step": 58630 + }, + { + "epoch": 0.08239470139121852, + "grad_norm": 0.8218898177146912, + "learning_rate": 0.0002470900660390614, + "loss": 4.6849, + "step": 58640 + }, + { + "epoch": 0.08240875232938209, + "grad_norm": 0.8200307488441467, + "learning_rate": 0.0002471322186314458, + "loss": 4.7979, + "step": 58650 + }, + { + "epoch": 0.08242280326754567, + "grad_norm": 0.8574158549308777, + "learning_rate": 0.00024717437122383023, + "loss": 4.8704, + "step": 58660 + }, + { + "epoch": 0.08243685420570925, + "grad_norm": 0.8805510401725769, + "learning_rate": 0.00024721652381621466, + "loss": 4.8024, + "step": 58670 + }, + { + "epoch": 0.08245090514387282, + "grad_norm": 0.8428569436073303, + "learning_rate": 0.0002472586764085991, + "loss": 4.7948, + "step": 58680 + }, + { + "epoch": 0.0824649560820364, + "grad_norm": 0.8235595226287842, + "learning_rate": 0.0002473008290009835, + "loss": 4.709, + "step": 58690 + }, + { + "epoch": 0.08247900702019997, + "grad_norm": 0.8152335286140442, + "learning_rate": 0.000247342981593368, + "loss": 4.8758, + "step": 58700 + }, + { + "epoch": 0.08249305795836356, + "grad_norm": 0.8103764653205872, + "learning_rate": 0.0002473851341857524, + "loss": 4.7552, + "step": 58710 + }, + { + "epoch": 0.08250710889652714, + "grad_norm": 0.8165539503097534, + "learning_rate": 0.0002474272867781368, + "loss": 4.8179, + "step": 58720 + }, + { + "epoch": 0.08252115983469072, + "grad_norm": 0.851290225982666, + "learning_rate": 0.00024746943937052125, + "loss": 4.9018, + "step": 58730 + }, + { + "epoch": 0.08253521077285429, + "grad_norm": 0.8036884665489197, + "learning_rate": 0.0002475115919629057, + "loss": 4.8221, + "step": 58740 + }, + { + "epoch": 0.08254926171101787, + "grad_norm": 0.8429344296455383, + "learning_rate": 0.0002475537445552901, + "loss": 4.8122, + "step": 58750 + }, + { + "epoch": 0.08256331264918144, + "grad_norm": 0.8305610418319702, + "learning_rate": 0.0002475958971476746, + "loss": 4.7718, + "step": 58760 + }, + { + "epoch": 0.08257736358734502, + "grad_norm": 0.8201099038124084, + "learning_rate": 0.000247638049740059, + "loss": 4.9241, + "step": 58770 + }, + { + "epoch": 0.0825914145255086, + "grad_norm": 0.8135986328125, + "learning_rate": 0.0002476802023324434, + "loss": 4.8023, + "step": 58780 + }, + { + "epoch": 0.08260546546367217, + "grad_norm": 0.8189361095428467, + "learning_rate": 0.00024772235492482783, + "loss": 4.8384, + "step": 58790 + }, + { + "epoch": 0.08261951640183575, + "grad_norm": 0.8032187223434448, + "learning_rate": 0.00024776450751721226, + "loss": 4.8052, + "step": 58800 + }, + { + "epoch": 0.08263356733999934, + "grad_norm": 0.8336382508277893, + "learning_rate": 0.0002478066601095967, + "loss": 4.755, + "step": 58810 + }, + { + "epoch": 0.08264761827816292, + "grad_norm": 0.8608211278915405, + "learning_rate": 0.0002478488127019812, + "loss": 4.9017, + "step": 58820 + }, + { + "epoch": 0.08266166921632649, + "grad_norm": 0.8575011491775513, + "learning_rate": 0.0002478909652943656, + "loss": 4.7826, + "step": 58830 + }, + { + "epoch": 0.08267572015449007, + "grad_norm": 0.8528128266334534, + "learning_rate": 0.00024793311788675, + "loss": 4.8539, + "step": 58840 + }, + { + "epoch": 0.08268977109265364, + "grad_norm": 0.8594315052032471, + "learning_rate": 0.0002479752704791344, + "loss": 4.7169, + "step": 58850 + }, + { + "epoch": 0.08270382203081722, + "grad_norm": 0.8435611128807068, + "learning_rate": 0.00024801742307151885, + "loss": 4.7116, + "step": 58860 + }, + { + "epoch": 0.0827178729689808, + "grad_norm": 0.8331279158592224, + "learning_rate": 0.0002480595756639033, + "loss": 4.933, + "step": 58870 + }, + { + "epoch": 0.08273192390714437, + "grad_norm": 0.8534746766090393, + "learning_rate": 0.00024810172825628777, + "loss": 4.8714, + "step": 58880 + }, + { + "epoch": 0.08274597484530795, + "grad_norm": 0.8149212002754211, + "learning_rate": 0.0002481438808486722, + "loss": 4.917, + "step": 58890 + }, + { + "epoch": 0.08276002578347152, + "grad_norm": 0.8224017024040222, + "learning_rate": 0.00024818603344105663, + "loss": 4.7634, + "step": 58900 + }, + { + "epoch": 0.0827740767216351, + "grad_norm": 0.8118835687637329, + "learning_rate": 0.000248228186033441, + "loss": 4.782, + "step": 58910 + }, + { + "epoch": 0.08278812765979869, + "grad_norm": 0.8306914567947388, + "learning_rate": 0.00024827033862582544, + "loss": 4.8674, + "step": 58920 + }, + { + "epoch": 0.08280217859796227, + "grad_norm": 0.8186638355255127, + "learning_rate": 0.00024831249121820987, + "loss": 4.8705, + "step": 58930 + }, + { + "epoch": 0.08281622953612584, + "grad_norm": 0.8312268257141113, + "learning_rate": 0.00024835464381059436, + "loss": 4.8295, + "step": 58940 + }, + { + "epoch": 0.08283028047428942, + "grad_norm": 0.8370024561882019, + "learning_rate": 0.0002483967964029788, + "loss": 4.8434, + "step": 58950 + }, + { + "epoch": 0.082844331412453, + "grad_norm": 0.8207914233207703, + "learning_rate": 0.0002484389489953632, + "loss": 4.7712, + "step": 58960 + }, + { + "epoch": 0.08285838235061657, + "grad_norm": 0.8279985785484314, + "learning_rate": 0.0002484811015877476, + "loss": 4.8821, + "step": 58970 + }, + { + "epoch": 0.08287243328878015, + "grad_norm": 0.8251729011535645, + "learning_rate": 0.000248523254180132, + "loss": 4.6615, + "step": 58980 + }, + { + "epoch": 0.08288648422694372, + "grad_norm": 0.8180922269821167, + "learning_rate": 0.0002485654067725165, + "loss": 4.778, + "step": 58990 + }, + { + "epoch": 0.0829005351651073, + "grad_norm": 0.833755612373352, + "learning_rate": 0.00024860755936490094, + "loss": 4.8855, + "step": 59000 + }, + { + "epoch": 0.08291458610327088, + "grad_norm": 0.8239709734916687, + "learning_rate": 0.0002486497119572854, + "loss": 4.7605, + "step": 59010 + }, + { + "epoch": 0.08292863704143447, + "grad_norm": 0.8077985644340515, + "learning_rate": 0.0002486918645496698, + "loss": 4.8525, + "step": 59020 + }, + { + "epoch": 0.08294268797959804, + "grad_norm": 0.795622706413269, + "learning_rate": 0.0002487340171420542, + "loss": 4.7966, + "step": 59030 + }, + { + "epoch": 0.08295673891776162, + "grad_norm": 0.8494805693626404, + "learning_rate": 0.0002487761697344386, + "loss": 4.8679, + "step": 59040 + }, + { + "epoch": 0.0829707898559252, + "grad_norm": 0.8523160219192505, + "learning_rate": 0.0002488183223268231, + "loss": 4.824, + "step": 59050 + }, + { + "epoch": 0.08298484079408877, + "grad_norm": 0.8548047542572021, + "learning_rate": 0.00024886047491920753, + "loss": 4.7349, + "step": 59060 + }, + { + "epoch": 0.08299889173225235, + "grad_norm": 0.8346095681190491, + "learning_rate": 0.00024890262751159196, + "loss": 4.858, + "step": 59070 + }, + { + "epoch": 0.08301294267041592, + "grad_norm": 0.8644514679908752, + "learning_rate": 0.0002489447801039764, + "loss": 4.897, + "step": 59080 + }, + { + "epoch": 0.0830269936085795, + "grad_norm": 0.8482093811035156, + "learning_rate": 0.0002489869326963608, + "loss": 4.8537, + "step": 59090 + }, + { + "epoch": 0.08304104454674308, + "grad_norm": 0.8546515703201294, + "learning_rate": 0.0002490290852887452, + "loss": 4.7443, + "step": 59100 + }, + { + "epoch": 0.08305509548490665, + "grad_norm": 0.8156818151473999, + "learning_rate": 0.0002490712378811297, + "loss": 4.7512, + "step": 59110 + }, + { + "epoch": 0.08306914642307024, + "grad_norm": 0.8068788647651672, + "learning_rate": 0.0002491133904735141, + "loss": 4.8865, + "step": 59120 + }, + { + "epoch": 0.08308319736123382, + "grad_norm": 0.8569709658622742, + "learning_rate": 0.00024915554306589855, + "loss": 4.8543, + "step": 59130 + }, + { + "epoch": 0.0830972482993974, + "grad_norm": 0.8595927357673645, + "learning_rate": 0.000249197695658283, + "loss": 4.9016, + "step": 59140 + }, + { + "epoch": 0.08311129923756097, + "grad_norm": 0.8086974024772644, + "learning_rate": 0.0002492398482506674, + "loss": 4.8776, + "step": 59150 + }, + { + "epoch": 0.08312535017572455, + "grad_norm": 0.8013572096824646, + "learning_rate": 0.0002492820008430518, + "loss": 4.7825, + "step": 59160 + }, + { + "epoch": 0.08313940111388812, + "grad_norm": 0.8331977725028992, + "learning_rate": 0.00024932415343543627, + "loss": 4.8848, + "step": 59170 + }, + { + "epoch": 0.0831534520520517, + "grad_norm": 0.8577580451965332, + "learning_rate": 0.0002493663060278207, + "loss": 4.8004, + "step": 59180 + }, + { + "epoch": 0.08316750299021528, + "grad_norm": 0.8222034573554993, + "learning_rate": 0.00024940845862020513, + "loss": 4.7706, + "step": 59190 + }, + { + "epoch": 0.08318155392837885, + "grad_norm": 0.8116173148155212, + "learning_rate": 0.00024945061121258957, + "loss": 4.925, + "step": 59200 + }, + { + "epoch": 0.08319560486654243, + "grad_norm": 0.8104360699653625, + "learning_rate": 0.000249492763804974, + "loss": 4.8693, + "step": 59210 + }, + { + "epoch": 0.083209655804706, + "grad_norm": 0.8252411484718323, + "learning_rate": 0.00024953491639735843, + "loss": 4.7542, + "step": 59220 + }, + { + "epoch": 0.0832237067428696, + "grad_norm": 0.8130877614021301, + "learning_rate": 0.00024957706898974286, + "loss": 4.8668, + "step": 59230 + }, + { + "epoch": 0.08323775768103317, + "grad_norm": 0.8746052980422974, + "learning_rate": 0.0002496192215821273, + "loss": 4.732, + "step": 59240 + }, + { + "epoch": 0.08325180861919675, + "grad_norm": 0.8369506001472473, + "learning_rate": 0.0002496613741745117, + "loss": 4.6947, + "step": 59250 + }, + { + "epoch": 0.08326585955736032, + "grad_norm": 0.8340523838996887, + "learning_rate": 0.00024970352676689615, + "loss": 4.8168, + "step": 59260 + }, + { + "epoch": 0.0832799104955239, + "grad_norm": 0.8198563456535339, + "learning_rate": 0.0002497456793592806, + "loss": 4.8669, + "step": 59270 + }, + { + "epoch": 0.08329396143368747, + "grad_norm": 0.7938926219940186, + "learning_rate": 0.000249787831951665, + "loss": 4.9119, + "step": 59280 + }, + { + "epoch": 0.08330801237185105, + "grad_norm": 0.8195042610168457, + "learning_rate": 0.00024982998454404945, + "loss": 4.7753, + "step": 59290 + }, + { + "epoch": 0.08332206331001463, + "grad_norm": 0.786556601524353, + "learning_rate": 0.0002498721371364339, + "loss": 4.8474, + "step": 59300 + }, + { + "epoch": 0.0833361142481782, + "grad_norm": 0.7999930381774902, + "learning_rate": 0.0002499142897288183, + "loss": 4.8823, + "step": 59310 + }, + { + "epoch": 0.08335016518634178, + "grad_norm": 0.8049028515815735, + "learning_rate": 0.00024995644232120274, + "loss": 4.758, + "step": 59320 + }, + { + "epoch": 0.08336421612450537, + "grad_norm": 0.8140138983726501, + "learning_rate": 0.00024999859491358717, + "loss": 4.8163, + "step": 59330 + }, + { + "epoch": 0.08337826706266895, + "grad_norm": 0.7999250292778015, + "learning_rate": 0.0002500407475059716, + "loss": 4.8743, + "step": 59340 + }, + { + "epoch": 0.08339231800083252, + "grad_norm": 0.8304258584976196, + "learning_rate": 0.00025008290009835603, + "loss": 4.8675, + "step": 59350 + }, + { + "epoch": 0.0834063689389961, + "grad_norm": 0.8012201189994812, + "learning_rate": 0.00025012505269074046, + "loss": 4.8525, + "step": 59360 + }, + { + "epoch": 0.08342041987715967, + "grad_norm": 0.7982044219970703, + "learning_rate": 0.0002501672052831249, + "loss": 4.8047, + "step": 59370 + }, + { + "epoch": 0.08343447081532325, + "grad_norm": 0.8110559582710266, + "learning_rate": 0.0002502093578755093, + "loss": 4.83, + "step": 59380 + }, + { + "epoch": 0.08344852175348683, + "grad_norm": 0.8042994141578674, + "learning_rate": 0.00025025151046789376, + "loss": 4.6822, + "step": 59390 + }, + { + "epoch": 0.0834625726916504, + "grad_norm": 0.9221730828285217, + "learning_rate": 0.0002502936630602782, + "loss": 4.8834, + "step": 59400 + }, + { + "epoch": 0.08347662362981398, + "grad_norm": 0.8320198059082031, + "learning_rate": 0.0002503358156526626, + "loss": 4.7958, + "step": 59410 + }, + { + "epoch": 0.08349067456797755, + "grad_norm": 0.8428821563720703, + "learning_rate": 0.00025037796824504705, + "loss": 4.7707, + "step": 59420 + }, + { + "epoch": 0.08350472550614114, + "grad_norm": 0.7867874503135681, + "learning_rate": 0.0002504201208374315, + "loss": 4.8291, + "step": 59430 + }, + { + "epoch": 0.08351877644430472, + "grad_norm": 0.8074051737785339, + "learning_rate": 0.0002504622734298159, + "loss": 4.8084, + "step": 59440 + }, + { + "epoch": 0.0835328273824683, + "grad_norm": 0.8142467141151428, + "learning_rate": 0.00025050442602220034, + "loss": 4.8289, + "step": 59450 + }, + { + "epoch": 0.08354687832063187, + "grad_norm": 0.8267741203308105, + "learning_rate": 0.0002505465786145848, + "loss": 4.734, + "step": 59460 + }, + { + "epoch": 0.08356092925879545, + "grad_norm": 0.786417543888092, + "learning_rate": 0.0002505887312069692, + "loss": 4.7357, + "step": 59470 + }, + { + "epoch": 0.08357498019695903, + "grad_norm": 0.8515885472297668, + "learning_rate": 0.00025063088379935364, + "loss": 4.7698, + "step": 59480 + }, + { + "epoch": 0.0835890311351226, + "grad_norm": 0.8577207922935486, + "learning_rate": 0.00025067303639173807, + "loss": 4.7342, + "step": 59490 + }, + { + "epoch": 0.08360308207328618, + "grad_norm": 0.8472533226013184, + "learning_rate": 0.0002507151889841225, + "loss": 4.7212, + "step": 59500 + }, + { + "epoch": 0.08361713301144975, + "grad_norm": 0.8497911691665649, + "learning_rate": 0.00025075734157650693, + "loss": 4.6883, + "step": 59510 + }, + { + "epoch": 0.08363118394961333, + "grad_norm": 0.7981038689613342, + "learning_rate": 0.00025079949416889136, + "loss": 4.7996, + "step": 59520 + }, + { + "epoch": 0.0836452348877769, + "grad_norm": 0.8073769807815552, + "learning_rate": 0.0002508416467612758, + "loss": 4.8362, + "step": 59530 + }, + { + "epoch": 0.0836592858259405, + "grad_norm": 0.8505818247795105, + "learning_rate": 0.0002508837993536602, + "loss": 4.7964, + "step": 59540 + }, + { + "epoch": 0.08367333676410407, + "grad_norm": 0.822891891002655, + "learning_rate": 0.00025092595194604466, + "loss": 4.9098, + "step": 59550 + }, + { + "epoch": 0.08368738770226765, + "grad_norm": 0.8498794436454773, + "learning_rate": 0.0002509681045384291, + "loss": 4.8038, + "step": 59560 + }, + { + "epoch": 0.08370143864043122, + "grad_norm": 0.8308042883872986, + "learning_rate": 0.0002510102571308135, + "loss": 4.8735, + "step": 59570 + }, + { + "epoch": 0.0837154895785948, + "grad_norm": 0.8018273711204529, + "learning_rate": 0.00025105240972319795, + "loss": 4.908, + "step": 59580 + }, + { + "epoch": 0.08372954051675838, + "grad_norm": 0.8206508159637451, + "learning_rate": 0.0002510945623155824, + "loss": 4.7587, + "step": 59590 + }, + { + "epoch": 0.08374359145492195, + "grad_norm": 0.841090977191925, + "learning_rate": 0.0002511367149079668, + "loss": 4.7037, + "step": 59600 + }, + { + "epoch": 0.08375764239308553, + "grad_norm": 0.7994279265403748, + "learning_rate": 0.00025117886750035124, + "loss": 4.7894, + "step": 59610 + }, + { + "epoch": 0.0837716933312491, + "grad_norm": 0.7760536074638367, + "learning_rate": 0.0002512210200927357, + "loss": 4.8517, + "step": 59620 + }, + { + "epoch": 0.08378574426941268, + "grad_norm": 0.8113006949424744, + "learning_rate": 0.0002512631726851201, + "loss": 4.9551, + "step": 59630 + }, + { + "epoch": 0.08379979520757627, + "grad_norm": 0.8211884498596191, + "learning_rate": 0.00025130532527750454, + "loss": 4.9197, + "step": 59640 + }, + { + "epoch": 0.08381384614573985, + "grad_norm": 0.8168864846229553, + "learning_rate": 0.00025134747786988897, + "loss": 4.8356, + "step": 59650 + }, + { + "epoch": 0.08382789708390342, + "grad_norm": 0.8315194845199585, + "learning_rate": 0.0002513896304622734, + "loss": 4.7761, + "step": 59660 + }, + { + "epoch": 0.083841948022067, + "grad_norm": 0.8301543593406677, + "learning_rate": 0.00025143178305465783, + "loss": 4.8829, + "step": 59670 + }, + { + "epoch": 0.08385599896023058, + "grad_norm": 0.8475062251091003, + "learning_rate": 0.00025147393564704226, + "loss": 4.7885, + "step": 59680 + }, + { + "epoch": 0.08387004989839415, + "grad_norm": 0.8310939073562622, + "learning_rate": 0.0002515160882394267, + "loss": 4.7185, + "step": 59690 + }, + { + "epoch": 0.08388410083655773, + "grad_norm": 0.8392112851142883, + "learning_rate": 0.0002515582408318111, + "loss": 4.7873, + "step": 59700 + }, + { + "epoch": 0.0838981517747213, + "grad_norm": 0.8230659365653992, + "learning_rate": 0.00025160039342419556, + "loss": 4.7909, + "step": 59710 + }, + { + "epoch": 0.08391220271288488, + "grad_norm": 0.834247350692749, + "learning_rate": 0.00025164254601658, + "loss": 4.7701, + "step": 59720 + }, + { + "epoch": 0.08392625365104846, + "grad_norm": 0.8113618493080139, + "learning_rate": 0.0002516846986089644, + "loss": 4.8853, + "step": 59730 + }, + { + "epoch": 0.08394030458921205, + "grad_norm": 0.8198868632316589, + "learning_rate": 0.00025172685120134885, + "loss": 4.8926, + "step": 59740 + }, + { + "epoch": 0.08395435552737562, + "grad_norm": 0.8316317200660706, + "learning_rate": 0.0002517690037937333, + "loss": 4.8605, + "step": 59750 + }, + { + "epoch": 0.0839684064655392, + "grad_norm": 0.9213061332702637, + "learning_rate": 0.0002518111563861177, + "loss": 4.8161, + "step": 59760 + }, + { + "epoch": 0.08398245740370278, + "grad_norm": 0.8067527413368225, + "learning_rate": 0.00025185330897850214, + "loss": 4.8305, + "step": 59770 + }, + { + "epoch": 0.08399650834186635, + "grad_norm": 0.8328394293785095, + "learning_rate": 0.0002518954615708866, + "loss": 4.8528, + "step": 59780 + }, + { + "epoch": 0.08401055928002993, + "grad_norm": 0.8063852190971375, + "learning_rate": 0.000251937614163271, + "loss": 4.7661, + "step": 59790 + }, + { + "epoch": 0.0840246102181935, + "grad_norm": 0.8150149583816528, + "learning_rate": 0.0002519797667556555, + "loss": 4.6904, + "step": 59800 + }, + { + "epoch": 0.08403866115635708, + "grad_norm": 0.8086451292037964, + "learning_rate": 0.00025202191934803987, + "loss": 4.8242, + "step": 59810 + }, + { + "epoch": 0.08405271209452066, + "grad_norm": 0.8651347160339355, + "learning_rate": 0.0002520640719404243, + "loss": 4.7535, + "step": 59820 + }, + { + "epoch": 0.08406676303268423, + "grad_norm": 0.8142220377922058, + "learning_rate": 0.00025210622453280873, + "loss": 4.8362, + "step": 59830 + }, + { + "epoch": 0.08408081397084781, + "grad_norm": 0.8657679557800293, + "learning_rate": 0.00025214837712519316, + "loss": 4.7861, + "step": 59840 + }, + { + "epoch": 0.0840948649090114, + "grad_norm": 0.87697833776474, + "learning_rate": 0.0002521905297175776, + "loss": 4.739, + "step": 59850 + }, + { + "epoch": 0.08410891584717498, + "grad_norm": 0.870245099067688, + "learning_rate": 0.0002522326823099621, + "loss": 4.7173, + "step": 59860 + }, + { + "epoch": 0.08412296678533855, + "grad_norm": 0.8389632701873779, + "learning_rate": 0.00025227483490234645, + "loss": 4.7693, + "step": 59870 + }, + { + "epoch": 0.08413701772350213, + "grad_norm": 0.8237364888191223, + "learning_rate": 0.0002523169874947309, + "loss": 4.8892, + "step": 59880 + }, + { + "epoch": 0.0841510686616657, + "grad_norm": 0.8023500442504883, + "learning_rate": 0.0002523591400871153, + "loss": 4.7923, + "step": 59890 + }, + { + "epoch": 0.08416511959982928, + "grad_norm": 0.8338358402252197, + "learning_rate": 0.00025240129267949975, + "loss": 4.8156, + "step": 59900 + }, + { + "epoch": 0.08417917053799286, + "grad_norm": 0.7917497158050537, + "learning_rate": 0.00025243923001264573, + "loss": 4.7169, + "step": 59910 + }, + { + "epoch": 0.08419322147615643, + "grad_norm": 0.8231458067893982, + "learning_rate": 0.00025248138260503016, + "loss": 4.8951, + "step": 59920 + }, + { + "epoch": 0.08420727241432001, + "grad_norm": 0.8315156102180481, + "learning_rate": 0.0002525235351974146, + "loss": 4.8144, + "step": 59930 + }, + { + "epoch": 0.08422132335248358, + "grad_norm": 0.8496831059455872, + "learning_rate": 0.0002525656877897991, + "loss": 4.9156, + "step": 59940 + }, + { + "epoch": 0.08423537429064717, + "grad_norm": 0.8502749800682068, + "learning_rate": 0.0002526078403821835, + "loss": 4.9349, + "step": 59950 + }, + { + "epoch": 0.08424942522881075, + "grad_norm": 0.8212767243385315, + "learning_rate": 0.00025264999297456794, + "loss": 4.801, + "step": 59960 + }, + { + "epoch": 0.08426347616697433, + "grad_norm": 0.8079051971435547, + "learning_rate": 0.0002526921455669523, + "loss": 4.821, + "step": 59970 + }, + { + "epoch": 0.0842775271051379, + "grad_norm": 0.7926346063613892, + "learning_rate": 0.00025273429815933675, + "loss": 4.7747, + "step": 59980 + }, + { + "epoch": 0.08429157804330148, + "grad_norm": 0.8248520493507385, + "learning_rate": 0.0002527764507517212, + "loss": 4.8311, + "step": 59990 + }, + { + "epoch": 0.08430562898146506, + "grad_norm": 0.8126881122589111, + "learning_rate": 0.00025281860334410566, + "loss": 4.8472, + "step": 60000 + }, + { + "epoch": 0.08431967991962863, + "grad_norm": 0.8101698160171509, + "learning_rate": 0.0002528607559364901, + "loss": 4.7401, + "step": 60010 + }, + { + "epoch": 0.08433373085779221, + "grad_norm": 0.8176490664482117, + "learning_rate": 0.00025290290852887453, + "loss": 4.7447, + "step": 60020 + }, + { + "epoch": 0.08434778179595578, + "grad_norm": 0.8406776189804077, + "learning_rate": 0.00025294506112125896, + "loss": 4.7898, + "step": 60030 + }, + { + "epoch": 0.08436183273411936, + "grad_norm": 0.8245927691459656, + "learning_rate": 0.00025298721371364334, + "loss": 4.6811, + "step": 60040 + }, + { + "epoch": 0.08437588367228295, + "grad_norm": 0.8441291451454163, + "learning_rate": 0.00025302936630602777, + "loss": 4.7632, + "step": 60050 + }, + { + "epoch": 0.08438993461044653, + "grad_norm": 0.8327770829200745, + "learning_rate": 0.00025307151889841225, + "loss": 4.8536, + "step": 60060 + }, + { + "epoch": 0.0844039855486101, + "grad_norm": 0.850288987159729, + "learning_rate": 0.0002531136714907967, + "loss": 4.8284, + "step": 60070 + }, + { + "epoch": 0.08441803648677368, + "grad_norm": 0.7964315414428711, + "learning_rate": 0.0002531558240831811, + "loss": 4.9407, + "step": 60080 + }, + { + "epoch": 0.08443208742493725, + "grad_norm": 0.8162838220596313, + "learning_rate": 0.00025319797667556555, + "loss": 4.8588, + "step": 60090 + }, + { + "epoch": 0.08444613836310083, + "grad_norm": 0.8507556319236755, + "learning_rate": 0.0002532401292679499, + "loss": 4.7351, + "step": 60100 + }, + { + "epoch": 0.08446018930126441, + "grad_norm": 0.7655229568481445, + "learning_rate": 0.00025328228186033435, + "loss": 4.9912, + "step": 60110 + }, + { + "epoch": 0.08447424023942798, + "grad_norm": 0.8107126951217651, + "learning_rate": 0.00025332443445271884, + "loss": 4.7462, + "step": 60120 + }, + { + "epoch": 0.08448829117759156, + "grad_norm": 0.8109584450721741, + "learning_rate": 0.00025336658704510327, + "loss": 4.8539, + "step": 60130 + }, + { + "epoch": 0.08450234211575514, + "grad_norm": 0.8400088548660278, + "learning_rate": 0.0002534087396374877, + "loss": 4.9024, + "step": 60140 + }, + { + "epoch": 0.08451639305391871, + "grad_norm": 0.8335424065589905, + "learning_rate": 0.00025345089222987213, + "loss": 4.8311, + "step": 60150 + }, + { + "epoch": 0.0845304439920823, + "grad_norm": 0.8169661164283752, + "learning_rate": 0.0002534930448222565, + "loss": 4.8582, + "step": 60160 + }, + { + "epoch": 0.08454449493024588, + "grad_norm": 0.7944623827934265, + "learning_rate": 0.00025353519741464094, + "loss": 4.7601, + "step": 60170 + }, + { + "epoch": 0.08455854586840945, + "grad_norm": 0.8325213193893433, + "learning_rate": 0.0002535773500070254, + "loss": 4.819, + "step": 60180 + }, + { + "epoch": 0.08457259680657303, + "grad_norm": 0.8035892844200134, + "learning_rate": 0.00025361950259940986, + "loss": 4.8387, + "step": 60190 + }, + { + "epoch": 0.0845866477447366, + "grad_norm": 0.8531663417816162, + "learning_rate": 0.0002536616551917943, + "loss": 4.9727, + "step": 60200 + }, + { + "epoch": 0.08460069868290018, + "grad_norm": 0.8064233660697937, + "learning_rate": 0.0002537038077841787, + "loss": 4.739, + "step": 60210 + }, + { + "epoch": 0.08461474962106376, + "grad_norm": 0.8612556457519531, + "learning_rate": 0.00025374596037656315, + "loss": 4.763, + "step": 60220 + }, + { + "epoch": 0.08462880055922734, + "grad_norm": 0.8533815145492554, + "learning_rate": 0.0002537881129689476, + "loss": 4.7813, + "step": 60230 + }, + { + "epoch": 0.08464285149739091, + "grad_norm": 0.7971927523612976, + "learning_rate": 0.000253830265561332, + "loss": 4.7823, + "step": 60240 + }, + { + "epoch": 0.08465690243555449, + "grad_norm": 0.8081861734390259, + "learning_rate": 0.00025387241815371644, + "loss": 4.7512, + "step": 60250 + }, + { + "epoch": 0.08467095337371808, + "grad_norm": 0.8018068075180054, + "learning_rate": 0.0002539145707461009, + "loss": 4.7706, + "step": 60260 + }, + { + "epoch": 0.08468500431188165, + "grad_norm": 0.8136337995529175, + "learning_rate": 0.0002539567233384853, + "loss": 4.7486, + "step": 60270 + }, + { + "epoch": 0.08469905525004523, + "grad_norm": 0.837989091873169, + "learning_rate": 0.00025399887593086974, + "loss": 4.7813, + "step": 60280 + }, + { + "epoch": 0.0847131061882088, + "grad_norm": 0.8240730166435242, + "learning_rate": 0.00025404102852325417, + "loss": 4.8329, + "step": 60290 + }, + { + "epoch": 0.08472715712637238, + "grad_norm": 0.8388275504112244, + "learning_rate": 0.0002540831811156386, + "loss": 4.8039, + "step": 60300 + }, + { + "epoch": 0.08474120806453596, + "grad_norm": 0.8347736597061157, + "learning_rate": 0.00025412533370802303, + "loss": 4.6845, + "step": 60310 + }, + { + "epoch": 0.08475525900269953, + "grad_norm": 0.8188750147819519, + "learning_rate": 0.00025416748630040746, + "loss": 4.7612, + "step": 60320 + }, + { + "epoch": 0.08476930994086311, + "grad_norm": 0.8329417109489441, + "learning_rate": 0.0002542096388927919, + "loss": 4.8708, + "step": 60330 + }, + { + "epoch": 0.08478336087902669, + "grad_norm": 0.8301446437835693, + "learning_rate": 0.0002542517914851763, + "loss": 4.8245, + "step": 60340 + }, + { + "epoch": 0.08479741181719026, + "grad_norm": 0.8130600452423096, + "learning_rate": 0.00025429394407756076, + "loss": 4.8549, + "step": 60350 + }, + { + "epoch": 0.08481146275535385, + "grad_norm": 0.8038048148155212, + "learning_rate": 0.0002543360966699452, + "loss": 4.8175, + "step": 60360 + }, + { + "epoch": 0.08482551369351743, + "grad_norm": 0.7973541021347046, + "learning_rate": 0.0002543782492623296, + "loss": 4.852, + "step": 60370 + }, + { + "epoch": 0.084839564631681, + "grad_norm": 0.8406749367713928, + "learning_rate": 0.00025442040185471405, + "loss": 4.8618, + "step": 60380 + }, + { + "epoch": 0.08485361556984458, + "grad_norm": 0.8773977160453796, + "learning_rate": 0.0002544625544470985, + "loss": 4.7474, + "step": 60390 + }, + { + "epoch": 0.08486766650800816, + "grad_norm": 0.7995550632476807, + "learning_rate": 0.0002545047070394829, + "loss": 4.7697, + "step": 60400 + }, + { + "epoch": 0.08488171744617173, + "grad_norm": 0.8282455801963806, + "learning_rate": 0.00025454685963186734, + "loss": 4.8029, + "step": 60410 + }, + { + "epoch": 0.08489576838433531, + "grad_norm": 0.8432449102401733, + "learning_rate": 0.0002545890122242518, + "loss": 4.872, + "step": 60420 + }, + { + "epoch": 0.08490981932249889, + "grad_norm": 0.8184887170791626, + "learning_rate": 0.0002546311648166362, + "loss": 4.7083, + "step": 60430 + }, + { + "epoch": 0.08492387026066246, + "grad_norm": 0.7973591685295105, + "learning_rate": 0.00025467331740902064, + "loss": 4.8724, + "step": 60440 + }, + { + "epoch": 0.08493792119882604, + "grad_norm": 0.8287274837493896, + "learning_rate": 0.00025471547000140507, + "loss": 4.9017, + "step": 60450 + }, + { + "epoch": 0.08495197213698961, + "grad_norm": 0.8042613863945007, + "learning_rate": 0.0002547576225937895, + "loss": 4.7804, + "step": 60460 + }, + { + "epoch": 0.0849660230751532, + "grad_norm": 0.8201180100440979, + "learning_rate": 0.00025479977518617393, + "loss": 4.8187, + "step": 60470 + }, + { + "epoch": 0.08498007401331678, + "grad_norm": 0.8212829232215881, + "learning_rate": 0.00025484192777855836, + "loss": 4.8745, + "step": 60480 + }, + { + "epoch": 0.08499412495148036, + "grad_norm": 0.8027827143669128, + "learning_rate": 0.0002548840803709428, + "loss": 4.7172, + "step": 60490 + }, + { + "epoch": 0.08500817588964393, + "grad_norm": 1.7494938373565674, + "learning_rate": 0.0002549262329633272, + "loss": 4.827, + "step": 60500 + }, + { + "epoch": 0.08502222682780751, + "grad_norm": 0.8321921825408936, + "learning_rate": 0.00025496838555571165, + "loss": 4.7602, + "step": 60510 + }, + { + "epoch": 0.08503627776597109, + "grad_norm": 0.8253518342971802, + "learning_rate": 0.0002550105381480961, + "loss": 4.7594, + "step": 60520 + }, + { + "epoch": 0.08505032870413466, + "grad_norm": 0.8793212175369263, + "learning_rate": 0.0002550526907404805, + "loss": 4.7981, + "step": 60530 + }, + { + "epoch": 0.08506437964229824, + "grad_norm": 0.8195023536682129, + "learning_rate": 0.00025509484333286495, + "loss": 4.7146, + "step": 60540 + }, + { + "epoch": 0.08507843058046181, + "grad_norm": 0.8089900016784668, + "learning_rate": 0.0002551369959252494, + "loss": 4.835, + "step": 60550 + }, + { + "epoch": 0.08509248151862539, + "grad_norm": 0.798321008682251, + "learning_rate": 0.0002551791485176338, + "loss": 4.9411, + "step": 60560 + }, + { + "epoch": 0.08510653245678898, + "grad_norm": 0.7749428153038025, + "learning_rate": 0.00025522130111001824, + "loss": 4.7919, + "step": 60570 + }, + { + "epoch": 0.08512058339495256, + "grad_norm": 0.832476019859314, + "learning_rate": 0.00025526345370240267, + "loss": 4.8104, + "step": 60580 + }, + { + "epoch": 0.08513463433311613, + "grad_norm": 0.8051169514656067, + "learning_rate": 0.0002553056062947871, + "loss": 4.7459, + "step": 60590 + }, + { + "epoch": 0.08514868527127971, + "grad_norm": 0.8199285268783569, + "learning_rate": 0.00025534775888717153, + "loss": 4.7434, + "step": 60600 + }, + { + "epoch": 0.08516273620944328, + "grad_norm": 0.820178210735321, + "learning_rate": 0.00025538991147955597, + "loss": 4.8498, + "step": 60610 + }, + { + "epoch": 0.08517678714760686, + "grad_norm": 0.8192396759986877, + "learning_rate": 0.0002554320640719404, + "loss": 4.8455, + "step": 60620 + }, + { + "epoch": 0.08519083808577044, + "grad_norm": 0.8456838130950928, + "learning_rate": 0.00025547421666432483, + "loss": 4.9442, + "step": 60630 + }, + { + "epoch": 0.08520488902393401, + "grad_norm": 0.7862314581871033, + "learning_rate": 0.00025551636925670926, + "loss": 4.8109, + "step": 60640 + }, + { + "epoch": 0.08521893996209759, + "grad_norm": 0.781730592250824, + "learning_rate": 0.0002555585218490937, + "loss": 4.8463, + "step": 60650 + }, + { + "epoch": 0.08523299090026117, + "grad_norm": 0.8605624437332153, + "learning_rate": 0.0002556006744414781, + "loss": 4.7406, + "step": 60660 + }, + { + "epoch": 0.08524704183842476, + "grad_norm": 0.798784613609314, + "learning_rate": 0.00025564282703386255, + "loss": 4.9813, + "step": 60670 + }, + { + "epoch": 0.08526109277658833, + "grad_norm": 0.8223904967308044, + "learning_rate": 0.000255684979626247, + "loss": 4.9145, + "step": 60680 + }, + { + "epoch": 0.08527514371475191, + "grad_norm": 0.8261276483535767, + "learning_rate": 0.0002557271322186314, + "loss": 4.815, + "step": 60690 + }, + { + "epoch": 0.08528919465291548, + "grad_norm": 0.7942939400672913, + "learning_rate": 0.00025576928481101585, + "loss": 4.7418, + "step": 60700 + }, + { + "epoch": 0.08530324559107906, + "grad_norm": 0.7955240607261658, + "learning_rate": 0.0002558114374034003, + "loss": 4.7581, + "step": 60710 + }, + { + "epoch": 0.08531729652924264, + "grad_norm": 0.8077800273895264, + "learning_rate": 0.0002558535899957847, + "loss": 4.8254, + "step": 60720 + }, + { + "epoch": 0.08533134746740621, + "grad_norm": 0.8321365714073181, + "learning_rate": 0.00025589574258816914, + "loss": 4.8636, + "step": 60730 + }, + { + "epoch": 0.08534539840556979, + "grad_norm": 0.8244563341140747, + "learning_rate": 0.0002559378951805536, + "loss": 4.8733, + "step": 60740 + }, + { + "epoch": 0.08535944934373337, + "grad_norm": 0.8096931576728821, + "learning_rate": 0.000255980047772938, + "loss": 4.8622, + "step": 60750 + }, + { + "epoch": 0.08537350028189694, + "grad_norm": 0.8346720337867737, + "learning_rate": 0.00025602220036532243, + "loss": 4.8147, + "step": 60760 + }, + { + "epoch": 0.08538755122006053, + "grad_norm": 0.7969753742218018, + "learning_rate": 0.00025606435295770686, + "loss": 4.803, + "step": 60770 + }, + { + "epoch": 0.08540160215822411, + "grad_norm": 0.7898983359336853, + "learning_rate": 0.0002561065055500913, + "loss": 4.7337, + "step": 60780 + }, + { + "epoch": 0.08541565309638768, + "grad_norm": 0.815297544002533, + "learning_rate": 0.0002561486581424757, + "loss": 4.8547, + "step": 60790 + }, + { + "epoch": 0.08542970403455126, + "grad_norm": 0.8263185620307922, + "learning_rate": 0.0002561908107348602, + "loss": 4.7106, + "step": 60800 + }, + { + "epoch": 0.08544375497271484, + "grad_norm": 0.7924615740776062, + "learning_rate": 0.0002562329633272446, + "loss": 4.628, + "step": 60810 + }, + { + "epoch": 0.08545780591087841, + "grad_norm": 0.8533799052238464, + "learning_rate": 0.000256275115919629, + "loss": 4.8976, + "step": 60820 + }, + { + "epoch": 0.08547185684904199, + "grad_norm": 0.7679029703140259, + "learning_rate": 0.00025631726851201345, + "loss": 4.8269, + "step": 60830 + }, + { + "epoch": 0.08548590778720556, + "grad_norm": 0.9749653935432434, + "learning_rate": 0.0002563594211043979, + "loss": 4.8443, + "step": 60840 + }, + { + "epoch": 0.08549995872536914, + "grad_norm": 0.8148539066314697, + "learning_rate": 0.0002564015736967823, + "loss": 4.8212, + "step": 60850 + }, + { + "epoch": 0.08551400966353272, + "grad_norm": 0.8042306900024414, + "learning_rate": 0.0002564437262891668, + "loss": 4.9523, + "step": 60860 + }, + { + "epoch": 0.08552806060169629, + "grad_norm": 0.8610725998878479, + "learning_rate": 0.0002564858788815512, + "loss": 4.822, + "step": 60870 + }, + { + "epoch": 0.08554211153985988, + "grad_norm": 0.8803948760032654, + "learning_rate": 0.0002565280314739356, + "loss": 4.7024, + "step": 60880 + }, + { + "epoch": 0.08555616247802346, + "grad_norm": 0.805877149105072, + "learning_rate": 0.00025657018406632004, + "loss": 4.7908, + "step": 60890 + }, + { + "epoch": 0.08557021341618704, + "grad_norm": 0.8154386878013611, + "learning_rate": 0.00025661233665870447, + "loss": 4.9642, + "step": 60900 + }, + { + "epoch": 0.08558426435435061, + "grad_norm": 0.8559610843658447, + "learning_rate": 0.0002566544892510889, + "loss": 4.7765, + "step": 60910 + }, + { + "epoch": 0.08559831529251419, + "grad_norm": 0.8591551184654236, + "learning_rate": 0.0002566966418434734, + "loss": 4.7464, + "step": 60920 + }, + { + "epoch": 0.08561236623067776, + "grad_norm": 0.8162509202957153, + "learning_rate": 0.0002567387944358578, + "loss": 4.7748, + "step": 60930 + }, + { + "epoch": 0.08562641716884134, + "grad_norm": 0.8140776753425598, + "learning_rate": 0.0002567809470282422, + "loss": 4.888, + "step": 60940 + }, + { + "epoch": 0.08564046810700492, + "grad_norm": 0.8785784244537354, + "learning_rate": 0.0002568230996206266, + "loss": 4.9443, + "step": 60950 + }, + { + "epoch": 0.08565451904516849, + "grad_norm": 0.7831446528434753, + "learning_rate": 0.00025686525221301106, + "loss": 4.9672, + "step": 60960 + }, + { + "epoch": 0.08566856998333207, + "grad_norm": 0.8083533048629761, + "learning_rate": 0.0002569074048053955, + "loss": 4.7751, + "step": 60970 + }, + { + "epoch": 0.08568262092149566, + "grad_norm": 0.8388922810554504, + "learning_rate": 0.00025694955739778, + "loss": 4.8792, + "step": 60980 + }, + { + "epoch": 0.08569667185965923, + "grad_norm": 0.8060362339019775, + "learning_rate": 0.0002569917099901644, + "loss": 4.796, + "step": 60990 + }, + { + "epoch": 0.08571072279782281, + "grad_norm": 0.8279473185539246, + "learning_rate": 0.0002570338625825488, + "loss": 4.7601, + "step": 61000 + }, + { + "epoch": 0.08572477373598639, + "grad_norm": 0.8248935341835022, + "learning_rate": 0.0002570760151749332, + "loss": 4.8548, + "step": 61010 + }, + { + "epoch": 0.08573882467414996, + "grad_norm": 0.8512712717056274, + "learning_rate": 0.00025711816776731764, + "loss": 4.7709, + "step": 61020 + }, + { + "epoch": 0.08575287561231354, + "grad_norm": 0.7810263633728027, + "learning_rate": 0.0002571603203597021, + "loss": 4.8958, + "step": 61030 + }, + { + "epoch": 0.08576692655047712, + "grad_norm": 0.8753952980041504, + "learning_rate": 0.00025720247295208656, + "loss": 4.6734, + "step": 61040 + }, + { + "epoch": 0.08578097748864069, + "grad_norm": 0.8158033490180969, + "learning_rate": 0.000257244625544471, + "loss": 4.8609, + "step": 61050 + }, + { + "epoch": 0.08579502842680427, + "grad_norm": 0.8370834589004517, + "learning_rate": 0.0002572867781368554, + "loss": 4.8195, + "step": 61060 + }, + { + "epoch": 0.08580907936496784, + "grad_norm": 0.792068362236023, + "learning_rate": 0.0002573289307292398, + "loss": 4.7818, + "step": 61070 + }, + { + "epoch": 0.08582313030313143, + "grad_norm": 0.8027522563934326, + "learning_rate": 0.00025737108332162423, + "loss": 4.7769, + "step": 61080 + }, + { + "epoch": 0.08583718124129501, + "grad_norm": 0.8090192079544067, + "learning_rate": 0.00025741323591400866, + "loss": 4.6341, + "step": 61090 + }, + { + "epoch": 0.08585123217945859, + "grad_norm": 0.7874997854232788, + "learning_rate": 0.00025745538850639315, + "loss": 4.9727, + "step": 61100 + }, + { + "epoch": 0.08586528311762216, + "grad_norm": 0.7994272708892822, + "learning_rate": 0.0002574975410987776, + "loss": 4.7984, + "step": 61110 + }, + { + "epoch": 0.08587933405578574, + "grad_norm": 0.8127740025520325, + "learning_rate": 0.000257539693691162, + "loss": 4.8371, + "step": 61120 + }, + { + "epoch": 0.08589338499394931, + "grad_norm": 0.7839046120643616, + "learning_rate": 0.0002575818462835464, + "loss": 4.9259, + "step": 61130 + }, + { + "epoch": 0.08590743593211289, + "grad_norm": 0.812605619430542, + "learning_rate": 0.0002576239988759308, + "loss": 4.7804, + "step": 61140 + }, + { + "epoch": 0.08592148687027647, + "grad_norm": 0.8251674175262451, + "learning_rate": 0.00025766615146831525, + "loss": 4.7948, + "step": 61150 + }, + { + "epoch": 0.08593553780844004, + "grad_norm": 0.7972586750984192, + "learning_rate": 0.00025770830406069973, + "loss": 4.7815, + "step": 61160 + }, + { + "epoch": 0.08594958874660362, + "grad_norm": 0.8652247190475464, + "learning_rate": 0.00025775045665308417, + "loss": 4.9054, + "step": 61170 + }, + { + "epoch": 0.0859636396847672, + "grad_norm": 0.7779868245124817, + "learning_rate": 0.00025778839398623015, + "loss": 4.8678, + "step": 61180 + }, + { + "epoch": 0.08597769062293079, + "grad_norm": 0.8390598893165588, + "learning_rate": 0.0002578305465786146, + "loss": 4.796, + "step": 61190 + }, + { + "epoch": 0.08599174156109436, + "grad_norm": 0.7976048588752747, + "learning_rate": 0.000257872699170999, + "loss": 4.8985, + "step": 61200 + }, + { + "epoch": 0.08600579249925794, + "grad_norm": 0.8002855777740479, + "learning_rate": 0.00025791485176338344, + "loss": 4.8251, + "step": 61210 + }, + { + "epoch": 0.08601984343742151, + "grad_norm": 0.787406861782074, + "learning_rate": 0.00025795700435576787, + "loss": 4.8762, + "step": 61220 + }, + { + "epoch": 0.08603389437558509, + "grad_norm": 0.8275022506713867, + "learning_rate": 0.00025799915694815225, + "loss": 4.8124, + "step": 61230 + }, + { + "epoch": 0.08604794531374867, + "grad_norm": 0.8134986162185669, + "learning_rate": 0.00025804130954053674, + "loss": 4.6739, + "step": 61240 + }, + { + "epoch": 0.08606199625191224, + "grad_norm": 0.8022597432136536, + "learning_rate": 0.00025808346213292117, + "loss": 4.8794, + "step": 61250 + }, + { + "epoch": 0.08607604719007582, + "grad_norm": 0.789439857006073, + "learning_rate": 0.0002581256147253056, + "loss": 4.8165, + "step": 61260 + }, + { + "epoch": 0.0860900981282394, + "grad_norm": 0.7730726003646851, + "learning_rate": 0.00025816776731769003, + "loss": 4.8367, + "step": 61270 + }, + { + "epoch": 0.08610414906640297, + "grad_norm": 0.8278162479400635, + "learning_rate": 0.00025820991991007446, + "loss": 4.6802, + "step": 61280 + }, + { + "epoch": 0.08611820000456656, + "grad_norm": 0.8260532021522522, + "learning_rate": 0.00025825207250245884, + "loss": 4.7331, + "step": 61290 + }, + { + "epoch": 0.08613225094273014, + "grad_norm": 0.8389633297920227, + "learning_rate": 0.0002582942250948433, + "loss": 4.8219, + "step": 61300 + }, + { + "epoch": 0.08614630188089371, + "grad_norm": 0.7943553328514099, + "learning_rate": 0.00025833637768722775, + "loss": 4.9142, + "step": 61310 + }, + { + "epoch": 0.08616035281905729, + "grad_norm": 0.815380334854126, + "learning_rate": 0.0002583785302796122, + "loss": 4.7978, + "step": 61320 + }, + { + "epoch": 0.08617440375722087, + "grad_norm": 0.7899944186210632, + "learning_rate": 0.0002584206828719966, + "loss": 4.7829, + "step": 61330 + }, + { + "epoch": 0.08618845469538444, + "grad_norm": 0.8026753664016724, + "learning_rate": 0.00025846283546438105, + "loss": 4.7086, + "step": 61340 + }, + { + "epoch": 0.08620250563354802, + "grad_norm": 0.8403603434562683, + "learning_rate": 0.0002585049880567655, + "loss": 4.8447, + "step": 61350 + }, + { + "epoch": 0.0862165565717116, + "grad_norm": 0.7885310649871826, + "learning_rate": 0.0002585471406491499, + "loss": 4.818, + "step": 61360 + }, + { + "epoch": 0.08623060750987517, + "grad_norm": 0.7821835279464722, + "learning_rate": 0.00025858929324153434, + "loss": 4.7951, + "step": 61370 + }, + { + "epoch": 0.08624465844803875, + "grad_norm": 0.8090482354164124, + "learning_rate": 0.00025863144583391877, + "loss": 4.7686, + "step": 61380 + }, + { + "epoch": 0.08625870938620234, + "grad_norm": 0.8011608719825745, + "learning_rate": 0.0002586735984263032, + "loss": 4.8789, + "step": 61390 + }, + { + "epoch": 0.08627276032436591, + "grad_norm": 0.7893279194831848, + "learning_rate": 0.00025871575101868763, + "loss": 4.8815, + "step": 61400 + }, + { + "epoch": 0.08628681126252949, + "grad_norm": 0.8958577513694763, + "learning_rate": 0.00025875790361107206, + "loss": 4.6991, + "step": 61410 + }, + { + "epoch": 0.08630086220069307, + "grad_norm": 0.8177340030670166, + "learning_rate": 0.0002588000562034565, + "loss": 4.8222, + "step": 61420 + }, + { + "epoch": 0.08631491313885664, + "grad_norm": 0.8018973469734192, + "learning_rate": 0.00025884220879584093, + "loss": 4.9089, + "step": 61430 + }, + { + "epoch": 0.08632896407702022, + "grad_norm": 0.7807876467704773, + "learning_rate": 0.00025888436138822536, + "loss": 4.826, + "step": 61440 + }, + { + "epoch": 0.0863430150151838, + "grad_norm": 0.8257559537887573, + "learning_rate": 0.0002589265139806098, + "loss": 4.8224, + "step": 61450 + }, + { + "epoch": 0.08635706595334737, + "grad_norm": 0.8308781385421753, + "learning_rate": 0.0002589686665729942, + "loss": 4.6974, + "step": 61460 + }, + { + "epoch": 0.08637111689151095, + "grad_norm": 0.7814127206802368, + "learning_rate": 0.00025901081916537865, + "loss": 4.8122, + "step": 61470 + }, + { + "epoch": 0.08638516782967452, + "grad_norm": 0.7803528904914856, + "learning_rate": 0.0002590529717577631, + "loss": 4.8447, + "step": 61480 + }, + { + "epoch": 0.0863992187678381, + "grad_norm": 0.8149372935295105, + "learning_rate": 0.0002590951243501475, + "loss": 4.7555, + "step": 61490 + }, + { + "epoch": 0.08641326970600169, + "grad_norm": 0.8497858047485352, + "learning_rate": 0.00025913727694253195, + "loss": 4.7493, + "step": 61500 + }, + { + "epoch": 0.08642732064416526, + "grad_norm": 0.8034449219703674, + "learning_rate": 0.0002591794295349164, + "loss": 4.8812, + "step": 61510 + }, + { + "epoch": 0.08644137158232884, + "grad_norm": 0.8024821877479553, + "learning_rate": 0.0002592215821273008, + "loss": 4.928, + "step": 61520 + }, + { + "epoch": 0.08645542252049242, + "grad_norm": 0.7921434640884399, + "learning_rate": 0.00025926373471968524, + "loss": 4.751, + "step": 61530 + }, + { + "epoch": 0.086469473458656, + "grad_norm": 0.8643134832382202, + "learning_rate": 0.00025930588731206967, + "loss": 4.8568, + "step": 61540 + }, + { + "epoch": 0.08648352439681957, + "grad_norm": 0.7830716967582703, + "learning_rate": 0.0002593480399044541, + "loss": 4.853, + "step": 61550 + }, + { + "epoch": 0.08649757533498315, + "grad_norm": 0.7742080092430115, + "learning_rate": 0.00025939019249683853, + "loss": 4.8559, + "step": 61560 + }, + { + "epoch": 0.08651162627314672, + "grad_norm": 0.8120469450950623, + "learning_rate": 0.00025943234508922296, + "loss": 4.856, + "step": 61570 + }, + { + "epoch": 0.0865256772113103, + "grad_norm": 0.827467679977417, + "learning_rate": 0.0002594744976816074, + "loss": 4.7491, + "step": 61580 + }, + { + "epoch": 0.08653972814947387, + "grad_norm": 0.7826569080352783, + "learning_rate": 0.0002595166502739918, + "loss": 4.7901, + "step": 61590 + }, + { + "epoch": 0.08655377908763746, + "grad_norm": 0.7950142621994019, + "learning_rate": 0.00025955880286637626, + "loss": 4.7976, + "step": 61600 + }, + { + "epoch": 0.08656783002580104, + "grad_norm": 0.821763277053833, + "learning_rate": 0.0002596009554587607, + "loss": 4.821, + "step": 61610 + }, + { + "epoch": 0.08658188096396462, + "grad_norm": 0.8673090934753418, + "learning_rate": 0.0002596431080511451, + "loss": 4.7369, + "step": 61620 + }, + { + "epoch": 0.08659593190212819, + "grad_norm": 0.7934688925743103, + "learning_rate": 0.00025968526064352955, + "loss": 4.6503, + "step": 61630 + }, + { + "epoch": 0.08660998284029177, + "grad_norm": 0.7949123978614807, + "learning_rate": 0.000259727413235914, + "loss": 4.8444, + "step": 61640 + }, + { + "epoch": 0.08662403377845534, + "grad_norm": 0.7871426939964294, + "learning_rate": 0.0002597695658282984, + "loss": 4.8069, + "step": 61650 + }, + { + "epoch": 0.08663808471661892, + "grad_norm": 0.8564624786376953, + "learning_rate": 0.00025981171842068284, + "loss": 4.863, + "step": 61660 + }, + { + "epoch": 0.0866521356547825, + "grad_norm": 0.8065398931503296, + "learning_rate": 0.0002598538710130673, + "loss": 4.9085, + "step": 61670 + }, + { + "epoch": 0.08666618659294607, + "grad_norm": 0.79127436876297, + "learning_rate": 0.0002598960236054517, + "loss": 4.7051, + "step": 61680 + }, + { + "epoch": 0.08668023753110965, + "grad_norm": 0.7989571690559387, + "learning_rate": 0.00025993817619783614, + "loss": 4.6685, + "step": 61690 + }, + { + "epoch": 0.08669428846927324, + "grad_norm": 0.794864296913147, + "learning_rate": 0.00025998032879022057, + "loss": 4.8343, + "step": 61700 + }, + { + "epoch": 0.08670833940743682, + "grad_norm": 0.803548276424408, + "learning_rate": 0.000260022481382605, + "loss": 4.7731, + "step": 61710 + }, + { + "epoch": 0.08672239034560039, + "grad_norm": 0.8012824654579163, + "learning_rate": 0.00026006463397498943, + "loss": 4.8466, + "step": 61720 + }, + { + "epoch": 0.08673644128376397, + "grad_norm": 0.792219340801239, + "learning_rate": 0.00026010678656737386, + "loss": 4.7721, + "step": 61730 + }, + { + "epoch": 0.08675049222192754, + "grad_norm": 0.7680630087852478, + "learning_rate": 0.0002601489391597583, + "loss": 4.7905, + "step": 61740 + }, + { + "epoch": 0.08676454316009112, + "grad_norm": 0.7854152917861938, + "learning_rate": 0.0002601910917521427, + "loss": 4.7413, + "step": 61750 + }, + { + "epoch": 0.0867785940982547, + "grad_norm": 0.8030341267585754, + "learning_rate": 0.00026023324434452716, + "loss": 4.8224, + "step": 61760 + }, + { + "epoch": 0.08679264503641827, + "grad_norm": 0.807295560836792, + "learning_rate": 0.0002602753969369116, + "loss": 4.8628, + "step": 61770 + }, + { + "epoch": 0.08680669597458185, + "grad_norm": 0.7911593317985535, + "learning_rate": 0.000260317549529296, + "loss": 4.6736, + "step": 61780 + }, + { + "epoch": 0.08682074691274543, + "grad_norm": 0.8324936032295227, + "learning_rate": 0.00026035970212168045, + "loss": 4.8149, + "step": 61790 + }, + { + "epoch": 0.086834797850909, + "grad_norm": 0.8072502017021179, + "learning_rate": 0.0002604018547140649, + "loss": 4.8918, + "step": 61800 + }, + { + "epoch": 0.08684884878907259, + "grad_norm": 0.8104373216629028, + "learning_rate": 0.0002604440073064493, + "loss": 4.8896, + "step": 61810 + }, + { + "epoch": 0.08686289972723617, + "grad_norm": 1.0080177783966064, + "learning_rate": 0.00026048615989883374, + "loss": 4.8292, + "step": 61820 + }, + { + "epoch": 0.08687695066539974, + "grad_norm": 0.8252009749412537, + "learning_rate": 0.0002605283124912182, + "loss": 4.7111, + "step": 61830 + }, + { + "epoch": 0.08689100160356332, + "grad_norm": 0.8061153888702393, + "learning_rate": 0.0002605704650836026, + "loss": 4.781, + "step": 61840 + }, + { + "epoch": 0.0869050525417269, + "grad_norm": 0.8102174401283264, + "learning_rate": 0.00026061261767598704, + "loss": 4.8626, + "step": 61850 + }, + { + "epoch": 0.08691910347989047, + "grad_norm": 0.8249738812446594, + "learning_rate": 0.00026065477026837147, + "loss": 4.9965, + "step": 61860 + }, + { + "epoch": 0.08693315441805405, + "grad_norm": 0.8087854385375977, + "learning_rate": 0.00026069692286075595, + "loss": 4.8119, + "step": 61870 + }, + { + "epoch": 0.08694720535621762, + "grad_norm": 0.7888401746749878, + "learning_rate": 0.00026073907545314033, + "loss": 4.7734, + "step": 61880 + }, + { + "epoch": 0.0869612562943812, + "grad_norm": 0.7908345460891724, + "learning_rate": 0.00026078122804552476, + "loss": 4.792, + "step": 61890 + }, + { + "epoch": 0.08697530723254478, + "grad_norm": 0.8059920072555542, + "learning_rate": 0.0002608233806379092, + "loss": 4.8439, + "step": 61900 + }, + { + "epoch": 0.08698935817070837, + "grad_norm": 0.8107774257659912, + "learning_rate": 0.0002608655332302936, + "loss": 4.8511, + "step": 61910 + }, + { + "epoch": 0.08700340910887194, + "grad_norm": 0.7921923995018005, + "learning_rate": 0.00026090768582267805, + "loss": 4.6943, + "step": 61920 + }, + { + "epoch": 0.08701746004703552, + "grad_norm": 0.8148934245109558, + "learning_rate": 0.00026094983841506254, + "loss": 4.7229, + "step": 61930 + }, + { + "epoch": 0.0870315109851991, + "grad_norm": 0.7990559935569763, + "learning_rate": 0.0002609919910074469, + "loss": 4.7258, + "step": 61940 + }, + { + "epoch": 0.08704556192336267, + "grad_norm": 0.7900142073631287, + "learning_rate": 0.00026103414359983135, + "loss": 4.7876, + "step": 61950 + }, + { + "epoch": 0.08705961286152625, + "grad_norm": 0.7979053258895874, + "learning_rate": 0.0002610762961922158, + "loss": 4.7788, + "step": 61960 + }, + { + "epoch": 0.08707366379968982, + "grad_norm": 0.8106703162193298, + "learning_rate": 0.0002611184487846002, + "loss": 4.7476, + "step": 61970 + }, + { + "epoch": 0.0870877147378534, + "grad_norm": 0.8180921673774719, + "learning_rate": 0.00026116060137698464, + "loss": 4.6959, + "step": 61980 + }, + { + "epoch": 0.08710176567601698, + "grad_norm": 0.7843952178955078, + "learning_rate": 0.0002612027539693691, + "loss": 4.7037, + "step": 61990 + }, + { + "epoch": 0.08711581661418055, + "grad_norm": 0.7980206608772278, + "learning_rate": 0.0002612449065617535, + "loss": 4.8514, + "step": 62000 + }, + { + "epoch": 0.08712986755234414, + "grad_norm": 0.8227792382240295, + "learning_rate": 0.00026128705915413793, + "loss": 4.7442, + "step": 62010 + }, + { + "epoch": 0.08714391849050772, + "grad_norm": 0.8404500484466553, + "learning_rate": 0.00026132921174652237, + "loss": 4.89, + "step": 62020 + }, + { + "epoch": 0.0871579694286713, + "grad_norm": 0.8126966953277588, + "learning_rate": 0.0002613713643389068, + "loss": 4.8632, + "step": 62030 + }, + { + "epoch": 0.08717202036683487, + "grad_norm": 0.817398726940155, + "learning_rate": 0.0002614135169312913, + "loss": 4.8315, + "step": 62040 + }, + { + "epoch": 0.08718607130499845, + "grad_norm": 0.81786048412323, + "learning_rate": 0.0002614556695236757, + "loss": 4.7713, + "step": 62050 + }, + { + "epoch": 0.08720012224316202, + "grad_norm": 0.8178216814994812, + "learning_rate": 0.00026149782211606014, + "loss": 4.7856, + "step": 62060 + }, + { + "epoch": 0.0872141731813256, + "grad_norm": 0.8115580081939697, + "learning_rate": 0.0002615399747084445, + "loss": 4.7314, + "step": 62070 + }, + { + "epoch": 0.08722822411948918, + "grad_norm": 0.7866354584693909, + "learning_rate": 0.00026158212730082895, + "loss": 4.839, + "step": 62080 + }, + { + "epoch": 0.08724227505765275, + "grad_norm": 0.808581531047821, + "learning_rate": 0.0002616242798932134, + "loss": 4.8308, + "step": 62090 + }, + { + "epoch": 0.08725632599581633, + "grad_norm": 0.7981185913085938, + "learning_rate": 0.00026166643248559787, + "loss": 4.7796, + "step": 62100 + }, + { + "epoch": 0.0872703769339799, + "grad_norm": 0.8379929065704346, + "learning_rate": 0.0002617085850779823, + "loss": 4.737, + "step": 62110 + }, + { + "epoch": 0.0872844278721435, + "grad_norm": 0.8175838589668274, + "learning_rate": 0.00026175073767036673, + "loss": 4.8416, + "step": 62120 + }, + { + "epoch": 0.08729847881030707, + "grad_norm": 0.8132398724555969, + "learning_rate": 0.0002617928902627511, + "loss": 4.8088, + "step": 62130 + }, + { + "epoch": 0.08731252974847065, + "grad_norm": 0.8097195625305176, + "learning_rate": 0.00026183504285513554, + "loss": 4.8075, + "step": 62140 + }, + { + "epoch": 0.08732658068663422, + "grad_norm": 0.8951746225357056, + "learning_rate": 0.00026187719544751997, + "loss": 4.8201, + "step": 62150 + }, + { + "epoch": 0.0873406316247978, + "grad_norm": 0.8372669219970703, + "learning_rate": 0.00026191934803990446, + "loss": 4.8856, + "step": 62160 + }, + { + "epoch": 0.08735468256296137, + "grad_norm": 0.7923863530158997, + "learning_rate": 0.0002619615006322889, + "loss": 4.8744, + "step": 62170 + }, + { + "epoch": 0.08736873350112495, + "grad_norm": 0.7726845741271973, + "learning_rate": 0.0002620036532246733, + "loss": 4.8476, + "step": 62180 + }, + { + "epoch": 0.08738278443928853, + "grad_norm": 0.7641924619674683, + "learning_rate": 0.00026204580581705775, + "loss": 4.8731, + "step": 62190 + }, + { + "epoch": 0.0873968353774521, + "grad_norm": 0.8116229772567749, + "learning_rate": 0.00026208795840944213, + "loss": 4.8149, + "step": 62200 + }, + { + "epoch": 0.08741088631561568, + "grad_norm": 0.7808107137680054, + "learning_rate": 0.00026213011100182656, + "loss": 4.7596, + "step": 62210 + }, + { + "epoch": 0.08742493725377927, + "grad_norm": 0.7787986397743225, + "learning_rate": 0.00026217226359421104, + "loss": 4.7875, + "step": 62220 + }, + { + "epoch": 0.08743898819194285, + "grad_norm": 0.7925445437431335, + "learning_rate": 0.0002622144161865955, + "loss": 4.7452, + "step": 62230 + }, + { + "epoch": 0.08745303913010642, + "grad_norm": 0.8389007449150085, + "learning_rate": 0.0002622565687789799, + "loss": 4.7547, + "step": 62240 + }, + { + "epoch": 0.08746709006827, + "grad_norm": 0.7930507659912109, + "learning_rate": 0.00026229872137136434, + "loss": 4.7961, + "step": 62250 + }, + { + "epoch": 0.08748114100643357, + "grad_norm": 0.7937973737716675, + "learning_rate": 0.0002623408739637487, + "loss": 4.7547, + "step": 62260 + }, + { + "epoch": 0.08749519194459715, + "grad_norm": 0.797887921333313, + "learning_rate": 0.00026238302655613315, + "loss": 4.9243, + "step": 62270 + }, + { + "epoch": 0.08750924288276073, + "grad_norm": 0.7737236618995667, + "learning_rate": 0.00026242517914851763, + "loss": 4.8146, + "step": 62280 + }, + { + "epoch": 0.0875232938209243, + "grad_norm": 0.8101477026939392, + "learning_rate": 0.00026246733174090206, + "loss": 4.7729, + "step": 62290 + }, + { + "epoch": 0.08753734475908788, + "grad_norm": 0.8078815340995789, + "learning_rate": 0.0002625094843332865, + "loss": 4.6863, + "step": 62300 + }, + { + "epoch": 0.08755139569725146, + "grad_norm": 0.7906801700592041, + "learning_rate": 0.0002625516369256709, + "loss": 4.7801, + "step": 62310 + }, + { + "epoch": 0.08756544663541505, + "grad_norm": 0.8077771663665771, + "learning_rate": 0.00026259378951805536, + "loss": 4.7214, + "step": 62320 + }, + { + "epoch": 0.08757949757357862, + "grad_norm": 0.8051345348358154, + "learning_rate": 0.00026263594211043973, + "loss": 4.8615, + "step": 62330 + }, + { + "epoch": 0.0875935485117422, + "grad_norm": 0.798250675201416, + "learning_rate": 0.0002626780947028242, + "loss": 4.7257, + "step": 62340 + }, + { + "epoch": 0.08760759944990577, + "grad_norm": 0.7771165370941162, + "learning_rate": 0.00026272024729520865, + "loss": 4.7493, + "step": 62350 + }, + { + "epoch": 0.08762165038806935, + "grad_norm": 0.7755945920944214, + "learning_rate": 0.0002627623998875931, + "loss": 4.9485, + "step": 62360 + }, + { + "epoch": 0.08763570132623293, + "grad_norm": 0.7809829115867615, + "learning_rate": 0.0002628045524799775, + "loss": 4.7587, + "step": 62370 + }, + { + "epoch": 0.0876497522643965, + "grad_norm": 0.80907142162323, + "learning_rate": 0.00026284670507236194, + "loss": 4.7375, + "step": 62380 + }, + { + "epoch": 0.08766380320256008, + "grad_norm": 0.8076225519180298, + "learning_rate": 0.0002628888576647463, + "loss": 4.898, + "step": 62390 + }, + { + "epoch": 0.08767785414072365, + "grad_norm": 0.7816614508628845, + "learning_rate": 0.0002629310102571308, + "loss": 4.689, + "step": 62400 + }, + { + "epoch": 0.08769190507888723, + "grad_norm": 0.8814823031425476, + "learning_rate": 0.00026297316284951524, + "loss": 4.7298, + "step": 62410 + }, + { + "epoch": 0.0877059560170508, + "grad_norm": 0.8320509791374207, + "learning_rate": 0.00026301531544189967, + "loss": 4.7035, + "step": 62420 + }, + { + "epoch": 0.0877200069552144, + "grad_norm": 0.8074789643287659, + "learning_rate": 0.0002630574680342841, + "loss": 4.8431, + "step": 62430 + }, + { + "epoch": 0.08773405789337797, + "grad_norm": 0.7870790362358093, + "learning_rate": 0.00026309962062666853, + "loss": 4.8224, + "step": 62440 + }, + { + "epoch": 0.08774810883154155, + "grad_norm": 0.8190731406211853, + "learning_rate": 0.00026314177321905296, + "loss": 4.7692, + "step": 62450 + }, + { + "epoch": 0.08776215976970513, + "grad_norm": 0.8550110459327698, + "learning_rate": 0.0002631839258114374, + "loss": 4.8563, + "step": 62460 + }, + { + "epoch": 0.0877762107078687, + "grad_norm": 0.7676360607147217, + "learning_rate": 0.0002632260784038218, + "loss": 4.7988, + "step": 62470 + }, + { + "epoch": 0.08779026164603228, + "grad_norm": 0.8471920490264893, + "learning_rate": 0.00026326823099620625, + "loss": 4.7623, + "step": 62480 + }, + { + "epoch": 0.08780431258419585, + "grad_norm": 0.7891804575920105, + "learning_rate": 0.0002633103835885907, + "loss": 4.8314, + "step": 62490 + }, + { + "epoch": 0.08781836352235943, + "grad_norm": 0.8159024119377136, + "learning_rate": 0.0002633525361809751, + "loss": 4.8448, + "step": 62500 + }, + { + "epoch": 0.087832414460523, + "grad_norm": 0.7949129939079285, + "learning_rate": 0.00026339468877335955, + "loss": 4.7038, + "step": 62510 + }, + { + "epoch": 0.08784646539868658, + "grad_norm": 0.8131535649299622, + "learning_rate": 0.000263436841365744, + "loss": 4.8487, + "step": 62520 + }, + { + "epoch": 0.08786051633685017, + "grad_norm": 0.856124222278595, + "learning_rate": 0.0002634789939581284, + "loss": 4.7969, + "step": 62530 + }, + { + "epoch": 0.08787456727501375, + "grad_norm": 0.7902341485023499, + "learning_rate": 0.00026352114655051284, + "loss": 4.7348, + "step": 62540 + }, + { + "epoch": 0.08788861821317732, + "grad_norm": 0.7894250154495239, + "learning_rate": 0.00026356329914289727, + "loss": 4.7934, + "step": 62550 + }, + { + "epoch": 0.0879026691513409, + "grad_norm": 0.7873814702033997, + "learning_rate": 0.0002636054517352817, + "loss": 4.8012, + "step": 62560 + }, + { + "epoch": 0.08791672008950448, + "grad_norm": 0.8066341876983643, + "learning_rate": 0.00026364760432766613, + "loss": 4.8533, + "step": 62570 + }, + { + "epoch": 0.08793077102766805, + "grad_norm": 0.7610378861427307, + "learning_rate": 0.00026368975692005057, + "loss": 4.7305, + "step": 62580 + }, + { + "epoch": 0.08794482196583163, + "grad_norm": 0.8013112545013428, + "learning_rate": 0.000263731909512435, + "loss": 4.7925, + "step": 62590 + }, + { + "epoch": 0.0879588729039952, + "grad_norm": 0.7849205732345581, + "learning_rate": 0.00026377406210481943, + "loss": 4.793, + "step": 62600 + }, + { + "epoch": 0.08797292384215878, + "grad_norm": 0.8190938234329224, + "learning_rate": 0.00026381621469720386, + "loss": 4.7288, + "step": 62610 + }, + { + "epoch": 0.08798697478032236, + "grad_norm": 0.8454349040985107, + "learning_rate": 0.0002638583672895883, + "loss": 4.8616, + "step": 62620 + }, + { + "epoch": 0.08800102571848595, + "grad_norm": 0.7780125737190247, + "learning_rate": 0.0002639005198819727, + "loss": 4.7913, + "step": 62630 + }, + { + "epoch": 0.08801507665664952, + "grad_norm": 0.8072920441627502, + "learning_rate": 0.00026394267247435715, + "loss": 4.8829, + "step": 62640 + }, + { + "epoch": 0.0880291275948131, + "grad_norm": 0.8076897263526917, + "learning_rate": 0.0002639848250667416, + "loss": 4.7353, + "step": 62650 + }, + { + "epoch": 0.08804317853297668, + "grad_norm": 0.7699461579322815, + "learning_rate": 0.000264026977659126, + "loss": 4.8364, + "step": 62660 + }, + { + "epoch": 0.08805722947114025, + "grad_norm": 0.8059861063957214, + "learning_rate": 0.00026406913025151045, + "loss": 4.779, + "step": 62670 + }, + { + "epoch": 0.08807128040930383, + "grad_norm": 0.7961968779563904, + "learning_rate": 0.0002641112828438949, + "loss": 4.8785, + "step": 62680 + }, + { + "epoch": 0.0880853313474674, + "grad_norm": 0.8171684741973877, + "learning_rate": 0.0002641534354362793, + "loss": 4.7068, + "step": 62690 + }, + { + "epoch": 0.08809938228563098, + "grad_norm": 0.7811867594718933, + "learning_rate": 0.00026419558802866374, + "loss": 4.971, + "step": 62700 + }, + { + "epoch": 0.08811343322379456, + "grad_norm": 0.7805197834968567, + "learning_rate": 0.00026423774062104817, + "loss": 4.9694, + "step": 62710 + }, + { + "epoch": 0.08812748416195813, + "grad_norm": 0.8082363605499268, + "learning_rate": 0.0002642798932134326, + "loss": 4.7084, + "step": 62720 + }, + { + "epoch": 0.08814153510012171, + "grad_norm": 0.8304884433746338, + "learning_rate": 0.00026432204580581703, + "loss": 4.7155, + "step": 62730 + }, + { + "epoch": 0.0881555860382853, + "grad_norm": 0.8257015943527222, + "learning_rate": 0.00026436419839820146, + "loss": 4.8118, + "step": 62740 + }, + { + "epoch": 0.08816963697644888, + "grad_norm": 0.7988607883453369, + "learning_rate": 0.0002644063509905859, + "loss": 4.7102, + "step": 62750 + }, + { + "epoch": 0.08818368791461245, + "grad_norm": 0.7881811857223511, + "learning_rate": 0.0002644485035829703, + "loss": 4.8997, + "step": 62760 + }, + { + "epoch": 0.08819773885277603, + "grad_norm": 0.859298825263977, + "learning_rate": 0.00026449065617535476, + "loss": 4.8189, + "step": 62770 + }, + { + "epoch": 0.0882117897909396, + "grad_norm": 0.781113862991333, + "learning_rate": 0.0002645328087677392, + "loss": 4.7552, + "step": 62780 + }, + { + "epoch": 0.08822584072910318, + "grad_norm": 0.7915051579475403, + "learning_rate": 0.0002645749613601236, + "loss": 4.7368, + "step": 62790 + }, + { + "epoch": 0.08823989166726676, + "grad_norm": 0.8124852180480957, + "learning_rate": 0.00026461711395250805, + "loss": 4.8497, + "step": 62800 + }, + { + "epoch": 0.08825394260543033, + "grad_norm": 0.8054794073104858, + "learning_rate": 0.0002646592665448925, + "loss": 4.7871, + "step": 62810 + }, + { + "epoch": 0.08826799354359391, + "grad_norm": 0.8467390537261963, + "learning_rate": 0.0002647014191372769, + "loss": 4.6806, + "step": 62820 + }, + { + "epoch": 0.08828204448175749, + "grad_norm": 0.8043726682662964, + "learning_rate": 0.00026474357172966134, + "loss": 4.8889, + "step": 62830 + }, + { + "epoch": 0.08829609541992108, + "grad_norm": 0.7874152660369873, + "learning_rate": 0.0002647857243220458, + "loss": 4.8625, + "step": 62840 + }, + { + "epoch": 0.08831014635808465, + "grad_norm": 0.8490023016929626, + "learning_rate": 0.0002648278769144302, + "loss": 4.7956, + "step": 62850 + }, + { + "epoch": 0.08832419729624823, + "grad_norm": 0.8019155263900757, + "learning_rate": 0.00026487002950681464, + "loss": 4.7038, + "step": 62860 + }, + { + "epoch": 0.0883382482344118, + "grad_norm": 0.7915338277816772, + "learning_rate": 0.00026491218209919907, + "loss": 4.8408, + "step": 62870 + }, + { + "epoch": 0.08835229917257538, + "grad_norm": 0.7774483561515808, + "learning_rate": 0.0002649543346915835, + "loss": 4.8746, + "step": 62880 + }, + { + "epoch": 0.08836635011073896, + "grad_norm": 0.7984682321548462, + "learning_rate": 0.00026499648728396793, + "loss": 4.7794, + "step": 62890 + }, + { + "epoch": 0.08838040104890253, + "grad_norm": 0.803041398525238, + "learning_rate": 0.00026503863987635236, + "loss": 4.7, + "step": 62900 + }, + { + "epoch": 0.08839445198706611, + "grad_norm": 0.7951576113700867, + "learning_rate": 0.0002650807924687368, + "loss": 4.7453, + "step": 62910 + }, + { + "epoch": 0.08840850292522968, + "grad_norm": 0.7813413143157959, + "learning_rate": 0.0002651229450611212, + "loss": 4.7845, + "step": 62920 + }, + { + "epoch": 0.08842255386339326, + "grad_norm": 0.7739083170890808, + "learning_rate": 0.00026516509765350566, + "loss": 4.7668, + "step": 62930 + }, + { + "epoch": 0.08843660480155685, + "grad_norm": 0.8090055584907532, + "learning_rate": 0.0002652072502458901, + "loss": 4.8, + "step": 62940 + }, + { + "epoch": 0.08845065573972043, + "grad_norm": 0.7967948317527771, + "learning_rate": 0.0002652494028382745, + "loss": 4.9125, + "step": 62950 + }, + { + "epoch": 0.088464706677884, + "grad_norm": 0.7919458150863647, + "learning_rate": 0.00026529155543065895, + "loss": 4.8298, + "step": 62960 + }, + { + "epoch": 0.08847875761604758, + "grad_norm": 0.7871581315994263, + "learning_rate": 0.0002653337080230434, + "loss": 4.8399, + "step": 62970 + }, + { + "epoch": 0.08849280855421116, + "grad_norm": 0.7785304188728333, + "learning_rate": 0.0002653758606154278, + "loss": 4.856, + "step": 62980 + }, + { + "epoch": 0.08850685949237473, + "grad_norm": 0.7989861369132996, + "learning_rate": 0.00026541801320781224, + "loss": 4.7617, + "step": 62990 + }, + { + "epoch": 0.08852091043053831, + "grad_norm": 0.7797976732254028, + "learning_rate": 0.0002654601658001967, + "loss": 4.9164, + "step": 63000 + }, + { + "epoch": 0.08853496136870188, + "grad_norm": 0.766815721988678, + "learning_rate": 0.0002655023183925811, + "loss": 4.809, + "step": 63010 + }, + { + "epoch": 0.08854901230686546, + "grad_norm": 0.8306536078453064, + "learning_rate": 0.00026554447098496554, + "loss": 4.8476, + "step": 63020 + }, + { + "epoch": 0.08856306324502904, + "grad_norm": 0.8039864897727966, + "learning_rate": 0.00026558662357735, + "loss": 4.7684, + "step": 63030 + }, + { + "epoch": 0.08857711418319261, + "grad_norm": 0.8326762914657593, + "learning_rate": 0.0002656287761697344, + "loss": 4.7807, + "step": 63040 + }, + { + "epoch": 0.0885911651213562, + "grad_norm": 0.7910907864570618, + "learning_rate": 0.00026567092876211883, + "loss": 4.7499, + "step": 63050 + }, + { + "epoch": 0.08860521605951978, + "grad_norm": 0.7897930145263672, + "learning_rate": 0.00026571308135450326, + "loss": 4.8467, + "step": 63060 + }, + { + "epoch": 0.08861926699768335, + "grad_norm": 0.7836660146713257, + "learning_rate": 0.0002657552339468877, + "loss": 4.7958, + "step": 63070 + }, + { + "epoch": 0.08863331793584693, + "grad_norm": 0.7851306796073914, + "learning_rate": 0.0002657973865392721, + "loss": 4.7329, + "step": 63080 + }, + { + "epoch": 0.08864736887401051, + "grad_norm": 0.7947384715080261, + "learning_rate": 0.0002658395391316566, + "loss": 4.8723, + "step": 63090 + }, + { + "epoch": 0.08866141981217408, + "grad_norm": 0.8200597167015076, + "learning_rate": 0.000265881691724041, + "loss": 4.8698, + "step": 63100 + }, + { + "epoch": 0.08867547075033766, + "grad_norm": 0.7728275060653687, + "learning_rate": 0.0002659238443164254, + "loss": 4.7902, + "step": 63110 + }, + { + "epoch": 0.08868952168850124, + "grad_norm": 0.7961565852165222, + "learning_rate": 0.00026596599690880985, + "loss": 4.736, + "step": 63120 + }, + { + "epoch": 0.08870357262666481, + "grad_norm": 0.7639411091804504, + "learning_rate": 0.0002660081495011943, + "loss": 4.9944, + "step": 63130 + }, + { + "epoch": 0.08871762356482839, + "grad_norm": 0.8078753352165222, + "learning_rate": 0.0002660503020935787, + "loss": 4.7914, + "step": 63140 + }, + { + "epoch": 0.08873167450299198, + "grad_norm": 0.8076030611991882, + "learning_rate": 0.0002660924546859632, + "loss": 4.9144, + "step": 63150 + }, + { + "epoch": 0.08874572544115555, + "grad_norm": 0.8135642409324646, + "learning_rate": 0.00026613460727834763, + "loss": 4.7739, + "step": 63160 + }, + { + "epoch": 0.08875977637931913, + "grad_norm": 0.81586754322052, + "learning_rate": 0.000266176759870732, + "loss": 4.7433, + "step": 63170 + }, + { + "epoch": 0.0887738273174827, + "grad_norm": 0.8089296817779541, + "learning_rate": 0.00026621891246311644, + "loss": 4.8924, + "step": 63180 + }, + { + "epoch": 0.08878787825564628, + "grad_norm": 0.7918955683708191, + "learning_rate": 0.00026626106505550087, + "loss": 4.7475, + "step": 63190 + }, + { + "epoch": 0.08880192919380986, + "grad_norm": 0.7879622578620911, + "learning_rate": 0.00026630321764788535, + "loss": 4.8674, + "step": 63200 + }, + { + "epoch": 0.08881598013197343, + "grad_norm": 0.7970558404922485, + "learning_rate": 0.0002663453702402698, + "loss": 4.8004, + "step": 63210 + }, + { + "epoch": 0.08883003107013701, + "grad_norm": 0.7629973292350769, + "learning_rate": 0.0002663875228326542, + "loss": 4.715, + "step": 63220 + }, + { + "epoch": 0.08884408200830059, + "grad_norm": 0.8390489816665649, + "learning_rate": 0.0002664296754250386, + "loss": 4.7549, + "step": 63230 + }, + { + "epoch": 0.08885813294646416, + "grad_norm": 0.7946515679359436, + "learning_rate": 0.000266471828017423, + "loss": 4.846, + "step": 63240 + }, + { + "epoch": 0.08887218388462775, + "grad_norm": 0.790104866027832, + "learning_rate": 0.00026651398060980745, + "loss": 5.0164, + "step": 63250 + }, + { + "epoch": 0.08888623482279133, + "grad_norm": 0.7630163431167603, + "learning_rate": 0.00026655613320219194, + "loss": 4.895, + "step": 63260 + }, + { + "epoch": 0.0889002857609549, + "grad_norm": 0.8079057931900024, + "learning_rate": 0.00026659828579457637, + "loss": 4.8883, + "step": 63270 + }, + { + "epoch": 0.08891433669911848, + "grad_norm": 0.7824141383171082, + "learning_rate": 0.0002666404383869608, + "loss": 4.7367, + "step": 63280 + }, + { + "epoch": 0.08892838763728206, + "grad_norm": 1.028652310371399, + "learning_rate": 0.0002666825909793452, + "loss": 4.8906, + "step": 63290 + }, + { + "epoch": 0.08894243857544563, + "grad_norm": 0.8163354396820068, + "learning_rate": 0.0002667247435717296, + "loss": 4.8534, + "step": 63300 + }, + { + "epoch": 0.08895648951360921, + "grad_norm": 0.7730701565742493, + "learning_rate": 0.00026676689616411404, + "loss": 4.7245, + "step": 63310 + }, + { + "epoch": 0.08897054045177279, + "grad_norm": 0.7872650623321533, + "learning_rate": 0.0002668090487564985, + "loss": 4.799, + "step": 63320 + }, + { + "epoch": 0.08898459138993636, + "grad_norm": 0.7956480979919434, + "learning_rate": 0.00026685120134888296, + "loss": 4.8097, + "step": 63330 + }, + { + "epoch": 0.08899864232809994, + "grad_norm": 0.7865951061248779, + "learning_rate": 0.0002668933539412674, + "loss": 4.9028, + "step": 63340 + }, + { + "epoch": 0.08901269326626352, + "grad_norm": 0.7755113244056702, + "learning_rate": 0.0002669355065336518, + "loss": 4.8193, + "step": 63350 + }, + { + "epoch": 0.0890267442044271, + "grad_norm": 0.7839175462722778, + "learning_rate": 0.0002669776591260362, + "loss": 4.7572, + "step": 63360 + }, + { + "epoch": 0.08904079514259068, + "grad_norm": 0.801013708114624, + "learning_rate": 0.00026701981171842063, + "loss": 4.7782, + "step": 63370 + }, + { + "epoch": 0.08905484608075426, + "grad_norm": 0.8248993754386902, + "learning_rate": 0.0002670619643108051, + "loss": 4.8563, + "step": 63380 + }, + { + "epoch": 0.08906889701891783, + "grad_norm": 0.8062582612037659, + "learning_rate": 0.00026710411690318954, + "loss": 4.8467, + "step": 63390 + }, + { + "epoch": 0.08908294795708141, + "grad_norm": 0.7893478274345398, + "learning_rate": 0.000267146269495574, + "loss": 4.8678, + "step": 63400 + }, + { + "epoch": 0.08909699889524499, + "grad_norm": 0.8045288920402527, + "learning_rate": 0.0002671884220879584, + "loss": 4.7405, + "step": 63410 + }, + { + "epoch": 0.08911104983340856, + "grad_norm": 0.7915014028549194, + "learning_rate": 0.0002672305746803428, + "loss": 4.7565, + "step": 63420 + }, + { + "epoch": 0.08912510077157214, + "grad_norm": 0.8055747747421265, + "learning_rate": 0.0002672727272727272, + "loss": 4.8634, + "step": 63430 + }, + { + "epoch": 0.08913915170973571, + "grad_norm": 0.8131148815155029, + "learning_rate": 0.0002673148798651117, + "loss": 4.7419, + "step": 63440 + }, + { + "epoch": 0.08915320264789929, + "grad_norm": 0.7881912589073181, + "learning_rate": 0.00026735703245749613, + "loss": 4.8812, + "step": 63450 + }, + { + "epoch": 0.08916725358606288, + "grad_norm": 0.7696599960327148, + "learning_rate": 0.00026739918504988056, + "loss": 4.7284, + "step": 63460 + }, + { + "epoch": 0.08918130452422646, + "grad_norm": 0.7833225727081299, + "learning_rate": 0.000267441337642265, + "loss": 4.8861, + "step": 63470 + }, + { + "epoch": 0.08919535546239003, + "grad_norm": 0.7945125102996826, + "learning_rate": 0.0002674834902346494, + "loss": 4.8175, + "step": 63480 + }, + { + "epoch": 0.08920940640055361, + "grad_norm": 0.8102738857269287, + "learning_rate": 0.0002675256428270338, + "loss": 4.8052, + "step": 63490 + }, + { + "epoch": 0.08922345733871719, + "grad_norm": 0.7833125591278076, + "learning_rate": 0.0002675677954194183, + "loss": 4.8178, + "step": 63500 + }, + { + "epoch": 0.08923750827688076, + "grad_norm": 0.7774162888526917, + "learning_rate": 0.0002676099480118027, + "loss": 4.9522, + "step": 63510 + }, + { + "epoch": 0.08925155921504434, + "grad_norm": 0.7914357781410217, + "learning_rate": 0.00026765210060418715, + "loss": 4.7583, + "step": 63520 + }, + { + "epoch": 0.08926561015320791, + "grad_norm": 0.780647337436676, + "learning_rate": 0.0002676942531965716, + "loss": 4.7975, + "step": 63530 + }, + { + "epoch": 0.08927966109137149, + "grad_norm": 0.7892720699310303, + "learning_rate": 0.000267736405788956, + "loss": 4.8141, + "step": 63540 + }, + { + "epoch": 0.08929371202953507, + "grad_norm": 0.7910761833190918, + "learning_rate": 0.0002677785583813404, + "loss": 4.7316, + "step": 63550 + }, + { + "epoch": 0.08930776296769866, + "grad_norm": 0.7864344716072083, + "learning_rate": 0.0002678207109737249, + "loss": 4.6778, + "step": 63560 + }, + { + "epoch": 0.08932181390586223, + "grad_norm": 0.8073357343673706, + "learning_rate": 0.0002678628635661093, + "loss": 4.7731, + "step": 63570 + }, + { + "epoch": 0.08933586484402581, + "grad_norm": 0.784953236579895, + "learning_rate": 0.00026790501615849374, + "loss": 4.7903, + "step": 63580 + }, + { + "epoch": 0.08934991578218938, + "grad_norm": 0.7666168808937073, + "learning_rate": 0.00026794716875087817, + "loss": 4.8284, + "step": 63590 + }, + { + "epoch": 0.08936396672035296, + "grad_norm": 0.8397361636161804, + "learning_rate": 0.0002679893213432626, + "loss": 4.872, + "step": 63600 + }, + { + "epoch": 0.08937801765851654, + "grad_norm": 0.801026463508606, + "learning_rate": 0.00026803147393564703, + "loss": 4.9165, + "step": 63610 + }, + { + "epoch": 0.08939206859668011, + "grad_norm": 0.7671746015548706, + "learning_rate": 0.00026807362652803146, + "loss": 4.9064, + "step": 63620 + }, + { + "epoch": 0.08940611953484369, + "grad_norm": 0.791368305683136, + "learning_rate": 0.0002681157791204159, + "loss": 4.8467, + "step": 63630 + }, + { + "epoch": 0.08942017047300727, + "grad_norm": 0.7889533638954163, + "learning_rate": 0.0002681579317128003, + "loss": 4.7742, + "step": 63640 + }, + { + "epoch": 0.08943422141117084, + "grad_norm": 0.8022910356521606, + "learning_rate": 0.00026820008430518475, + "loss": 4.7715, + "step": 63650 + }, + { + "epoch": 0.08944827234933442, + "grad_norm": 0.8055729866027832, + "learning_rate": 0.0002682422368975692, + "loss": 4.8834, + "step": 63660 + }, + { + "epoch": 0.08946232328749801, + "grad_norm": 0.8192464709281921, + "learning_rate": 0.0002682843894899536, + "loss": 4.8719, + "step": 63670 + }, + { + "epoch": 0.08947637422566158, + "grad_norm": 0.7914919257164001, + "learning_rate": 0.00026832654208233805, + "loss": 4.8239, + "step": 63680 + }, + { + "epoch": 0.08949042516382516, + "grad_norm": 0.7857105135917664, + "learning_rate": 0.0002683686946747225, + "loss": 4.798, + "step": 63690 + }, + { + "epoch": 0.08950447610198874, + "grad_norm": 0.7843837738037109, + "learning_rate": 0.0002684108472671069, + "loss": 4.8482, + "step": 63700 + }, + { + "epoch": 0.08951852704015231, + "grad_norm": 0.7536832690238953, + "learning_rate": 0.00026845299985949134, + "loss": 4.9153, + "step": 63710 + }, + { + "epoch": 0.08953257797831589, + "grad_norm": 0.8098011016845703, + "learning_rate": 0.00026849515245187577, + "loss": 4.7243, + "step": 63720 + }, + { + "epoch": 0.08954662891647946, + "grad_norm": 0.7772601842880249, + "learning_rate": 0.0002685373050442602, + "loss": 4.7708, + "step": 63730 + }, + { + "epoch": 0.08956067985464304, + "grad_norm": 0.7769831418991089, + "learning_rate": 0.00026857945763664464, + "loss": 4.8495, + "step": 63740 + }, + { + "epoch": 0.08957473079280662, + "grad_norm": 0.7833921909332275, + "learning_rate": 0.00026862161022902907, + "loss": 4.7753, + "step": 63750 + }, + { + "epoch": 0.0895887817309702, + "grad_norm": 0.7920560836791992, + "learning_rate": 0.0002686637628214135, + "loss": 4.892, + "step": 63760 + }, + { + "epoch": 0.08960283266913378, + "grad_norm": 0.7888614535331726, + "learning_rate": 0.00026870591541379793, + "loss": 4.8269, + "step": 63770 + }, + { + "epoch": 0.08961688360729736, + "grad_norm": 0.7918998599052429, + "learning_rate": 0.00026874806800618236, + "loss": 4.7976, + "step": 63780 + }, + { + "epoch": 0.08963093454546094, + "grad_norm": 0.8044359683990479, + "learning_rate": 0.0002687902205985668, + "loss": 4.6607, + "step": 63790 + }, + { + "epoch": 0.08964498548362451, + "grad_norm": 0.8149663209915161, + "learning_rate": 0.0002688323731909512, + "loss": 4.9424, + "step": 63800 + }, + { + "epoch": 0.08965903642178809, + "grad_norm": 0.8094214797019958, + "learning_rate": 0.00026887452578333565, + "loss": 4.7464, + "step": 63810 + }, + { + "epoch": 0.08967308735995166, + "grad_norm": 0.8028861284255981, + "learning_rate": 0.0002689166783757201, + "loss": 4.7546, + "step": 63820 + }, + { + "epoch": 0.08968713829811524, + "grad_norm": 0.7879270315170288, + "learning_rate": 0.0002689588309681045, + "loss": 4.7557, + "step": 63830 + }, + { + "epoch": 0.08970118923627882, + "grad_norm": 0.802869439125061, + "learning_rate": 0.00026900098356048895, + "loss": 4.7707, + "step": 63840 + }, + { + "epoch": 0.08971524017444239, + "grad_norm": 0.825872004032135, + "learning_rate": 0.0002690431361528734, + "loss": 4.7376, + "step": 63850 + }, + { + "epoch": 0.08972929111260597, + "grad_norm": 0.7914631366729736, + "learning_rate": 0.0002690852887452578, + "loss": 4.7492, + "step": 63860 + }, + { + "epoch": 0.08974334205076956, + "grad_norm": 0.8055576682090759, + "learning_rate": 0.00026912744133764224, + "loss": 4.8859, + "step": 63870 + }, + { + "epoch": 0.08975739298893314, + "grad_norm": 0.7824854254722595, + "learning_rate": 0.00026916959393002667, + "loss": 4.6911, + "step": 63880 + }, + { + "epoch": 0.08977144392709671, + "grad_norm": 0.7842464447021484, + "learning_rate": 0.0002692117465224111, + "loss": 4.8765, + "step": 63890 + }, + { + "epoch": 0.08978549486526029, + "grad_norm": 0.7833605408668518, + "learning_rate": 0.00026925389911479553, + "loss": 4.9597, + "step": 63900 + }, + { + "epoch": 0.08979954580342386, + "grad_norm": 0.8475800156593323, + "learning_rate": 0.00026929605170717997, + "loss": 4.8712, + "step": 63910 + }, + { + "epoch": 0.08981359674158744, + "grad_norm": 0.7852799296379089, + "learning_rate": 0.0002693382042995644, + "loss": 4.7098, + "step": 63920 + }, + { + "epoch": 0.08982764767975102, + "grad_norm": 0.7918947339057922, + "learning_rate": 0.00026938035689194883, + "loss": 4.7586, + "step": 63930 + }, + { + "epoch": 0.08984169861791459, + "grad_norm": 0.7760900855064392, + "learning_rate": 0.00026942250948433326, + "loss": 4.6896, + "step": 63940 + }, + { + "epoch": 0.08985574955607817, + "grad_norm": 0.7669195532798767, + "learning_rate": 0.0002694646620767177, + "loss": 4.7557, + "step": 63950 + }, + { + "epoch": 0.08986980049424174, + "grad_norm": 0.7910531163215637, + "learning_rate": 0.0002695068146691021, + "loss": 4.7617, + "step": 63960 + }, + { + "epoch": 0.08988385143240532, + "grad_norm": 0.7762371301651001, + "learning_rate": 0.00026954896726148655, + "loss": 4.8493, + "step": 63970 + }, + { + "epoch": 0.08989790237056891, + "grad_norm": 0.7670621275901794, + "learning_rate": 0.000269591119853871, + "loss": 4.8319, + "step": 63980 + }, + { + "epoch": 0.08991195330873249, + "grad_norm": 0.7959233522415161, + "learning_rate": 0.0002696332724462554, + "loss": 4.8357, + "step": 63990 + }, + { + "epoch": 0.08992600424689606, + "grad_norm": 0.7903863191604614, + "learning_rate": 0.00026967542503863985, + "loss": 4.9134, + "step": 64000 + }, + { + "epoch": 0.08994005518505964, + "grad_norm": 0.7668089866638184, + "learning_rate": 0.0002697175776310243, + "loss": 4.8917, + "step": 64010 + }, + { + "epoch": 0.08995410612322322, + "grad_norm": 0.7648235559463501, + "learning_rate": 0.0002697597302234087, + "loss": 4.7716, + "step": 64020 + }, + { + "epoch": 0.08996815706138679, + "grad_norm": 0.7938455939292908, + "learning_rate": 0.00026980188281579314, + "loss": 4.8386, + "step": 64030 + }, + { + "epoch": 0.08998220799955037, + "grad_norm": 0.7752029895782471, + "learning_rate": 0.00026984403540817757, + "loss": 4.7078, + "step": 64040 + }, + { + "epoch": 0.08999625893771394, + "grad_norm": 0.7659651041030884, + "learning_rate": 0.000269886188000562, + "loss": 4.83, + "step": 64050 + }, + { + "epoch": 0.09001030987587752, + "grad_norm": 0.7914566993713379, + "learning_rate": 0.00026992834059294643, + "loss": 4.8551, + "step": 64060 + }, + { + "epoch": 0.0900243608140411, + "grad_norm": 0.7842085957527161, + "learning_rate": 0.00026997049318533086, + "loss": 4.7521, + "step": 64070 + }, + { + "epoch": 0.09003841175220469, + "grad_norm": 0.8064524531364441, + "learning_rate": 0.0002700126457777153, + "loss": 4.6835, + "step": 64080 + }, + { + "epoch": 0.09005246269036826, + "grad_norm": 0.7767420411109924, + "learning_rate": 0.0002700547983700997, + "loss": 4.8526, + "step": 64090 + }, + { + "epoch": 0.09006651362853184, + "grad_norm": 0.8287174105644226, + "learning_rate": 0.00027009695096248416, + "loss": 4.8894, + "step": 64100 + }, + { + "epoch": 0.09008056456669541, + "grad_norm": 0.7760927677154541, + "learning_rate": 0.0002701391035548686, + "loss": 4.8314, + "step": 64110 + }, + { + "epoch": 0.09009461550485899, + "grad_norm": 0.7992680072784424, + "learning_rate": 0.000270181256147253, + "loss": 4.6976, + "step": 64120 + }, + { + "epoch": 0.09010866644302257, + "grad_norm": 0.7589333057403564, + "learning_rate": 0.00027022340873963745, + "loss": 4.7346, + "step": 64130 + }, + { + "epoch": 0.09012271738118614, + "grad_norm": 0.8577151298522949, + "learning_rate": 0.0002702655613320219, + "loss": 4.8193, + "step": 64140 + }, + { + "epoch": 0.09013676831934972, + "grad_norm": 0.8156781792640686, + "learning_rate": 0.0002703077139244063, + "loss": 4.8104, + "step": 64150 + }, + { + "epoch": 0.0901508192575133, + "grad_norm": 0.780096709728241, + "learning_rate": 0.00027034986651679074, + "loss": 4.8663, + "step": 64160 + }, + { + "epoch": 0.09016487019567687, + "grad_norm": 0.798434853553772, + "learning_rate": 0.0002703920191091752, + "loss": 4.971, + "step": 64170 + }, + { + "epoch": 0.09017892113384046, + "grad_norm": 0.7590034008026123, + "learning_rate": 0.0002704341717015596, + "loss": 4.7796, + "step": 64180 + }, + { + "epoch": 0.09019297207200404, + "grad_norm": 0.7929971814155579, + "learning_rate": 0.0002704763242939441, + "loss": 4.8087, + "step": 64190 + }, + { + "epoch": 0.09020702301016761, + "grad_norm": 0.8228213787078857, + "learning_rate": 0.00027051847688632847, + "loss": 4.7715, + "step": 64200 + }, + { + "epoch": 0.09022107394833119, + "grad_norm": 0.7692803144454956, + "learning_rate": 0.0002705606294787129, + "loss": 4.7081, + "step": 64210 + }, + { + "epoch": 0.09023512488649477, + "grad_norm": 0.7841417789459229, + "learning_rate": 0.00027060278207109733, + "loss": 4.8686, + "step": 64220 + }, + { + "epoch": 0.09024917582465834, + "grad_norm": 0.7873497009277344, + "learning_rate": 0.00027064493466348176, + "loss": 4.8794, + "step": 64230 + }, + { + "epoch": 0.09026322676282192, + "grad_norm": 0.7846882343292236, + "learning_rate": 0.0002706870872558662, + "loss": 4.671, + "step": 64240 + }, + { + "epoch": 0.0902772777009855, + "grad_norm": 0.7908580303192139, + "learning_rate": 0.0002707292398482507, + "loss": 4.7099, + "step": 64250 + }, + { + "epoch": 0.09029132863914907, + "grad_norm": 0.8587797284126282, + "learning_rate": 0.00027077139244063506, + "loss": 4.7138, + "step": 64260 + }, + { + "epoch": 0.09030537957731265, + "grad_norm": 0.8411611914634705, + "learning_rate": 0.0002708135450330195, + "loss": 4.8517, + "step": 64270 + }, + { + "epoch": 0.09031943051547622, + "grad_norm": 0.8374800086021423, + "learning_rate": 0.0002708556976254039, + "loss": 4.7404, + "step": 64280 + }, + { + "epoch": 0.09033348145363981, + "grad_norm": 0.74876469373703, + "learning_rate": 0.00027089785021778835, + "loss": 4.8666, + "step": 64290 + }, + { + "epoch": 0.09034753239180339, + "grad_norm": 0.7894213199615479, + "learning_rate": 0.0002709400028101728, + "loss": 4.6872, + "step": 64300 + }, + { + "epoch": 0.09036158332996697, + "grad_norm": 0.7926090955734253, + "learning_rate": 0.00027098215540255727, + "loss": 4.7816, + "step": 64310 + }, + { + "epoch": 0.09037563426813054, + "grad_norm": 0.7763543725013733, + "learning_rate": 0.0002710243079949417, + "loss": 4.8655, + "step": 64320 + }, + { + "epoch": 0.09038968520629412, + "grad_norm": 0.7839246988296509, + "learning_rate": 0.0002710664605873261, + "loss": 4.9012, + "step": 64330 + }, + { + "epoch": 0.0904037361444577, + "grad_norm": 0.8784212470054626, + "learning_rate": 0.0002711086131797105, + "loss": 4.8065, + "step": 64340 + }, + { + "epoch": 0.09041778708262127, + "grad_norm": 0.7844157218933105, + "learning_rate": 0.00027115076577209494, + "loss": 4.7653, + "step": 64350 + }, + { + "epoch": 0.09043183802078485, + "grad_norm": 0.7894060015678406, + "learning_rate": 0.00027119291836447937, + "loss": 4.7516, + "step": 64360 + }, + { + "epoch": 0.09044588895894842, + "grad_norm": 0.7836933732032776, + "learning_rate": 0.00027123085569762535, + "loss": 4.7689, + "step": 64370 + }, + { + "epoch": 0.090459939897112, + "grad_norm": 0.7689288854598999, + "learning_rate": 0.0002712730082900098, + "loss": 4.8232, + "step": 64380 + }, + { + "epoch": 0.09047399083527559, + "grad_norm": 0.83738112449646, + "learning_rate": 0.00027131516088239427, + "loss": 4.8224, + "step": 64390 + }, + { + "epoch": 0.09048804177343917, + "grad_norm": 0.7634382843971252, + "learning_rate": 0.0002713573134747787, + "loss": 4.9363, + "step": 64400 + }, + { + "epoch": 0.09050209271160274, + "grad_norm": 0.7892853617668152, + "learning_rate": 0.00027139946606716313, + "loss": 4.8402, + "step": 64410 + }, + { + "epoch": 0.09051614364976632, + "grad_norm": 0.7666093111038208, + "learning_rate": 0.0002714416186595475, + "loss": 4.8348, + "step": 64420 + }, + { + "epoch": 0.0905301945879299, + "grad_norm": 0.780320405960083, + "learning_rate": 0.00027148377125193194, + "loss": 4.695, + "step": 64430 + }, + { + "epoch": 0.09054424552609347, + "grad_norm": 0.7658125758171082, + "learning_rate": 0.00027152592384431637, + "loss": 4.8615, + "step": 64440 + }, + { + "epoch": 0.09055829646425705, + "grad_norm": 0.7964998483657837, + "learning_rate": 0.00027156807643670085, + "loss": 4.7311, + "step": 64450 + }, + { + "epoch": 0.09057234740242062, + "grad_norm": 0.7794137001037598, + "learning_rate": 0.0002716102290290853, + "loss": 4.8909, + "step": 64460 + }, + { + "epoch": 0.0905863983405842, + "grad_norm": 0.763118326663971, + "learning_rate": 0.0002716523816214697, + "loss": 4.6842, + "step": 64470 + }, + { + "epoch": 0.09060044927874777, + "grad_norm": 0.758752167224884, + "learning_rate": 0.00027169453421385415, + "loss": 4.7289, + "step": 64480 + }, + { + "epoch": 0.09061450021691136, + "grad_norm": 0.8015297055244446, + "learning_rate": 0.0002717366868062385, + "loss": 4.8431, + "step": 64490 + }, + { + "epoch": 0.09062855115507494, + "grad_norm": 0.7830663323402405, + "learning_rate": 0.000271778839398623, + "loss": 4.8457, + "step": 64500 + }, + { + "epoch": 0.09064260209323852, + "grad_norm": 0.7865242958068848, + "learning_rate": 0.00027182099199100744, + "loss": 4.7674, + "step": 64510 + }, + { + "epoch": 0.09065665303140209, + "grad_norm": 0.829883337020874, + "learning_rate": 0.00027186314458339187, + "loss": 4.7804, + "step": 64520 + }, + { + "epoch": 0.09067070396956567, + "grad_norm": 0.7968289852142334, + "learning_rate": 0.0002719052971757763, + "loss": 4.8857, + "step": 64530 + }, + { + "epoch": 0.09068475490772925, + "grad_norm": 0.7666770815849304, + "learning_rate": 0.00027194744976816073, + "loss": 4.7485, + "step": 64540 + }, + { + "epoch": 0.09069880584589282, + "grad_norm": 0.7687329053878784, + "learning_rate": 0.0002719896023605451, + "loss": 4.8631, + "step": 64550 + }, + { + "epoch": 0.0907128567840564, + "grad_norm": 0.8181332945823669, + "learning_rate": 0.0002720317549529296, + "loss": 4.8318, + "step": 64560 + }, + { + "epoch": 0.09072690772221997, + "grad_norm": 0.7772801518440247, + "learning_rate": 0.00027207390754531403, + "loss": 4.8091, + "step": 64570 + }, + { + "epoch": 0.09074095866038355, + "grad_norm": 0.7849490642547607, + "learning_rate": 0.00027211606013769846, + "loss": 4.8445, + "step": 64580 + }, + { + "epoch": 0.09075500959854713, + "grad_norm": 0.7753223180770874, + "learning_rate": 0.0002721582127300829, + "loss": 4.9714, + "step": 64590 + }, + { + "epoch": 0.09076906053671072, + "grad_norm": 0.7721008062362671, + "learning_rate": 0.0002722003653224673, + "loss": 4.8823, + "step": 64600 + }, + { + "epoch": 0.09078311147487429, + "grad_norm": 0.7952477335929871, + "learning_rate": 0.00027224251791485175, + "loss": 4.8502, + "step": 64610 + }, + { + "epoch": 0.09079716241303787, + "grad_norm": 0.7919043302536011, + "learning_rate": 0.0002722846705072362, + "loss": 4.757, + "step": 64620 + }, + { + "epoch": 0.09081121335120144, + "grad_norm": 0.7945954203605652, + "learning_rate": 0.0002723268230996206, + "loss": 4.8094, + "step": 64630 + }, + { + "epoch": 0.09082526428936502, + "grad_norm": 0.768685519695282, + "learning_rate": 0.00027236897569200505, + "loss": 4.796, + "step": 64640 + }, + { + "epoch": 0.0908393152275286, + "grad_norm": 0.7738785743713379, + "learning_rate": 0.0002724111282843895, + "loss": 4.8588, + "step": 64650 + }, + { + "epoch": 0.09085336616569217, + "grad_norm": 0.76154625415802, + "learning_rate": 0.0002724532808767739, + "loss": 4.8627, + "step": 64660 + }, + { + "epoch": 0.09086741710385575, + "grad_norm": 0.7614012956619263, + "learning_rate": 0.00027249543346915834, + "loss": 4.9136, + "step": 64670 + }, + { + "epoch": 0.09088146804201933, + "grad_norm": 0.7839372754096985, + "learning_rate": 0.00027253758606154277, + "loss": 4.8921, + "step": 64680 + }, + { + "epoch": 0.0908955189801829, + "grad_norm": 0.8074344396591187, + "learning_rate": 0.0002725797386539272, + "loss": 4.8253, + "step": 64690 + }, + { + "epoch": 0.09090956991834649, + "grad_norm": 0.7729949951171875, + "learning_rate": 0.00027262189124631163, + "loss": 4.6527, + "step": 64700 + }, + { + "epoch": 0.09092362085651007, + "grad_norm": 0.7652543187141418, + "learning_rate": 0.00027266404383869606, + "loss": 4.7971, + "step": 64710 + }, + { + "epoch": 0.09093767179467364, + "grad_norm": 0.7983514070510864, + "learning_rate": 0.0002727061964310805, + "loss": 4.7203, + "step": 64720 + }, + { + "epoch": 0.09095172273283722, + "grad_norm": 0.7539568543434143, + "learning_rate": 0.0002727483490234649, + "loss": 4.8282, + "step": 64730 + }, + { + "epoch": 0.0909657736710008, + "grad_norm": 0.7831068634986877, + "learning_rate": 0.00027279050161584936, + "loss": 4.7831, + "step": 64740 + }, + { + "epoch": 0.09097982460916437, + "grad_norm": 0.7707744836807251, + "learning_rate": 0.0002728326542082338, + "loss": 4.8327, + "step": 64750 + }, + { + "epoch": 0.09099387554732795, + "grad_norm": 0.7709292769432068, + "learning_rate": 0.0002728748068006182, + "loss": 4.7731, + "step": 64760 + }, + { + "epoch": 0.09100792648549152, + "grad_norm": 0.8126934766769409, + "learning_rate": 0.00027291695939300265, + "loss": 4.7199, + "step": 64770 + }, + { + "epoch": 0.0910219774236551, + "grad_norm": 0.7724310755729675, + "learning_rate": 0.0002729591119853871, + "loss": 4.9449, + "step": 64780 + }, + { + "epoch": 0.09103602836181868, + "grad_norm": 0.787074089050293, + "learning_rate": 0.0002730012645777715, + "loss": 4.8887, + "step": 64790 + }, + { + "epoch": 0.09105007929998227, + "grad_norm": 0.8085893988609314, + "learning_rate": 0.00027304341717015594, + "loss": 4.6988, + "step": 64800 + }, + { + "epoch": 0.09106413023814584, + "grad_norm": 0.7805407047271729, + "learning_rate": 0.0002730855697625404, + "loss": 4.752, + "step": 64810 + }, + { + "epoch": 0.09107818117630942, + "grad_norm": 0.8128443360328674, + "learning_rate": 0.0002731277223549248, + "loss": 4.8374, + "step": 64820 + }, + { + "epoch": 0.091092232114473, + "grad_norm": 0.745376467704773, + "learning_rate": 0.00027316987494730924, + "loss": 4.684, + "step": 64830 + }, + { + "epoch": 0.09110628305263657, + "grad_norm": 0.7663626670837402, + "learning_rate": 0.00027321202753969367, + "loss": 4.897, + "step": 64840 + }, + { + "epoch": 0.09112033399080015, + "grad_norm": 0.7975637316703796, + "learning_rate": 0.0002732541801320781, + "loss": 4.8247, + "step": 64850 + }, + { + "epoch": 0.09113438492896372, + "grad_norm": 0.7657324075698853, + "learning_rate": 0.00027329633272446253, + "loss": 4.7546, + "step": 64860 + }, + { + "epoch": 0.0911484358671273, + "grad_norm": 0.7604333162307739, + "learning_rate": 0.00027333848531684696, + "loss": 4.6362, + "step": 64870 + }, + { + "epoch": 0.09116248680529088, + "grad_norm": 0.8322310447692871, + "learning_rate": 0.0002733806379092314, + "loss": 4.7038, + "step": 64880 + }, + { + "epoch": 0.09117653774345445, + "grad_norm": 0.7785294055938721, + "learning_rate": 0.0002734227905016158, + "loss": 4.7769, + "step": 64890 + }, + { + "epoch": 0.09119058868161803, + "grad_norm": 0.763119637966156, + "learning_rate": 0.00027346494309400026, + "loss": 4.8997, + "step": 64900 + }, + { + "epoch": 0.09120463961978162, + "grad_norm": 0.7625258564949036, + "learning_rate": 0.0002735070956863847, + "loss": 4.8272, + "step": 64910 + }, + { + "epoch": 0.0912186905579452, + "grad_norm": 0.777725100517273, + "learning_rate": 0.0002735492482787691, + "loss": 4.7371, + "step": 64920 + }, + { + "epoch": 0.09123274149610877, + "grad_norm": 0.8185275793075562, + "learning_rate": 0.00027359140087115355, + "loss": 4.7327, + "step": 64930 + }, + { + "epoch": 0.09124679243427235, + "grad_norm": 0.7645296454429626, + "learning_rate": 0.000273633553463538, + "loss": 4.7193, + "step": 64940 + }, + { + "epoch": 0.09126084337243592, + "grad_norm": 0.7671161890029907, + "learning_rate": 0.0002736757060559224, + "loss": 4.8717, + "step": 64950 + }, + { + "epoch": 0.0912748943105995, + "grad_norm": 0.7883368730545044, + "learning_rate": 0.00027371785864830684, + "loss": 4.8019, + "step": 64960 + }, + { + "epoch": 0.09128894524876308, + "grad_norm": 0.7796012759208679, + "learning_rate": 0.0002737600112406913, + "loss": 4.8924, + "step": 64970 + }, + { + "epoch": 0.09130299618692665, + "grad_norm": 0.7437483072280884, + "learning_rate": 0.0002738021638330757, + "loss": 4.7397, + "step": 64980 + }, + { + "epoch": 0.09131704712509023, + "grad_norm": 0.7617444396018982, + "learning_rate": 0.00027384431642546014, + "loss": 4.7948, + "step": 64990 + }, + { + "epoch": 0.0913310980632538, + "grad_norm": 0.7927044034004211, + "learning_rate": 0.00027388646901784457, + "loss": 4.7253, + "step": 65000 + }, + { + "epoch": 0.0913451490014174, + "grad_norm": 0.8112998604774475, + "learning_rate": 0.000273928621610229, + "loss": 4.7289, + "step": 65010 + }, + { + "epoch": 0.09135919993958097, + "grad_norm": 0.7460951805114746, + "learning_rate": 0.00027397077420261343, + "loss": 4.6933, + "step": 65020 + }, + { + "epoch": 0.09137325087774455, + "grad_norm": 0.7493196129798889, + "learning_rate": 0.00027401292679499786, + "loss": 4.8195, + "step": 65030 + }, + { + "epoch": 0.09138730181590812, + "grad_norm": 0.7835866808891296, + "learning_rate": 0.0002740550793873823, + "loss": 4.7021, + "step": 65040 + }, + { + "epoch": 0.0914013527540717, + "grad_norm": 0.7718499302864075, + "learning_rate": 0.0002740972319797667, + "loss": 4.8433, + "step": 65050 + }, + { + "epoch": 0.09141540369223528, + "grad_norm": 0.7729384899139404, + "learning_rate": 0.00027413938457215115, + "loss": 4.6984, + "step": 65060 + }, + { + "epoch": 0.09142945463039885, + "grad_norm": 0.7586498260498047, + "learning_rate": 0.0002741815371645356, + "loss": 4.836, + "step": 65070 + }, + { + "epoch": 0.09144350556856243, + "grad_norm": 0.8164992332458496, + "learning_rate": 0.00027422368975692, + "loss": 4.7512, + "step": 65080 + }, + { + "epoch": 0.091457556506726, + "grad_norm": 0.7848567962646484, + "learning_rate": 0.00027426584234930445, + "loss": 4.8578, + "step": 65090 + }, + { + "epoch": 0.09147160744488958, + "grad_norm": 0.7660877704620361, + "learning_rate": 0.0002743079949416889, + "loss": 4.8616, + "step": 65100 + }, + { + "epoch": 0.09148565838305317, + "grad_norm": 0.7506645321846008, + "learning_rate": 0.0002743501475340733, + "loss": 4.6539, + "step": 65110 + }, + { + "epoch": 0.09149970932121675, + "grad_norm": 0.7743892669677734, + "learning_rate": 0.00027439230012645774, + "loss": 4.7529, + "step": 65120 + }, + { + "epoch": 0.09151376025938032, + "grad_norm": 0.7679518461227417, + "learning_rate": 0.0002744344527188422, + "loss": 4.768, + "step": 65130 + }, + { + "epoch": 0.0915278111975439, + "grad_norm": 0.7692804336547852, + "learning_rate": 0.0002744766053112266, + "loss": 4.8296, + "step": 65140 + }, + { + "epoch": 0.09154186213570747, + "grad_norm": 0.7586258053779602, + "learning_rate": 0.00027451875790361104, + "loss": 4.7978, + "step": 65150 + }, + { + "epoch": 0.09155591307387105, + "grad_norm": 0.799343466758728, + "learning_rate": 0.00027456091049599547, + "loss": 4.6653, + "step": 65160 + }, + { + "epoch": 0.09156996401203463, + "grad_norm": 0.8051402568817139, + "learning_rate": 0.0002746030630883799, + "loss": 4.8429, + "step": 65170 + }, + { + "epoch": 0.0915840149501982, + "grad_norm": 0.7682332396507263, + "learning_rate": 0.00027464521568076433, + "loss": 4.7701, + "step": 65180 + }, + { + "epoch": 0.09159806588836178, + "grad_norm": 0.7532578110694885, + "learning_rate": 0.0002746873682731488, + "loss": 4.9643, + "step": 65190 + }, + { + "epoch": 0.09161211682652536, + "grad_norm": 0.7631083130836487, + "learning_rate": 0.0002747295208655332, + "loss": 4.8076, + "step": 65200 + }, + { + "epoch": 0.09162616776468893, + "grad_norm": 0.7771109342575073, + "learning_rate": 0.0002747716734579176, + "loss": 4.894, + "step": 65210 + }, + { + "epoch": 0.09164021870285252, + "grad_norm": 0.8161442279815674, + "learning_rate": 0.00027481382605030205, + "loss": 4.7906, + "step": 65220 + }, + { + "epoch": 0.0916542696410161, + "grad_norm": 0.7733320593833923, + "learning_rate": 0.0002748559786426865, + "loss": 4.7649, + "step": 65230 + }, + { + "epoch": 0.09166832057917967, + "grad_norm": 0.7682567238807678, + "learning_rate": 0.0002748981312350709, + "loss": 4.6973, + "step": 65240 + }, + { + "epoch": 0.09168237151734325, + "grad_norm": 0.7778883576393127, + "learning_rate": 0.0002749402838274554, + "loss": 4.7464, + "step": 65250 + }, + { + "epoch": 0.09169642245550683, + "grad_norm": 0.7723760008811951, + "learning_rate": 0.0002749824364198398, + "loss": 4.8092, + "step": 65260 + }, + { + "epoch": 0.0917104733936704, + "grad_norm": 0.7590900659561157, + "learning_rate": 0.0002750245890122242, + "loss": 4.7138, + "step": 65270 + }, + { + "epoch": 0.09172452433183398, + "grad_norm": 0.7957313656806946, + "learning_rate": 0.00027506674160460864, + "loss": 4.7174, + "step": 65280 + }, + { + "epoch": 0.09173857526999755, + "grad_norm": 0.7801516056060791, + "learning_rate": 0.00027510889419699307, + "loss": 4.8837, + "step": 65290 + }, + { + "epoch": 0.09175262620816113, + "grad_norm": 0.7635098099708557, + "learning_rate": 0.0002751510467893775, + "loss": 4.7774, + "step": 65300 + }, + { + "epoch": 0.09176667714632471, + "grad_norm": 0.8163349032402039, + "learning_rate": 0.000275193199381762, + "loss": 4.902, + "step": 65310 + }, + { + "epoch": 0.0917807280844883, + "grad_norm": 0.7679600715637207, + "learning_rate": 0.0002752353519741464, + "loss": 4.8026, + "step": 65320 + }, + { + "epoch": 0.09179477902265187, + "grad_norm": 0.794219970703125, + "learning_rate": 0.0002752775045665308, + "loss": 4.8016, + "step": 65330 + }, + { + "epoch": 0.09180882996081545, + "grad_norm": 0.8230299353599548, + "learning_rate": 0.00027531965715891523, + "loss": 4.7612, + "step": 65340 + }, + { + "epoch": 0.09182288089897903, + "grad_norm": 0.7828049659729004, + "learning_rate": 0.00027536180975129966, + "loss": 4.8781, + "step": 65350 + }, + { + "epoch": 0.0918369318371426, + "grad_norm": 0.7481945753097534, + "learning_rate": 0.0002754039623436841, + "loss": 4.818, + "step": 65360 + }, + { + "epoch": 0.09185098277530618, + "grad_norm": 0.8005225658416748, + "learning_rate": 0.0002754461149360686, + "loss": 4.7186, + "step": 65370 + }, + { + "epoch": 0.09186503371346975, + "grad_norm": 0.8023760318756104, + "learning_rate": 0.000275488267528453, + "loss": 4.7263, + "step": 65380 + }, + { + "epoch": 0.09187908465163333, + "grad_norm": 0.8292908072471619, + "learning_rate": 0.0002755304201208374, + "loss": 4.9251, + "step": 65390 + }, + { + "epoch": 0.0918931355897969, + "grad_norm": 0.7676294445991516, + "learning_rate": 0.0002755725727132218, + "loss": 4.7774, + "step": 65400 + }, + { + "epoch": 0.09190718652796048, + "grad_norm": 0.7877136468887329, + "learning_rate": 0.00027561472530560625, + "loss": 4.8057, + "step": 65410 + }, + { + "epoch": 0.09192123746612407, + "grad_norm": 0.7636573314666748, + "learning_rate": 0.0002756568778979907, + "loss": 4.9751, + "step": 65420 + }, + { + "epoch": 0.09193528840428765, + "grad_norm": 0.7771406173706055, + "learning_rate": 0.00027569903049037516, + "loss": 4.785, + "step": 65430 + }, + { + "epoch": 0.09194933934245123, + "grad_norm": 0.7473503351211548, + "learning_rate": 0.0002757411830827596, + "loss": 4.7884, + "step": 65440 + }, + { + "epoch": 0.0919633902806148, + "grad_norm": 0.7925262451171875, + "learning_rate": 0.000275783335675144, + "loss": 4.7761, + "step": 65450 + }, + { + "epoch": 0.09197744121877838, + "grad_norm": 0.7829357385635376, + "learning_rate": 0.0002758254882675284, + "loss": 4.7649, + "step": 65460 + }, + { + "epoch": 0.09199149215694195, + "grad_norm": 0.8279735445976257, + "learning_rate": 0.00027586764085991283, + "loss": 4.7296, + "step": 65470 + }, + { + "epoch": 0.09200554309510553, + "grad_norm": 0.7664202451705933, + "learning_rate": 0.00027590979345229726, + "loss": 4.8287, + "step": 65480 + }, + { + "epoch": 0.0920195940332691, + "grad_norm": 0.7685601711273193, + "learning_rate": 0.00027595194604468175, + "loss": 4.7864, + "step": 65490 + }, + { + "epoch": 0.09203364497143268, + "grad_norm": 0.784343421459198, + "learning_rate": 0.0002759940986370662, + "loss": 4.817, + "step": 65500 + }, + { + "epoch": 0.09204769590959626, + "grad_norm": 0.8135119080543518, + "learning_rate": 0.0002760362512294506, + "loss": 4.8879, + "step": 65510 + }, + { + "epoch": 0.09206174684775985, + "grad_norm": 0.7757298350334167, + "learning_rate": 0.000276078403821835, + "loss": 4.793, + "step": 65520 + }, + { + "epoch": 0.09207579778592342, + "grad_norm": 0.8407139778137207, + "learning_rate": 0.0002761205564142194, + "loss": 4.8615, + "step": 65530 + }, + { + "epoch": 0.092089848724087, + "grad_norm": 0.7666981220245361, + "learning_rate": 0.00027616270900660385, + "loss": 4.721, + "step": 65540 + }, + { + "epoch": 0.09210389966225058, + "grad_norm": 0.781695544719696, + "learning_rate": 0.00027620486159898834, + "loss": 4.7766, + "step": 65550 + }, + { + "epoch": 0.09211795060041415, + "grad_norm": 0.7683060169219971, + "learning_rate": 0.00027624701419137277, + "loss": 4.8538, + "step": 65560 + }, + { + "epoch": 0.09213200153857773, + "grad_norm": 0.7697662115097046, + "learning_rate": 0.0002762891667837572, + "loss": 4.8836, + "step": 65570 + }, + { + "epoch": 0.0921460524767413, + "grad_norm": 0.7881425023078918, + "learning_rate": 0.00027633131937614163, + "loss": 4.7934, + "step": 65580 + }, + { + "epoch": 0.09216010341490488, + "grad_norm": 0.7787525653839111, + "learning_rate": 0.000276373471968526, + "loss": 4.8257, + "step": 65590 + }, + { + "epoch": 0.09217415435306846, + "grad_norm": 0.757805585861206, + "learning_rate": 0.00027641562456091044, + "loss": 4.8827, + "step": 65600 + }, + { + "epoch": 0.09218820529123203, + "grad_norm": 0.7534720301628113, + "learning_rate": 0.0002764577771532949, + "loss": 4.7524, + "step": 65610 + }, + { + "epoch": 0.09220225622939561, + "grad_norm": 0.7800748348236084, + "learning_rate": 0.00027649992974567935, + "loss": 4.8452, + "step": 65620 + }, + { + "epoch": 0.0922163071675592, + "grad_norm": 0.7849797010421753, + "learning_rate": 0.0002765420823380638, + "loss": 4.7981, + "step": 65630 + }, + { + "epoch": 0.09223035810572278, + "grad_norm": 0.7599306106567383, + "learning_rate": 0.0002765842349304482, + "loss": 4.7777, + "step": 65640 + }, + { + "epoch": 0.09224440904388635, + "grad_norm": 0.789528489112854, + "learning_rate": 0.0002766263875228326, + "loss": 4.7558, + "step": 65650 + }, + { + "epoch": 0.09225845998204993, + "grad_norm": 0.7765876650810242, + "learning_rate": 0.0002766685401152171, + "loss": 4.8782, + "step": 65660 + }, + { + "epoch": 0.0922725109202135, + "grad_norm": 0.7817826271057129, + "learning_rate": 0.0002767106927076015, + "loss": 4.8516, + "step": 65670 + }, + { + "epoch": 0.09228656185837708, + "grad_norm": 0.7477931380271912, + "learning_rate": 0.00027675284529998594, + "loss": 4.9344, + "step": 65680 + }, + { + "epoch": 0.09230061279654066, + "grad_norm": 0.7491680979728699, + "learning_rate": 0.00027679499789237037, + "loss": 4.8218, + "step": 65690 + }, + { + "epoch": 0.09231466373470423, + "grad_norm": 0.7636551260948181, + "learning_rate": 0.0002768371504847548, + "loss": 4.8828, + "step": 65700 + }, + { + "epoch": 0.09232871467286781, + "grad_norm": 0.7482210993766785, + "learning_rate": 0.00027687930307713923, + "loss": 4.7725, + "step": 65710 + }, + { + "epoch": 0.09234276561103139, + "grad_norm": 0.7747960090637207, + "learning_rate": 0.00027692145566952367, + "loss": 4.7205, + "step": 65720 + }, + { + "epoch": 0.09235681654919498, + "grad_norm": 0.7718808650970459, + "learning_rate": 0.0002769636082619081, + "loss": 4.7258, + "step": 65730 + }, + { + "epoch": 0.09237086748735855, + "grad_norm": 0.7397331595420837, + "learning_rate": 0.00027700576085429253, + "loss": 4.7103, + "step": 65740 + }, + { + "epoch": 0.09238491842552213, + "grad_norm": 0.7860981822013855, + "learning_rate": 0.00027704791344667696, + "loss": 4.752, + "step": 65750 + }, + { + "epoch": 0.0923989693636857, + "grad_norm": 0.7629745006561279, + "learning_rate": 0.0002770900660390614, + "loss": 4.8679, + "step": 65760 + }, + { + "epoch": 0.09241302030184928, + "grad_norm": 0.8002822995185852, + "learning_rate": 0.0002771322186314458, + "loss": 4.6234, + "step": 65770 + }, + { + "epoch": 0.09242707124001286, + "grad_norm": 0.7651919722557068, + "learning_rate": 0.00027717437122383025, + "loss": 4.8368, + "step": 65780 + }, + { + "epoch": 0.09244112217817643, + "grad_norm": 0.7951807379722595, + "learning_rate": 0.0002772165238162147, + "loss": 4.7385, + "step": 65790 + }, + { + "epoch": 0.09245517311634001, + "grad_norm": 0.7759065628051758, + "learning_rate": 0.0002772586764085991, + "loss": 4.8701, + "step": 65800 + }, + { + "epoch": 0.09246922405450358, + "grad_norm": 0.753464937210083, + "learning_rate": 0.00027730082900098355, + "loss": 4.7545, + "step": 65810 + }, + { + "epoch": 0.09248327499266716, + "grad_norm": 0.7959068417549133, + "learning_rate": 0.000277342981593368, + "loss": 4.8609, + "step": 65820 + }, + { + "epoch": 0.09249732593083075, + "grad_norm": 0.7836405038833618, + "learning_rate": 0.0002773851341857524, + "loss": 4.768, + "step": 65830 + }, + { + "epoch": 0.09251137686899433, + "grad_norm": 0.7547979950904846, + "learning_rate": 0.00027742728677813684, + "loss": 4.7868, + "step": 65840 + }, + { + "epoch": 0.0925254278071579, + "grad_norm": 0.793201744556427, + "learning_rate": 0.00027746943937052127, + "loss": 4.7997, + "step": 65850 + }, + { + "epoch": 0.09253947874532148, + "grad_norm": 0.7729663252830505, + "learning_rate": 0.0002775115919629057, + "loss": 4.7825, + "step": 65860 + }, + { + "epoch": 0.09255352968348506, + "grad_norm": 0.8203871846199036, + "learning_rate": 0.00027755374455529013, + "loss": 4.7756, + "step": 65870 + }, + { + "epoch": 0.09256758062164863, + "grad_norm": 0.8373938798904419, + "learning_rate": 0.00027759589714767456, + "loss": 4.7466, + "step": 65880 + }, + { + "epoch": 0.09258163155981221, + "grad_norm": 0.77327960729599, + "learning_rate": 0.000277638049740059, + "loss": 4.812, + "step": 65890 + }, + { + "epoch": 0.09259568249797578, + "grad_norm": 0.7683082818984985, + "learning_rate": 0.0002776802023324434, + "loss": 4.8128, + "step": 65900 + }, + { + "epoch": 0.09260973343613936, + "grad_norm": 0.7626813650131226, + "learning_rate": 0.00027772235492482786, + "loss": 4.8501, + "step": 65910 + }, + { + "epoch": 0.09262378437430294, + "grad_norm": 0.7827718257904053, + "learning_rate": 0.0002777645075172123, + "loss": 4.7699, + "step": 65920 + }, + { + "epoch": 0.09263783531246651, + "grad_norm": 0.7622679471969604, + "learning_rate": 0.0002778066601095967, + "loss": 4.6728, + "step": 65930 + }, + { + "epoch": 0.0926518862506301, + "grad_norm": 0.7683987021446228, + "learning_rate": 0.00027784881270198115, + "loss": 4.9069, + "step": 65940 + }, + { + "epoch": 0.09266593718879368, + "grad_norm": 0.7734614610671997, + "learning_rate": 0.0002778909652943656, + "loss": 4.8696, + "step": 65950 + }, + { + "epoch": 0.09267998812695726, + "grad_norm": 1.3228286504745483, + "learning_rate": 0.00027793311788675, + "loss": 4.7319, + "step": 65960 + }, + { + "epoch": 0.09269403906512083, + "grad_norm": 0.7824644446372986, + "learning_rate": 0.00027797527047913445, + "loss": 4.7125, + "step": 65970 + }, + { + "epoch": 0.09270809000328441, + "grad_norm": 0.7849623560905457, + "learning_rate": 0.0002780174230715189, + "loss": 4.8618, + "step": 65980 + }, + { + "epoch": 0.09272214094144798, + "grad_norm": 0.7580630779266357, + "learning_rate": 0.0002780595756639033, + "loss": 4.7867, + "step": 65990 + }, + { + "epoch": 0.09273619187961156, + "grad_norm": 0.7988112568855286, + "learning_rate": 0.00027810172825628774, + "loss": 4.7798, + "step": 66000 + }, + { + "epoch": 0.09275024281777514, + "grad_norm": 0.7714882493019104, + "learning_rate": 0.00027814388084867217, + "loss": 4.8462, + "step": 66010 + }, + { + "epoch": 0.09276429375593871, + "grad_norm": 0.7701422572135925, + "learning_rate": 0.0002781860334410566, + "loss": 4.7374, + "step": 66020 + }, + { + "epoch": 0.09277834469410229, + "grad_norm": 0.7310877442359924, + "learning_rate": 0.00027822818603344103, + "loss": 4.8362, + "step": 66030 + }, + { + "epoch": 0.09279239563226588, + "grad_norm": 0.7519302368164062, + "learning_rate": 0.00027827033862582546, + "loss": 4.8217, + "step": 66040 + }, + { + "epoch": 0.09280644657042945, + "grad_norm": 0.7439327239990234, + "learning_rate": 0.0002783124912182099, + "loss": 4.899, + "step": 66050 + }, + { + "epoch": 0.09282049750859303, + "grad_norm": 0.7650001645088196, + "learning_rate": 0.0002783546438105943, + "loss": 4.8668, + "step": 66060 + }, + { + "epoch": 0.0928345484467566, + "grad_norm": 0.7906643152236938, + "learning_rate": 0.00027839679640297876, + "loss": 4.85, + "step": 66070 + }, + { + "epoch": 0.09284859938492018, + "grad_norm": 0.772290825843811, + "learning_rate": 0.0002784389489953632, + "loss": 4.8879, + "step": 66080 + }, + { + "epoch": 0.09286265032308376, + "grad_norm": 0.7907188534736633, + "learning_rate": 0.0002784811015877476, + "loss": 4.7498, + "step": 66090 + }, + { + "epoch": 0.09287670126124734, + "grad_norm": 0.7830239534378052, + "learning_rate": 0.00027852325418013205, + "loss": 4.8478, + "step": 66100 + }, + { + "epoch": 0.09289075219941091, + "grad_norm": 0.7706902027130127, + "learning_rate": 0.0002785654067725165, + "loss": 4.8905, + "step": 66110 + }, + { + "epoch": 0.09290480313757449, + "grad_norm": 0.7300134301185608, + "learning_rate": 0.0002786075593649009, + "loss": 4.7552, + "step": 66120 + }, + { + "epoch": 0.09291885407573806, + "grad_norm": 0.7618225812911987, + "learning_rate": 0.00027864971195728534, + "loss": 4.7571, + "step": 66130 + }, + { + "epoch": 0.09293290501390165, + "grad_norm": 0.8037118911743164, + "learning_rate": 0.0002786918645496698, + "loss": 4.749, + "step": 66140 + }, + { + "epoch": 0.09294695595206523, + "grad_norm": 0.7755313515663147, + "learning_rate": 0.0002787340171420542, + "loss": 4.8419, + "step": 66150 + }, + { + "epoch": 0.0929610068902288, + "grad_norm": 0.753610372543335, + "learning_rate": 0.00027877616973443864, + "loss": 4.8936, + "step": 66160 + }, + { + "epoch": 0.09297505782839238, + "grad_norm": 0.8028730750083923, + "learning_rate": 0.00027881832232682307, + "loss": 4.8275, + "step": 66170 + }, + { + "epoch": 0.09298910876655596, + "grad_norm": 0.7614244818687439, + "learning_rate": 0.0002788604749192075, + "loss": 4.8163, + "step": 66180 + }, + { + "epoch": 0.09300315970471953, + "grad_norm": 0.7953831553459167, + "learning_rate": 0.00027890262751159193, + "loss": 4.9137, + "step": 66190 + }, + { + "epoch": 0.09301721064288311, + "grad_norm": 0.7856250405311584, + "learning_rate": 0.00027894478010397636, + "loss": 4.8262, + "step": 66200 + }, + { + "epoch": 0.09303126158104669, + "grad_norm": 0.749713659286499, + "learning_rate": 0.0002789869326963608, + "loss": 4.8025, + "step": 66210 + }, + { + "epoch": 0.09304531251921026, + "grad_norm": 0.7905395030975342, + "learning_rate": 0.0002790290852887452, + "loss": 4.8134, + "step": 66220 + }, + { + "epoch": 0.09305936345737384, + "grad_norm": 0.7537453174591064, + "learning_rate": 0.00027907123788112966, + "loss": 4.6562, + "step": 66230 + }, + { + "epoch": 0.09307341439553742, + "grad_norm": 0.7715054154396057, + "learning_rate": 0.0002791133904735141, + "loss": 4.8059, + "step": 66240 + }, + { + "epoch": 0.093087465333701, + "grad_norm": 0.7600757479667664, + "learning_rate": 0.0002791555430658985, + "loss": 4.7877, + "step": 66250 + }, + { + "epoch": 0.09310151627186458, + "grad_norm": 0.7605622410774231, + "learning_rate": 0.00027919769565828295, + "loss": 4.8044, + "step": 66260 + }, + { + "epoch": 0.09311556721002816, + "grad_norm": 0.7521841526031494, + "learning_rate": 0.0002792398482506674, + "loss": 4.771, + "step": 66270 + }, + { + "epoch": 0.09312961814819173, + "grad_norm": 0.751024067401886, + "learning_rate": 0.0002792820008430518, + "loss": 4.7984, + "step": 66280 + }, + { + "epoch": 0.09314366908635531, + "grad_norm": 0.7688322067260742, + "learning_rate": 0.0002793241534354363, + "loss": 4.8356, + "step": 66290 + }, + { + "epoch": 0.09315772002451889, + "grad_norm": 0.7705650329589844, + "learning_rate": 0.0002793663060278207, + "loss": 4.7877, + "step": 66300 + }, + { + "epoch": 0.09317177096268246, + "grad_norm": 0.769523024559021, + "learning_rate": 0.0002794084586202051, + "loss": 4.8034, + "step": 66310 + }, + { + "epoch": 0.09318582190084604, + "grad_norm": 0.7682741284370422, + "learning_rate": 0.00027945061121258954, + "loss": 4.6817, + "step": 66320 + }, + { + "epoch": 0.09319987283900961, + "grad_norm": 0.765863835811615, + "learning_rate": 0.00027949276380497397, + "loss": 4.7184, + "step": 66330 + }, + { + "epoch": 0.09321392377717319, + "grad_norm": 0.7482745051383972, + "learning_rate": 0.0002795349163973584, + "loss": 4.766, + "step": 66340 + }, + { + "epoch": 0.09322797471533678, + "grad_norm": 0.7587314248085022, + "learning_rate": 0.0002795770689897429, + "loss": 4.8509, + "step": 66350 + }, + { + "epoch": 0.09324202565350036, + "grad_norm": 0.7830142378807068, + "learning_rate": 0.00027961922158212726, + "loss": 4.6339, + "step": 66360 + }, + { + "epoch": 0.09325607659166393, + "grad_norm": 0.7715080380439758, + "learning_rate": 0.0002796613741745117, + "loss": 4.7213, + "step": 66370 + }, + { + "epoch": 0.09327012752982751, + "grad_norm": 0.7741284966468811, + "learning_rate": 0.0002797035267668961, + "loss": 4.7524, + "step": 66380 + }, + { + "epoch": 0.09328417846799109, + "grad_norm": 0.7751002907752991, + "learning_rate": 0.00027974567935928055, + "loss": 4.8013, + "step": 66390 + }, + { + "epoch": 0.09329822940615466, + "grad_norm": 0.8361513614654541, + "learning_rate": 0.000279787831951665, + "loss": 4.8237, + "step": 66400 + }, + { + "epoch": 0.09331228034431824, + "grad_norm": 0.7796192169189453, + "learning_rate": 0.00027982998454404947, + "loss": 4.8101, + "step": 66410 + }, + { + "epoch": 0.09332633128248181, + "grad_norm": 0.8053950071334839, + "learning_rate": 0.0002798721371364339, + "loss": 4.6938, + "step": 66420 + }, + { + "epoch": 0.09334038222064539, + "grad_norm": 0.8083071112632751, + "learning_rate": 0.0002799142897288183, + "loss": 4.8032, + "step": 66430 + }, + { + "epoch": 0.09335443315880897, + "grad_norm": 0.8004527688026428, + "learning_rate": 0.0002799564423212027, + "loss": 4.7322, + "step": 66440 + }, + { + "epoch": 0.09336848409697256, + "grad_norm": 0.7978747487068176, + "learning_rate": 0.00027999859491358714, + "loss": 4.8168, + "step": 66450 + }, + { + "epoch": 0.09338253503513613, + "grad_norm": 0.8221896290779114, + "learning_rate": 0.00028004074750597157, + "loss": 4.5916, + "step": 66460 + }, + { + "epoch": 0.09339658597329971, + "grad_norm": 0.8084535002708435, + "learning_rate": 0.00028008290009835606, + "loss": 4.8748, + "step": 66470 + }, + { + "epoch": 0.09341063691146329, + "grad_norm": 0.8202959895133972, + "learning_rate": 0.0002801250526907405, + "loss": 4.7996, + "step": 66480 + }, + { + "epoch": 0.09342468784962686, + "grad_norm": 0.7502172589302063, + "learning_rate": 0.00028016720528312487, + "loss": 4.9115, + "step": 66490 + }, + { + "epoch": 0.09343873878779044, + "grad_norm": 0.7529955506324768, + "learning_rate": 0.0002802093578755093, + "loss": 4.8029, + "step": 66500 + }, + { + "epoch": 0.09345278972595401, + "grad_norm": 0.7608353495597839, + "learning_rate": 0.00028025151046789373, + "loss": 4.7967, + "step": 66510 + }, + { + "epoch": 0.09346684066411759, + "grad_norm": 0.8318527936935425, + "learning_rate": 0.00028029366306027816, + "loss": 4.7791, + "step": 66520 + }, + { + "epoch": 0.09348089160228117, + "grad_norm": 0.7857316732406616, + "learning_rate": 0.00028033581565266264, + "loss": 4.7374, + "step": 66530 + }, + { + "epoch": 0.09349494254044474, + "grad_norm": 0.9135415554046631, + "learning_rate": 0.0002803779682450471, + "loss": 4.761, + "step": 66540 + }, + { + "epoch": 0.09350899347860832, + "grad_norm": 0.7974305748939514, + "learning_rate": 0.00028042012083743145, + "loss": 4.6771, + "step": 66550 + }, + { + "epoch": 0.09352304441677191, + "grad_norm": 0.7421619892120361, + "learning_rate": 0.0002804622734298159, + "loss": 4.7538, + "step": 66560 + }, + { + "epoch": 0.09353709535493548, + "grad_norm": 0.796992838382721, + "learning_rate": 0.0002805044260222003, + "loss": 4.8697, + "step": 66570 + }, + { + "epoch": 0.09355114629309906, + "grad_norm": 1.0140798091888428, + "learning_rate": 0.00028054657861458475, + "loss": 4.8219, + "step": 66580 + }, + { + "epoch": 0.09356519723126264, + "grad_norm": 0.8030684590339661, + "learning_rate": 0.00028058873120696923, + "loss": 4.7397, + "step": 66590 + }, + { + "epoch": 0.09357924816942621, + "grad_norm": 0.8484904766082764, + "learning_rate": 0.00028063088379935366, + "loss": 4.7481, + "step": 66600 + }, + { + "epoch": 0.09359329910758979, + "grad_norm": 0.7419153451919556, + "learning_rate": 0.0002806730363917381, + "loss": 4.8027, + "step": 66610 + }, + { + "epoch": 0.09360735004575337, + "grad_norm": 0.7505769729614258, + "learning_rate": 0.00028071518898412247, + "loss": 4.7095, + "step": 66620 + }, + { + "epoch": 0.09362140098391694, + "grad_norm": 0.7895868420600891, + "learning_rate": 0.0002807573415765069, + "loss": 4.7332, + "step": 66630 + }, + { + "epoch": 0.09363545192208052, + "grad_norm": 0.7834476232528687, + "learning_rate": 0.00028079949416889133, + "loss": 4.6707, + "step": 66640 + }, + { + "epoch": 0.0936495028602441, + "grad_norm": 0.7624843716621399, + "learning_rate": 0.0002808416467612758, + "loss": 4.7925, + "step": 66650 + }, + { + "epoch": 0.09366355379840768, + "grad_norm": 0.8170449733734131, + "learning_rate": 0.00028088379935366025, + "loss": 4.7729, + "step": 66660 + }, + { + "epoch": 0.09367760473657126, + "grad_norm": 0.8319101929664612, + "learning_rate": 0.0002809259519460447, + "loss": 4.933, + "step": 66670 + }, + { + "epoch": 0.09369165567473484, + "grad_norm": 0.7560040354728699, + "learning_rate": 0.00028096810453842906, + "loss": 4.7946, + "step": 66680 + }, + { + "epoch": 0.09370570661289841, + "grad_norm": 0.7837918996810913, + "learning_rate": 0.0002810102571308135, + "loss": 4.7895, + "step": 66690 + }, + { + "epoch": 0.09371975755106199, + "grad_norm": 0.7507804036140442, + "learning_rate": 0.0002810524097231979, + "loss": 4.7504, + "step": 66700 + }, + { + "epoch": 0.09373380848922556, + "grad_norm": 0.771360456943512, + "learning_rate": 0.0002810945623155824, + "loss": 4.8561, + "step": 66710 + }, + { + "epoch": 0.09374785942738914, + "grad_norm": 0.7771738171577454, + "learning_rate": 0.00028113671490796684, + "loss": 4.874, + "step": 66720 + }, + { + "epoch": 0.09376191036555272, + "grad_norm": 0.7817769646644592, + "learning_rate": 0.00028117886750035127, + "loss": 4.8408, + "step": 66730 + }, + { + "epoch": 0.0937759613037163, + "grad_norm": 0.9140200018882751, + "learning_rate": 0.0002812210200927357, + "loss": 4.8285, + "step": 66740 + }, + { + "epoch": 0.09379001224187987, + "grad_norm": 0.7840200662612915, + "learning_rate": 0.0002812631726851201, + "loss": 4.8193, + "step": 66750 + }, + { + "epoch": 0.09380406318004346, + "grad_norm": 0.7683082818984985, + "learning_rate": 0.0002813053252775045, + "loss": 4.5404, + "step": 66760 + }, + { + "epoch": 0.09381811411820704, + "grad_norm": 0.7743772268295288, + "learning_rate": 0.000281347477869889, + "loss": 4.9332, + "step": 66770 + }, + { + "epoch": 0.09383216505637061, + "grad_norm": 0.7601823210716248, + "learning_rate": 0.0002813896304622734, + "loss": 4.8454, + "step": 66780 + }, + { + "epoch": 0.09384621599453419, + "grad_norm": 0.7685533761978149, + "learning_rate": 0.00028143178305465786, + "loss": 4.8704, + "step": 66790 + }, + { + "epoch": 0.09386026693269776, + "grad_norm": 0.7852855324745178, + "learning_rate": 0.0002814739356470423, + "loss": 4.7243, + "step": 66800 + }, + { + "epoch": 0.09387431787086134, + "grad_norm": 0.7344205379486084, + "learning_rate": 0.00028151608823942666, + "loss": 4.6807, + "step": 66810 + }, + { + "epoch": 0.09388836880902492, + "grad_norm": 0.7859446406364441, + "learning_rate": 0.00028155824083181115, + "loss": 4.6914, + "step": 66820 + }, + { + "epoch": 0.09390241974718849, + "grad_norm": 0.769825279712677, + "learning_rate": 0.0002816003934241956, + "loss": 4.795, + "step": 66830 + }, + { + "epoch": 0.09391647068535207, + "grad_norm": 0.7718509435653687, + "learning_rate": 0.00028164254601658, + "loss": 4.7383, + "step": 66840 + }, + { + "epoch": 0.09393052162351564, + "grad_norm": 0.7945657968521118, + "learning_rate": 0.00028168469860896444, + "loss": 4.8164, + "step": 66850 + }, + { + "epoch": 0.09394457256167922, + "grad_norm": 0.9426447749137878, + "learning_rate": 0.0002817268512013489, + "loss": 4.674, + "step": 66860 + }, + { + "epoch": 0.09395862349984281, + "grad_norm": 0.7499502301216125, + "learning_rate": 0.0002817690037937333, + "loss": 4.7949, + "step": 66870 + }, + { + "epoch": 0.09397267443800639, + "grad_norm": 0.7673668265342712, + "learning_rate": 0.00028181115638611774, + "loss": 4.8771, + "step": 66880 + }, + { + "epoch": 0.09398672537616996, + "grad_norm": 0.7585043907165527, + "learning_rate": 0.00028185330897850217, + "loss": 4.8944, + "step": 66890 + }, + { + "epoch": 0.09400077631433354, + "grad_norm": 0.7782924175262451, + "learning_rate": 0.0002818954615708866, + "loss": 4.7822, + "step": 66900 + }, + { + "epoch": 0.09401482725249712, + "grad_norm": 0.7514545917510986, + "learning_rate": 0.00028193761416327103, + "loss": 4.6599, + "step": 66910 + }, + { + "epoch": 0.09402887819066069, + "grad_norm": 0.7437326312065125, + "learning_rate": 0.00028197976675565546, + "loss": 4.8388, + "step": 66920 + }, + { + "epoch": 0.09404292912882427, + "grad_norm": 0.7777213454246521, + "learning_rate": 0.0002820219193480399, + "loss": 4.7727, + "step": 66930 + }, + { + "epoch": 0.09405698006698784, + "grad_norm": 0.7418848276138306, + "learning_rate": 0.0002820640719404243, + "loss": 4.7165, + "step": 66940 + }, + { + "epoch": 0.09407103100515142, + "grad_norm": 0.7748672962188721, + "learning_rate": 0.00028210622453280875, + "loss": 4.7824, + "step": 66950 + }, + { + "epoch": 0.094085081943315, + "grad_norm": 0.7600992321968079, + "learning_rate": 0.0002821483771251932, + "loss": 4.8642, + "step": 66960 + }, + { + "epoch": 0.09409913288147859, + "grad_norm": 0.7563418745994568, + "learning_rate": 0.0002821905297175776, + "loss": 4.8016, + "step": 66970 + }, + { + "epoch": 0.09411318381964216, + "grad_norm": 0.7561931014060974, + "learning_rate": 0.00028223268230996205, + "loss": 4.8931, + "step": 66980 + }, + { + "epoch": 0.09412723475780574, + "grad_norm": 0.8095070719718933, + "learning_rate": 0.0002822748349023465, + "loss": 4.6954, + "step": 66990 + }, + { + "epoch": 0.09414128569596932, + "grad_norm": 0.7557608485221863, + "learning_rate": 0.0002823169874947309, + "loss": 4.9589, + "step": 67000 + }, + { + "epoch": 0.09415533663413289, + "grad_norm": 0.7877722978591919, + "learning_rate": 0.00028235914008711534, + "loss": 4.8427, + "step": 67010 + }, + { + "epoch": 0.09416938757229647, + "grad_norm": 0.7584842443466187, + "learning_rate": 0.00028240129267949977, + "loss": 4.8917, + "step": 67020 + }, + { + "epoch": 0.09418343851046004, + "grad_norm": 0.7679328322410583, + "learning_rate": 0.0002824434452718842, + "loss": 4.684, + "step": 67030 + }, + { + "epoch": 0.09419748944862362, + "grad_norm": 0.7546048760414124, + "learning_rate": 0.00028248559786426863, + "loss": 4.6522, + "step": 67040 + }, + { + "epoch": 0.0942115403867872, + "grad_norm": 0.7862738966941833, + "learning_rate": 0.00028252775045665307, + "loss": 4.7412, + "step": 67050 + }, + { + "epoch": 0.09422559132495077, + "grad_norm": 0.7737675905227661, + "learning_rate": 0.0002825699030490375, + "loss": 4.7039, + "step": 67060 + }, + { + "epoch": 0.09423964226311436, + "grad_norm": 0.7403519153594971, + "learning_rate": 0.00028261205564142193, + "loss": 4.8726, + "step": 67070 + }, + { + "epoch": 0.09425369320127794, + "grad_norm": 0.7542514204978943, + "learning_rate": 0.00028265420823380636, + "loss": 4.8402, + "step": 67080 + }, + { + "epoch": 0.09426774413944151, + "grad_norm": 0.7455792427062988, + "learning_rate": 0.0002826963608261908, + "loss": 4.8167, + "step": 67090 + }, + { + "epoch": 0.09428179507760509, + "grad_norm": 0.7721514701843262, + "learning_rate": 0.0002827385134185752, + "loss": 4.8949, + "step": 67100 + }, + { + "epoch": 0.09429584601576867, + "grad_norm": 0.7985861301422119, + "learning_rate": 0.00028278066601095965, + "loss": 4.8274, + "step": 67110 + }, + { + "epoch": 0.09430989695393224, + "grad_norm": 0.7794066071510315, + "learning_rate": 0.0002828228186033441, + "loss": 4.7993, + "step": 67120 + }, + { + "epoch": 0.09432394789209582, + "grad_norm": 0.7830890417098999, + "learning_rate": 0.0002828649711957285, + "loss": 4.7902, + "step": 67130 + }, + { + "epoch": 0.0943379988302594, + "grad_norm": 0.7577866911888123, + "learning_rate": 0.00028290712378811295, + "loss": 4.8907, + "step": 67140 + }, + { + "epoch": 0.09435204976842297, + "grad_norm": 0.8127627968788147, + "learning_rate": 0.0002829492763804974, + "loss": 4.7554, + "step": 67150 + }, + { + "epoch": 0.09436610070658655, + "grad_norm": 0.7625774145126343, + "learning_rate": 0.0002829914289728818, + "loss": 4.7444, + "step": 67160 + }, + { + "epoch": 0.09438015164475012, + "grad_norm": 0.7992131114006042, + "learning_rate": 0.00028303358156526624, + "loss": 4.8071, + "step": 67170 + }, + { + "epoch": 0.09439420258291371, + "grad_norm": 0.7639414668083191, + "learning_rate": 0.00028307573415765067, + "loss": 4.7122, + "step": 67180 + }, + { + "epoch": 0.09440825352107729, + "grad_norm": 0.7498297691345215, + "learning_rate": 0.0002831178867500351, + "loss": 4.7293, + "step": 67190 + }, + { + "epoch": 0.09442230445924087, + "grad_norm": 0.7553703188896179, + "learning_rate": 0.00028316003934241953, + "loss": 4.9202, + "step": 67200 + }, + { + "epoch": 0.09443635539740444, + "grad_norm": 0.7734369039535522, + "learning_rate": 0.00028320219193480396, + "loss": 4.7912, + "step": 67210 + }, + { + "epoch": 0.09445040633556802, + "grad_norm": 0.7736976146697998, + "learning_rate": 0.0002832443445271884, + "loss": 4.8836, + "step": 67220 + }, + { + "epoch": 0.0944644572737316, + "grad_norm": 0.7649384140968323, + "learning_rate": 0.0002832864971195728, + "loss": 4.8357, + "step": 67230 + }, + { + "epoch": 0.09447850821189517, + "grad_norm": 0.7400626540184021, + "learning_rate": 0.00028332864971195726, + "loss": 4.8263, + "step": 67240 + }, + { + "epoch": 0.09449255915005875, + "grad_norm": 0.8864115476608276, + "learning_rate": 0.0002833708023043417, + "loss": 4.7784, + "step": 67250 + }, + { + "epoch": 0.09450661008822232, + "grad_norm": 0.7719464302062988, + "learning_rate": 0.0002834129548967261, + "loss": 4.927, + "step": 67260 + }, + { + "epoch": 0.0945206610263859, + "grad_norm": 0.7647191286087036, + "learning_rate": 0.00028345510748911055, + "loss": 4.776, + "step": 67270 + }, + { + "epoch": 0.09453471196454949, + "grad_norm": 0.7322827577590942, + "learning_rate": 0.000283497260081495, + "loss": 4.8069, + "step": 67280 + }, + { + "epoch": 0.09454876290271307, + "grad_norm": 0.7592594623565674, + "learning_rate": 0.0002835394126738794, + "loss": 4.7898, + "step": 67290 + }, + { + "epoch": 0.09456281384087664, + "grad_norm": 0.7448276281356812, + "learning_rate": 0.00028358156526626384, + "loss": 4.8365, + "step": 67300 + }, + { + "epoch": 0.09457686477904022, + "grad_norm": 0.786535382270813, + "learning_rate": 0.0002836237178586483, + "loss": 4.7254, + "step": 67310 + }, + { + "epoch": 0.0945909157172038, + "grad_norm": 0.7626109719276428, + "learning_rate": 0.0002836658704510327, + "loss": 4.9295, + "step": 67320 + }, + { + "epoch": 0.09460496665536737, + "grad_norm": 0.8014910817146301, + "learning_rate": 0.00028370802304341714, + "loss": 4.8296, + "step": 67330 + }, + { + "epoch": 0.09461901759353095, + "grad_norm": 0.7696790099143982, + "learning_rate": 0.00028375017563580157, + "loss": 4.7296, + "step": 67340 + }, + { + "epoch": 0.09463306853169452, + "grad_norm": 0.770531952381134, + "learning_rate": 0.000283792328228186, + "loss": 4.8663, + "step": 67350 + }, + { + "epoch": 0.0946471194698581, + "grad_norm": 0.7845657467842102, + "learning_rate": 0.00028383448082057043, + "loss": 4.7573, + "step": 67360 + }, + { + "epoch": 0.09466117040802167, + "grad_norm": 0.798072099685669, + "learning_rate": 0.00028387663341295486, + "loss": 4.7633, + "step": 67370 + }, + { + "epoch": 0.09467522134618526, + "grad_norm": 0.7655891180038452, + "learning_rate": 0.0002839187860053393, + "loss": 4.729, + "step": 67380 + }, + { + "epoch": 0.09468927228434884, + "grad_norm": 0.7528164386749268, + "learning_rate": 0.0002839609385977237, + "loss": 4.7863, + "step": 67390 + }, + { + "epoch": 0.09470332322251242, + "grad_norm": 0.7557967305183411, + "learning_rate": 0.00028400309119010816, + "loss": 4.8202, + "step": 67400 + }, + { + "epoch": 0.094717374160676, + "grad_norm": 0.7755447626113892, + "learning_rate": 0.0002840452437824926, + "loss": 4.7798, + "step": 67410 + }, + { + "epoch": 0.09473142509883957, + "grad_norm": 0.7739848494529724, + "learning_rate": 0.000284087396374877, + "loss": 4.8155, + "step": 67420 + }, + { + "epoch": 0.09474547603700315, + "grad_norm": 0.7496004700660706, + "learning_rate": 0.00028412954896726145, + "loss": 4.579, + "step": 67430 + }, + { + "epoch": 0.09475952697516672, + "grad_norm": 0.7430173754692078, + "learning_rate": 0.0002841717015596459, + "loss": 4.7696, + "step": 67440 + }, + { + "epoch": 0.0947735779133303, + "grad_norm": 0.7696523666381836, + "learning_rate": 0.00028421385415203037, + "loss": 4.9943, + "step": 67450 + }, + { + "epoch": 0.09478762885149387, + "grad_norm": 0.8024923801422119, + "learning_rate": 0.00028425600674441474, + "loss": 4.7865, + "step": 67460 + }, + { + "epoch": 0.09480167978965745, + "grad_norm": 0.7850974202156067, + "learning_rate": 0.0002842981593367992, + "loss": 4.813, + "step": 67470 + }, + { + "epoch": 0.09481573072782103, + "grad_norm": 0.7672629952430725, + "learning_rate": 0.0002843403119291836, + "loss": 4.8848, + "step": 67480 + }, + { + "epoch": 0.09482978166598462, + "grad_norm": 0.7520509958267212, + "learning_rate": 0.00028438246452156804, + "loss": 4.821, + "step": 67490 + }, + { + "epoch": 0.09484383260414819, + "grad_norm": 0.7907251715660095, + "learning_rate": 0.00028442461711395247, + "loss": 4.7222, + "step": 67500 + }, + { + "epoch": 0.09485788354231177, + "grad_norm": 0.7532843947410583, + "learning_rate": 0.00028446676970633695, + "loss": 4.7545, + "step": 67510 + }, + { + "epoch": 0.09487193448047535, + "grad_norm": 0.7584605813026428, + "learning_rate": 0.00028450892229872133, + "loss": 4.7687, + "step": 67520 + }, + { + "epoch": 0.09488598541863892, + "grad_norm": 0.7673341035842896, + "learning_rate": 0.00028455107489110576, + "loss": 4.7871, + "step": 67530 + }, + { + "epoch": 0.0949000363568025, + "grad_norm": 0.7831926345825195, + "learning_rate": 0.0002845932274834902, + "loss": 4.7859, + "step": 67540 + }, + { + "epoch": 0.09491408729496607, + "grad_norm": 0.7618937492370605, + "learning_rate": 0.0002846353800758746, + "loss": 4.8277, + "step": 67550 + }, + { + "epoch": 0.09492813823312965, + "grad_norm": 0.813814640045166, + "learning_rate": 0.00028467753266825905, + "loss": 4.7525, + "step": 67560 + }, + { + "epoch": 0.09494218917129323, + "grad_norm": 0.7494709491729736, + "learning_rate": 0.00028471968526064354, + "loss": 4.8525, + "step": 67570 + }, + { + "epoch": 0.0949562401094568, + "grad_norm": 0.7520213723182678, + "learning_rate": 0.00028476183785302797, + "loss": 4.7942, + "step": 67580 + }, + { + "epoch": 0.09497029104762039, + "grad_norm": 0.7849497199058533, + "learning_rate": 0.00028480399044541235, + "loss": 4.7749, + "step": 67590 + }, + { + "epoch": 0.09498434198578397, + "grad_norm": 0.7550547122955322, + "learning_rate": 0.0002848461430377968, + "loss": 4.7389, + "step": 67600 + }, + { + "epoch": 0.09499839292394754, + "grad_norm": 0.7957075834274292, + "learning_rate": 0.0002848882956301812, + "loss": 4.7788, + "step": 67610 + }, + { + "epoch": 0.09501244386211112, + "grad_norm": 0.7405850291252136, + "learning_rate": 0.00028493044822256564, + "loss": 4.7592, + "step": 67620 + }, + { + "epoch": 0.0950264948002747, + "grad_norm": 0.7549129128456116, + "learning_rate": 0.00028497260081495013, + "loss": 4.8282, + "step": 67630 + }, + { + "epoch": 0.09504054573843827, + "grad_norm": 0.7701460123062134, + "learning_rate": 0.00028501475340733456, + "loss": 4.8901, + "step": 67640 + }, + { + "epoch": 0.09505459667660185, + "grad_norm": 0.7383482456207275, + "learning_rate": 0.00028505690599971894, + "loss": 4.7539, + "step": 67650 + }, + { + "epoch": 0.09506864761476543, + "grad_norm": 0.7618653774261475, + "learning_rate": 0.00028509905859210337, + "loss": 4.812, + "step": 67660 + }, + { + "epoch": 0.095082698552929, + "grad_norm": 0.7711530327796936, + "learning_rate": 0.0002851412111844878, + "loss": 4.7898, + "step": 67670 + }, + { + "epoch": 0.09509674949109258, + "grad_norm": 0.7828205227851868, + "learning_rate": 0.00028518336377687223, + "loss": 4.8524, + "step": 67680 + }, + { + "epoch": 0.09511080042925617, + "grad_norm": 0.7714737057685852, + "learning_rate": 0.0002852255163692567, + "loss": 4.731, + "step": 67690 + }, + { + "epoch": 0.09512485136741974, + "grad_norm": 0.7553906440734863, + "learning_rate": 0.00028526766896164115, + "loss": 4.7764, + "step": 67700 + }, + { + "epoch": 0.09513890230558332, + "grad_norm": 0.7760165333747864, + "learning_rate": 0.0002853098215540256, + "loss": 4.7859, + "step": 67710 + }, + { + "epoch": 0.0951529532437469, + "grad_norm": 0.7785241603851318, + "learning_rate": 0.00028535197414640995, + "loss": 4.7499, + "step": 67720 + }, + { + "epoch": 0.09516700418191047, + "grad_norm": 0.7613420486450195, + "learning_rate": 0.0002853941267387944, + "loss": 4.806, + "step": 67730 + }, + { + "epoch": 0.09518105512007405, + "grad_norm": 0.7879531979560852, + "learning_rate": 0.0002854362793311788, + "loss": 4.8668, + "step": 67740 + }, + { + "epoch": 0.09519510605823762, + "grad_norm": 0.7439324855804443, + "learning_rate": 0.0002854784319235633, + "loss": 4.8048, + "step": 67750 + }, + { + "epoch": 0.0952091569964012, + "grad_norm": 0.8519731163978577, + "learning_rate": 0.00028552058451594773, + "loss": 4.6315, + "step": 67760 + }, + { + "epoch": 0.09522320793456478, + "grad_norm": 0.7999274134635925, + "learning_rate": 0.00028556273710833216, + "loss": 4.8338, + "step": 67770 + }, + { + "epoch": 0.09523725887272835, + "grad_norm": 0.7844228148460388, + "learning_rate": 0.00028560488970071654, + "loss": 4.7579, + "step": 67780 + }, + { + "epoch": 0.09525130981089193, + "grad_norm": 0.7809820175170898, + "learning_rate": 0.00028564704229310097, + "loss": 4.7089, + "step": 67790 + }, + { + "epoch": 0.09526536074905552, + "grad_norm": 0.759044885635376, + "learning_rate": 0.0002856891948854854, + "loss": 4.7492, + "step": 67800 + }, + { + "epoch": 0.0952794116872191, + "grad_norm": 0.755091667175293, + "learning_rate": 0.0002857313474778699, + "loss": 4.6888, + "step": 67810 + }, + { + "epoch": 0.09529346262538267, + "grad_norm": 0.7296857237815857, + "learning_rate": 0.0002857735000702543, + "loss": 4.7357, + "step": 67820 + }, + { + "epoch": 0.09530751356354625, + "grad_norm": 0.7496383190155029, + "learning_rate": 0.00028581565266263875, + "loss": 4.7345, + "step": 67830 + }, + { + "epoch": 0.09532156450170982, + "grad_norm": 0.7716061472892761, + "learning_rate": 0.00028585780525502313, + "loss": 4.8233, + "step": 67840 + }, + { + "epoch": 0.0953356154398734, + "grad_norm": 0.7615830302238464, + "learning_rate": 0.00028589995784740756, + "loss": 4.7433, + "step": 67850 + }, + { + "epoch": 0.09534966637803698, + "grad_norm": 0.7765911817550659, + "learning_rate": 0.000285942110439792, + "loss": 4.7496, + "step": 67860 + }, + { + "epoch": 0.09536371731620055, + "grad_norm": 0.7490736842155457, + "learning_rate": 0.0002859842630321765, + "loss": 4.8747, + "step": 67870 + }, + { + "epoch": 0.09537776825436413, + "grad_norm": 0.7704018354415894, + "learning_rate": 0.0002860264156245609, + "loss": 4.8234, + "step": 67880 + }, + { + "epoch": 0.0953918191925277, + "grad_norm": 0.7510921359062195, + "learning_rate": 0.00028606856821694534, + "loss": 4.7961, + "step": 67890 + }, + { + "epoch": 0.0954058701306913, + "grad_norm": 0.7316580414772034, + "learning_rate": 0.00028611072080932977, + "loss": 4.7473, + "step": 67900 + }, + { + "epoch": 0.09541992106885487, + "grad_norm": 0.7586633563041687, + "learning_rate": 0.00028615287340171415, + "loss": 4.8093, + "step": 67910 + }, + { + "epoch": 0.09543397200701845, + "grad_norm": 0.7565239071846008, + "learning_rate": 0.0002861950259940986, + "loss": 4.7894, + "step": 67920 + }, + { + "epoch": 0.09544802294518202, + "grad_norm": 0.7565003037452698, + "learning_rate": 0.00028623717858648306, + "loss": 4.8842, + "step": 67930 + }, + { + "epoch": 0.0954620738833456, + "grad_norm": 0.7603064179420471, + "learning_rate": 0.0002862793311788675, + "loss": 4.7224, + "step": 67940 + }, + { + "epoch": 0.09547612482150918, + "grad_norm": 0.7563630938529968, + "learning_rate": 0.0002863214837712519, + "loss": 4.7289, + "step": 67950 + }, + { + "epoch": 0.09549017575967275, + "grad_norm": 0.7232897281646729, + "learning_rate": 0.00028636363636363636, + "loss": 4.718, + "step": 67960 + }, + { + "epoch": 0.09550422669783633, + "grad_norm": 0.747141420841217, + "learning_rate": 0.00028640578895602073, + "loss": 4.7984, + "step": 67970 + }, + { + "epoch": 0.0955182776359999, + "grad_norm": 0.7786388397216797, + "learning_rate": 0.00028644794154840516, + "loss": 4.6962, + "step": 67980 + }, + { + "epoch": 0.09553232857416348, + "grad_norm": 0.7422512769699097, + "learning_rate": 0.00028649009414078965, + "loss": 4.8789, + "step": 67990 + }, + { + "epoch": 0.09554637951232707, + "grad_norm": 0.8502141833305359, + "learning_rate": 0.0002865322467331741, + "loss": 4.8269, + "step": 68000 + }, + { + "epoch": 0.09556043045049065, + "grad_norm": 0.7409288287162781, + "learning_rate": 0.0002865743993255585, + "loss": 4.7592, + "step": 68010 + }, + { + "epoch": 0.09557448138865422, + "grad_norm": 0.7689024209976196, + "learning_rate": 0.00028661655191794294, + "loss": 4.7512, + "step": 68020 + }, + { + "epoch": 0.0955885323268178, + "grad_norm": 0.7674072980880737, + "learning_rate": 0.0002866587045103274, + "loss": 4.9744, + "step": 68030 + }, + { + "epoch": 0.09560258326498138, + "grad_norm": 0.7291126251220703, + "learning_rate": 0.0002867008571027118, + "loss": 4.7998, + "step": 68040 + }, + { + "epoch": 0.09561663420314495, + "grad_norm": 0.7502152323722839, + "learning_rate": 0.00028674300969509624, + "loss": 4.7427, + "step": 68050 + }, + { + "epoch": 0.09563068514130853, + "grad_norm": 0.744105875492096, + "learning_rate": 0.00028678516228748067, + "loss": 4.8043, + "step": 68060 + }, + { + "epoch": 0.0956447360794721, + "grad_norm": 0.7957160472869873, + "learning_rate": 0.0002868273148798651, + "loss": 4.8657, + "step": 68070 + }, + { + "epoch": 0.09565878701763568, + "grad_norm": 0.857458233833313, + "learning_rate": 0.00028686946747224953, + "loss": 4.685, + "step": 68080 + }, + { + "epoch": 0.09567283795579926, + "grad_norm": 0.7583727240562439, + "learning_rate": 0.00028691162006463396, + "loss": 4.8969, + "step": 68090 + }, + { + "epoch": 0.09568688889396283, + "grad_norm": 0.7256913781166077, + "learning_rate": 0.0002869537726570184, + "loss": 4.8367, + "step": 68100 + }, + { + "epoch": 0.09570093983212642, + "grad_norm": 0.7758738994598389, + "learning_rate": 0.0002869959252494028, + "loss": 4.8103, + "step": 68110 + }, + { + "epoch": 0.09571499077029, + "grad_norm": 0.7613289952278137, + "learning_rate": 0.00028703807784178725, + "loss": 4.8833, + "step": 68120 + }, + { + "epoch": 0.09572904170845357, + "grad_norm": 0.7221300601959229, + "learning_rate": 0.0002870802304341717, + "loss": 4.7747, + "step": 68130 + }, + { + "epoch": 0.09574309264661715, + "grad_norm": 0.7547207474708557, + "learning_rate": 0.0002871223830265561, + "loss": 4.8013, + "step": 68140 + }, + { + "epoch": 0.09575714358478073, + "grad_norm": 0.7506118416786194, + "learning_rate": 0.00028716453561894055, + "loss": 4.8435, + "step": 68150 + }, + { + "epoch": 0.0957711945229443, + "grad_norm": 0.7563244700431824, + "learning_rate": 0.000287206688211325, + "loss": 4.7599, + "step": 68160 + }, + { + "epoch": 0.09578524546110788, + "grad_norm": 0.7807770371437073, + "learning_rate": 0.0002872488408037094, + "loss": 4.8858, + "step": 68170 + }, + { + "epoch": 0.09579929639927146, + "grad_norm": 0.7905330657958984, + "learning_rate": 0.00028729099339609384, + "loss": 4.8305, + "step": 68180 + }, + { + "epoch": 0.09581334733743503, + "grad_norm": 0.7471740245819092, + "learning_rate": 0.00028733314598847827, + "loss": 4.7711, + "step": 68190 + }, + { + "epoch": 0.09582739827559861, + "grad_norm": 0.7244470119476318, + "learning_rate": 0.0002873752985808627, + "loss": 4.9656, + "step": 68200 + }, + { + "epoch": 0.0958414492137622, + "grad_norm": 0.7297165989875793, + "learning_rate": 0.00028741745117324713, + "loss": 4.783, + "step": 68210 + }, + { + "epoch": 0.09585550015192577, + "grad_norm": 0.7688151597976685, + "learning_rate": 0.00028745960376563157, + "loss": 4.7866, + "step": 68220 + }, + { + "epoch": 0.09586955109008935, + "grad_norm": 0.7747588157653809, + "learning_rate": 0.000287501756358016, + "loss": 4.7084, + "step": 68230 + }, + { + "epoch": 0.09588360202825293, + "grad_norm": 0.766656219959259, + "learning_rate": 0.00028754390895040043, + "loss": 4.7271, + "step": 68240 + }, + { + "epoch": 0.0958976529664165, + "grad_norm": 0.7433441281318665, + "learning_rate": 0.00028758606154278486, + "loss": 4.8076, + "step": 68250 + }, + { + "epoch": 0.09591170390458008, + "grad_norm": 0.7474486231803894, + "learning_rate": 0.0002876282141351693, + "loss": 4.7575, + "step": 68260 + }, + { + "epoch": 0.09592575484274365, + "grad_norm": 0.7397088408470154, + "learning_rate": 0.0002876703667275537, + "loss": 4.8102, + "step": 68270 + }, + { + "epoch": 0.09593980578090723, + "grad_norm": 0.7497064471244812, + "learning_rate": 0.00028771251931993815, + "loss": 4.8034, + "step": 68280 + }, + { + "epoch": 0.0959538567190708, + "grad_norm": 0.7426905632019043, + "learning_rate": 0.0002877546719123226, + "loss": 4.7192, + "step": 68290 + }, + { + "epoch": 0.09596790765723438, + "grad_norm": 0.7634294629096985, + "learning_rate": 0.000287796824504707, + "loss": 4.7343, + "step": 68300 + }, + { + "epoch": 0.09598195859539797, + "grad_norm": 0.7681379318237305, + "learning_rate": 0.00028783897709709145, + "loss": 4.7821, + "step": 68310 + }, + { + "epoch": 0.09599600953356155, + "grad_norm": 0.7707437872886658, + "learning_rate": 0.0002878811296894759, + "loss": 4.839, + "step": 68320 + }, + { + "epoch": 0.09601006047172513, + "grad_norm": 0.7358832359313965, + "learning_rate": 0.0002879232822818603, + "loss": 4.7412, + "step": 68330 + }, + { + "epoch": 0.0960241114098887, + "grad_norm": 0.7570725083351135, + "learning_rate": 0.00028796543487424474, + "loss": 4.7562, + "step": 68340 + }, + { + "epoch": 0.09603816234805228, + "grad_norm": 0.7852720022201538, + "learning_rate": 0.00028800758746662917, + "loss": 4.6697, + "step": 68350 + }, + { + "epoch": 0.09605221328621585, + "grad_norm": 0.7732431888580322, + "learning_rate": 0.0002880497400590136, + "loss": 4.7449, + "step": 68360 + }, + { + "epoch": 0.09606626422437943, + "grad_norm": 0.7763086557388306, + "learning_rate": 0.00028809189265139803, + "loss": 4.7639, + "step": 68370 + }, + { + "epoch": 0.096080315162543, + "grad_norm": 0.7549687623977661, + "learning_rate": 0.00028813404524378246, + "loss": 4.7652, + "step": 68380 + }, + { + "epoch": 0.09609436610070658, + "grad_norm": 0.7978414297103882, + "learning_rate": 0.0002881761978361669, + "loss": 4.8397, + "step": 68390 + }, + { + "epoch": 0.09610841703887016, + "grad_norm": 0.7623574137687683, + "learning_rate": 0.00028821835042855133, + "loss": 4.7205, + "step": 68400 + }, + { + "epoch": 0.09612246797703373, + "grad_norm": 0.7838755249977112, + "learning_rate": 0.00028826050302093576, + "loss": 4.8583, + "step": 68410 + }, + { + "epoch": 0.09613651891519732, + "grad_norm": 0.7429123520851135, + "learning_rate": 0.0002883026556133202, + "loss": 4.9437, + "step": 68420 + }, + { + "epoch": 0.0961505698533609, + "grad_norm": 0.7363834381103516, + "learning_rate": 0.0002883448082057046, + "loss": 4.8511, + "step": 68430 + }, + { + "epoch": 0.09616462079152448, + "grad_norm": 0.7411858439445496, + "learning_rate": 0.00028838696079808905, + "loss": 4.8572, + "step": 68440 + }, + { + "epoch": 0.09617867172968805, + "grad_norm": 0.8027516007423401, + "learning_rate": 0.0002884291133904735, + "loss": 4.7389, + "step": 68450 + }, + { + "epoch": 0.09619272266785163, + "grad_norm": 0.7942211031913757, + "learning_rate": 0.0002884712659828579, + "loss": 4.8992, + "step": 68460 + }, + { + "epoch": 0.0962067736060152, + "grad_norm": 0.7656008005142212, + "learning_rate": 0.00028851341857524235, + "loss": 4.9126, + "step": 68470 + }, + { + "epoch": 0.09622082454417878, + "grad_norm": 0.7987953424453735, + "learning_rate": 0.0002885555711676268, + "loss": 4.7477, + "step": 68480 + }, + { + "epoch": 0.09623487548234236, + "grad_norm": 0.7650423645973206, + "learning_rate": 0.0002885977237600112, + "loss": 4.8695, + "step": 68490 + }, + { + "epoch": 0.09624892642050593, + "grad_norm": 0.7581927180290222, + "learning_rate": 0.00028863987635239564, + "loss": 4.9698, + "step": 68500 + }, + { + "epoch": 0.09626297735866951, + "grad_norm": 0.7571653127670288, + "learning_rate": 0.00028868202894478007, + "loss": 4.7919, + "step": 68510 + }, + { + "epoch": 0.0962770282968331, + "grad_norm": 0.752564549446106, + "learning_rate": 0.0002887241815371645, + "loss": 4.7388, + "step": 68520 + }, + { + "epoch": 0.09629107923499668, + "grad_norm": 0.7321598529815674, + "learning_rate": 0.00028876633412954893, + "loss": 4.7807, + "step": 68530 + }, + { + "epoch": 0.09630513017316025, + "grad_norm": 0.7438421845436096, + "learning_rate": 0.00028880848672193336, + "loss": 4.8229, + "step": 68540 + }, + { + "epoch": 0.09631918111132383, + "grad_norm": 0.7429989576339722, + "learning_rate": 0.0002888506393143178, + "loss": 4.7067, + "step": 68550 + }, + { + "epoch": 0.0963332320494874, + "grad_norm": 0.749879777431488, + "learning_rate": 0.0002888927919067022, + "loss": 4.8433, + "step": 68560 + }, + { + "epoch": 0.09634728298765098, + "grad_norm": 0.8272457718849182, + "learning_rate": 0.00028893494449908666, + "loss": 4.6546, + "step": 68570 + }, + { + "epoch": 0.09636133392581456, + "grad_norm": 0.7289469242095947, + "learning_rate": 0.0002889770970914711, + "loss": 4.8699, + "step": 68580 + }, + { + "epoch": 0.09637538486397813, + "grad_norm": 0.7849155068397522, + "learning_rate": 0.0002890192496838555, + "loss": 4.5681, + "step": 68590 + }, + { + "epoch": 0.09638943580214171, + "grad_norm": 0.7392459511756897, + "learning_rate": 0.00028906140227623995, + "loss": 4.8062, + "step": 68600 + }, + { + "epoch": 0.09640348674030529, + "grad_norm": 0.771606981754303, + "learning_rate": 0.00028910355486862444, + "loss": 4.7789, + "step": 68610 + }, + { + "epoch": 0.09641753767846888, + "grad_norm": 0.7495245337486267, + "learning_rate": 0.0002891457074610088, + "loss": 4.725, + "step": 68620 + }, + { + "epoch": 0.09643158861663245, + "grad_norm": 0.758076012134552, + "learning_rate": 0.00028918786005339324, + "loss": 4.8515, + "step": 68630 + }, + { + "epoch": 0.09644563955479603, + "grad_norm": 0.7334091663360596, + "learning_rate": 0.0002892300126457777, + "loss": 5.0253, + "step": 68640 + }, + { + "epoch": 0.0964596904929596, + "grad_norm": 0.7746126651763916, + "learning_rate": 0.0002892721652381621, + "loss": 4.7502, + "step": 68650 + }, + { + "epoch": 0.09647374143112318, + "grad_norm": 0.7248702645301819, + "learning_rate": 0.00028931431783054654, + "loss": 4.9443, + "step": 68660 + }, + { + "epoch": 0.09648779236928676, + "grad_norm": 0.8012595772743225, + "learning_rate": 0.000289356470422931, + "loss": 4.8209, + "step": 68670 + }, + { + "epoch": 0.09650184330745033, + "grad_norm": 0.7670885324478149, + "learning_rate": 0.0002893986230153154, + "loss": 4.7838, + "step": 68680 + }, + { + "epoch": 0.09651589424561391, + "grad_norm": 0.7456730008125305, + "learning_rate": 0.00028944077560769983, + "loss": 4.7761, + "step": 68690 + }, + { + "epoch": 0.09652994518377749, + "grad_norm": 0.727116584777832, + "learning_rate": 0.00028948292820008426, + "loss": 4.7292, + "step": 68700 + }, + { + "epoch": 0.09654399612194106, + "grad_norm": 0.758237898349762, + "learning_rate": 0.0002895250807924687, + "loss": 4.7278, + "step": 68710 + }, + { + "epoch": 0.09655804706010464, + "grad_norm": 0.7621706128120422, + "learning_rate": 0.0002895672333848531, + "loss": 4.7813, + "step": 68720 + }, + { + "epoch": 0.09657209799826823, + "grad_norm": 0.7740551829338074, + "learning_rate": 0.0002896093859772376, + "loss": 4.8008, + "step": 68730 + }, + { + "epoch": 0.0965861489364318, + "grad_norm": 0.7670646905899048, + "learning_rate": 0.00028965153856962204, + "loss": 4.762, + "step": 68740 + }, + { + "epoch": 0.09660019987459538, + "grad_norm": 0.76221764087677, + "learning_rate": 0.0002896936911620064, + "loss": 4.6957, + "step": 68750 + }, + { + "epoch": 0.09661425081275896, + "grad_norm": 0.7280772924423218, + "learning_rate": 0.00028973584375439085, + "loss": 4.8566, + "step": 68760 + }, + { + "epoch": 0.09662830175092253, + "grad_norm": 0.7589055299758911, + "learning_rate": 0.0002897779963467753, + "loss": 4.7532, + "step": 68770 + }, + { + "epoch": 0.09664235268908611, + "grad_norm": 0.8092201948165894, + "learning_rate": 0.0002898201489391597, + "loss": 4.7692, + "step": 68780 + }, + { + "epoch": 0.09665640362724968, + "grad_norm": 0.7353342175483704, + "learning_rate": 0.0002898623015315442, + "loss": 4.8688, + "step": 68790 + }, + { + "epoch": 0.09667045456541326, + "grad_norm": 0.7660097479820251, + "learning_rate": 0.00028990445412392863, + "loss": 4.809, + "step": 68800 + }, + { + "epoch": 0.09668450550357684, + "grad_norm": 0.7710687518119812, + "learning_rate": 0.000289946606716313, + "loss": 4.7622, + "step": 68810 + }, + { + "epoch": 0.09669855644174041, + "grad_norm": 0.7906742095947266, + "learning_rate": 0.00028998875930869744, + "loss": 4.8203, + "step": 68820 + }, + { + "epoch": 0.096712607379904, + "grad_norm": 0.7781643271446228, + "learning_rate": 0.00029003091190108187, + "loss": 4.8479, + "step": 68830 + }, + { + "epoch": 0.09672665831806758, + "grad_norm": 0.7606348991394043, + "learning_rate": 0.0002900730644934663, + "loss": 4.8669, + "step": 68840 + }, + { + "epoch": 0.09674070925623116, + "grad_norm": 0.7359606623649597, + "learning_rate": 0.0002901152170858508, + "loss": 4.861, + "step": 68850 + }, + { + "epoch": 0.09675476019439473, + "grad_norm": 0.7380146384239197, + "learning_rate": 0.0002901573696782352, + "loss": 4.7876, + "step": 68860 + }, + { + "epoch": 0.09676881113255831, + "grad_norm": 0.7908768057823181, + "learning_rate": 0.00029019952227061965, + "loss": 4.765, + "step": 68870 + }, + { + "epoch": 0.09678286207072188, + "grad_norm": 0.7726009488105774, + "learning_rate": 0.000290241674863004, + "loss": 4.6684, + "step": 68880 + }, + { + "epoch": 0.09679691300888546, + "grad_norm": 0.7417333126068115, + "learning_rate": 0.00029028382745538845, + "loss": 4.8782, + "step": 68890 + }, + { + "epoch": 0.09681096394704904, + "grad_norm": 0.7703368067741394, + "learning_rate": 0.0002903259800477729, + "loss": 4.8342, + "step": 68900 + }, + { + "epoch": 0.09682501488521261, + "grad_norm": 0.774889349937439, + "learning_rate": 0.00029036813264015737, + "loss": 4.8245, + "step": 68910 + }, + { + "epoch": 0.09683906582337619, + "grad_norm": 0.7258095145225525, + "learning_rate": 0.0002904102852325418, + "loss": 4.7255, + "step": 68920 + }, + { + "epoch": 0.09685311676153978, + "grad_norm": 0.8192217946052551, + "learning_rate": 0.00029045243782492623, + "loss": 4.7965, + "step": 68930 + }, + { + "epoch": 0.09686716769970335, + "grad_norm": 0.7315029501914978, + "learning_rate": 0.0002904945904173106, + "loss": 4.6835, + "step": 68940 + }, + { + "epoch": 0.09688121863786693, + "grad_norm": 0.7798453569412231, + "learning_rate": 0.00029053674300969504, + "loss": 4.8065, + "step": 68950 + }, + { + "epoch": 0.09689526957603051, + "grad_norm": 0.7461909651756287, + "learning_rate": 0.00029057889560207947, + "loss": 4.8489, + "step": 68960 + }, + { + "epoch": 0.09690932051419408, + "grad_norm": 0.7602362036705017, + "learning_rate": 0.00029062104819446396, + "loss": 4.7368, + "step": 68970 + }, + { + "epoch": 0.09692337145235766, + "grad_norm": 0.7881399989128113, + "learning_rate": 0.0002906632007868484, + "loss": 4.8113, + "step": 68980 + }, + { + "epoch": 0.09693742239052124, + "grad_norm": 0.7365151643753052, + "learning_rate": 0.0002907053533792328, + "loss": 4.873, + "step": 68990 + }, + { + "epoch": 0.09695147332868481, + "grad_norm": 0.7622225284576416, + "learning_rate": 0.00029074750597161725, + "loss": 4.7345, + "step": 69000 + }, + { + "epoch": 0.09696552426684839, + "grad_norm": 0.7960168123245239, + "learning_rate": 0.00029078965856400163, + "loss": 4.6768, + "step": 69010 + }, + { + "epoch": 0.09697957520501196, + "grad_norm": 0.7706853747367859, + "learning_rate": 0.00029083181115638606, + "loss": 4.8081, + "step": 69020 + }, + { + "epoch": 0.09699362614317554, + "grad_norm": 0.7176666259765625, + "learning_rate": 0.00029087396374877054, + "loss": 4.8542, + "step": 69030 + }, + { + "epoch": 0.09700767708133913, + "grad_norm": 0.7501884698867798, + "learning_rate": 0.000290916116341155, + "loss": 4.7623, + "step": 69040 + }, + { + "epoch": 0.0970217280195027, + "grad_norm": 0.7589862942695618, + "learning_rate": 0.0002909582689335394, + "loss": 4.8451, + "step": 69050 + }, + { + "epoch": 0.09703577895766628, + "grad_norm": 0.7779807448387146, + "learning_rate": 0.00029100042152592384, + "loss": 4.7506, + "step": 69060 + }, + { + "epoch": 0.09704982989582986, + "grad_norm": 0.745506227016449, + "learning_rate": 0.0002910425741183082, + "loss": 4.7947, + "step": 69070 + }, + { + "epoch": 0.09706388083399344, + "grad_norm": 0.7517091035842896, + "learning_rate": 0.00029108472671069265, + "loss": 4.7302, + "step": 69080 + }, + { + "epoch": 0.09707793177215701, + "grad_norm": 0.7594832181930542, + "learning_rate": 0.00029112687930307713, + "loss": 4.7494, + "step": 69090 + }, + { + "epoch": 0.09709198271032059, + "grad_norm": 0.7366914749145508, + "learning_rate": 0.00029116903189546156, + "loss": 4.7735, + "step": 69100 + }, + { + "epoch": 0.09710603364848416, + "grad_norm": 0.7551494836807251, + "learning_rate": 0.000291211184487846, + "loss": 4.8079, + "step": 69110 + }, + { + "epoch": 0.09712008458664774, + "grad_norm": 0.7677338123321533, + "learning_rate": 0.0002912533370802304, + "loss": 4.7275, + "step": 69120 + }, + { + "epoch": 0.09713413552481132, + "grad_norm": 0.7898170948028564, + "learning_rate": 0.00029129548967261486, + "loss": 4.7946, + "step": 69130 + }, + { + "epoch": 0.0971481864629749, + "grad_norm": 0.7417213320732117, + "learning_rate": 0.00029133764226499923, + "loss": 4.7658, + "step": 69140 + }, + { + "epoch": 0.09716223740113848, + "grad_norm": 0.7414938807487488, + "learning_rate": 0.0002913797948573837, + "loss": 4.8031, + "step": 69150 + }, + { + "epoch": 0.09717628833930206, + "grad_norm": 0.7222784161567688, + "learning_rate": 0.00029142194744976815, + "loss": 4.7848, + "step": 69160 + }, + { + "epoch": 0.09719033927746563, + "grad_norm": 0.7901402115821838, + "learning_rate": 0.0002914641000421526, + "loss": 4.7291, + "step": 69170 + }, + { + "epoch": 0.09720439021562921, + "grad_norm": 0.7619618773460388, + "learning_rate": 0.000291506252634537, + "loss": 4.772, + "step": 69180 + }, + { + "epoch": 0.09721844115379279, + "grad_norm": 0.7558301687240601, + "learning_rate": 0.00029154840522692144, + "loss": 4.6799, + "step": 69190 + }, + { + "epoch": 0.09723249209195636, + "grad_norm": 0.7339478731155396, + "learning_rate": 0.0002915905578193059, + "loss": 4.699, + "step": 69200 + }, + { + "epoch": 0.09724654303011994, + "grad_norm": 0.7412484288215637, + "learning_rate": 0.0002916327104116903, + "loss": 4.7946, + "step": 69210 + }, + { + "epoch": 0.09726059396828352, + "grad_norm": 0.7897161245346069, + "learning_rate": 0.00029167486300407474, + "loss": 4.7929, + "step": 69220 + }, + { + "epoch": 0.09727464490644709, + "grad_norm": 0.7667302489280701, + "learning_rate": 0.00029171701559645917, + "loss": 4.7734, + "step": 69230 + }, + { + "epoch": 0.09728869584461068, + "grad_norm": 0.7448514699935913, + "learning_rate": 0.0002917591681888436, + "loss": 4.7505, + "step": 69240 + }, + { + "epoch": 0.09730274678277426, + "grad_norm": 0.7377836108207703, + "learning_rate": 0.00029180132078122803, + "loss": 4.6303, + "step": 69250 + }, + { + "epoch": 0.09731679772093783, + "grad_norm": 0.742834746837616, + "learning_rate": 0.00029184347337361246, + "loss": 4.8095, + "step": 69260 + }, + { + "epoch": 0.09733084865910141, + "grad_norm": 0.7830805778503418, + "learning_rate": 0.0002918856259659969, + "loss": 4.7535, + "step": 69270 + }, + { + "epoch": 0.09734489959726499, + "grad_norm": 0.7334023118019104, + "learning_rate": 0.0002919277785583813, + "loss": 4.8267, + "step": 69280 + }, + { + "epoch": 0.09735895053542856, + "grad_norm": 0.7822585105895996, + "learning_rate": 0.00029196993115076576, + "loss": 4.743, + "step": 69290 + }, + { + "epoch": 0.09737300147359214, + "grad_norm": 0.7647944092750549, + "learning_rate": 0.0002920120837431502, + "loss": 4.7674, + "step": 69300 + }, + { + "epoch": 0.09738705241175571, + "grad_norm": 0.7947744727134705, + "learning_rate": 0.0002920542363355346, + "loss": 4.7308, + "step": 69310 + }, + { + "epoch": 0.09740110334991929, + "grad_norm": 0.7814948558807373, + "learning_rate": 0.0002920921736686806, + "loss": 4.8563, + "step": 69320 + }, + { + "epoch": 0.09741515428808287, + "grad_norm": 0.7444278001785278, + "learning_rate": 0.00029213432626106503, + "loss": 4.7429, + "step": 69330 + }, + { + "epoch": 0.09742920522624644, + "grad_norm": 0.7391352653503418, + "learning_rate": 0.00029217647885344946, + "loss": 4.6809, + "step": 69340 + }, + { + "epoch": 0.09744325616441003, + "grad_norm": 0.7300538420677185, + "learning_rate": 0.0002922186314458339, + "loss": 4.8616, + "step": 69350 + }, + { + "epoch": 0.09745730710257361, + "grad_norm": 0.7259089350700378, + "learning_rate": 0.0002922607840382183, + "loss": 4.7652, + "step": 69360 + }, + { + "epoch": 0.09747135804073719, + "grad_norm": 0.7582126259803772, + "learning_rate": 0.00029230293663060276, + "loss": 4.7776, + "step": 69370 + }, + { + "epoch": 0.09748540897890076, + "grad_norm": 0.7297702431678772, + "learning_rate": 0.0002923450892229872, + "loss": 4.7676, + "step": 69380 + }, + { + "epoch": 0.09749945991706434, + "grad_norm": 0.753105878829956, + "learning_rate": 0.0002923872418153716, + "loss": 4.7513, + "step": 69390 + }, + { + "epoch": 0.09751351085522791, + "grad_norm": 0.7406522631645203, + "learning_rate": 0.00029242939440775605, + "loss": 4.8253, + "step": 69400 + }, + { + "epoch": 0.09752756179339149, + "grad_norm": 0.8035889863967896, + "learning_rate": 0.0002924715470001405, + "loss": 4.7889, + "step": 69410 + }, + { + "epoch": 0.09754161273155507, + "grad_norm": 0.7537881731987, + "learning_rate": 0.0002925136995925249, + "loss": 4.8387, + "step": 69420 + }, + { + "epoch": 0.09755566366971864, + "grad_norm": 0.7590875625610352, + "learning_rate": 0.00029255585218490934, + "loss": 4.8179, + "step": 69430 + }, + { + "epoch": 0.09756971460788222, + "grad_norm": 0.7489945292472839, + "learning_rate": 0.0002925980047772938, + "loss": 4.8359, + "step": 69440 + }, + { + "epoch": 0.09758376554604581, + "grad_norm": 0.7501441240310669, + "learning_rate": 0.0002926401573696782, + "loss": 4.7967, + "step": 69450 + }, + { + "epoch": 0.09759781648420938, + "grad_norm": 0.7316177487373352, + "learning_rate": 0.00029268230996206264, + "loss": 4.772, + "step": 69460 + }, + { + "epoch": 0.09761186742237296, + "grad_norm": 0.7441616654396057, + "learning_rate": 0.00029272446255444707, + "loss": 4.7145, + "step": 69470 + }, + { + "epoch": 0.09762591836053654, + "grad_norm": 0.733923077583313, + "learning_rate": 0.0002927666151468315, + "loss": 4.8215, + "step": 69480 + }, + { + "epoch": 0.09763996929870011, + "grad_norm": 0.7444280385971069, + "learning_rate": 0.00029280876773921593, + "loss": 4.8405, + "step": 69490 + }, + { + "epoch": 0.09765402023686369, + "grad_norm": 0.7458165287971497, + "learning_rate": 0.00029285092033160036, + "loss": 4.7594, + "step": 69500 + }, + { + "epoch": 0.09766807117502727, + "grad_norm": 0.7323271632194519, + "learning_rate": 0.0002928930729239848, + "loss": 4.6669, + "step": 69510 + }, + { + "epoch": 0.09768212211319084, + "grad_norm": 0.7334503531455994, + "learning_rate": 0.0002929352255163692, + "loss": 4.8641, + "step": 69520 + }, + { + "epoch": 0.09769617305135442, + "grad_norm": 0.747018039226532, + "learning_rate": 0.00029297737810875365, + "loss": 4.8795, + "step": 69530 + }, + { + "epoch": 0.097710223989518, + "grad_norm": 0.7300179600715637, + "learning_rate": 0.0002930195307011381, + "loss": 4.6616, + "step": 69540 + }, + { + "epoch": 0.09772427492768158, + "grad_norm": 0.7466294765472412, + "learning_rate": 0.0002930616832935225, + "loss": 4.8153, + "step": 69550 + }, + { + "epoch": 0.09773832586584516, + "grad_norm": 0.7623385787010193, + "learning_rate": 0.00029310383588590695, + "loss": 4.7444, + "step": 69560 + }, + { + "epoch": 0.09775237680400874, + "grad_norm": 0.7668961882591248, + "learning_rate": 0.0002931459884782914, + "loss": 4.8579, + "step": 69570 + }, + { + "epoch": 0.09776642774217231, + "grad_norm": 0.7659717202186584, + "learning_rate": 0.0002931881410706758, + "loss": 4.6188, + "step": 69580 + }, + { + "epoch": 0.09778047868033589, + "grad_norm": 0.7393763065338135, + "learning_rate": 0.00029323029366306024, + "loss": 4.8071, + "step": 69590 + }, + { + "epoch": 0.09779452961849947, + "grad_norm": 0.7585105299949646, + "learning_rate": 0.00029327244625544467, + "loss": 4.7279, + "step": 69600 + }, + { + "epoch": 0.09780858055666304, + "grad_norm": 0.9059628844261169, + "learning_rate": 0.0002933145988478291, + "loss": 4.931, + "step": 69610 + }, + { + "epoch": 0.09782263149482662, + "grad_norm": 0.7281200289726257, + "learning_rate": 0.00029335675144021354, + "loss": 4.8205, + "step": 69620 + }, + { + "epoch": 0.0978366824329902, + "grad_norm": 0.7609696984291077, + "learning_rate": 0.00029339890403259797, + "loss": 4.8716, + "step": 69630 + }, + { + "epoch": 0.09785073337115377, + "grad_norm": 0.7602007389068604, + "learning_rate": 0.0002934410566249824, + "loss": 4.8642, + "step": 69640 + }, + { + "epoch": 0.09786478430931735, + "grad_norm": 0.7322178483009338, + "learning_rate": 0.00029348320921736683, + "loss": 4.8271, + "step": 69650 + }, + { + "epoch": 0.09787883524748094, + "grad_norm": 0.757675290107727, + "learning_rate": 0.00029352536180975126, + "loss": 4.8059, + "step": 69660 + }, + { + "epoch": 0.09789288618564451, + "grad_norm": 0.770977258682251, + "learning_rate": 0.0002935675144021357, + "loss": 4.7335, + "step": 69670 + }, + { + "epoch": 0.09790693712380809, + "grad_norm": 0.7639873623847961, + "learning_rate": 0.0002936096669945201, + "loss": 4.7993, + "step": 69680 + }, + { + "epoch": 0.09792098806197166, + "grad_norm": 0.7296125888824463, + "learning_rate": 0.00029365181958690455, + "loss": 4.9388, + "step": 69690 + }, + { + "epoch": 0.09793503900013524, + "grad_norm": 0.7537563443183899, + "learning_rate": 0.000293693972179289, + "loss": 4.8099, + "step": 69700 + }, + { + "epoch": 0.09794908993829882, + "grad_norm": 0.7601600289344788, + "learning_rate": 0.0002937361247716734, + "loss": 4.7219, + "step": 69710 + }, + { + "epoch": 0.09796314087646239, + "grad_norm": 0.7119511365890503, + "learning_rate": 0.00029377827736405785, + "loss": 4.7607, + "step": 69720 + }, + { + "epoch": 0.09797719181462597, + "grad_norm": 0.7661904096603394, + "learning_rate": 0.0002938204299564423, + "loss": 4.7596, + "step": 69730 + }, + { + "epoch": 0.09799124275278955, + "grad_norm": 0.7884121537208557, + "learning_rate": 0.00029386258254882676, + "loss": 4.7573, + "step": 69740 + }, + { + "epoch": 0.09800529369095312, + "grad_norm": 0.7419071793556213, + "learning_rate": 0.00029390473514121114, + "loss": 4.852, + "step": 69750 + }, + { + "epoch": 0.09801934462911671, + "grad_norm": 0.8159481287002563, + "learning_rate": 0.00029394688773359557, + "loss": 4.6828, + "step": 69760 + }, + { + "epoch": 0.09803339556728029, + "grad_norm": 0.755358874797821, + "learning_rate": 0.00029398904032598, + "loss": 4.7223, + "step": 69770 + }, + { + "epoch": 0.09804744650544386, + "grad_norm": 0.7439599633216858, + "learning_rate": 0.00029403119291836443, + "loss": 4.7543, + "step": 69780 + }, + { + "epoch": 0.09806149744360744, + "grad_norm": 0.7651283740997314, + "learning_rate": 0.0002940733455107489, + "loss": 4.8595, + "step": 69790 + }, + { + "epoch": 0.09807554838177102, + "grad_norm": 0.7742921113967896, + "learning_rate": 0.00029411549810313335, + "loss": 4.6861, + "step": 69800 + }, + { + "epoch": 0.09808959931993459, + "grad_norm": 0.7510424256324768, + "learning_rate": 0.00029415765069551773, + "loss": 4.7856, + "step": 69810 + }, + { + "epoch": 0.09810365025809817, + "grad_norm": 0.7651505470275879, + "learning_rate": 0.00029419980328790216, + "loss": 4.7194, + "step": 69820 + }, + { + "epoch": 0.09811770119626174, + "grad_norm": 0.7609485387802124, + "learning_rate": 0.0002942419558802866, + "loss": 4.7036, + "step": 69830 + }, + { + "epoch": 0.09813175213442532, + "grad_norm": 0.7432272434234619, + "learning_rate": 0.000294284108472671, + "loss": 4.7723, + "step": 69840 + }, + { + "epoch": 0.0981458030725889, + "grad_norm": 0.7429347038269043, + "learning_rate": 0.0002943262610650555, + "loss": 4.7675, + "step": 69850 + }, + { + "epoch": 0.09815985401075249, + "grad_norm": 0.7958447933197021, + "learning_rate": 0.00029436841365743994, + "loss": 4.7454, + "step": 69860 + }, + { + "epoch": 0.09817390494891606, + "grad_norm": 0.7396420836448669, + "learning_rate": 0.00029441056624982437, + "loss": 4.6856, + "step": 69870 + }, + { + "epoch": 0.09818795588707964, + "grad_norm": 0.7574613690376282, + "learning_rate": 0.00029445271884220875, + "loss": 4.8366, + "step": 69880 + }, + { + "epoch": 0.09820200682524322, + "grad_norm": 0.7506481409072876, + "learning_rate": 0.0002944948714345932, + "loss": 4.6661, + "step": 69890 + }, + { + "epoch": 0.09821605776340679, + "grad_norm": 0.7857638001441956, + "learning_rate": 0.0002945370240269776, + "loss": 4.7884, + "step": 69900 + }, + { + "epoch": 0.09823010870157037, + "grad_norm": 0.7581502199172974, + "learning_rate": 0.0002945791766193621, + "loss": 4.7728, + "step": 69910 + }, + { + "epoch": 0.09824415963973394, + "grad_norm": 0.7380070090293884, + "learning_rate": 0.0002946213292117465, + "loss": 4.7034, + "step": 69920 + }, + { + "epoch": 0.09825821057789752, + "grad_norm": 0.719098687171936, + "learning_rate": 0.00029466348180413096, + "loss": 4.8379, + "step": 69930 + }, + { + "epoch": 0.0982722615160611, + "grad_norm": 0.7294281125068665, + "learning_rate": 0.00029470563439651533, + "loss": 4.8387, + "step": 69940 + }, + { + "epoch": 0.09828631245422467, + "grad_norm": 0.7245777249336243, + "learning_rate": 0.00029474778698889976, + "loss": 4.9148, + "step": 69950 + }, + { + "epoch": 0.09830036339238825, + "grad_norm": 0.7560135126113892, + "learning_rate": 0.0002947899395812842, + "loss": 4.8186, + "step": 69960 + }, + { + "epoch": 0.09831441433055184, + "grad_norm": 0.7155381441116333, + "learning_rate": 0.0002948320921736687, + "loss": 4.8529, + "step": 69970 + }, + { + "epoch": 0.09832846526871541, + "grad_norm": 0.751598060131073, + "learning_rate": 0.0002948742447660531, + "loss": 4.6971, + "step": 69980 + }, + { + "epoch": 0.09834251620687899, + "grad_norm": 0.7454250454902649, + "learning_rate": 0.00029491639735843754, + "loss": 4.8887, + "step": 69990 + }, + { + "epoch": 0.09835656714504257, + "grad_norm": 0.745444118976593, + "learning_rate": 0.000294958549950822, + "loss": 4.8025, + "step": 70000 + }, + { + "epoch": 0.09837061808320614, + "grad_norm": 0.7454139590263367, + "learning_rate": 0.00029500070254320635, + "loss": 4.8269, + "step": 70010 + }, + { + "epoch": 0.09838466902136972, + "grad_norm": 0.7288814187049866, + "learning_rate": 0.0002950428551355908, + "loss": 4.8821, + "step": 70020 + }, + { + "epoch": 0.0983987199595333, + "grad_norm": 0.7203482985496521, + "learning_rate": 0.00029508500772797527, + "loss": 4.7518, + "step": 70030 + }, + { + "epoch": 0.09841277089769687, + "grad_norm": 0.7820789813995361, + "learning_rate": 0.0002951271603203597, + "loss": 4.7813, + "step": 70040 + }, + { + "epoch": 0.09842682183586045, + "grad_norm": 0.7446337938308716, + "learning_rate": 0.00029516931291274413, + "loss": 4.7992, + "step": 70050 + }, + { + "epoch": 0.09844087277402402, + "grad_norm": 0.7281479835510254, + "learning_rate": 0.00029521146550512856, + "loss": 4.8715, + "step": 70060 + }, + { + "epoch": 0.09845492371218761, + "grad_norm": 0.7503718137741089, + "learning_rate": 0.00029525361809751294, + "loss": 4.7856, + "step": 70070 + }, + { + "epoch": 0.09846897465035119, + "grad_norm": 0.7504997849464417, + "learning_rate": 0.00029529577068989737, + "loss": 4.7455, + "step": 70080 + }, + { + "epoch": 0.09848302558851477, + "grad_norm": 0.755368709564209, + "learning_rate": 0.00029533792328228185, + "loss": 4.7809, + "step": 70090 + }, + { + "epoch": 0.09849707652667834, + "grad_norm": 0.7570355534553528, + "learning_rate": 0.0002953800758746663, + "loss": 4.7163, + "step": 70100 + }, + { + "epoch": 0.09851112746484192, + "grad_norm": 0.7263005971908569, + "learning_rate": 0.0002954222284670507, + "loss": 4.8162, + "step": 70110 + }, + { + "epoch": 0.0985251784030055, + "grad_norm": 0.7369113564491272, + "learning_rate": 0.00029546438105943515, + "loss": 4.7484, + "step": 70120 + }, + { + "epoch": 0.09853922934116907, + "grad_norm": 0.7835990190505981, + "learning_rate": 0.0002955065336518196, + "loss": 4.7219, + "step": 70130 + }, + { + "epoch": 0.09855328027933265, + "grad_norm": 0.7315928936004639, + "learning_rate": 0.00029554868624420396, + "loss": 4.7446, + "step": 70140 + }, + { + "epoch": 0.09856733121749622, + "grad_norm": 0.7778264284133911, + "learning_rate": 0.00029559083883658844, + "loss": 4.7838, + "step": 70150 + }, + { + "epoch": 0.0985813821556598, + "grad_norm": 0.7490993142127991, + "learning_rate": 0.00029563299142897287, + "loss": 4.7523, + "step": 70160 + }, + { + "epoch": 0.09859543309382339, + "grad_norm": 0.728198766708374, + "learning_rate": 0.0002956751440213573, + "loss": 4.7791, + "step": 70170 + }, + { + "epoch": 0.09860948403198697, + "grad_norm": 0.7419266700744629, + "learning_rate": 0.00029571729661374173, + "loss": 4.7178, + "step": 70180 + }, + { + "epoch": 0.09862353497015054, + "grad_norm": 0.7586491703987122, + "learning_rate": 0.00029575944920612617, + "loss": 4.7541, + "step": 70190 + }, + { + "epoch": 0.09863758590831412, + "grad_norm": 0.7712236046791077, + "learning_rate": 0.00029580160179851054, + "loss": 4.7473, + "step": 70200 + }, + { + "epoch": 0.0986516368464777, + "grad_norm": 0.7614474892616272, + "learning_rate": 0.00029584375439089503, + "loss": 4.7263, + "step": 70210 + }, + { + "epoch": 0.09866568778464127, + "grad_norm": 0.7652673125267029, + "learning_rate": 0.00029588590698327946, + "loss": 4.7051, + "step": 70220 + }, + { + "epoch": 0.09867973872280485, + "grad_norm": 0.7385839223861694, + "learning_rate": 0.0002959280595756639, + "loss": 4.7849, + "step": 70230 + }, + { + "epoch": 0.09869378966096842, + "grad_norm": 0.8170595765113831, + "learning_rate": 0.0002959702121680483, + "loss": 4.8655, + "step": 70240 + }, + { + "epoch": 0.098707840599132, + "grad_norm": 0.7230207324028015, + "learning_rate": 0.00029601236476043275, + "loss": 4.8548, + "step": 70250 + }, + { + "epoch": 0.09872189153729558, + "grad_norm": 0.7606765031814575, + "learning_rate": 0.0002960545173528172, + "loss": 4.7995, + "step": 70260 + }, + { + "epoch": 0.09873594247545917, + "grad_norm": 0.7488109469413757, + "learning_rate": 0.0002960966699452016, + "loss": 4.827, + "step": 70270 + }, + { + "epoch": 0.09874999341362274, + "grad_norm": 0.7458518743515015, + "learning_rate": 0.00029613882253758605, + "loss": 4.8889, + "step": 70280 + }, + { + "epoch": 0.09876404435178632, + "grad_norm": 0.7448275089263916, + "learning_rate": 0.0002961809751299705, + "loss": 4.7753, + "step": 70290 + }, + { + "epoch": 0.0987780952899499, + "grad_norm": 0.7387959957122803, + "learning_rate": 0.0002962231277223549, + "loss": 4.738, + "step": 70300 + }, + { + "epoch": 0.09879214622811347, + "grad_norm": 0.7459929585456848, + "learning_rate": 0.00029626528031473934, + "loss": 4.6784, + "step": 70310 + }, + { + "epoch": 0.09880619716627705, + "grad_norm": 0.7338137030601501, + "learning_rate": 0.00029630743290712377, + "loss": 4.8043, + "step": 70320 + }, + { + "epoch": 0.09882024810444062, + "grad_norm": 0.7475922107696533, + "learning_rate": 0.0002963495854995082, + "loss": 4.8699, + "step": 70330 + }, + { + "epoch": 0.0988342990426042, + "grad_norm": 0.7375343441963196, + "learning_rate": 0.00029639173809189263, + "loss": 4.7544, + "step": 70340 + }, + { + "epoch": 0.09884834998076777, + "grad_norm": 0.7328280210494995, + "learning_rate": 0.00029643389068427706, + "loss": 4.7604, + "step": 70350 + }, + { + "epoch": 0.09886240091893135, + "grad_norm": 0.723840594291687, + "learning_rate": 0.0002964760432766615, + "loss": 4.888, + "step": 70360 + }, + { + "epoch": 0.09887645185709493, + "grad_norm": 0.7332075238227844, + "learning_rate": 0.0002965181958690459, + "loss": 4.7549, + "step": 70370 + }, + { + "epoch": 0.09889050279525852, + "grad_norm": 0.7340726852416992, + "learning_rate": 0.00029656034846143036, + "loss": 4.7467, + "step": 70380 + }, + { + "epoch": 0.0989045537334221, + "grad_norm": 0.7343766093254089, + "learning_rate": 0.0002966025010538148, + "loss": 4.7236, + "step": 70390 + }, + { + "epoch": 0.09891860467158567, + "grad_norm": 0.7564096450805664, + "learning_rate": 0.0002966446536461992, + "loss": 4.6312, + "step": 70400 + }, + { + "epoch": 0.09893265560974925, + "grad_norm": 0.7509664297103882, + "learning_rate": 0.00029668680623858365, + "loss": 4.7879, + "step": 70410 + }, + { + "epoch": 0.09894670654791282, + "grad_norm": 0.7478966116905212, + "learning_rate": 0.0002967289588309681, + "loss": 4.8726, + "step": 70420 + }, + { + "epoch": 0.0989607574860764, + "grad_norm": 0.7956492304801941, + "learning_rate": 0.0002967711114233525, + "loss": 4.8496, + "step": 70430 + }, + { + "epoch": 0.09897480842423997, + "grad_norm": 0.7533037662506104, + "learning_rate": 0.00029681326401573694, + "loss": 4.8447, + "step": 70440 + }, + { + "epoch": 0.09898885936240355, + "grad_norm": 0.7333579063415527, + "learning_rate": 0.0002968554166081214, + "loss": 4.7172, + "step": 70450 + }, + { + "epoch": 0.09900291030056713, + "grad_norm": 0.745343029499054, + "learning_rate": 0.0002968975692005058, + "loss": 4.7113, + "step": 70460 + }, + { + "epoch": 0.0990169612387307, + "grad_norm": 0.7377544045448303, + "learning_rate": 0.00029693972179289024, + "loss": 5.0384, + "step": 70470 + }, + { + "epoch": 0.09903101217689429, + "grad_norm": 0.756903886795044, + "learning_rate": 0.00029698187438527467, + "loss": 4.6786, + "step": 70480 + }, + { + "epoch": 0.09904506311505787, + "grad_norm": 0.7782666087150574, + "learning_rate": 0.0002970240269776591, + "loss": 4.8214, + "step": 70490 + }, + { + "epoch": 0.09905911405322144, + "grad_norm": 0.7528187036514282, + "learning_rate": 0.00029706617957004353, + "loss": 4.7897, + "step": 70500 + }, + { + "epoch": 0.09907316499138502, + "grad_norm": 0.7484343647956848, + "learning_rate": 0.00029710833216242796, + "loss": 4.7155, + "step": 70510 + }, + { + "epoch": 0.0990872159295486, + "grad_norm": 0.7250393629074097, + "learning_rate": 0.0002971504847548124, + "loss": 4.8713, + "step": 70520 + }, + { + "epoch": 0.09910126686771217, + "grad_norm": 0.7773247957229614, + "learning_rate": 0.0002971926373471968, + "loss": 4.7164, + "step": 70530 + }, + { + "epoch": 0.09911531780587575, + "grad_norm": 0.729132890701294, + "learning_rate": 0.00029723478993958126, + "loss": 4.7736, + "step": 70540 + }, + { + "epoch": 0.09912936874403933, + "grad_norm": 0.7351367473602295, + "learning_rate": 0.0002972769425319657, + "loss": 4.9012, + "step": 70550 + }, + { + "epoch": 0.0991434196822029, + "grad_norm": 0.7388243675231934, + "learning_rate": 0.0002973190951243501, + "loss": 4.6448, + "step": 70560 + }, + { + "epoch": 0.09915747062036648, + "grad_norm": 0.7806434631347656, + "learning_rate": 0.00029736124771673455, + "loss": 4.7922, + "step": 70570 + }, + { + "epoch": 0.09917152155853007, + "grad_norm": 0.788178026676178, + "learning_rate": 0.000297403400309119, + "loss": 4.7924, + "step": 70580 + }, + { + "epoch": 0.09918557249669364, + "grad_norm": 0.7238157391548157, + "learning_rate": 0.0002974455529015034, + "loss": 4.8356, + "step": 70590 + }, + { + "epoch": 0.09919962343485722, + "grad_norm": 0.7454667687416077, + "learning_rate": 0.00029748770549388784, + "loss": 4.7332, + "step": 70600 + }, + { + "epoch": 0.0992136743730208, + "grad_norm": 0.7305548191070557, + "learning_rate": 0.0002975298580862723, + "loss": 4.8754, + "step": 70610 + }, + { + "epoch": 0.09922772531118437, + "grad_norm": 0.7459691762924194, + "learning_rate": 0.0002975720106786567, + "loss": 4.7751, + "step": 70620 + }, + { + "epoch": 0.09924177624934795, + "grad_norm": 0.7162081599235535, + "learning_rate": 0.00029761416327104114, + "loss": 4.8116, + "step": 70630 + }, + { + "epoch": 0.09925582718751153, + "grad_norm": 0.7404140830039978, + "learning_rate": 0.00029765631586342557, + "loss": 4.8285, + "step": 70640 + }, + { + "epoch": 0.0992698781256751, + "grad_norm": 0.7442372441291809, + "learning_rate": 0.00029769846845581, + "loss": 4.7367, + "step": 70650 + }, + { + "epoch": 0.09928392906383868, + "grad_norm": 0.7542151212692261, + "learning_rate": 0.00029774062104819443, + "loss": 4.7881, + "step": 70660 + }, + { + "epoch": 0.09929798000200225, + "grad_norm": 0.7247451543807983, + "learning_rate": 0.00029778277364057886, + "loss": 4.7453, + "step": 70670 + }, + { + "epoch": 0.09931203094016583, + "grad_norm": 0.7268667221069336, + "learning_rate": 0.0002978249262329633, + "loss": 4.7807, + "step": 70680 + }, + { + "epoch": 0.09932608187832942, + "grad_norm": 0.7007578611373901, + "learning_rate": 0.0002978670788253477, + "loss": 4.8148, + "step": 70690 + }, + { + "epoch": 0.099340132816493, + "grad_norm": 0.7512852549552917, + "learning_rate": 0.00029790923141773216, + "loss": 4.7867, + "step": 70700 + }, + { + "epoch": 0.09935418375465657, + "grad_norm": 0.7661580443382263, + "learning_rate": 0.0002979513840101166, + "loss": 4.721, + "step": 70710 + }, + { + "epoch": 0.09936823469282015, + "grad_norm": 0.7639215588569641, + "learning_rate": 0.000297993536602501, + "loss": 4.7987, + "step": 70720 + }, + { + "epoch": 0.09938228563098372, + "grad_norm": 0.7315471172332764, + "learning_rate": 0.00029803568919488545, + "loss": 4.9177, + "step": 70730 + }, + { + "epoch": 0.0993963365691473, + "grad_norm": 0.7605272531509399, + "learning_rate": 0.0002980778417872699, + "loss": 4.8585, + "step": 70740 + }, + { + "epoch": 0.09941038750731088, + "grad_norm": 0.71612948179245, + "learning_rate": 0.0002981199943796543, + "loss": 4.7394, + "step": 70750 + }, + { + "epoch": 0.09942443844547445, + "grad_norm": 0.7437877058982849, + "learning_rate": 0.00029816214697203874, + "loss": 4.7909, + "step": 70760 + }, + { + "epoch": 0.09943848938363803, + "grad_norm": 0.7186173796653748, + "learning_rate": 0.0002982042995644232, + "loss": 4.7169, + "step": 70770 + }, + { + "epoch": 0.0994525403218016, + "grad_norm": 0.7542218565940857, + "learning_rate": 0.0002982464521568076, + "loss": 4.7704, + "step": 70780 + }, + { + "epoch": 0.0994665912599652, + "grad_norm": 0.7415812611579895, + "learning_rate": 0.00029828860474919204, + "loss": 4.8173, + "step": 70790 + }, + { + "epoch": 0.09948064219812877, + "grad_norm": 0.7335118651390076, + "learning_rate": 0.00029833075734157647, + "loss": 4.8941, + "step": 70800 + }, + { + "epoch": 0.09949469313629235, + "grad_norm": 0.7257421612739563, + "learning_rate": 0.0002983729099339609, + "loss": 4.8061, + "step": 70810 + }, + { + "epoch": 0.09950874407445592, + "grad_norm": 0.7506051659584045, + "learning_rate": 0.00029841506252634533, + "loss": 4.7745, + "step": 70820 + }, + { + "epoch": 0.0995227950126195, + "grad_norm": 0.765600323677063, + "learning_rate": 0.00029845721511872976, + "loss": 4.7226, + "step": 70830 + }, + { + "epoch": 0.09953684595078308, + "grad_norm": 0.7521647214889526, + "learning_rate": 0.00029849936771111425, + "loss": 4.8212, + "step": 70840 + }, + { + "epoch": 0.09955089688894665, + "grad_norm": 0.7769646644592285, + "learning_rate": 0.0002985415203034986, + "loss": 4.7042, + "step": 70850 + }, + { + "epoch": 0.09956494782711023, + "grad_norm": 0.751598060131073, + "learning_rate": 0.00029858367289588305, + "loss": 4.7392, + "step": 70860 + }, + { + "epoch": 0.0995789987652738, + "grad_norm": 0.7508914470672607, + "learning_rate": 0.0002986258254882675, + "loss": 4.8002, + "step": 70870 + }, + { + "epoch": 0.09959304970343738, + "grad_norm": 0.728714644908905, + "learning_rate": 0.0002986679780806519, + "loss": 4.7734, + "step": 70880 + }, + { + "epoch": 0.09960710064160097, + "grad_norm": 0.7441499829292297, + "learning_rate": 0.00029871013067303635, + "loss": 4.8044, + "step": 70890 + }, + { + "epoch": 0.09962115157976455, + "grad_norm": 0.74562007188797, + "learning_rate": 0.00029875228326542083, + "loss": 4.8104, + "step": 70900 + }, + { + "epoch": 0.09963520251792812, + "grad_norm": 0.7518746256828308, + "learning_rate": 0.0002987944358578052, + "loss": 4.7614, + "step": 70910 + }, + { + "epoch": 0.0996492534560917, + "grad_norm": 0.7399574518203735, + "learning_rate": 0.00029883658845018964, + "loss": 4.7511, + "step": 70920 + }, + { + "epoch": 0.09966330439425528, + "grad_norm": 0.7430614233016968, + "learning_rate": 0.00029887874104257407, + "loss": 4.7823, + "step": 70930 + }, + { + "epoch": 0.09967735533241885, + "grad_norm": 0.7251525521278381, + "learning_rate": 0.0002989208936349585, + "loss": 4.8084, + "step": 70940 + }, + { + "epoch": 0.09969140627058243, + "grad_norm": 0.8306521773338318, + "learning_rate": 0.000298963046227343, + "loss": 4.7226, + "step": 70950 + }, + { + "epoch": 0.099705457208746, + "grad_norm": 0.7492359280586243, + "learning_rate": 0.0002990051988197274, + "loss": 4.7497, + "step": 70960 + }, + { + "epoch": 0.09971950814690958, + "grad_norm": 0.736562967300415, + "learning_rate": 0.00029904735141211185, + "loss": 4.6997, + "step": 70970 + }, + { + "epoch": 0.09973355908507316, + "grad_norm": 0.740244448184967, + "learning_rate": 0.00029908950400449623, + "loss": 4.7356, + "step": 70980 + }, + { + "epoch": 0.09974761002323673, + "grad_norm": 0.7707760334014893, + "learning_rate": 0.00029913165659688066, + "loss": 4.6801, + "step": 70990 + }, + { + "epoch": 0.09976166096140032, + "grad_norm": 0.7498148679733276, + "learning_rate": 0.0002991738091892651, + "loss": 4.6808, + "step": 71000 + }, + { + "epoch": 0.0997757118995639, + "grad_norm": 0.7346001863479614, + "learning_rate": 0.0002992159617816496, + "loss": 4.7796, + "step": 71010 + }, + { + "epoch": 0.09978976283772747, + "grad_norm": 0.7446746826171875, + "learning_rate": 0.000299258114374034, + "loss": 4.7906, + "step": 71020 + }, + { + "epoch": 0.09980381377589105, + "grad_norm": 0.7236793637275696, + "learning_rate": 0.00029930026696641844, + "loss": 4.7742, + "step": 71030 + }, + { + "epoch": 0.09981786471405463, + "grad_norm": 0.7329582571983337, + "learning_rate": 0.0002993424195588028, + "loss": 4.6221, + "step": 71040 + }, + { + "epoch": 0.0998319156522182, + "grad_norm": 0.7298672199249268, + "learning_rate": 0.00029938457215118725, + "loss": 4.8417, + "step": 71050 + }, + { + "epoch": 0.09984596659038178, + "grad_norm": 0.7621816396713257, + "learning_rate": 0.0002994267247435717, + "loss": 4.8465, + "step": 71060 + }, + { + "epoch": 0.09986001752854536, + "grad_norm": 0.7588155269622803, + "learning_rate": 0.00029946887733595616, + "loss": 4.7579, + "step": 71070 + }, + { + "epoch": 0.09987406846670893, + "grad_norm": 0.7053050398826599, + "learning_rate": 0.0002995110299283406, + "loss": 4.7375, + "step": 71080 + }, + { + "epoch": 0.09988811940487251, + "grad_norm": 0.7316890954971313, + "learning_rate": 0.000299553182520725, + "loss": 4.8224, + "step": 71090 + }, + { + "epoch": 0.0999021703430361, + "grad_norm": 0.7458178400993347, + "learning_rate": 0.0002995953351131094, + "loss": 4.9259, + "step": 71100 + }, + { + "epoch": 0.09991622128119967, + "grad_norm": 0.7310707569122314, + "learning_rate": 0.00029963748770549383, + "loss": 4.7937, + "step": 71110 + }, + { + "epoch": 0.09993027221936325, + "grad_norm": 0.7549211978912354, + "learning_rate": 0.00029967964029787826, + "loss": 4.7397, + "step": 71120 + }, + { + "epoch": 0.09994432315752683, + "grad_norm": 0.743523120880127, + "learning_rate": 0.00029972179289026275, + "loss": 4.7553, + "step": 71130 + }, + { + "epoch": 0.0999583740956904, + "grad_norm": 0.7451346516609192, + "learning_rate": 0.0002997639454826472, + "loss": 4.7489, + "step": 71140 + }, + { + "epoch": 0.09997242503385398, + "grad_norm": 0.7571463584899902, + "learning_rate": 0.0002998060980750316, + "loss": 4.7289, + "step": 71150 + }, + { + "epoch": 0.09998647597201755, + "grad_norm": 0.7282206416130066, + "learning_rate": 0.00029984825066741604, + "loss": 4.8128, + "step": 71160 + }, + { + "epoch": 0.10000052691018113, + "grad_norm": 0.7786749601364136, + "learning_rate": 0.0002998904032598004, + "loss": 4.928, + "step": 71170 + }, + { + "epoch": 0.10001457784834471, + "grad_norm": 0.7948459982872009, + "learning_rate": 0.00029993255585218485, + "loss": 4.7095, + "step": 71180 + }, + { + "epoch": 0.10002862878650828, + "grad_norm": 0.7715887427330017, + "learning_rate": 0.00029997470844456934, + "loss": 4.8289, + "step": 71190 + }, + { + "epoch": 0.10004267972467187, + "grad_norm": 0.7413100004196167, + "learning_rate": 0.0002999999999711326, + "loss": 4.6136, + "step": 71200 + }, + { + "epoch": 0.10005673066283545, + "grad_norm": 0.8110563158988953, + "learning_rate": 0.00029999999964637453, + "loss": 4.7762, + "step": 71210 + }, + { + "epoch": 0.10007078160099903, + "grad_norm": 0.7453218698501587, + "learning_rate": 0.0002999999989607741, + "loss": 4.7854, + "step": 71220 + }, + { + "epoch": 0.1000848325391626, + "grad_norm": 0.787434995174408, + "learning_rate": 0.0002999999979143313, + "loss": 4.7152, + "step": 71230 + }, + { + "epoch": 0.10009888347732618, + "grad_norm": 0.7521796226501465, + "learning_rate": 0.00029999999650704627, + "loss": 4.863, + "step": 71240 + }, + { + "epoch": 0.10011293441548975, + "grad_norm": 0.7553433179855347, + "learning_rate": 0.00029999999473891885, + "loss": 4.7434, + "step": 71250 + }, + { + "epoch": 0.10012698535365333, + "grad_norm": 0.7574853301048279, + "learning_rate": 0.00029999999260994916, + "loss": 4.8025, + "step": 71260 + }, + { + "epoch": 0.1001410362918169, + "grad_norm": 0.7284955978393555, + "learning_rate": 0.0002999999901201371, + "loss": 4.7283, + "step": 71270 + }, + { + "epoch": 0.10015508722998048, + "grad_norm": 0.7206137776374817, + "learning_rate": 0.00029999998726948276, + "loss": 4.6774, + "step": 71280 + }, + { + "epoch": 0.10016913816814406, + "grad_norm": 0.7361604571342468, + "learning_rate": 0.00029999998405798615, + "loss": 4.8914, + "step": 71290 + }, + { + "epoch": 0.10018318910630764, + "grad_norm": 0.7506524920463562, + "learning_rate": 0.0002999999804856472, + "loss": 4.7574, + "step": 71300 + }, + { + "epoch": 0.10019724004447123, + "grad_norm": 0.7192894220352173, + "learning_rate": 0.00029999997655246597, + "loss": 4.7978, + "step": 71310 + }, + { + "epoch": 0.1002112909826348, + "grad_norm": 0.6896945238113403, + "learning_rate": 0.0002999999722584425, + "loss": 4.7519, + "step": 71320 + }, + { + "epoch": 0.10022534192079838, + "grad_norm": 0.7413153648376465, + "learning_rate": 0.00029999996760357676, + "loss": 4.878, + "step": 71330 + }, + { + "epoch": 0.10023939285896195, + "grad_norm": 0.7378984689712524, + "learning_rate": 0.0002999999625878688, + "loss": 4.7571, + "step": 71340 + }, + { + "epoch": 0.10025344379712553, + "grad_norm": 0.7499397397041321, + "learning_rate": 0.0002999999572113185, + "loss": 4.7636, + "step": 71350 + }, + { + "epoch": 0.1002674947352891, + "grad_norm": 0.725183367729187, + "learning_rate": 0.0002999999514739261, + "loss": 4.8262, + "step": 71360 + }, + { + "epoch": 0.10028154567345268, + "grad_norm": 0.783112645149231, + "learning_rate": 0.0002999999453756914, + "loss": 4.7547, + "step": 71370 + }, + { + "epoch": 0.10029559661161626, + "grad_norm": 0.7308584451675415, + "learning_rate": 0.00029999993891661455, + "loss": 4.7689, + "step": 71380 + }, + { + "epoch": 0.10030964754977983, + "grad_norm": 0.7266840934753418, + "learning_rate": 0.0002999999320966955, + "loss": 4.7863, + "step": 71390 + }, + { + "epoch": 0.10032369848794341, + "grad_norm": 0.7103331089019775, + "learning_rate": 0.00029999992491593427, + "loss": 4.6985, + "step": 71400 + }, + { + "epoch": 0.100337749426107, + "grad_norm": 0.7648615837097168, + "learning_rate": 0.0002999999173743309, + "loss": 4.8235, + "step": 71410 + }, + { + "epoch": 0.10035180036427058, + "grad_norm": 0.7262908220291138, + "learning_rate": 0.0002999999094718854, + "loss": 4.8409, + "step": 71420 + }, + { + "epoch": 0.10036585130243415, + "grad_norm": 0.7537401914596558, + "learning_rate": 0.0002999999012085978, + "loss": 4.7755, + "step": 71430 + }, + { + "epoch": 0.10037990224059773, + "grad_norm": 0.7262635827064514, + "learning_rate": 0.0002999998925844681, + "loss": 4.7393, + "step": 71440 + }, + { + "epoch": 0.1003939531787613, + "grad_norm": 0.7246768474578857, + "learning_rate": 0.0002999998835994963, + "loss": 4.747, + "step": 71450 + }, + { + "epoch": 0.10040800411692488, + "grad_norm": 0.7420998811721802, + "learning_rate": 0.0002999998742536825, + "loss": 4.7245, + "step": 71460 + }, + { + "epoch": 0.10042205505508846, + "grad_norm": 0.7534216046333313, + "learning_rate": 0.0002999998645470266, + "loss": 4.7235, + "step": 71470 + }, + { + "epoch": 0.10043610599325203, + "grad_norm": 0.7178711891174316, + "learning_rate": 0.00029999985447952876, + "loss": 4.6908, + "step": 71480 + }, + { + "epoch": 0.10045015693141561, + "grad_norm": 0.7422319054603577, + "learning_rate": 0.0002999998440511889, + "loss": 4.8748, + "step": 71490 + }, + { + "epoch": 0.10046420786957919, + "grad_norm": 0.7381262183189392, + "learning_rate": 0.00029999983326200715, + "loss": 4.7584, + "step": 71500 + }, + { + "epoch": 0.10047825880774278, + "grad_norm": 0.726177453994751, + "learning_rate": 0.0002999998232432237, + "loss": 4.7548, + "step": 71510 + }, + { + "epoch": 0.10049230974590635, + "grad_norm": 0.732631266117096, + "learning_rate": 0.00029999981176844225, + "loss": 4.8067, + "step": 71520 + }, + { + "epoch": 0.10050636068406993, + "grad_norm": 0.7357652187347412, + "learning_rate": 0.00029999979993281894, + "loss": 4.8273, + "step": 71530 + }, + { + "epoch": 0.1005204116222335, + "grad_norm": 0.7101837396621704, + "learning_rate": 0.0002999997877363538, + "loss": 4.6746, + "step": 71540 + }, + { + "epoch": 0.10053446256039708, + "grad_norm": 0.7089740633964539, + "learning_rate": 0.0002999997751790468, + "loss": 4.8879, + "step": 71550 + }, + { + "epoch": 0.10054851349856066, + "grad_norm": 0.7296656966209412, + "learning_rate": 0.000299999762260898, + "loss": 4.7599, + "step": 71560 + }, + { + "epoch": 0.10056256443672423, + "grad_norm": 0.7465136051177979, + "learning_rate": 0.00029999974898190754, + "loss": 4.8796, + "step": 71570 + }, + { + "epoch": 0.10057661537488781, + "grad_norm": 0.7571253776550293, + "learning_rate": 0.0002999997353420753, + "loss": 4.7633, + "step": 71580 + }, + { + "epoch": 0.10059066631305139, + "grad_norm": 0.7460293173789978, + "learning_rate": 0.00029999972134140134, + "loss": 4.9302, + "step": 71590 + }, + { + "epoch": 0.10060471725121496, + "grad_norm": 0.7485179305076599, + "learning_rate": 0.0002999997069798857, + "loss": 4.8744, + "step": 71600 + }, + { + "epoch": 0.10061876818937854, + "grad_norm": 0.7496930956840515, + "learning_rate": 0.0002999996922575285, + "loss": 4.7929, + "step": 71610 + }, + { + "epoch": 0.10063281912754213, + "grad_norm": 0.7394983768463135, + "learning_rate": 0.0002999996771743297, + "loss": 4.7198, + "step": 71620 + }, + { + "epoch": 0.1006468700657057, + "grad_norm": 0.7388569712638855, + "learning_rate": 0.00029999966173028934, + "loss": 4.7331, + "step": 71630 + }, + { + "epoch": 0.10066092100386928, + "grad_norm": 0.7156569361686707, + "learning_rate": 0.00029999964592540747, + "loss": 4.8351, + "step": 71640 + }, + { + "epoch": 0.10067497194203286, + "grad_norm": 0.7694973349571228, + "learning_rate": 0.0002999996297596841, + "loss": 4.8795, + "step": 71650 + }, + { + "epoch": 0.10068902288019643, + "grad_norm": 0.8083022236824036, + "learning_rate": 0.0002999996132331193, + "loss": 4.7547, + "step": 71660 + }, + { + "epoch": 0.10070307381836001, + "grad_norm": 0.7387674450874329, + "learning_rate": 0.00029999959634571306, + "loss": 4.6957, + "step": 71670 + }, + { + "epoch": 0.10071712475652358, + "grad_norm": 0.7180886268615723, + "learning_rate": 0.0002999995790974655, + "loss": 4.9078, + "step": 71680 + }, + { + "epoch": 0.10073117569468716, + "grad_norm": 0.7135482430458069, + "learning_rate": 0.00029999956148837664, + "loss": 4.8903, + "step": 71690 + }, + { + "epoch": 0.10074522663285074, + "grad_norm": 0.7912961840629578, + "learning_rate": 0.00029999954351844646, + "loss": 4.7291, + "step": 71700 + }, + { + "epoch": 0.10075927757101431, + "grad_norm": 0.7425175905227661, + "learning_rate": 0.0002999995251876751, + "loss": 4.8633, + "step": 71710 + }, + { + "epoch": 0.1007733285091779, + "grad_norm": 0.741531491279602, + "learning_rate": 0.00029999950649606255, + "loss": 4.8959, + "step": 71720 + }, + { + "epoch": 0.10078737944734148, + "grad_norm": 0.7444692254066467, + "learning_rate": 0.0002999994874436088, + "loss": 4.7001, + "step": 71730 + }, + { + "epoch": 0.10080143038550506, + "grad_norm": 0.7535449862480164, + "learning_rate": 0.000299999468030314, + "loss": 4.7645, + "step": 71740 + }, + { + "epoch": 0.10081548132366863, + "grad_norm": 0.7291529178619385, + "learning_rate": 0.0002999994482561781, + "loss": 4.8059, + "step": 71750 + }, + { + "epoch": 0.10082953226183221, + "grad_norm": 0.7530328035354614, + "learning_rate": 0.0002999994281212013, + "loss": 4.7958, + "step": 71760 + }, + { + "epoch": 0.10084358319999578, + "grad_norm": 0.7359017133712769, + "learning_rate": 0.0002999994076253835, + "loss": 4.7982, + "step": 71770 + }, + { + "epoch": 0.10085763413815936, + "grad_norm": 0.7342458367347717, + "learning_rate": 0.00029999938676872475, + "loss": 4.7397, + "step": 71780 + }, + { + "epoch": 0.10087168507632294, + "grad_norm": 0.7454365491867065, + "learning_rate": 0.0002999993655512252, + "loss": 4.7391, + "step": 71790 + }, + { + "epoch": 0.10088573601448651, + "grad_norm": 0.771918773651123, + "learning_rate": 0.0002999993439728848, + "loss": 4.8666, + "step": 71800 + }, + { + "epoch": 0.10089978695265009, + "grad_norm": 0.7269641160964966, + "learning_rate": 0.00029999932203370364, + "loss": 4.8573, + "step": 71810 + }, + { + "epoch": 0.10091383789081368, + "grad_norm": 0.7324143648147583, + "learning_rate": 0.00029999929973368184, + "loss": 4.6735, + "step": 71820 + }, + { + "epoch": 0.10092788882897726, + "grad_norm": 0.7486067414283752, + "learning_rate": 0.0002999992770728194, + "loss": 4.6773, + "step": 71830 + }, + { + "epoch": 0.10094193976714083, + "grad_norm": 0.7198307514190674, + "learning_rate": 0.0002999992540511163, + "loss": 4.7123, + "step": 71840 + }, + { + "epoch": 0.10095599070530441, + "grad_norm": 0.7431245446205139, + "learning_rate": 0.00029999923066857274, + "loss": 4.7798, + "step": 71850 + }, + { + "epoch": 0.10097004164346798, + "grad_norm": 0.7345250844955444, + "learning_rate": 0.00029999920692518864, + "loss": 4.8772, + "step": 71860 + }, + { + "epoch": 0.10098409258163156, + "grad_norm": 0.7660800814628601, + "learning_rate": 0.0002999991828209641, + "loss": 4.8468, + "step": 71870 + }, + { + "epoch": 0.10099814351979514, + "grad_norm": 0.7611793279647827, + "learning_rate": 0.0002999991583558993, + "loss": 4.8363, + "step": 71880 + }, + { + "epoch": 0.10101219445795871, + "grad_norm": 0.7366256713867188, + "learning_rate": 0.0002999991335299941, + "loss": 4.7184, + "step": 71890 + }, + { + "epoch": 0.10102624539612229, + "grad_norm": 0.7112072706222534, + "learning_rate": 0.0002999991083432487, + "loss": 4.724, + "step": 71900 + }, + { + "epoch": 0.10104029633428586, + "grad_norm": 0.7414090633392334, + "learning_rate": 0.0002999990827956631, + "loss": 4.8933, + "step": 71910 + }, + { + "epoch": 0.10105434727244944, + "grad_norm": 0.7189602851867676, + "learning_rate": 0.0002999990568872374, + "loss": 4.7637, + "step": 71920 + }, + { + "epoch": 0.10106839821061303, + "grad_norm": 0.7293269038200378, + "learning_rate": 0.0002999990306179716, + "loss": 4.8199, + "step": 71930 + }, + { + "epoch": 0.1010824491487766, + "grad_norm": 0.7632038593292236, + "learning_rate": 0.0002999990039878658, + "loss": 4.6565, + "step": 71940 + }, + { + "epoch": 0.10109650008694018, + "grad_norm": 0.7255117297172546, + "learning_rate": 0.0002999989769969201, + "loss": 4.6914, + "step": 71950 + }, + { + "epoch": 0.10111055102510376, + "grad_norm": 0.7196149230003357, + "learning_rate": 0.0002999989496451345, + "loss": 4.7191, + "step": 71960 + }, + { + "epoch": 0.10112460196326734, + "grad_norm": 0.7434830665588379, + "learning_rate": 0.00029999892193250917, + "loss": 4.7101, + "step": 71970 + }, + { + "epoch": 0.10113865290143091, + "grad_norm": 0.7188177108764648, + "learning_rate": 0.000299998893859044, + "loss": 4.8254, + "step": 71980 + }, + { + "epoch": 0.10115270383959449, + "grad_norm": 0.7340826988220215, + "learning_rate": 0.00029999886542473925, + "loss": 4.7947, + "step": 71990 + }, + { + "epoch": 0.10116675477775806, + "grad_norm": 0.7305575609207153, + "learning_rate": 0.0002999988366295948, + "loss": 4.5447, + "step": 72000 + }, + { + "epoch": 0.10118080571592164, + "grad_norm": 0.7468332052230835, + "learning_rate": 0.0002999988074736109, + "loss": 4.7055, + "step": 72010 + }, + { + "epoch": 0.10119485665408522, + "grad_norm": 0.7357817888259888, + "learning_rate": 0.00029999877795678753, + "loss": 4.8088, + "step": 72020 + }, + { + "epoch": 0.1012089075922488, + "grad_norm": 0.7364190816879272, + "learning_rate": 0.0002999987480791248, + "loss": 4.6873, + "step": 72030 + }, + { + "epoch": 0.10122295853041238, + "grad_norm": 0.7412238717079163, + "learning_rate": 0.00029999871784062266, + "loss": 4.7859, + "step": 72040 + }, + { + "epoch": 0.10123700946857596, + "grad_norm": 0.7454258799552917, + "learning_rate": 0.00029999868724128135, + "loss": 4.7342, + "step": 72050 + }, + { + "epoch": 0.10125106040673953, + "grad_norm": 0.7320020794868469, + "learning_rate": 0.0002999986562811008, + "loss": 4.7076, + "step": 72060 + }, + { + "epoch": 0.10126511134490311, + "grad_norm": 0.7472354769706726, + "learning_rate": 0.00029999862496008124, + "loss": 4.7221, + "step": 72070 + }, + { + "epoch": 0.10127916228306669, + "grad_norm": 0.7363531589508057, + "learning_rate": 0.0002999985932782226, + "loss": 4.681, + "step": 72080 + }, + { + "epoch": 0.10129321322123026, + "grad_norm": 0.7143122553825378, + "learning_rate": 0.000299998561235525, + "loss": 4.7911, + "step": 72090 + }, + { + "epoch": 0.10130726415939384, + "grad_norm": 0.7466495037078857, + "learning_rate": 0.0002999985288319886, + "loss": 4.9298, + "step": 72100 + }, + { + "epoch": 0.10132131509755742, + "grad_norm": 0.7105987668037415, + "learning_rate": 0.00029999849606761336, + "loss": 4.8656, + "step": 72110 + }, + { + "epoch": 0.10133536603572099, + "grad_norm": 0.7323604822158813, + "learning_rate": 0.0002999984629423994, + "loss": 4.8375, + "step": 72120 + }, + { + "epoch": 0.10134941697388458, + "grad_norm": 0.7545437812805176, + "learning_rate": 0.00029999842945634684, + "loss": 4.7479, + "step": 72130 + }, + { + "epoch": 0.10136346791204816, + "grad_norm": 0.732039213180542, + "learning_rate": 0.0002999983956094557, + "loss": 4.8971, + "step": 72140 + }, + { + "epoch": 0.10137751885021173, + "grad_norm": 0.7388688325881958, + "learning_rate": 0.0002999983614017261, + "loss": 4.7152, + "step": 72150 + }, + { + "epoch": 0.10139156978837531, + "grad_norm": 0.7269540429115295, + "learning_rate": 0.0002999983268331582, + "loss": 4.8155, + "step": 72160 + }, + { + "epoch": 0.10140562072653889, + "grad_norm": 0.7393106818199158, + "learning_rate": 0.00029999829190375187, + "loss": 4.7825, + "step": 72170 + }, + { + "epoch": 0.10141967166470246, + "grad_norm": 0.7458699941635132, + "learning_rate": 0.00029999825661350735, + "loss": 4.696, + "step": 72180 + }, + { + "epoch": 0.10143372260286604, + "grad_norm": 0.7346116900444031, + "learning_rate": 0.00029999822096242473, + "loss": 4.7549, + "step": 72190 + }, + { + "epoch": 0.10144777354102961, + "grad_norm": 0.7251469492912292, + "learning_rate": 0.000299998184950504, + "loss": 4.7406, + "step": 72200 + }, + { + "epoch": 0.10146182447919319, + "grad_norm": 0.7549038529396057, + "learning_rate": 0.0002999981485777453, + "loss": 4.7226, + "step": 72210 + }, + { + "epoch": 0.10147587541735677, + "grad_norm": 0.7311290502548218, + "learning_rate": 0.0002999981118441488, + "loss": 4.7761, + "step": 72220 + }, + { + "epoch": 0.10148992635552034, + "grad_norm": 0.7395356297492981, + "learning_rate": 0.0002999980747497145, + "loss": 4.8128, + "step": 72230 + }, + { + "epoch": 0.10150397729368393, + "grad_norm": 0.7352813482284546, + "learning_rate": 0.0002999980372944425, + "loss": 4.7336, + "step": 72240 + }, + { + "epoch": 0.10151802823184751, + "grad_norm": 0.7374472618103027, + "learning_rate": 0.00029999799947833287, + "loss": 4.6786, + "step": 72250 + }, + { + "epoch": 0.10153207917001109, + "grad_norm": 0.7531014680862427, + "learning_rate": 0.0002999979613013857, + "loss": 4.7935, + "step": 72260 + }, + { + "epoch": 0.10154613010817466, + "grad_norm": 0.7118738889694214, + "learning_rate": 0.00029999792276360113, + "loss": 4.7567, + "step": 72270 + }, + { + "epoch": 0.10156018104633824, + "grad_norm": 0.7456701397895813, + "learning_rate": 0.00029999788386497923, + "loss": 4.7232, + "step": 72280 + }, + { + "epoch": 0.10157423198450181, + "grad_norm": 0.7232758402824402, + "learning_rate": 0.00029999784460552005, + "loss": 4.7032, + "step": 72290 + }, + { + "epoch": 0.10158828292266539, + "grad_norm": 0.7644693851470947, + "learning_rate": 0.0002999978049852238, + "loss": 4.8459, + "step": 72300 + }, + { + "epoch": 0.10160233386082897, + "grad_norm": 0.7518143057823181, + "learning_rate": 0.00029999776500409044, + "loss": 4.7958, + "step": 72310 + }, + { + "epoch": 0.10161638479899254, + "grad_norm": 0.7484323382377625, + "learning_rate": 0.00029999772466212017, + "loss": 4.7845, + "step": 72320 + }, + { + "epoch": 0.10163043573715612, + "grad_norm": 0.7363878488540649, + "learning_rate": 0.000299997683959313, + "loss": 4.8805, + "step": 72330 + }, + { + "epoch": 0.10164448667531971, + "grad_norm": 0.771102786064148, + "learning_rate": 0.00029999764289566914, + "loss": 4.766, + "step": 72340 + }, + { + "epoch": 0.10165853761348329, + "grad_norm": 0.7849813103675842, + "learning_rate": 0.00029999760147118854, + "loss": 4.7616, + "step": 72350 + }, + { + "epoch": 0.10167258855164686, + "grad_norm": 0.7605536580085754, + "learning_rate": 0.0002999975596858714, + "loss": 4.8137, + "step": 72360 + }, + { + "epoch": 0.10168663948981044, + "grad_norm": 0.7546150088310242, + "learning_rate": 0.0002999975175397178, + "loss": 4.7202, + "step": 72370 + }, + { + "epoch": 0.10170069042797401, + "grad_norm": 0.7308611869812012, + "learning_rate": 0.0002999974750327279, + "loss": 4.7466, + "step": 72380 + }, + { + "epoch": 0.10171474136613759, + "grad_norm": 0.7356964349746704, + "learning_rate": 0.0002999974321649017, + "loss": 4.8318, + "step": 72390 + }, + { + "epoch": 0.10172879230430117, + "grad_norm": 0.73519366979599, + "learning_rate": 0.00029999738893623937, + "loss": 4.7815, + "step": 72400 + }, + { + "epoch": 0.10174284324246474, + "grad_norm": 0.6988383531570435, + "learning_rate": 0.000299997345346741, + "loss": 4.7789, + "step": 72410 + }, + { + "epoch": 0.10175689418062832, + "grad_norm": 0.745654284954071, + "learning_rate": 0.0002999973013964067, + "loss": 4.8238, + "step": 72420 + }, + { + "epoch": 0.1017709451187919, + "grad_norm": 0.7207648158073425, + "learning_rate": 0.00029999725708523643, + "loss": 4.7722, + "step": 72430 + }, + { + "epoch": 0.10178499605695548, + "grad_norm": 0.8427481651306152, + "learning_rate": 0.00029999721241323054, + "loss": 4.7605, + "step": 72440 + }, + { + "epoch": 0.10179904699511906, + "grad_norm": 0.7269243597984314, + "learning_rate": 0.00029999716738038904, + "loss": 4.8267, + "step": 72450 + }, + { + "epoch": 0.10181309793328264, + "grad_norm": 0.7501411437988281, + "learning_rate": 0.00029999712198671195, + "loss": 4.7538, + "step": 72460 + }, + { + "epoch": 0.10182714887144621, + "grad_norm": 0.7097275853157043, + "learning_rate": 0.0002999970762321995, + "loss": 4.7286, + "step": 72470 + }, + { + "epoch": 0.10184119980960979, + "grad_norm": 0.7278454303741455, + "learning_rate": 0.00029999703011685176, + "loss": 4.8126, + "step": 72480 + }, + { + "epoch": 0.10185525074777337, + "grad_norm": 0.7060046195983887, + "learning_rate": 0.00029999698364066883, + "loss": 4.715, + "step": 72490 + }, + { + "epoch": 0.10186930168593694, + "grad_norm": 0.7554420828819275, + "learning_rate": 0.0002999969368036508, + "loss": 4.9029, + "step": 72500 + }, + { + "epoch": 0.10188335262410052, + "grad_norm": 0.733871340751648, + "learning_rate": 0.00029999688960579785, + "loss": 4.808, + "step": 72510 + }, + { + "epoch": 0.1018974035622641, + "grad_norm": 0.7300422191619873, + "learning_rate": 0.00029999684204711006, + "loss": 4.8701, + "step": 72520 + }, + { + "epoch": 0.10191145450042767, + "grad_norm": 0.7577276825904846, + "learning_rate": 0.0002999967941275875, + "loss": 4.6814, + "step": 72530 + }, + { + "epoch": 0.10192550543859125, + "grad_norm": 0.7170461416244507, + "learning_rate": 0.00029999674584723035, + "loss": 4.7992, + "step": 72540 + }, + { + "epoch": 0.10193955637675484, + "grad_norm": 0.7378881573677063, + "learning_rate": 0.00029999669720603864, + "loss": 4.7144, + "step": 72550 + }, + { + "epoch": 0.10195360731491841, + "grad_norm": 0.7653374671936035, + "learning_rate": 0.0002999966482040126, + "loss": 4.8943, + "step": 72560 + }, + { + "epoch": 0.10196765825308199, + "grad_norm": 0.7405737042427063, + "learning_rate": 0.0002999965988411523, + "loss": 4.7858, + "step": 72570 + }, + { + "epoch": 0.10198170919124556, + "grad_norm": 0.712135910987854, + "learning_rate": 0.0002999965491174578, + "loss": 4.9162, + "step": 72580 + }, + { + "epoch": 0.10199576012940914, + "grad_norm": 0.742899477481842, + "learning_rate": 0.0002999964990329293, + "loss": 4.7942, + "step": 72590 + }, + { + "epoch": 0.10200981106757272, + "grad_norm": 0.7559585571289062, + "learning_rate": 0.0002999964485875669, + "loss": 4.6854, + "step": 72600 + }, + { + "epoch": 0.1020238620057363, + "grad_norm": 0.7186324000358582, + "learning_rate": 0.00029999639778137074, + "loss": 4.7303, + "step": 72610 + }, + { + "epoch": 0.10203791294389987, + "grad_norm": 0.7546454071998596, + "learning_rate": 0.0002999963466143408, + "loss": 4.6544, + "step": 72620 + }, + { + "epoch": 0.10205196388206345, + "grad_norm": 0.6856681108474731, + "learning_rate": 0.00029999629508647746, + "loss": 4.7369, + "step": 72630 + }, + { + "epoch": 0.10206601482022702, + "grad_norm": 0.7506888508796692, + "learning_rate": 0.00029999624319778067, + "loss": 4.7411, + "step": 72640 + }, + { + "epoch": 0.10208006575839061, + "grad_norm": 0.7311187982559204, + "learning_rate": 0.00029999619094825056, + "loss": 4.6478, + "step": 72650 + }, + { + "epoch": 0.10209411669655419, + "grad_norm": 0.7421939969062805, + "learning_rate": 0.0002999961383378873, + "loss": 4.7328, + "step": 72660 + }, + { + "epoch": 0.10210816763471776, + "grad_norm": 0.7084130644798279, + "learning_rate": 0.00029999608536669095, + "loss": 4.6734, + "step": 72670 + }, + { + "epoch": 0.10212221857288134, + "grad_norm": 0.7495949864387512, + "learning_rate": 0.0002999960320346617, + "loss": 4.7531, + "step": 72680 + }, + { + "epoch": 0.10213626951104492, + "grad_norm": 0.7394850254058838, + "learning_rate": 0.0002999959783417997, + "loss": 4.7793, + "step": 72690 + }, + { + "epoch": 0.10215032044920849, + "grad_norm": 0.7716978788375854, + "learning_rate": 0.00029999592428810504, + "loss": 4.7758, + "step": 72700 + }, + { + "epoch": 0.10216437138737207, + "grad_norm": 0.7250087857246399, + "learning_rate": 0.0002999958698735779, + "loss": 4.8447, + "step": 72710 + }, + { + "epoch": 0.10217842232553564, + "grad_norm": 0.7172320485115051, + "learning_rate": 0.0002999958150982183, + "loss": 4.9669, + "step": 72720 + }, + { + "epoch": 0.10219247326369922, + "grad_norm": 0.7347210049629211, + "learning_rate": 0.00029999575996202647, + "loss": 4.7817, + "step": 72730 + }, + { + "epoch": 0.1022065242018628, + "grad_norm": 0.7101386785507202, + "learning_rate": 0.00029999570446500245, + "loss": 4.7991, + "step": 72740 + }, + { + "epoch": 0.10222057514002639, + "grad_norm": 0.7327938079833984, + "learning_rate": 0.0002999956486071465, + "loss": 4.707, + "step": 72750 + }, + { + "epoch": 0.10223462607818996, + "grad_norm": 0.7212610244750977, + "learning_rate": 0.0002999955923884587, + "loss": 4.7766, + "step": 72760 + }, + { + "epoch": 0.10224867701635354, + "grad_norm": 0.720736563205719, + "learning_rate": 0.0002999955358089391, + "loss": 4.7198, + "step": 72770 + }, + { + "epoch": 0.10226272795451712, + "grad_norm": 0.7179093360900879, + "learning_rate": 0.000299995478868588, + "loss": 4.7682, + "step": 72780 + }, + { + "epoch": 0.10227677889268069, + "grad_norm": 0.7311738133430481, + "learning_rate": 0.0002999954215674054, + "loss": 4.8445, + "step": 72790 + }, + { + "epoch": 0.10229082983084427, + "grad_norm": 0.7069105505943298, + "learning_rate": 0.00029999536390539145, + "loss": 4.7873, + "step": 72800 + }, + { + "epoch": 0.10230488076900784, + "grad_norm": 0.7458084225654602, + "learning_rate": 0.00029999530588254635, + "loss": 4.6617, + "step": 72810 + }, + { + "epoch": 0.10231893170717142, + "grad_norm": 0.7396774291992188, + "learning_rate": 0.00029999524749887025, + "loss": 4.8425, + "step": 72820 + }, + { + "epoch": 0.102332982645335, + "grad_norm": 0.7057560682296753, + "learning_rate": 0.0002999951887543632, + "loss": 4.8336, + "step": 72830 + }, + { + "epoch": 0.10234703358349857, + "grad_norm": 0.716718852519989, + "learning_rate": 0.0002999951296490254, + "loss": 4.6897, + "step": 72840 + }, + { + "epoch": 0.10236108452166215, + "grad_norm": 0.7290700674057007, + "learning_rate": 0.000299995070182857, + "loss": 4.8415, + "step": 72850 + }, + { + "epoch": 0.10237513545982574, + "grad_norm": 0.7181606292724609, + "learning_rate": 0.00029999501035585816, + "loss": 4.8283, + "step": 72860 + }, + { + "epoch": 0.10238918639798932, + "grad_norm": 0.7337809801101685, + "learning_rate": 0.00029999495016802893, + "loss": 4.8772, + "step": 72870 + }, + { + "epoch": 0.10240323733615289, + "grad_norm": 0.7575526833534241, + "learning_rate": 0.0002999948896193696, + "loss": 4.787, + "step": 72880 + }, + { + "epoch": 0.10241728827431647, + "grad_norm": Infinity, + "learning_rate": 0.00029999482870988017, + "loss": 4.7359, + "step": 72890 + }, + { + "epoch": 0.10243133921248004, + "grad_norm": 0.7764653563499451, + "learning_rate": 0.0002999947735828301, + "loss": 4.7426, + "step": 72900 + }, + { + "epoch": 0.10244539015064362, + "grad_norm": 0.7185347676277161, + "learning_rate": 0.00029999471198776404, + "loss": 4.867, + "step": 72910 + }, + { + "epoch": 0.1024594410888072, + "grad_norm": 0.755716860294342, + "learning_rate": 0.0002999946500318684, + "loss": 4.7958, + "step": 72920 + }, + { + "epoch": 0.10247349202697077, + "grad_norm": 0.7544811367988586, + "learning_rate": 0.0002999945877151432, + "loss": 4.9392, + "step": 72930 + }, + { + "epoch": 0.10248754296513435, + "grad_norm": 0.7307287454605103, + "learning_rate": 0.00029999452503758875, + "loss": 4.8479, + "step": 72940 + }, + { + "epoch": 0.10250159390329792, + "grad_norm": 0.7607468366622925, + "learning_rate": 0.00029999446199920516, + "loss": 4.6962, + "step": 72950 + }, + { + "epoch": 0.10251564484146151, + "grad_norm": 0.721833348274231, + "learning_rate": 0.00029999439859999257, + "loss": 4.7497, + "step": 72960 + }, + { + "epoch": 0.10252969577962509, + "grad_norm": 0.7424848079681396, + "learning_rate": 0.00029999433483995106, + "loss": 4.7419, + "step": 72970 + }, + { + "epoch": 0.10254374671778867, + "grad_norm": 0.721015453338623, + "learning_rate": 0.0002999942707190809, + "loss": 4.5932, + "step": 72980 + }, + { + "epoch": 0.10255779765595224, + "grad_norm": 0.7474051713943481, + "learning_rate": 0.00029999420623738214, + "loss": 4.6855, + "step": 72990 + }, + { + "epoch": 0.10257184859411582, + "grad_norm": 0.7426432967185974, + "learning_rate": 0.000299994141394855, + "loss": 4.7178, + "step": 73000 + }, + { + "epoch": 0.1025858995322794, + "grad_norm": 0.7148788571357727, + "learning_rate": 0.00029999407619149966, + "loss": 4.8307, + "step": 73010 + }, + { + "epoch": 0.10259995047044297, + "grad_norm": 0.7621682286262512, + "learning_rate": 0.00029999401062731623, + "loss": 4.7439, + "step": 73020 + }, + { + "epoch": 0.10261400140860655, + "grad_norm": 0.7121155858039856, + "learning_rate": 0.00029999394470230485, + "loss": 4.6666, + "step": 73030 + }, + { + "epoch": 0.10262805234677012, + "grad_norm": 0.7072491645812988, + "learning_rate": 0.0002999938784164658, + "loss": 4.6901, + "step": 73040 + }, + { + "epoch": 0.1026421032849337, + "grad_norm": 0.7037854790687561, + "learning_rate": 0.00029999381176979903, + "loss": 4.7861, + "step": 73050 + }, + { + "epoch": 0.10265615422309729, + "grad_norm": 0.7043249011039734, + "learning_rate": 0.0002999937447623049, + "loss": 4.8758, + "step": 73060 + }, + { + "epoch": 0.10267020516126087, + "grad_norm": 0.7356353998184204, + "learning_rate": 0.00029999367739398343, + "loss": 4.7525, + "step": 73070 + }, + { + "epoch": 0.10268425609942444, + "grad_norm": 0.7560482025146484, + "learning_rate": 0.00029999360966483486, + "loss": 4.642, + "step": 73080 + }, + { + "epoch": 0.10269830703758802, + "grad_norm": 0.7081610560417175, + "learning_rate": 0.00029999354157485937, + "loss": 4.6725, + "step": 73090 + }, + { + "epoch": 0.1027123579757516, + "grad_norm": 0.7189093828201294, + "learning_rate": 0.00029999347312405705, + "loss": 4.852, + "step": 73100 + }, + { + "epoch": 0.10272640891391517, + "grad_norm": 0.7496188879013062, + "learning_rate": 0.0002999934043124281, + "loss": 4.7643, + "step": 73110 + }, + { + "epoch": 0.10274045985207875, + "grad_norm": 0.8219110369682312, + "learning_rate": 0.0002999933351399727, + "loss": 4.8383, + "step": 73120 + }, + { + "epoch": 0.10275451079024232, + "grad_norm": 0.714171826839447, + "learning_rate": 0.00029999326560669096, + "loss": 4.6523, + "step": 73130 + }, + { + "epoch": 0.1027685617284059, + "grad_norm": 0.7769915461540222, + "learning_rate": 0.00029999319571258316, + "loss": 4.7178, + "step": 73140 + }, + { + "epoch": 0.10278261266656948, + "grad_norm": 0.7325097918510437, + "learning_rate": 0.00029999312545764934, + "loss": 4.7205, + "step": 73150 + }, + { + "epoch": 0.10279666360473305, + "grad_norm": 0.7231411933898926, + "learning_rate": 0.00029999305484188974, + "loss": 4.8554, + "step": 73160 + }, + { + "epoch": 0.10281071454289664, + "grad_norm": 0.7304064035415649, + "learning_rate": 0.00029999298386530456, + "loss": 4.7141, + "step": 73170 + }, + { + "epoch": 0.10282476548106022, + "grad_norm": 0.7381361722946167, + "learning_rate": 0.0002999929125278939, + "loss": 4.8076, + "step": 73180 + }, + { + "epoch": 0.1028388164192238, + "grad_norm": 0.7440902590751648, + "learning_rate": 0.00029999284082965797, + "loss": 4.7352, + "step": 73190 + }, + { + "epoch": 0.10285286735738737, + "grad_norm": 0.7400135397911072, + "learning_rate": 0.0002999927687705969, + "loss": 4.8078, + "step": 73200 + }, + { + "epoch": 0.10286691829555095, + "grad_norm": 0.6989032626152039, + "learning_rate": 0.00029999269635071094, + "loss": 4.8402, + "step": 73210 + }, + { + "epoch": 0.10288096923371452, + "grad_norm": 0.7391857504844666, + "learning_rate": 0.00029999262357000017, + "loss": 4.8794, + "step": 73220 + }, + { + "epoch": 0.1028950201718781, + "grad_norm": 0.7222636342048645, + "learning_rate": 0.00029999255042846486, + "loss": 4.6988, + "step": 73230 + }, + { + "epoch": 0.10290907111004167, + "grad_norm": 0.7194198369979858, + "learning_rate": 0.0002999924769261051, + "loss": 4.649, + "step": 73240 + }, + { + "epoch": 0.10292312204820525, + "grad_norm": 0.7942532300949097, + "learning_rate": 0.00029999240306292116, + "loss": 4.7988, + "step": 73250 + }, + { + "epoch": 0.10293717298636883, + "grad_norm": 0.7117690443992615, + "learning_rate": 0.00029999232883891314, + "loss": 4.6962, + "step": 73260 + }, + { + "epoch": 0.10295122392453242, + "grad_norm": 0.7358763813972473, + "learning_rate": 0.00029999225425408123, + "loss": 4.7445, + "step": 73270 + }, + { + "epoch": 0.102965274862696, + "grad_norm": 0.7358459234237671, + "learning_rate": 0.00029999217930842565, + "loss": 4.7767, + "step": 73280 + }, + { + "epoch": 0.10297932580085957, + "grad_norm": 0.7460939884185791, + "learning_rate": 0.0002999921040019465, + "loss": 4.7906, + "step": 73290 + }, + { + "epoch": 0.10299337673902315, + "grad_norm": 0.7220245599746704, + "learning_rate": 0.0002999920283346441, + "loss": 4.818, + "step": 73300 + }, + { + "epoch": 0.10300742767718672, + "grad_norm": 0.7411149740219116, + "learning_rate": 0.00029999195230651847, + "loss": 4.7419, + "step": 73310 + }, + { + "epoch": 0.1030214786153503, + "grad_norm": 0.7259693145751953, + "learning_rate": 0.0002999918759175699, + "loss": 4.7345, + "step": 73320 + }, + { + "epoch": 0.10303552955351387, + "grad_norm": 0.7284397482872009, + "learning_rate": 0.00029999179916779856, + "loss": 4.873, + "step": 73330 + }, + { + "epoch": 0.10304958049167745, + "grad_norm": 0.7188897132873535, + "learning_rate": 0.0002999917220572046, + "loss": 4.8309, + "step": 73340 + }, + { + "epoch": 0.10306363142984103, + "grad_norm": 0.7184074521064758, + "learning_rate": 0.0002999916445857882, + "loss": 4.7298, + "step": 73350 + }, + { + "epoch": 0.1030776823680046, + "grad_norm": 0.7120581269264221, + "learning_rate": 0.00029999156675354963, + "loss": 4.8255, + "step": 73360 + }, + { + "epoch": 0.10309173330616819, + "grad_norm": 0.733837902545929, + "learning_rate": 0.000299991488560489, + "loss": 4.7718, + "step": 73370 + }, + { + "epoch": 0.10310578424433177, + "grad_norm": 0.7166063785552979, + "learning_rate": 0.00029999141000660646, + "loss": 4.801, + "step": 73380 + }, + { + "epoch": 0.10311983518249535, + "grad_norm": 0.7153290510177612, + "learning_rate": 0.0002999913310919023, + "loss": 4.7146, + "step": 73390 + }, + { + "epoch": 0.10313388612065892, + "grad_norm": 0.7479586005210876, + "learning_rate": 0.00029999125181637666, + "loss": 4.8948, + "step": 73400 + }, + { + "epoch": 0.1031479370588225, + "grad_norm": 0.7473878860473633, + "learning_rate": 0.0002999911721800297, + "loss": 4.7084, + "step": 73410 + }, + { + "epoch": 0.10316198799698607, + "grad_norm": 0.7283495664596558, + "learning_rate": 0.0002999910921828617, + "loss": 4.7356, + "step": 73420 + }, + { + "epoch": 0.10317603893514965, + "grad_norm": 0.7387956976890564, + "learning_rate": 0.0002999910118248728, + "loss": 4.8818, + "step": 73430 + }, + { + "epoch": 0.10319008987331323, + "grad_norm": 0.7675484418869019, + "learning_rate": 0.0002999909311060632, + "loss": 4.8588, + "step": 73440 + }, + { + "epoch": 0.1032041408114768, + "grad_norm": 0.7122921943664551, + "learning_rate": 0.00029999085002643306, + "loss": 4.7725, + "step": 73450 + }, + { + "epoch": 0.10321819174964038, + "grad_norm": 0.7225001454353333, + "learning_rate": 0.0002999907685859826, + "loss": 4.6845, + "step": 73460 + }, + { + "epoch": 0.10323224268780395, + "grad_norm": 0.7381240725517273, + "learning_rate": 0.00029999068678471203, + "loss": 4.7242, + "step": 73470 + }, + { + "epoch": 0.10324629362596754, + "grad_norm": 0.7431597709655762, + "learning_rate": 0.00029999060462262153, + "loss": 4.8332, + "step": 73480 + }, + { + "epoch": 0.10326034456413112, + "grad_norm": 0.7094483971595764, + "learning_rate": 0.00029999052209971126, + "loss": 4.812, + "step": 73490 + }, + { + "epoch": 0.1032743955022947, + "grad_norm": 0.7248532176017761, + "learning_rate": 0.0002999904392159815, + "loss": 4.734, + "step": 73500 + }, + { + "epoch": 0.10328844644045827, + "grad_norm": 0.7661448121070862, + "learning_rate": 0.00029999035597143243, + "loss": 4.8685, + "step": 73510 + }, + { + "epoch": 0.10330249737862185, + "grad_norm": 0.7161885499954224, + "learning_rate": 0.0002999902723660642, + "loss": 4.7242, + "step": 73520 + }, + { + "epoch": 0.10331654831678543, + "grad_norm": 0.8056687116622925, + "learning_rate": 0.00029999018839987706, + "loss": 4.7779, + "step": 73530 + }, + { + "epoch": 0.103330599254949, + "grad_norm": 0.7297375798225403, + "learning_rate": 0.0002999901040728712, + "loss": 4.699, + "step": 73540 + }, + { + "epoch": 0.10334465019311258, + "grad_norm": 0.7313802242279053, + "learning_rate": 0.00029999001938504677, + "loss": 4.7828, + "step": 73550 + }, + { + "epoch": 0.10335870113127615, + "grad_norm": 0.7233887314796448, + "learning_rate": 0.000299989934336404, + "loss": 4.7871, + "step": 73560 + }, + { + "epoch": 0.10337275206943973, + "grad_norm": 0.7726687788963318, + "learning_rate": 0.0002999898489269432, + "loss": 4.7328, + "step": 73570 + }, + { + "epoch": 0.10338680300760332, + "grad_norm": 0.7139991521835327, + "learning_rate": 0.00029998976315666445, + "loss": 4.6724, + "step": 73580 + }, + { + "epoch": 0.1034008539457669, + "grad_norm": 0.7393600940704346, + "learning_rate": 0.00029998967702556795, + "loss": 4.7427, + "step": 73590 + }, + { + "epoch": 0.10341490488393047, + "grad_norm": 0.7399834394454956, + "learning_rate": 0.00029998959053365406, + "loss": 4.7802, + "step": 73600 + }, + { + "epoch": 0.10342895582209405, + "grad_norm": 0.7233583331108093, + "learning_rate": 0.0002999895036809228, + "loss": 4.8864, + "step": 73610 + }, + { + "epoch": 0.10344300676025762, + "grad_norm": 0.7222065925598145, + "learning_rate": 0.0002999894164673745, + "loss": 4.771, + "step": 73620 + }, + { + "epoch": 0.1034570576984212, + "grad_norm": 0.7875381112098694, + "learning_rate": 0.0002999893288930093, + "loss": 4.6923, + "step": 73630 + }, + { + "epoch": 0.10347110863658478, + "grad_norm": 0.7450695037841797, + "learning_rate": 0.00029998924095782744, + "loss": 4.7587, + "step": 73640 + }, + { + "epoch": 0.10348515957474835, + "grad_norm": 0.7188162803649902, + "learning_rate": 0.00029998915266182916, + "loss": 4.8141, + "step": 73650 + }, + { + "epoch": 0.10349921051291193, + "grad_norm": 0.7098068594932556, + "learning_rate": 0.00029998906400501463, + "loss": 4.7548, + "step": 73660 + }, + { + "epoch": 0.1035132614510755, + "grad_norm": 0.730060875415802, + "learning_rate": 0.00029998897498738407, + "loss": 4.788, + "step": 73670 + }, + { + "epoch": 0.1035273123892391, + "grad_norm": 0.7018395662307739, + "learning_rate": 0.00029998888560893775, + "loss": 4.8347, + "step": 73680 + }, + { + "epoch": 0.10354136332740267, + "grad_norm": 0.7120745778083801, + "learning_rate": 0.0002999887958696758, + "loss": 4.8095, + "step": 73690 + }, + { + "epoch": 0.10355541426556625, + "grad_norm": 0.7242351174354553, + "learning_rate": 0.00029998870576959847, + "loss": 4.6808, + "step": 73700 + }, + { + "epoch": 0.10356946520372982, + "grad_norm": 0.7491177320480347, + "learning_rate": 0.00029998861530870595, + "loss": 4.7441, + "step": 73710 + }, + { + "epoch": 0.1035835161418934, + "grad_norm": 0.7404350638389587, + "learning_rate": 0.00029998852448699854, + "loss": 4.7472, + "step": 73720 + }, + { + "epoch": 0.10359756708005698, + "grad_norm": 0.7084073424339294, + "learning_rate": 0.0002999884333044764, + "loss": 4.7107, + "step": 73730 + }, + { + "epoch": 0.10361161801822055, + "grad_norm": 0.7354574799537659, + "learning_rate": 0.00029998834176113973, + "loss": 4.7334, + "step": 73740 + }, + { + "epoch": 0.10362566895638413, + "grad_norm": 0.7333629131317139, + "learning_rate": 0.0002999882498569888, + "loss": 4.7645, + "step": 73750 + }, + { + "epoch": 0.1036397198945477, + "grad_norm": 0.7341249585151672, + "learning_rate": 0.0002999881575920238, + "loss": 4.8465, + "step": 73760 + }, + { + "epoch": 0.10365377083271128, + "grad_norm": 0.7545302510261536, + "learning_rate": 0.00029998806496624495, + "loss": 4.7715, + "step": 73770 + }, + { + "epoch": 0.10366782177087486, + "grad_norm": 0.7346596121788025, + "learning_rate": 0.00029998797197965246, + "loss": 4.7478, + "step": 73780 + }, + { + "epoch": 0.10368187270903845, + "grad_norm": 0.7493293285369873, + "learning_rate": 0.0002999878786322466, + "loss": 4.6977, + "step": 73790 + }, + { + "epoch": 0.10369592364720202, + "grad_norm": 0.713477611541748, + "learning_rate": 0.00029998778492402756, + "loss": 4.7183, + "step": 73800 + }, + { + "epoch": 0.1037099745853656, + "grad_norm": 0.7505142688751221, + "learning_rate": 0.0002999876908549956, + "loss": 4.7706, + "step": 73810 + }, + { + "epoch": 0.10372402552352918, + "grad_norm": 0.7234914898872375, + "learning_rate": 0.0002999875964251509, + "loss": 4.7092, + "step": 73820 + }, + { + "epoch": 0.10373807646169275, + "grad_norm": 0.7134677171707153, + "learning_rate": 0.0002999875016344937, + "loss": 4.8585, + "step": 73830 + }, + { + "epoch": 0.10375212739985633, + "grad_norm": 0.7248352766036987, + "learning_rate": 0.00029998740648302425, + "loss": 4.7321, + "step": 73840 + }, + { + "epoch": 0.1037661783380199, + "grad_norm": 0.729803204536438, + "learning_rate": 0.0002999873109707428, + "loss": 4.8516, + "step": 73850 + }, + { + "epoch": 0.10378022927618348, + "grad_norm": 0.7194018363952637, + "learning_rate": 0.0002999872150976495, + "loss": 4.7418, + "step": 73860 + }, + { + "epoch": 0.10379428021434706, + "grad_norm": 0.7106111645698547, + "learning_rate": 0.0002999871188637446, + "loss": 4.7227, + "step": 73870 + }, + { + "epoch": 0.10380833115251063, + "grad_norm": 0.724184513092041, + "learning_rate": 0.0002999870222690284, + "loss": 4.7338, + "step": 73880 + }, + { + "epoch": 0.10382238209067422, + "grad_norm": 0.7321856617927551, + "learning_rate": 0.0002999869253135011, + "loss": 4.7236, + "step": 73890 + }, + { + "epoch": 0.1038364330288378, + "grad_norm": 0.7175441384315491, + "learning_rate": 0.00029998682799716294, + "loss": 4.8629, + "step": 73900 + }, + { + "epoch": 0.10385048396700138, + "grad_norm": 0.7437195181846619, + "learning_rate": 0.00029998673032001407, + "loss": 4.7403, + "step": 73910 + }, + { + "epoch": 0.10386453490516495, + "grad_norm": 0.7287278771400452, + "learning_rate": 0.0002999866322820548, + "loss": 4.7742, + "step": 73920 + }, + { + "epoch": 0.10387858584332853, + "grad_norm": 0.7289449572563171, + "learning_rate": 0.00029998653388328537, + "loss": 4.7351, + "step": 73930 + }, + { + "epoch": 0.1038926367814921, + "grad_norm": 0.7480982542037964, + "learning_rate": 0.0002999864351237061, + "loss": 4.7372, + "step": 73940 + }, + { + "epoch": 0.10390668771965568, + "grad_norm": 0.7479860186576843, + "learning_rate": 0.00029998633600331705, + "loss": 4.7979, + "step": 73950 + }, + { + "epoch": 0.10392073865781926, + "grad_norm": 0.756632387638092, + "learning_rate": 0.0002999862365221185, + "loss": 4.7318, + "step": 73960 + }, + { + "epoch": 0.10393478959598283, + "grad_norm": 0.722648561000824, + "learning_rate": 0.0002999861366801108, + "loss": 4.7916, + "step": 73970 + }, + { + "epoch": 0.10394884053414641, + "grad_norm": 0.7514876127243042, + "learning_rate": 0.0002999860364772941, + "loss": 4.8139, + "step": 73980 + }, + { + "epoch": 0.10396289147231, + "grad_norm": 0.7162560224533081, + "learning_rate": 0.0002999859359136686, + "loss": 4.7387, + "step": 73990 + }, + { + "epoch": 0.10397694241047357, + "grad_norm": 0.7170782089233398, + "learning_rate": 0.0002999858349892347, + "loss": 4.8234, + "step": 74000 + }, + { + "epoch": 0.10399099334863715, + "grad_norm": 0.7494903206825256, + "learning_rate": 0.0002999857337039925, + "loss": 4.7603, + "step": 74010 + }, + { + "epoch": 0.10400504428680073, + "grad_norm": 0.7144988775253296, + "learning_rate": 0.00029998563205794223, + "loss": 4.7661, + "step": 74020 + }, + { + "epoch": 0.1040190952249643, + "grad_norm": 0.7347742319107056, + "learning_rate": 0.0002999855300510843, + "loss": 4.8141, + "step": 74030 + }, + { + "epoch": 0.10403314616312788, + "grad_norm": 0.7606138586997986, + "learning_rate": 0.0002999854276834188, + "loss": 4.7885, + "step": 74040 + }, + { + "epoch": 0.10404719710129146, + "grad_norm": 0.7385061979293823, + "learning_rate": 0.00029998532495494604, + "loss": 4.8078, + "step": 74050 + }, + { + "epoch": 0.10406124803945503, + "grad_norm": 0.7367041110992432, + "learning_rate": 0.00029998522186566626, + "loss": 4.802, + "step": 74060 + }, + { + "epoch": 0.10407529897761861, + "grad_norm": 0.7521949410438538, + "learning_rate": 0.0002999851184155797, + "loss": 4.7509, + "step": 74070 + }, + { + "epoch": 0.10408934991578218, + "grad_norm": 0.746960461139679, + "learning_rate": 0.00029998501460468656, + "loss": 4.7074, + "step": 74080 + }, + { + "epoch": 0.10410340085394576, + "grad_norm": 0.7234981060028076, + "learning_rate": 0.0002999849104329872, + "loss": 4.8579, + "step": 74090 + }, + { + "epoch": 0.10411745179210935, + "grad_norm": 0.6993427872657776, + "learning_rate": 0.00029998480590048176, + "loss": 4.7195, + "step": 74100 + }, + { + "epoch": 0.10413150273027293, + "grad_norm": 0.7664044499397278, + "learning_rate": 0.0002999847010071706, + "loss": 4.8389, + "step": 74110 + }, + { + "epoch": 0.1041455536684365, + "grad_norm": 0.7423094511032104, + "learning_rate": 0.0002999845957530538, + "loss": 4.7013, + "step": 74120 + }, + { + "epoch": 0.10415960460660008, + "grad_norm": 0.7351399660110474, + "learning_rate": 0.00029998449013813184, + "loss": 4.8115, + "step": 74130 + }, + { + "epoch": 0.10417365554476365, + "grad_norm": 0.7567272186279297, + "learning_rate": 0.00029998438416240484, + "loss": 4.7673, + "step": 74140 + }, + { + "epoch": 0.10418770648292723, + "grad_norm": 0.7193549275398254, + "learning_rate": 0.00029998427782587303, + "loss": 4.7971, + "step": 74150 + }, + { + "epoch": 0.10420175742109081, + "grad_norm": 0.7518367767333984, + "learning_rate": 0.0002999841711285368, + "loss": 4.8342, + "step": 74160 + }, + { + "epoch": 0.10421580835925438, + "grad_norm": 0.7178200483322144, + "learning_rate": 0.0002999840640703963, + "loss": 4.7677, + "step": 74170 + }, + { + "epoch": 0.10422985929741796, + "grad_norm": 0.8699411749839783, + "learning_rate": 0.0002999839566514517, + "loss": 4.6964, + "step": 74180 + }, + { + "epoch": 0.10424391023558154, + "grad_norm": 0.7391669750213623, + "learning_rate": 0.0002999838488717035, + "loss": 4.7222, + "step": 74190 + }, + { + "epoch": 0.10425796117374513, + "grad_norm": 0.7199887633323669, + "learning_rate": 0.0002999837407311517, + "loss": 4.7128, + "step": 74200 + }, + { + "epoch": 0.1042720121119087, + "grad_norm": 0.7740340232849121, + "learning_rate": 0.00029998363222979676, + "loss": 4.6918, + "step": 74210 + }, + { + "epoch": 0.10428606305007228, + "grad_norm": 0.7272385954856873, + "learning_rate": 0.0002999835233676388, + "loss": 4.6641, + "step": 74220 + }, + { + "epoch": 0.10430011398823585, + "grad_norm": 0.7720179557800293, + "learning_rate": 0.0002999834141446782, + "loss": 4.8111, + "step": 74230 + }, + { + "epoch": 0.10431416492639943, + "grad_norm": 0.7326408624649048, + "learning_rate": 0.0002999833045609152, + "loss": 4.7686, + "step": 74240 + }, + { + "epoch": 0.104328215864563, + "grad_norm": 0.708678126335144, + "learning_rate": 0.00029998319461635, + "loss": 4.7862, + "step": 74250 + }, + { + "epoch": 0.10434226680272658, + "grad_norm": 0.7066006660461426, + "learning_rate": 0.00029998308431098287, + "loss": 4.7004, + "step": 74260 + }, + { + "epoch": 0.10435631774089016, + "grad_norm": 0.7312318682670593, + "learning_rate": 0.00029998297364481415, + "loss": 4.8703, + "step": 74270 + }, + { + "epoch": 0.10437036867905373, + "grad_norm": 0.7320531606674194, + "learning_rate": 0.00029998286261784403, + "loss": 4.694, + "step": 74280 + }, + { + "epoch": 0.10438441961721731, + "grad_norm": 0.743951141834259, + "learning_rate": 0.0002999827512300728, + "loss": 4.8147, + "step": 74290 + }, + { + "epoch": 0.1043984705553809, + "grad_norm": 0.7393697500228882, + "learning_rate": 0.00029998263948150075, + "loss": 4.8336, + "step": 74300 + }, + { + "epoch": 0.10441252149354448, + "grad_norm": 0.7255837917327881, + "learning_rate": 0.00029998252737212813, + "loss": 4.7326, + "step": 74310 + }, + { + "epoch": 0.10442657243170805, + "grad_norm": 0.7354263067245483, + "learning_rate": 0.0002999824149019552, + "loss": 4.7204, + "step": 74320 + }, + { + "epoch": 0.10444062336987163, + "grad_norm": 0.7385523915290833, + "learning_rate": 0.0002999823020709823, + "loss": 4.7987, + "step": 74330 + }, + { + "epoch": 0.1044546743080352, + "grad_norm": 0.7048619985580444, + "learning_rate": 0.0002999821888792096, + "loss": 4.7279, + "step": 74340 + }, + { + "epoch": 0.10446872524619878, + "grad_norm": 0.7030783295631409, + "learning_rate": 0.00029998207532663736, + "loss": 4.8038, + "step": 74350 + }, + { + "epoch": 0.10448277618436236, + "grad_norm": 0.7411651611328125, + "learning_rate": 0.000299981961413266, + "loss": 4.9361, + "step": 74360 + }, + { + "epoch": 0.10449682712252593, + "grad_norm": 0.7514051198959351, + "learning_rate": 0.00029998184713909563, + "loss": 4.7624, + "step": 74370 + }, + { + "epoch": 0.10451087806068951, + "grad_norm": 0.7228174209594727, + "learning_rate": 0.0002999817325041267, + "loss": 4.8236, + "step": 74380 + }, + { + "epoch": 0.10452492899885309, + "grad_norm": 0.7124325633049011, + "learning_rate": 0.0002999816175083593, + "loss": 4.8186, + "step": 74390 + }, + { + "epoch": 0.10453897993701666, + "grad_norm": 0.7320361137390137, + "learning_rate": 0.00029998150215179383, + "loss": 4.8853, + "step": 74400 + }, + { + "epoch": 0.10455303087518025, + "grad_norm": 0.7228125333786011, + "learning_rate": 0.0002999813864344305, + "loss": 4.6899, + "step": 74410 + }, + { + "epoch": 0.10456708181334383, + "grad_norm": 0.7609881162643433, + "learning_rate": 0.00029998127035626965, + "loss": 4.6897, + "step": 74420 + }, + { + "epoch": 0.1045811327515074, + "grad_norm": 0.7024797797203064, + "learning_rate": 0.0002999811539173115, + "loss": 4.756, + "step": 74430 + }, + { + "epoch": 0.10459518368967098, + "grad_norm": 0.7212551832199097, + "learning_rate": 0.00029998103711755637, + "loss": 4.8677, + "step": 74440 + }, + { + "epoch": 0.10460923462783456, + "grad_norm": 0.7069899439811707, + "learning_rate": 0.00029998091995700455, + "loss": 4.8609, + "step": 74450 + }, + { + "epoch": 0.10462328556599813, + "grad_norm": 0.7125039100646973, + "learning_rate": 0.00029998080243565626, + "loss": 4.6833, + "step": 74460 + }, + { + "epoch": 0.10463733650416171, + "grad_norm": 0.7615266442298889, + "learning_rate": 0.00029998068455351184, + "loss": 4.6864, + "step": 74470 + }, + { + "epoch": 0.10465138744232529, + "grad_norm": 0.7260366082191467, + "learning_rate": 0.00029998056631057155, + "loss": 4.7931, + "step": 74480 + }, + { + "epoch": 0.10466543838048886, + "grad_norm": 0.7347187995910645, + "learning_rate": 0.00029998044770683567, + "loss": 4.797, + "step": 74490 + }, + { + "epoch": 0.10467948931865244, + "grad_norm": 0.7333105206489563, + "learning_rate": 0.0002999803287423045, + "loss": 4.7057, + "step": 74500 + }, + { + "epoch": 0.10469354025681603, + "grad_norm": 0.7438333630561829, + "learning_rate": 0.0002999802094169783, + "loss": 4.8193, + "step": 74510 + }, + { + "epoch": 0.1047075911949796, + "grad_norm": 0.836247980594635, + "learning_rate": 0.00029998008973085744, + "loss": 4.6534, + "step": 74520 + }, + { + "epoch": 0.10472164213314318, + "grad_norm": 0.7281263470649719, + "learning_rate": 0.00029997996968394216, + "loss": 4.7365, + "step": 74530 + }, + { + "epoch": 0.10473569307130676, + "grad_norm": 0.7797707319259644, + "learning_rate": 0.0002999798492762327, + "loss": 4.6875, + "step": 74540 + }, + { + "epoch": 0.10474974400947033, + "grad_norm": 0.7314170598983765, + "learning_rate": 0.00029997972850772937, + "loss": 4.7991, + "step": 74550 + }, + { + "epoch": 0.10476379494763391, + "grad_norm": 0.8902124762535095, + "learning_rate": 0.00029997960737843246, + "loss": 4.7621, + "step": 74560 + }, + { + "epoch": 0.10477784588579749, + "grad_norm": 0.7190029621124268, + "learning_rate": 0.00029997948588834233, + "loss": 4.8784, + "step": 74570 + }, + { + "epoch": 0.10479189682396106, + "grad_norm": 0.7397128343582153, + "learning_rate": 0.00029997936403745916, + "loss": 4.8362, + "step": 74580 + }, + { + "epoch": 0.10480594776212464, + "grad_norm": 0.7313927412033081, + "learning_rate": 0.0002999792418257833, + "loss": 4.788, + "step": 74590 + }, + { + "epoch": 0.10481999870028821, + "grad_norm": 0.7316232919692993, + "learning_rate": 0.00029997911925331514, + "loss": 4.6484, + "step": 74600 + }, + { + "epoch": 0.1048340496384518, + "grad_norm": 0.7240854501724243, + "learning_rate": 0.00029997899632005484, + "loss": 4.6801, + "step": 74610 + }, + { + "epoch": 0.10484810057661538, + "grad_norm": 0.7367461323738098, + "learning_rate": 0.0002999788730260027, + "loss": 4.7329, + "step": 74620 + }, + { + "epoch": 0.10486215151477896, + "grad_norm": 0.7352172136306763, + "learning_rate": 0.00029997874937115905, + "loss": 4.6665, + "step": 74630 + }, + { + "epoch": 0.10487620245294253, + "grad_norm": 0.7014811635017395, + "learning_rate": 0.0002999786253555242, + "loss": 4.7753, + "step": 74640 + }, + { + "epoch": 0.10489025339110611, + "grad_norm": 0.7095927596092224, + "learning_rate": 0.0002999785009790985, + "loss": 4.7157, + "step": 74650 + }, + { + "epoch": 0.10490430432926968, + "grad_norm": 0.7442272305488586, + "learning_rate": 0.00029997837624188206, + "loss": 4.7303, + "step": 74660 + }, + { + "epoch": 0.10491835526743326, + "grad_norm": 0.7289188504219055, + "learning_rate": 0.00029997825114387545, + "loss": 4.8395, + "step": 74670 + }, + { + "epoch": 0.10493240620559684, + "grad_norm": 0.7389072179794312, + "learning_rate": 0.0002999781256850788, + "loss": 4.7305, + "step": 74680 + }, + { + "epoch": 0.10494645714376041, + "grad_norm": 0.7263525128364563, + "learning_rate": 0.00029997799986549237, + "loss": 4.8211, + "step": 74690 + }, + { + "epoch": 0.10496050808192399, + "grad_norm": 0.7205453515052795, + "learning_rate": 0.00029997787368511654, + "loss": 4.8692, + "step": 74700 + }, + { + "epoch": 0.10497455902008757, + "grad_norm": 0.7371070384979248, + "learning_rate": 0.0002999777471439517, + "loss": 4.6703, + "step": 74710 + }, + { + "epoch": 0.10498860995825116, + "grad_norm": 0.7681359648704529, + "learning_rate": 0.000299977620241998, + "loss": 4.6302, + "step": 74720 + }, + { + "epoch": 0.10500266089641473, + "grad_norm": 0.7186245322227478, + "learning_rate": 0.00029997749297925577, + "loss": 4.8669, + "step": 74730 + }, + { + "epoch": 0.10501671183457831, + "grad_norm": 0.7410948872566223, + "learning_rate": 0.00029997736535572543, + "loss": 4.8782, + "step": 74740 + }, + { + "epoch": 0.10503076277274188, + "grad_norm": 0.7192717790603638, + "learning_rate": 0.00029997723737140713, + "loss": 4.6931, + "step": 74750 + }, + { + "epoch": 0.10504481371090546, + "grad_norm": 0.6977980732917786, + "learning_rate": 0.0002999771090263013, + "loss": 4.8123, + "step": 74760 + }, + { + "epoch": 0.10505886464906904, + "grad_norm": 0.7145591378211975, + "learning_rate": 0.00029997698032040827, + "loss": 4.7737, + "step": 74770 + }, + { + "epoch": 0.10507291558723261, + "grad_norm": 0.7509664297103882, + "learning_rate": 0.0002999768512537282, + "loss": 4.6107, + "step": 74780 + }, + { + "epoch": 0.10508696652539619, + "grad_norm": 0.7300460338592529, + "learning_rate": 0.0002999767218262615, + "loss": 4.6742, + "step": 74790 + }, + { + "epoch": 0.10510101746355976, + "grad_norm": 0.711122453212738, + "learning_rate": 0.0002999765920380085, + "loss": 4.8037, + "step": 74800 + }, + { + "epoch": 0.10511506840172334, + "grad_norm": 0.6955252289772034, + "learning_rate": 0.0002999764618889695, + "loss": 4.9123, + "step": 74810 + }, + { + "epoch": 0.10512911933988693, + "grad_norm": 0.7196218967437744, + "learning_rate": 0.00029997633137914475, + "loss": 4.7226, + "step": 74820 + }, + { + "epoch": 0.10514317027805051, + "grad_norm": 0.7192176580429077, + "learning_rate": 0.0002999762005085346, + "loss": 4.5709, + "step": 74830 + }, + { + "epoch": 0.10515722121621408, + "grad_norm": 0.7230987548828125, + "learning_rate": 0.0002999760692771394, + "loss": 4.7651, + "step": 74840 + }, + { + "epoch": 0.10517127215437766, + "grad_norm": 0.705202579498291, + "learning_rate": 0.0002999759376849594, + "loss": 4.6931, + "step": 74850 + }, + { + "epoch": 0.10518532309254124, + "grad_norm": 0.7315492630004883, + "learning_rate": 0.000299975805731995, + "loss": 4.7949, + "step": 74860 + }, + { + "epoch": 0.10519937403070481, + "grad_norm": 0.7159463763237, + "learning_rate": 0.0002999756734182464, + "loss": 4.72, + "step": 74870 + }, + { + "epoch": 0.10521342496886839, + "grad_norm": 0.7130573987960815, + "learning_rate": 0.00029997554074371414, + "loss": 4.7058, + "step": 74880 + }, + { + "epoch": 0.10522747590703196, + "grad_norm": 0.714362621307373, + "learning_rate": 0.00029997540770839825, + "loss": 4.7595, + "step": 74890 + }, + { + "epoch": 0.10524152684519554, + "grad_norm": 0.7358441352844238, + "learning_rate": 0.00029997527431229923, + "loss": 4.69, + "step": 74900 + }, + { + "epoch": 0.10525557778335912, + "grad_norm": 0.7290864586830139, + "learning_rate": 0.00029997514055541736, + "loss": 4.7994, + "step": 74910 + }, + { + "epoch": 0.1052696287215227, + "grad_norm": 0.7372201085090637, + "learning_rate": 0.000299975006437753, + "loss": 4.8169, + "step": 74920 + }, + { + "epoch": 0.10528367965968628, + "grad_norm": 0.7162683010101318, + "learning_rate": 0.0002999748719593064, + "loss": 4.7578, + "step": 74930 + }, + { + "epoch": 0.10529773059784986, + "grad_norm": 0.7158368825912476, + "learning_rate": 0.0002999747371200779, + "loss": 4.8043, + "step": 74940 + }, + { + "epoch": 0.10531178153601344, + "grad_norm": 0.7241293787956238, + "learning_rate": 0.00029997460192006793, + "loss": 4.7606, + "step": 74950 + }, + { + "epoch": 0.10532583247417701, + "grad_norm": 0.7109899520874023, + "learning_rate": 0.00029997446635927666, + "loss": 4.6894, + "step": 74960 + }, + { + "epoch": 0.10533988341234059, + "grad_norm": 0.7288115620613098, + "learning_rate": 0.0002999743304377045, + "loss": 4.7761, + "step": 74970 + }, + { + "epoch": 0.10535393435050416, + "grad_norm": 0.7241887450218201, + "learning_rate": 0.00029997419415535173, + "loss": 4.726, + "step": 74980 + }, + { + "epoch": 0.10536798528866774, + "grad_norm": 0.7299656271934509, + "learning_rate": 0.0002999740575122187, + "loss": 4.766, + "step": 74990 + }, + { + "epoch": 0.10538203622683132, + "grad_norm": 0.9255989789962769, + "learning_rate": 0.0002999739205083058, + "loss": 4.792, + "step": 75000 + }, + { + "epoch": 0.10539608716499489, + "grad_norm": 0.7332300543785095, + "learning_rate": 0.0002999737831436133, + "loss": 4.7162, + "step": 75010 + }, + { + "epoch": 0.10541013810315847, + "grad_norm": 0.7358729839324951, + "learning_rate": 0.00029997364541814155, + "loss": 4.8229, + "step": 75020 + }, + { + "epoch": 0.10542418904132206, + "grad_norm": 0.7899588346481323, + "learning_rate": 0.00029997350733189084, + "loss": 4.6358, + "step": 75030 + }, + { + "epoch": 0.10543823997948563, + "grad_norm": 0.7667545676231384, + "learning_rate": 0.0002999733688848615, + "loss": 4.8529, + "step": 75040 + }, + { + "epoch": 0.10545229091764921, + "grad_norm": 0.7302919626235962, + "learning_rate": 0.00029997323007705396, + "loss": 4.6845, + "step": 75050 + }, + { + "epoch": 0.10546634185581279, + "grad_norm": 0.7035412788391113, + "learning_rate": 0.0002999730909084684, + "loss": 4.7805, + "step": 75060 + }, + { + "epoch": 0.10548039279397636, + "grad_norm": 0.7011492848396301, + "learning_rate": 0.00029997295137910535, + "loss": 4.6031, + "step": 75070 + }, + { + "epoch": 0.10549444373213994, + "grad_norm": 0.7378577589988708, + "learning_rate": 0.00029997281148896497, + "loss": 4.8141, + "step": 75080 + }, + { + "epoch": 0.10550849467030352, + "grad_norm": 0.7121955156326294, + "learning_rate": 0.0002999726712380477, + "loss": 4.7603, + "step": 75090 + }, + { + "epoch": 0.10552254560846709, + "grad_norm": 0.7434091567993164, + "learning_rate": 0.00029997253062635386, + "loss": 4.9379, + "step": 75100 + }, + { + "epoch": 0.10553659654663067, + "grad_norm": 0.7290818095207214, + "learning_rate": 0.0002999723896538837, + "loss": 4.7528, + "step": 75110 + }, + { + "epoch": 0.10555064748479424, + "grad_norm": 0.7576467394828796, + "learning_rate": 0.0002999722483206377, + "loss": 4.8125, + "step": 75120 + }, + { + "epoch": 0.10556469842295783, + "grad_norm": 0.7280282974243164, + "learning_rate": 0.0002999721066266161, + "loss": 4.7116, + "step": 75130 + }, + { + "epoch": 0.10557874936112141, + "grad_norm": 0.7176198959350586, + "learning_rate": 0.0002999719645718193, + "loss": 4.8024, + "step": 75140 + }, + { + "epoch": 0.10559280029928499, + "grad_norm": 0.7369096279144287, + "learning_rate": 0.00029997182215624755, + "loss": 4.7124, + "step": 75150 + }, + { + "epoch": 0.10560685123744856, + "grad_norm": 0.7322725653648376, + "learning_rate": 0.00029997167937990125, + "loss": 4.7161, + "step": 75160 + }, + { + "epoch": 0.10562090217561214, + "grad_norm": 0.7204955816268921, + "learning_rate": 0.0002999715362427808, + "loss": 4.7834, + "step": 75170 + }, + { + "epoch": 0.10563495311377571, + "grad_norm": 0.7370813488960266, + "learning_rate": 0.0002999713927448865, + "loss": 4.7807, + "step": 75180 + }, + { + "epoch": 0.10564900405193929, + "grad_norm": 0.737962543964386, + "learning_rate": 0.00029997124888621866, + "loss": 4.7968, + "step": 75190 + }, + { + "epoch": 0.10566305499010287, + "grad_norm": 0.7260681986808777, + "learning_rate": 0.0002999711046667777, + "loss": 4.6648, + "step": 75200 + }, + { + "epoch": 0.10567710592826644, + "grad_norm": 0.7455945014953613, + "learning_rate": 0.00029997096008656384, + "loss": 4.7407, + "step": 75210 + }, + { + "epoch": 0.10569115686643002, + "grad_norm": 0.710437536239624, + "learning_rate": 0.0002999708151455776, + "loss": 4.7474, + "step": 75220 + }, + { + "epoch": 0.10570520780459361, + "grad_norm": 0.7501201033592224, + "learning_rate": 0.0002999706698438192, + "loss": 4.6896, + "step": 75230 + }, + { + "epoch": 0.10571925874275719, + "grad_norm": 0.7423140406608582, + "learning_rate": 0.00029997052418128897, + "loss": 4.8219, + "step": 75240 + }, + { + "epoch": 0.10573330968092076, + "grad_norm": 0.8542389869689941, + "learning_rate": 0.0002999703781579874, + "loss": 4.7801, + "step": 75250 + }, + { + "epoch": 0.10574736061908434, + "grad_norm": 0.736545979976654, + "learning_rate": 0.0002999702317739147, + "loss": 4.8571, + "step": 75260 + }, + { + "epoch": 0.10576141155724791, + "grad_norm": 0.7311723828315735, + "learning_rate": 0.00029997008502907134, + "loss": 4.8457, + "step": 75270 + }, + { + "epoch": 0.10577546249541149, + "grad_norm": 0.7157546877861023, + "learning_rate": 0.00029996993792345757, + "loss": 4.827, + "step": 75280 + }, + { + "epoch": 0.10578951343357507, + "grad_norm": 0.7155654430389404, + "learning_rate": 0.0002999697904570738, + "loss": 4.9051, + "step": 75290 + }, + { + "epoch": 0.10580356437173864, + "grad_norm": 0.7166204452514648, + "learning_rate": 0.0002999696426299204, + "loss": 4.7967, + "step": 75300 + }, + { + "epoch": 0.10581761530990222, + "grad_norm": 0.7281050086021423, + "learning_rate": 0.0002999694944419976, + "loss": 4.7054, + "step": 75310 + }, + { + "epoch": 0.1058316662480658, + "grad_norm": 0.7242557406425476, + "learning_rate": 0.00029996934589330594, + "loss": 4.7892, + "step": 75320 + }, + { + "epoch": 0.10584571718622938, + "grad_norm": 0.7574252486228943, + "learning_rate": 0.0002999691969838457, + "loss": 4.7271, + "step": 75330 + }, + { + "epoch": 0.10585976812439296, + "grad_norm": 0.7095407843589783, + "learning_rate": 0.0002999690477136172, + "loss": 4.7575, + "step": 75340 + }, + { + "epoch": 0.10587381906255654, + "grad_norm": 0.7685112953186035, + "learning_rate": 0.00029996889808262087, + "loss": 4.8053, + "step": 75350 + }, + { + "epoch": 0.10588787000072011, + "grad_norm": 0.7352884411811829, + "learning_rate": 0.000299968748090857, + "loss": 4.7534, + "step": 75360 + }, + { + "epoch": 0.10590192093888369, + "grad_norm": 0.7439702153205872, + "learning_rate": 0.000299968597738326, + "loss": 4.7675, + "step": 75370 + }, + { + "epoch": 0.10591597187704727, + "grad_norm": 0.7124572396278381, + "learning_rate": 0.00029996844702502817, + "loss": 4.8264, + "step": 75380 + }, + { + "epoch": 0.10593002281521084, + "grad_norm": 0.7124819159507751, + "learning_rate": 0.00029996829595096394, + "loss": 4.764, + "step": 75390 + }, + { + "epoch": 0.10594407375337442, + "grad_norm": 0.7227981686592102, + "learning_rate": 0.00029996814451613366, + "loss": 4.672, + "step": 75400 + }, + { + "epoch": 0.105958124691538, + "grad_norm": 0.7123106718063354, + "learning_rate": 0.00029996799272053766, + "loss": 4.8143, + "step": 75410 + }, + { + "epoch": 0.10597217562970157, + "grad_norm": 0.7216246724128723, + "learning_rate": 0.00029996784056417636, + "loss": 4.8079, + "step": 75420 + }, + { + "epoch": 0.10598622656786515, + "grad_norm": 0.7304632067680359, + "learning_rate": 0.00029996768804705005, + "loss": 4.7621, + "step": 75430 + }, + { + "epoch": 0.10600027750602874, + "grad_norm": 0.7460058927536011, + "learning_rate": 0.00029996753516915915, + "loss": 4.7263, + "step": 75440 + }, + { + "epoch": 0.10601432844419231, + "grad_norm": 0.7226387858390808, + "learning_rate": 0.00029996738193050404, + "loss": 4.8106, + "step": 75450 + }, + { + "epoch": 0.10602837938235589, + "grad_norm": 0.7301837801933289, + "learning_rate": 0.00029996722833108505, + "loss": 4.8293, + "step": 75460 + }, + { + "epoch": 0.10604243032051947, + "grad_norm": 0.7361141443252563, + "learning_rate": 0.00029996707437090255, + "loss": 4.8255, + "step": 75470 + }, + { + "epoch": 0.10605648125868304, + "grad_norm": 0.726216733455658, + "learning_rate": 0.000299966920049957, + "loss": 4.7971, + "step": 75480 + }, + { + "epoch": 0.10607053219684662, + "grad_norm": 0.7054420113563538, + "learning_rate": 0.0002999667653682486, + "loss": 4.8014, + "step": 75490 + }, + { + "epoch": 0.1060845831350102, + "grad_norm": 0.7391892075538635, + "learning_rate": 0.00029996661032577785, + "loss": 4.8289, + "step": 75500 + }, + { + "epoch": 0.10609863407317377, + "grad_norm": 0.7168707251548767, + "learning_rate": 0.00029996645492254514, + "loss": 4.6734, + "step": 75510 + }, + { + "epoch": 0.10611268501133735, + "grad_norm": 0.7321151494979858, + "learning_rate": 0.0002999662991585507, + "loss": 4.7675, + "step": 75520 + }, + { + "epoch": 0.10612673594950092, + "grad_norm": 0.7170401215553284, + "learning_rate": 0.00029996614303379513, + "loss": 4.6391, + "step": 75530 + }, + { + "epoch": 0.10614078688766451, + "grad_norm": 0.6838593482971191, + "learning_rate": 0.00029996598654827864, + "loss": 4.7761, + "step": 75540 + }, + { + "epoch": 0.10615483782582809, + "grad_norm": 0.7038040161132812, + "learning_rate": 0.00029996582970200157, + "loss": 4.8264, + "step": 75550 + }, + { + "epoch": 0.10616888876399166, + "grad_norm": 0.7399590015411377, + "learning_rate": 0.00029996567249496447, + "loss": 4.7957, + "step": 75560 + }, + { + "epoch": 0.10618293970215524, + "grad_norm": 0.7103729248046875, + "learning_rate": 0.0002999655149271675, + "loss": 4.8062, + "step": 75570 + }, + { + "epoch": 0.10619699064031882, + "grad_norm": 0.7498480081558228, + "learning_rate": 0.0002999653569986113, + "loss": 4.7943, + "step": 75580 + }, + { + "epoch": 0.10621104157848239, + "grad_norm": 0.730226993560791, + "learning_rate": 0.000299965198709296, + "loss": 4.7162, + "step": 75590 + }, + { + "epoch": 0.10622509251664597, + "grad_norm": 0.7429940700531006, + "learning_rate": 0.0002999650400592221, + "loss": 4.7841, + "step": 75600 + }, + { + "epoch": 0.10623914345480955, + "grad_norm": 0.7538554668426514, + "learning_rate": 0.00029996488104839, + "loss": 4.7471, + "step": 75610 + }, + { + "epoch": 0.10625319439297312, + "grad_norm": 0.7402827739715576, + "learning_rate": 0.0002999647216768001, + "loss": 4.6182, + "step": 75620 + }, + { + "epoch": 0.1062672453311367, + "grad_norm": 0.7203502058982849, + "learning_rate": 0.0002999645619444526, + "loss": 4.7311, + "step": 75630 + }, + { + "epoch": 0.10628129626930029, + "grad_norm": 0.729570209980011, + "learning_rate": 0.00029996440185134815, + "loss": 4.7514, + "step": 75640 + }, + { + "epoch": 0.10629534720746386, + "grad_norm": 0.8635588884353638, + "learning_rate": 0.00029996424139748696, + "loss": 4.7753, + "step": 75650 + }, + { + "epoch": 0.10630939814562744, + "grad_norm": 0.7464445233345032, + "learning_rate": 0.00029996408058286943, + "loss": 4.6894, + "step": 75660 + }, + { + "epoch": 0.10632344908379102, + "grad_norm": 0.7225930094718933, + "learning_rate": 0.0002999639194074961, + "loss": 4.7286, + "step": 75670 + }, + { + "epoch": 0.10633750002195459, + "grad_norm": 0.7263953685760498, + "learning_rate": 0.0002999637578713671, + "loss": 4.7748, + "step": 75680 + }, + { + "epoch": 0.10635155096011817, + "grad_norm": 0.7385531663894653, + "learning_rate": 0.000299963595974483, + "loss": 4.7724, + "step": 75690 + }, + { + "epoch": 0.10636560189828174, + "grad_norm": 0.6999334096908569, + "learning_rate": 0.0002999634337168441, + "loss": 4.8438, + "step": 75700 + }, + { + "epoch": 0.10637965283644532, + "grad_norm": 0.7192821502685547, + "learning_rate": 0.0002999632710984509, + "loss": 4.7399, + "step": 75710 + }, + { + "epoch": 0.1063937037746089, + "grad_norm": 0.7049062252044678, + "learning_rate": 0.0002999631081193037, + "loss": 4.7818, + "step": 75720 + }, + { + "epoch": 0.10640775471277247, + "grad_norm": 0.7248371839523315, + "learning_rate": 0.00029996294477940296, + "loss": 4.624, + "step": 75730 + }, + { + "epoch": 0.10642180565093605, + "grad_norm": 0.6994637250900269, + "learning_rate": 0.000299962781078749, + "loss": 4.7474, + "step": 75740 + }, + { + "epoch": 0.10643585658909964, + "grad_norm": 0.744132936000824, + "learning_rate": 0.0002999626170173422, + "loss": 4.7988, + "step": 75750 + }, + { + "epoch": 0.10644990752726322, + "grad_norm": 0.7178496718406677, + "learning_rate": 0.00029996245259518304, + "loss": 4.7616, + "step": 75760 + }, + { + "epoch": 0.10646395846542679, + "grad_norm": 0.7215403318405151, + "learning_rate": 0.0002999622878122719, + "loss": 4.7007, + "step": 75770 + }, + { + "epoch": 0.10647800940359037, + "grad_norm": 0.7298187613487244, + "learning_rate": 0.00029996212266860913, + "loss": 4.7248, + "step": 75780 + }, + { + "epoch": 0.10649206034175394, + "grad_norm": 0.7294317483901978, + "learning_rate": 0.00029996195716419514, + "loss": 4.6084, + "step": 75790 + }, + { + "epoch": 0.10650611127991752, + "grad_norm": 0.7263827919960022, + "learning_rate": 0.0002999617912990303, + "loss": 4.7366, + "step": 75800 + }, + { + "epoch": 0.1065201622180811, + "grad_norm": 0.768453061580658, + "learning_rate": 0.00029996162507311513, + "loss": 4.8619, + "step": 75810 + }, + { + "epoch": 0.10653421315624467, + "grad_norm": 0.7260246276855469, + "learning_rate": 0.00029996145848644993, + "loss": 4.7207, + "step": 75820 + }, + { + "epoch": 0.10654826409440825, + "grad_norm": 0.7110903859138489, + "learning_rate": 0.00029996129153903506, + "loss": 4.7166, + "step": 75830 + }, + { + "epoch": 0.10656231503257182, + "grad_norm": 0.7710065245628357, + "learning_rate": 0.00029996112423087104, + "loss": 4.7668, + "step": 75840 + }, + { + "epoch": 0.10657636597073541, + "grad_norm": 0.7256232500076294, + "learning_rate": 0.00029996095656195816, + "loss": 4.7379, + "step": 75850 + }, + { + "epoch": 0.10659041690889899, + "grad_norm": 0.7331588864326477, + "learning_rate": 0.0002999607885322969, + "loss": 4.7213, + "step": 75860 + }, + { + "epoch": 0.10660446784706257, + "grad_norm": 0.7234667539596558, + "learning_rate": 0.0002999606201418876, + "loss": 4.9289, + "step": 75870 + }, + { + "epoch": 0.10661851878522614, + "grad_norm": 0.7169747352600098, + "learning_rate": 0.0002999604513907308, + "loss": 4.804, + "step": 75880 + }, + { + "epoch": 0.10663256972338972, + "grad_norm": 0.7151200771331787, + "learning_rate": 0.0002999602822788267, + "loss": 4.7486, + "step": 75890 + }, + { + "epoch": 0.1066466206615533, + "grad_norm": 0.7405186295509338, + "learning_rate": 0.00029996011280617585, + "loss": 4.7887, + "step": 75900 + }, + { + "epoch": 0.10666067159971687, + "grad_norm": 0.712820291519165, + "learning_rate": 0.0002999599429727786, + "loss": 4.7063, + "step": 75910 + }, + { + "epoch": 0.10667472253788045, + "grad_norm": 0.733707845211029, + "learning_rate": 0.00029995977277863544, + "loss": 4.7957, + "step": 75920 + }, + { + "epoch": 0.10668877347604402, + "grad_norm": 0.7222506403923035, + "learning_rate": 0.0002999596022237467, + "loss": 4.6025, + "step": 75930 + }, + { + "epoch": 0.1067028244142076, + "grad_norm": 0.7260400056838989, + "learning_rate": 0.0002999594313081128, + "loss": 4.7407, + "step": 75940 + }, + { + "epoch": 0.10671687535237119, + "grad_norm": 0.7056910395622253, + "learning_rate": 0.0002999592600317342, + "loss": 4.8546, + "step": 75950 + }, + { + "epoch": 0.10673092629053477, + "grad_norm": 0.7579191327095032, + "learning_rate": 0.0002999590883946112, + "loss": 4.6756, + "step": 75960 + }, + { + "epoch": 0.10674497722869834, + "grad_norm": 0.7058764100074768, + "learning_rate": 0.00029995891639674437, + "loss": 4.7785, + "step": 75970 + }, + { + "epoch": 0.10675902816686192, + "grad_norm": 0.7291330099105835, + "learning_rate": 0.000299958744038134, + "loss": 4.7122, + "step": 75980 + }, + { + "epoch": 0.1067730791050255, + "grad_norm": 0.7083228230476379, + "learning_rate": 0.00029995857131878055, + "loss": 4.8688, + "step": 75990 + }, + { + "epoch": 0.10678713004318907, + "grad_norm": 0.7070737481117249, + "learning_rate": 0.00029995839823868445, + "loss": 4.7306, + "step": 76000 + }, + { + "epoch": 0.10680118098135265, + "grad_norm": 0.7116759419441223, + "learning_rate": 0.00029995822479784605, + "loss": 4.6925, + "step": 76010 + }, + { + "epoch": 0.10681523191951622, + "grad_norm": 0.7482094764709473, + "learning_rate": 0.0002999580509962658, + "loss": 4.6478, + "step": 76020 + }, + { + "epoch": 0.1068292828576798, + "grad_norm": 0.7246078252792358, + "learning_rate": 0.00029995787683394427, + "loss": 4.7802, + "step": 76030 + }, + { + "epoch": 0.10684333379584338, + "grad_norm": 0.7256546020507812, + "learning_rate": 0.0002999577023108816, + "loss": 4.7196, + "step": 76040 + }, + { + "epoch": 0.10685738473400695, + "grad_norm": 0.7039734721183777, + "learning_rate": 0.0002999575274270784, + "loss": 4.7635, + "step": 76050 + }, + { + "epoch": 0.10687143567217054, + "grad_norm": 0.6815078258514404, + "learning_rate": 0.0002999573521825351, + "loss": 4.8917, + "step": 76060 + }, + { + "epoch": 0.10688548661033412, + "grad_norm": 0.7078894972801208, + "learning_rate": 0.00029995717657725195, + "loss": 4.7096, + "step": 76070 + }, + { + "epoch": 0.1068995375484977, + "grad_norm": 0.7273938655853271, + "learning_rate": 0.0002999570006112296, + "loss": 4.644, + "step": 76080 + }, + { + "epoch": 0.10691358848666127, + "grad_norm": 0.7119658589363098, + "learning_rate": 0.0002999568242844683, + "loss": 4.788, + "step": 76090 + }, + { + "epoch": 0.10692763942482485, + "grad_norm": 0.7011669874191284, + "learning_rate": 0.00029995664759696855, + "loss": 4.5298, + "step": 76100 + }, + { + "epoch": 0.10694169036298842, + "grad_norm": 0.7396077513694763, + "learning_rate": 0.0002999564705487307, + "loss": 4.6668, + "step": 76110 + }, + { + "epoch": 0.106955741301152, + "grad_norm": 0.7068682312965393, + "learning_rate": 0.00029995629313975535, + "loss": 4.7548, + "step": 76120 + }, + { + "epoch": 0.10696979223931558, + "grad_norm": 0.7131781578063965, + "learning_rate": 0.0002999561153700427, + "loss": 4.8285, + "step": 76130 + }, + { + "epoch": 0.10698384317747915, + "grad_norm": 0.7171995639801025, + "learning_rate": 0.00029995593723959336, + "loss": 4.6529, + "step": 76140 + }, + { + "epoch": 0.10699789411564273, + "grad_norm": 0.6993269920349121, + "learning_rate": 0.0002999557587484077, + "loss": 4.6662, + "step": 76150 + }, + { + "epoch": 0.10701194505380632, + "grad_norm": 0.7163041234016418, + "learning_rate": 0.0002999555798964861, + "loss": 4.8221, + "step": 76160 + }, + { + "epoch": 0.1070259959919699, + "grad_norm": 0.6855682134628296, + "learning_rate": 0.0002999554006838291, + "loss": 4.7832, + "step": 76170 + }, + { + "epoch": 0.10704004693013347, + "grad_norm": 0.7540141940116882, + "learning_rate": 0.00029995522111043695, + "loss": 4.7666, + "step": 76180 + }, + { + "epoch": 0.10705409786829705, + "grad_norm": 0.7331625819206238, + "learning_rate": 0.0002999550411763102, + "loss": 4.7275, + "step": 76190 + }, + { + "epoch": 0.10706814880646062, + "grad_norm": 0.7579591870307922, + "learning_rate": 0.0002999548608814493, + "loss": 4.6774, + "step": 76200 + }, + { + "epoch": 0.1070821997446242, + "grad_norm": 0.7343604564666748, + "learning_rate": 0.0002999546802258547, + "loss": 4.716, + "step": 76210 + }, + { + "epoch": 0.10709625068278777, + "grad_norm": 0.7140637040138245, + "learning_rate": 0.00029995449920952675, + "loss": 4.6961, + "step": 76220 + }, + { + "epoch": 0.10711030162095135, + "grad_norm": 0.7120891213417053, + "learning_rate": 0.000299954317832466, + "loss": 4.8529, + "step": 76230 + }, + { + "epoch": 0.10712435255911493, + "grad_norm": 0.7201858758926392, + "learning_rate": 0.0002999541360946727, + "loss": 4.7678, + "step": 76240 + }, + { + "epoch": 0.1071384034972785, + "grad_norm": 0.7303602695465088, + "learning_rate": 0.0002999539539961475, + "loss": 4.8727, + "step": 76250 + }, + { + "epoch": 0.1071524544354421, + "grad_norm": 0.7096266150474548, + "learning_rate": 0.00029995377153689063, + "loss": 4.7656, + "step": 76260 + }, + { + "epoch": 0.10716650537360567, + "grad_norm": 0.6913262009620667, + "learning_rate": 0.00029995358871690275, + "loss": 4.7481, + "step": 76270 + }, + { + "epoch": 0.10718055631176925, + "grad_norm": 0.7402442693710327, + "learning_rate": 0.0002999534055361841, + "loss": 4.7493, + "step": 76280 + }, + { + "epoch": 0.10719460724993282, + "grad_norm": 0.7259013652801514, + "learning_rate": 0.00029995322199473525, + "loss": 4.7733, + "step": 76290 + }, + { + "epoch": 0.1072086581880964, + "grad_norm": 0.7574719786643982, + "learning_rate": 0.0002999530380925566, + "loss": 4.65, + "step": 76300 + }, + { + "epoch": 0.10722270912625997, + "grad_norm": 0.7451621294021606, + "learning_rate": 0.00029995285382964856, + "loss": 4.6954, + "step": 76310 + }, + { + "epoch": 0.10723676006442355, + "grad_norm": 0.704237163066864, + "learning_rate": 0.00029995266920601165, + "loss": 4.7913, + "step": 76320 + }, + { + "epoch": 0.10725081100258713, + "grad_norm": 0.7237526774406433, + "learning_rate": 0.0002999524842216463, + "loss": 4.7553, + "step": 76330 + }, + { + "epoch": 0.1072648619407507, + "grad_norm": 0.724061131477356, + "learning_rate": 0.0002999522988765529, + "loss": 4.7573, + "step": 76340 + }, + { + "epoch": 0.10727891287891428, + "grad_norm": 0.6993366479873657, + "learning_rate": 0.0002999521131707318, + "loss": 4.6088, + "step": 76350 + }, + { + "epoch": 0.10729296381707785, + "grad_norm": 0.6915683746337891, + "learning_rate": 0.00029995192710418375, + "loss": 4.8146, + "step": 76360 + }, + { + "epoch": 0.10730701475524144, + "grad_norm": 0.7251079678535461, + "learning_rate": 0.0002999517406769089, + "loss": 4.6804, + "step": 76370 + }, + { + "epoch": 0.10732106569340502, + "grad_norm": 0.7160983681678772, + "learning_rate": 0.00029995155388890785, + "loss": 4.8018, + "step": 76380 + }, + { + "epoch": 0.1073351166315686, + "grad_norm": 0.6960489153862, + "learning_rate": 0.00029995136674018105, + "loss": 4.6292, + "step": 76390 + }, + { + "epoch": 0.10734916756973217, + "grad_norm": 0.7182583808898926, + "learning_rate": 0.00029995117923072885, + "loss": 4.7744, + "step": 76400 + }, + { + "epoch": 0.10736321850789575, + "grad_norm": 0.7371787428855896, + "learning_rate": 0.0002999509913605518, + "loss": 4.7254, + "step": 76410 + }, + { + "epoch": 0.10737726944605933, + "grad_norm": 0.7134677171707153, + "learning_rate": 0.0002999508031296503, + "loss": 4.8213, + "step": 76420 + }, + { + "epoch": 0.1073913203842229, + "grad_norm": 0.7464840412139893, + "learning_rate": 0.00029995061453802485, + "loss": 4.722, + "step": 76430 + }, + { + "epoch": 0.10740537132238648, + "grad_norm": 0.7147547602653503, + "learning_rate": 0.0002999504255856758, + "loss": 4.8029, + "step": 76440 + }, + { + "epoch": 0.10741942226055005, + "grad_norm": 0.7210128307342529, + "learning_rate": 0.00029995023627260375, + "loss": 4.7076, + "step": 76450 + }, + { + "epoch": 0.10743347319871363, + "grad_norm": 0.6943991184234619, + "learning_rate": 0.00029995004659880907, + "loss": 4.7665, + "step": 76460 + }, + { + "epoch": 0.10744752413687722, + "grad_norm": 0.7254976034164429, + "learning_rate": 0.0002999498565642922, + "loss": 4.6883, + "step": 76470 + }, + { + "epoch": 0.1074615750750408, + "grad_norm": 0.733951210975647, + "learning_rate": 0.00029994966616905366, + "loss": 4.6997, + "step": 76480 + }, + { + "epoch": 0.10747562601320437, + "grad_norm": 0.7102512121200562, + "learning_rate": 0.00029994947541309386, + "loss": 4.6974, + "step": 76490 + }, + { + "epoch": 0.10748967695136795, + "grad_norm": 0.7103749513626099, + "learning_rate": 0.00029994928429641327, + "loss": 4.677, + "step": 76500 + }, + { + "epoch": 0.10750372788953153, + "grad_norm": 0.7255563735961914, + "learning_rate": 0.00029994909281901233, + "loss": 4.7501, + "step": 76510 + }, + { + "epoch": 0.1075177788276951, + "grad_norm": 0.7186533808708191, + "learning_rate": 0.00029994890098089153, + "loss": 4.6903, + "step": 76520 + }, + { + "epoch": 0.10753182976585868, + "grad_norm": 0.7188341617584229, + "learning_rate": 0.00029994870878205136, + "loss": 4.7485, + "step": 76530 + }, + { + "epoch": 0.10754588070402225, + "grad_norm": 0.7477817535400391, + "learning_rate": 0.00029994851622249224, + "loss": 4.7999, + "step": 76540 + }, + { + "epoch": 0.10755993164218583, + "grad_norm": 0.7210610508918762, + "learning_rate": 0.0002999483233022146, + "loss": 4.7705, + "step": 76550 + }, + { + "epoch": 0.1075739825803494, + "grad_norm": 0.7165090441703796, + "learning_rate": 0.000299948130021219, + "loss": 4.7389, + "step": 76560 + }, + { + "epoch": 0.107588033518513, + "grad_norm": 0.6880157589912415, + "learning_rate": 0.00029994793637950584, + "loss": 4.7931, + "step": 76570 + }, + { + "epoch": 0.10760208445667657, + "grad_norm": 0.719469428062439, + "learning_rate": 0.00029994774237707556, + "loss": 4.751, + "step": 76580 + }, + { + "epoch": 0.10761613539484015, + "grad_norm": 0.7153851389884949, + "learning_rate": 0.0002999475480139287, + "loss": 4.6964, + "step": 76590 + }, + { + "epoch": 0.10763018633300372, + "grad_norm": 0.7175468802452087, + "learning_rate": 0.00029994735329006567, + "loss": 4.6719, + "step": 76600 + }, + { + "epoch": 0.1076442372711673, + "grad_norm": 0.7065703868865967, + "learning_rate": 0.0002999471582054869, + "loss": 4.7861, + "step": 76610 + }, + { + "epoch": 0.10765828820933088, + "grad_norm": 0.7555844187736511, + "learning_rate": 0.00029994696276019307, + "loss": 4.7068, + "step": 76620 + }, + { + "epoch": 0.10767233914749445, + "grad_norm": 0.7342779040336609, + "learning_rate": 0.0002999467669541844, + "loss": 4.7435, + "step": 76630 + }, + { + "epoch": 0.10768639008565803, + "grad_norm": 0.7292844653129578, + "learning_rate": 0.00029994657078746145, + "loss": 4.6379, + "step": 76640 + }, + { + "epoch": 0.1077004410238216, + "grad_norm": 0.7343292832374573, + "learning_rate": 0.0002999463742600247, + "loss": 4.6573, + "step": 76650 + }, + { + "epoch": 0.10771449196198518, + "grad_norm": 0.7275290489196777, + "learning_rate": 0.0002999461773718746, + "loss": 4.7651, + "step": 76660 + }, + { + "epoch": 0.10772854290014876, + "grad_norm": 0.7052959203720093, + "learning_rate": 0.0002999459801230117, + "loss": 4.806, + "step": 76670 + }, + { + "epoch": 0.10774259383831235, + "grad_norm": 0.7254387140274048, + "learning_rate": 0.0002999457825134364, + "loss": 4.5679, + "step": 76680 + }, + { + "epoch": 0.10775664477647592, + "grad_norm": 0.7066879272460938, + "learning_rate": 0.0002999455845431492, + "loss": 4.6913, + "step": 76690 + }, + { + "epoch": 0.1077706957146395, + "grad_norm": 0.716200590133667, + "learning_rate": 0.0002999453862121506, + "loss": 4.567, + "step": 76700 + }, + { + "epoch": 0.10778474665280308, + "grad_norm": 0.7112560868263245, + "learning_rate": 0.00029994518752044104, + "loss": 4.7576, + "step": 76710 + }, + { + "epoch": 0.10779879759096665, + "grad_norm": 0.7039072513580322, + "learning_rate": 0.000299944988468021, + "loss": 4.6923, + "step": 76720 + }, + { + "epoch": 0.10781284852913023, + "grad_norm": 0.7240103483200073, + "learning_rate": 0.00029994478905489094, + "loss": 4.8887, + "step": 76730 + }, + { + "epoch": 0.1078268994672938, + "grad_norm": 0.6706738471984863, + "learning_rate": 0.0002999445892810514, + "loss": 4.848, + "step": 76740 + }, + { + "epoch": 0.10784095040545738, + "grad_norm": 0.7257563471794128, + "learning_rate": 0.0002999443891465028, + "loss": 4.7367, + "step": 76750 + }, + { + "epoch": 0.10785500134362096, + "grad_norm": 0.7115849256515503, + "learning_rate": 0.0002999441886512456, + "loss": 4.7701, + "step": 76760 + }, + { + "epoch": 0.10786905228178453, + "grad_norm": 0.7439100742340088, + "learning_rate": 0.00029994398779528047, + "loss": 4.7311, + "step": 76770 + }, + { + "epoch": 0.10788310321994812, + "grad_norm": 0.7145756483078003, + "learning_rate": 0.00029994378657860764, + "loss": 4.7226, + "step": 76780 + }, + { + "epoch": 0.1078971541581117, + "grad_norm": 0.7000148892402649, + "learning_rate": 0.00029994358500122774, + "loss": 4.8696, + "step": 76790 + }, + { + "epoch": 0.10791120509627528, + "grad_norm": 0.7268564105033875, + "learning_rate": 0.00029994338306314126, + "loss": 4.7751, + "step": 76800 + }, + { + "epoch": 0.10792525603443885, + "grad_norm": 0.6927092671394348, + "learning_rate": 0.00029994318076434864, + "loss": 4.6979, + "step": 76810 + }, + { + "epoch": 0.10793930697260243, + "grad_norm": 0.7400745749473572, + "learning_rate": 0.0002999429781048503, + "loss": 4.7819, + "step": 76820 + }, + { + "epoch": 0.107953357910766, + "grad_norm": 0.7177280783653259, + "learning_rate": 0.00029994277508464684, + "loss": 4.7256, + "step": 76830 + }, + { + "epoch": 0.10796740884892958, + "grad_norm": 0.7067402601242065, + "learning_rate": 0.00029994257170373875, + "loss": 4.8645, + "step": 76840 + }, + { + "epoch": 0.10798145978709316, + "grad_norm": 0.7211511731147766, + "learning_rate": 0.0002999423679621265, + "loss": 4.768, + "step": 76850 + }, + { + "epoch": 0.10799551072525673, + "grad_norm": 0.717394232749939, + "learning_rate": 0.0002999421638598105, + "loss": 4.7647, + "step": 76860 + }, + { + "epoch": 0.10800956166342031, + "grad_norm": 0.741280734539032, + "learning_rate": 0.0002999419593967913, + "loss": 4.6856, + "step": 76870 + }, + { + "epoch": 0.1080236126015839, + "grad_norm": 0.7437717318534851, + "learning_rate": 0.0002999417545730694, + "loss": 4.7239, + "step": 76880 + }, + { + "epoch": 0.10803766353974747, + "grad_norm": 0.7169762849807739, + "learning_rate": 0.00029994154938864533, + "loss": 4.7881, + "step": 76890 + }, + { + "epoch": 0.10805171447791105, + "grad_norm": 0.7102547287940979, + "learning_rate": 0.0002999413438435195, + "loss": 4.6919, + "step": 76900 + }, + { + "epoch": 0.10806576541607463, + "grad_norm": 0.7318692803382874, + "learning_rate": 0.00029994113793769243, + "loss": 4.8037, + "step": 76910 + }, + { + "epoch": 0.1080798163542382, + "grad_norm": 0.7535057663917542, + "learning_rate": 0.0002999409316711647, + "loss": 4.7232, + "step": 76920 + }, + { + "epoch": 0.10809386729240178, + "grad_norm": 0.7213916182518005, + "learning_rate": 0.0002999407250439367, + "loss": 4.6257, + "step": 76930 + }, + { + "epoch": 0.10810791823056536, + "grad_norm": 0.7251282930374146, + "learning_rate": 0.00029994051805600896, + "loss": 4.6825, + "step": 76940 + }, + { + "epoch": 0.10812196916872893, + "grad_norm": 0.8058227300643921, + "learning_rate": 0.0002999403107073819, + "loss": 4.6347, + "step": 76950 + }, + { + "epoch": 0.10813602010689251, + "grad_norm": 0.7237033843994141, + "learning_rate": 0.0002999401029980562, + "loss": 4.7505, + "step": 76960 + }, + { + "epoch": 0.10815007104505608, + "grad_norm": 0.7178425192832947, + "learning_rate": 0.0002999398949280323, + "loss": 4.7016, + "step": 76970 + }, + { + "epoch": 0.10816412198321966, + "grad_norm": 0.7172357439994812, + "learning_rate": 0.0002999396864973106, + "loss": 4.8938, + "step": 76980 + }, + { + "epoch": 0.10817817292138325, + "grad_norm": 0.7308405637741089, + "learning_rate": 0.0002999394777058916, + "loss": 4.7532, + "step": 76990 + }, + { + "epoch": 0.10819222385954683, + "grad_norm": 0.715327799320221, + "learning_rate": 0.00029993926855377594, + "loss": 4.7328, + "step": 77000 + }, + { + "epoch": 0.1082062747977104, + "grad_norm": 0.6993787288665771, + "learning_rate": 0.000299939059040964, + "loss": 4.7433, + "step": 77010 + }, + { + "epoch": 0.10822032573587398, + "grad_norm": 0.7134409546852112, + "learning_rate": 0.0002999388491674564, + "loss": 4.6656, + "step": 77020 + }, + { + "epoch": 0.10823437667403756, + "grad_norm": 0.7323780655860901, + "learning_rate": 0.00029993863893325355, + "loss": 4.7395, + "step": 77030 + }, + { + "epoch": 0.10824842761220113, + "grad_norm": 0.7384430170059204, + "learning_rate": 0.00029993844941407694, + "loss": 4.701, + "step": 77040 + }, + { + "epoch": 0.10826247855036471, + "grad_norm": 0.7190428972244263, + "learning_rate": 0.00029993823849455454, + "loss": 4.8667, + "step": 77050 + }, + { + "epoch": 0.10827652948852828, + "grad_norm": 0.7207445502281189, + "learning_rate": 0.0002999380272143384, + "loss": 4.8387, + "step": 77060 + }, + { + "epoch": 0.10829058042669186, + "grad_norm": 0.7112932205200195, + "learning_rate": 0.000299937815573429, + "loss": 4.7327, + "step": 77070 + }, + { + "epoch": 0.10830463136485544, + "grad_norm": 0.7273050546646118, + "learning_rate": 0.00029993760357182686, + "loss": 4.7299, + "step": 77080 + }, + { + "epoch": 0.10831868230301903, + "grad_norm": 0.7051961421966553, + "learning_rate": 0.0002999373912095325, + "loss": 4.7734, + "step": 77090 + }, + { + "epoch": 0.1083327332411826, + "grad_norm": 0.7308077216148376, + "learning_rate": 0.0002999371784865464, + "loss": 4.859, + "step": 77100 + }, + { + "epoch": 0.10834678417934618, + "grad_norm": 0.7368705868721008, + "learning_rate": 0.00029993696540286913, + "loss": 4.7365, + "step": 77110 + }, + { + "epoch": 0.10836083511750975, + "grad_norm": 0.7260712385177612, + "learning_rate": 0.00029993675195850116, + "loss": 4.822, + "step": 77120 + }, + { + "epoch": 0.10837488605567333, + "grad_norm": 0.7681806087493896, + "learning_rate": 0.000299936538153443, + "loss": 4.7356, + "step": 77130 + }, + { + "epoch": 0.1083889369938369, + "grad_norm": 0.7338926196098328, + "learning_rate": 0.0002999363239876952, + "loss": 4.8259, + "step": 77140 + }, + { + "epoch": 0.10840298793200048, + "grad_norm": 0.6925027370452881, + "learning_rate": 0.0002999361094612582, + "loss": 4.723, + "step": 77150 + }, + { + "epoch": 0.10841703887016406, + "grad_norm": 0.7058929800987244, + "learning_rate": 0.0002999358945741326, + "loss": 4.7304, + "step": 77160 + }, + { + "epoch": 0.10843108980832764, + "grad_norm": 0.7068637013435364, + "learning_rate": 0.00029993567932631886, + "loss": 4.9076, + "step": 77170 + }, + { + "epoch": 0.10844514074649121, + "grad_norm": 0.7397764921188354, + "learning_rate": 0.00029993546371781754, + "loss": 4.7557, + "step": 77180 + }, + { + "epoch": 0.1084591916846548, + "grad_norm": 0.7128493189811707, + "learning_rate": 0.0002999352477486291, + "loss": 4.7219, + "step": 77190 + }, + { + "epoch": 0.10847324262281838, + "grad_norm": 0.7162886261940002, + "learning_rate": 0.00029993503141875413, + "loss": 4.7888, + "step": 77200 + }, + { + "epoch": 0.10848729356098195, + "grad_norm": 0.7667913436889648, + "learning_rate": 0.00029993481472819313, + "loss": 4.8192, + "step": 77210 + }, + { + "epoch": 0.10850134449914553, + "grad_norm": 0.707908034324646, + "learning_rate": 0.0002999345976769466, + "loss": 4.7827, + "step": 77220 + }, + { + "epoch": 0.1085153954373091, + "grad_norm": 0.7176185250282288, + "learning_rate": 0.00029993438026501513, + "loss": 4.6943, + "step": 77230 + }, + { + "epoch": 0.10852944637547268, + "grad_norm": 0.7139223217964172, + "learning_rate": 0.0002999341624923991, + "loss": 4.7028, + "step": 77240 + }, + { + "epoch": 0.10854349731363626, + "grad_norm": 0.7057112455368042, + "learning_rate": 0.00029993394435909917, + "loss": 4.7016, + "step": 77250 + }, + { + "epoch": 0.10855754825179983, + "grad_norm": 0.7334666848182678, + "learning_rate": 0.0002999337258651158, + "loss": 4.7411, + "step": 77260 + }, + { + "epoch": 0.10857159918996341, + "grad_norm": 0.7039465308189392, + "learning_rate": 0.0002999335070104496, + "loss": 4.8021, + "step": 77270 + }, + { + "epoch": 0.10858565012812699, + "grad_norm": 0.716081976890564, + "learning_rate": 0.0002999332877951009, + "loss": 4.7026, + "step": 77280 + }, + { + "epoch": 0.10859970106629056, + "grad_norm": 0.725543737411499, + "learning_rate": 0.00029993306821907037, + "loss": 4.7908, + "step": 77290 + }, + { + "epoch": 0.10861375200445415, + "grad_norm": 0.7131525874137878, + "learning_rate": 0.0002999328482823586, + "loss": 4.7378, + "step": 77300 + }, + { + "epoch": 0.10862780294261773, + "grad_norm": 0.7272675037384033, + "learning_rate": 0.00029993262798496604, + "loss": 4.6323, + "step": 77310 + }, + { + "epoch": 0.1086418538807813, + "grad_norm": 0.8117364645004272, + "learning_rate": 0.0002999324073268931, + "loss": 4.7449, + "step": 77320 + }, + { + "epoch": 0.10865590481894488, + "grad_norm": 0.7216888666152954, + "learning_rate": 0.00029993218630814056, + "loss": 4.6925, + "step": 77330 + }, + { + "epoch": 0.10866995575710846, + "grad_norm": 0.7425556778907776, + "learning_rate": 0.0002999319649287088, + "loss": 4.8035, + "step": 77340 + }, + { + "epoch": 0.10868400669527203, + "grad_norm": 0.7141205668449402, + "learning_rate": 0.00029993174318859836, + "loss": 4.7453, + "step": 77350 + }, + { + "epoch": 0.10869805763343561, + "grad_norm": 0.7087002992630005, + "learning_rate": 0.0002999315210878098, + "loss": 4.6944, + "step": 77360 + }, + { + "epoch": 0.10871210857159919, + "grad_norm": 0.7184770107269287, + "learning_rate": 0.0002999312986263436, + "loss": 4.8394, + "step": 77370 + }, + { + "epoch": 0.10872615950976276, + "grad_norm": 0.7024494409561157, + "learning_rate": 0.0002999310758042004, + "loss": 4.7444, + "step": 77380 + }, + { + "epoch": 0.10874021044792634, + "grad_norm": 0.723019540309906, + "learning_rate": 0.0002999308526213806, + "loss": 4.7096, + "step": 77390 + }, + { + "epoch": 0.10875426138608993, + "grad_norm": 0.731758713722229, + "learning_rate": 0.0002999306290778849, + "loss": 4.7041, + "step": 77400 + }, + { + "epoch": 0.1087683123242535, + "grad_norm": 0.7360407710075378, + "learning_rate": 0.0002999304051737137, + "loss": 4.7408, + "step": 77410 + }, + { + "epoch": 0.10878236326241708, + "grad_norm": 0.9013959169387817, + "learning_rate": 0.00029993018090886765, + "loss": 4.8065, + "step": 77420 + }, + { + "epoch": 0.10879641420058066, + "grad_norm": 0.716956377029419, + "learning_rate": 0.00029992995628334715, + "loss": 4.7094, + "step": 77430 + }, + { + "epoch": 0.10881046513874423, + "grad_norm": 0.7028824090957642, + "learning_rate": 0.0002999297312971529, + "loss": 4.7542, + "step": 77440 + }, + { + "epoch": 0.10882451607690781, + "grad_norm": 0.7272639274597168, + "learning_rate": 0.00029992950595028524, + "loss": 4.7021, + "step": 77450 + }, + { + "epoch": 0.10883856701507139, + "grad_norm": 0.769668459892273, + "learning_rate": 0.0002999292802427449, + "loss": 4.7857, + "step": 77460 + }, + { + "epoch": 0.10885261795323496, + "grad_norm": 0.7413691878318787, + "learning_rate": 0.00029992905417453237, + "loss": 4.7665, + "step": 77470 + }, + { + "epoch": 0.10886666889139854, + "grad_norm": 0.7249032258987427, + "learning_rate": 0.0002999288277456482, + "loss": 4.6758, + "step": 77480 + }, + { + "epoch": 0.10888071982956211, + "grad_norm": 0.7224403023719788, + "learning_rate": 0.00029992860095609286, + "loss": 4.696, + "step": 77490 + }, + { + "epoch": 0.1088947707677257, + "grad_norm": 0.708949863910675, + "learning_rate": 0.00029992837380586694, + "loss": 4.7162, + "step": 77500 + }, + { + "epoch": 0.10890882170588928, + "grad_norm": 0.7245976328849792, + "learning_rate": 0.000299928146294971, + "loss": 4.6874, + "step": 77510 + }, + { + "epoch": 0.10892287264405286, + "grad_norm": 1.1022865772247314, + "learning_rate": 0.00029992791842340565, + "loss": 4.7513, + "step": 77520 + }, + { + "epoch": 0.10893692358221643, + "grad_norm": 0.7656620740890503, + "learning_rate": 0.0002999276901911713, + "loss": 4.6487, + "step": 77530 + }, + { + "epoch": 0.10895097452038001, + "grad_norm": 0.698890209197998, + "learning_rate": 0.00029992746159826863, + "loss": 4.7322, + "step": 77540 + }, + { + "epoch": 0.10896502545854359, + "grad_norm": 0.7267652153968811, + "learning_rate": 0.00029992723264469807, + "loss": 4.7124, + "step": 77550 + }, + { + "epoch": 0.10897907639670716, + "grad_norm": 0.7437712550163269, + "learning_rate": 0.0002999270033304603, + "loss": 4.7506, + "step": 77560 + }, + { + "epoch": 0.10899312733487074, + "grad_norm": 0.7425222396850586, + "learning_rate": 0.0002999267736555557, + "loss": 4.6336, + "step": 77570 + }, + { + "epoch": 0.10900717827303431, + "grad_norm": 0.735388994216919, + "learning_rate": 0.00029992654361998497, + "loss": 4.7389, + "step": 77580 + }, + { + "epoch": 0.10902122921119789, + "grad_norm": 0.7018815279006958, + "learning_rate": 0.00029992631322374864, + "loss": 4.7882, + "step": 77590 + }, + { + "epoch": 0.10903528014936147, + "grad_norm": 0.7219749689102173, + "learning_rate": 0.00029992608246684726, + "loss": 4.773, + "step": 77600 + }, + { + "epoch": 0.10904933108752506, + "grad_norm": 0.7270717024803162, + "learning_rate": 0.00029992585134928137, + "loss": 4.7005, + "step": 77610 + }, + { + "epoch": 0.10906338202568863, + "grad_norm": 0.6950241327285767, + "learning_rate": 0.00029992561987105145, + "loss": 4.7569, + "step": 77620 + }, + { + "epoch": 0.10907743296385221, + "grad_norm": 0.711341142654419, + "learning_rate": 0.0002999253880321582, + "loss": 4.7915, + "step": 77630 + }, + { + "epoch": 0.10909148390201578, + "grad_norm": 0.7145726084709167, + "learning_rate": 0.00029992515583260207, + "loss": 4.6679, + "step": 77640 + }, + { + "epoch": 0.10910553484017936, + "grad_norm": 0.7017688155174255, + "learning_rate": 0.0002999249232723837, + "loss": 4.897, + "step": 77650 + }, + { + "epoch": 0.10911958577834294, + "grad_norm": 0.7016028165817261, + "learning_rate": 0.0002999246903515035, + "loss": 4.7972, + "step": 77660 + }, + { + "epoch": 0.10913363671650651, + "grad_norm": 0.739503026008606, + "learning_rate": 0.00029992445706996225, + "loss": 4.7323, + "step": 77670 + }, + { + "epoch": 0.10914768765467009, + "grad_norm": 0.7204867601394653, + "learning_rate": 0.00029992422342776037, + "loss": 4.7506, + "step": 77680 + }, + { + "epoch": 0.10916173859283367, + "grad_norm": 0.713930070400238, + "learning_rate": 0.0002999239894248984, + "loss": 4.8078, + "step": 77690 + }, + { + "epoch": 0.10917578953099724, + "grad_norm": 0.7046233415603638, + "learning_rate": 0.00029992375506137704, + "loss": 4.6674, + "step": 77700 + }, + { + "epoch": 0.10918984046916083, + "grad_norm": 0.731946587562561, + "learning_rate": 0.0002999235203371967, + "loss": 4.7776, + "step": 77710 + }, + { + "epoch": 0.10920389140732441, + "grad_norm": 0.7064023017883301, + "learning_rate": 0.00029992328525235804, + "loss": 4.749, + "step": 77720 + }, + { + "epoch": 0.10921794234548798, + "grad_norm": 0.693246066570282, + "learning_rate": 0.00029992304980686155, + "loss": 4.7499, + "step": 77730 + }, + { + "epoch": 0.10923199328365156, + "grad_norm": 0.7170222997665405, + "learning_rate": 0.0002999228140007079, + "loss": 4.6803, + "step": 77740 + }, + { + "epoch": 0.10924604422181514, + "grad_norm": 0.7952947616577148, + "learning_rate": 0.0002999225778338976, + "loss": 4.6546, + "step": 77750 + }, + { + "epoch": 0.10926009515997871, + "grad_norm": 0.7248690128326416, + "learning_rate": 0.0002999223413064312, + "loss": 4.6298, + "step": 77760 + }, + { + "epoch": 0.10927414609814229, + "grad_norm": 0.7593909502029419, + "learning_rate": 0.0002999221044183093, + "loss": 4.7174, + "step": 77770 + }, + { + "epoch": 0.10928819703630586, + "grad_norm": 0.7026767730712891, + "learning_rate": 0.0002999218671695324, + "loss": 4.5835, + "step": 77780 + }, + { + "epoch": 0.10930224797446944, + "grad_norm": 0.7110097408294678, + "learning_rate": 0.0002999216295601012, + "loss": 4.8037, + "step": 77790 + }, + { + "epoch": 0.10931629891263302, + "grad_norm": 0.7158304452896118, + "learning_rate": 0.00029992139159001614, + "loss": 4.7756, + "step": 77800 + }, + { + "epoch": 0.10933034985079661, + "grad_norm": 0.7022632360458374, + "learning_rate": 0.0002999211532592779, + "loss": 4.7069, + "step": 77810 + }, + { + "epoch": 0.10934440078896018, + "grad_norm": 0.7358798384666443, + "learning_rate": 0.000299920914567887, + "loss": 4.8487, + "step": 77820 + }, + { + "epoch": 0.10935845172712376, + "grad_norm": 0.721831738948822, + "learning_rate": 0.00029992067551584396, + "loss": 4.7852, + "step": 77830 + }, + { + "epoch": 0.10937250266528734, + "grad_norm": 0.7131444811820984, + "learning_rate": 0.0002999204361031495, + "loss": 4.8763, + "step": 77840 + }, + { + "epoch": 0.10938655360345091, + "grad_norm": 0.7156862616539001, + "learning_rate": 0.00029992019632980405, + "loss": 4.7805, + "step": 77850 + }, + { + "epoch": 0.10940060454161449, + "grad_norm": 0.7103162407875061, + "learning_rate": 0.00029991995619580824, + "loss": 4.7374, + "step": 77860 + }, + { + "epoch": 0.10941465547977806, + "grad_norm": 0.6913536190986633, + "learning_rate": 0.0002999197157011627, + "loss": 4.6408, + "step": 77870 + }, + { + "epoch": 0.10942870641794164, + "grad_norm": 0.7195391058921814, + "learning_rate": 0.0002999194748458679, + "loss": 4.8017, + "step": 77880 + }, + { + "epoch": 0.10944275735610522, + "grad_norm": 0.7326173186302185, + "learning_rate": 0.0002999192336299246, + "loss": 4.729, + "step": 77890 + }, + { + "epoch": 0.10945680829426879, + "grad_norm": 0.721824049949646, + "learning_rate": 0.0002999189920533332, + "loss": 4.7534, + "step": 77900 + }, + { + "epoch": 0.10947085923243237, + "grad_norm": 0.719916582107544, + "learning_rate": 0.00029991875011609433, + "loss": 4.7614, + "step": 77910 + }, + { + "epoch": 0.10948491017059596, + "grad_norm": 0.7365716099739075, + "learning_rate": 0.0002999185078182086, + "loss": 4.6417, + "step": 77920 + }, + { + "epoch": 0.10949896110875953, + "grad_norm": 0.7198102474212646, + "learning_rate": 0.00029991826515967653, + "loss": 4.6395, + "step": 77930 + }, + { + "epoch": 0.10951301204692311, + "grad_norm": 0.7074626088142395, + "learning_rate": 0.0002999180221404988, + "loss": 4.8106, + "step": 77940 + }, + { + "epoch": 0.10952706298508669, + "grad_norm": 0.7036644220352173, + "learning_rate": 0.000299917778760676, + "loss": 4.6494, + "step": 77950 + }, + { + "epoch": 0.10954111392325026, + "grad_norm": 0.7054417729377747, + "learning_rate": 0.00029991753502020855, + "loss": 4.8741, + "step": 77960 + }, + { + "epoch": 0.10955516486141384, + "grad_norm": 0.7201747298240662, + "learning_rate": 0.0002999172909190972, + "loss": 4.8208, + "step": 77970 + }, + { + "epoch": 0.10956921579957742, + "grad_norm": 0.7417471408843994, + "learning_rate": 0.0002999170464573425, + "loss": 4.6939, + "step": 77980 + }, + { + "epoch": 0.10958326673774099, + "grad_norm": 0.7059158086776733, + "learning_rate": 0.000299916801634945, + "loss": 4.6488, + "step": 77990 + }, + { + "epoch": 0.10959731767590457, + "grad_norm": 0.7165995836257935, + "learning_rate": 0.00029991655645190534, + "loss": 4.6077, + "step": 78000 + }, + { + "epoch": 0.10961136861406814, + "grad_norm": 0.6998836398124695, + "learning_rate": 0.0002999163109082241, + "loss": 4.7637, + "step": 78010 + }, + { + "epoch": 0.10962541955223173, + "grad_norm": 0.7285990715026855, + "learning_rate": 0.00029991606500390184, + "loss": 4.7262, + "step": 78020 + }, + { + "epoch": 0.10963947049039531, + "grad_norm": 0.7066509127616882, + "learning_rate": 0.0002999158187389391, + "loss": 4.8026, + "step": 78030 + }, + { + "epoch": 0.10965352142855889, + "grad_norm": 0.7119672894477844, + "learning_rate": 0.0002999155721133366, + "loss": 4.7852, + "step": 78040 + }, + { + "epoch": 0.10966757236672246, + "grad_norm": 0.7337003946304321, + "learning_rate": 0.0002999153251270949, + "loss": 4.7541, + "step": 78050 + }, + { + "epoch": 0.10968162330488604, + "grad_norm": 0.7333500981330872, + "learning_rate": 0.0002999150777802145, + "loss": 4.8055, + "step": 78060 + }, + { + "epoch": 0.10969567424304962, + "grad_norm": 0.7334858775138855, + "learning_rate": 0.0002999148300726961, + "loss": 4.719, + "step": 78070 + }, + { + "epoch": 0.10970972518121319, + "grad_norm": 0.7448481917381287, + "learning_rate": 0.00029991458200454024, + "loss": 4.591, + "step": 78080 + }, + { + "epoch": 0.10972377611937677, + "grad_norm": 0.7082262635231018, + "learning_rate": 0.0002999143335757475, + "loss": 4.6573, + "step": 78090 + }, + { + "epoch": 0.10973782705754034, + "grad_norm": 0.752067506313324, + "learning_rate": 0.00029991408478631857, + "loss": 4.801, + "step": 78100 + }, + { + "epoch": 0.10975187799570392, + "grad_norm": 0.7344422936439514, + "learning_rate": 0.000299913835636254, + "loss": 4.6786, + "step": 78110 + }, + { + "epoch": 0.10976592893386751, + "grad_norm": 0.7434372901916504, + "learning_rate": 0.00029991358612555435, + "loss": 4.7506, + "step": 78120 + }, + { + "epoch": 0.10977997987203109, + "grad_norm": 0.7248328924179077, + "learning_rate": 0.00029991333625422023, + "loss": 4.766, + "step": 78130 + }, + { + "epoch": 0.10979403081019466, + "grad_norm": 0.7229834794998169, + "learning_rate": 0.00029991308602225227, + "loss": 4.7095, + "step": 78140 + }, + { + "epoch": 0.10980808174835824, + "grad_norm": 0.718464732170105, + "learning_rate": 0.0002999128354296511, + "loss": 4.731, + "step": 78150 + }, + { + "epoch": 0.10982213268652181, + "grad_norm": 0.7103954553604126, + "learning_rate": 0.00029991258447641724, + "loss": 4.7028, + "step": 78160 + }, + { + "epoch": 0.10983618362468539, + "grad_norm": 0.7132537961006165, + "learning_rate": 0.00029991233316255136, + "loss": 4.5993, + "step": 78170 + }, + { + "epoch": 0.10985023456284897, + "grad_norm": 0.7273306846618652, + "learning_rate": 0.00029991208148805404, + "loss": 4.7597, + "step": 78180 + }, + { + "epoch": 0.10986428550101254, + "grad_norm": 0.7169373035430908, + "learning_rate": 0.00029991182945292587, + "loss": 4.821, + "step": 78190 + }, + { + "epoch": 0.10987833643917612, + "grad_norm": 0.7016663551330566, + "learning_rate": 0.0002999115770571675, + "loss": 4.7387, + "step": 78200 + }, + { + "epoch": 0.1098923873773397, + "grad_norm": 0.7153110504150391, + "learning_rate": 0.00029991132430077946, + "loss": 4.6402, + "step": 78210 + }, + { + "epoch": 0.10990643831550327, + "grad_norm": 0.7164067625999451, + "learning_rate": 0.0002999110711837625, + "loss": 4.7666, + "step": 78220 + }, + { + "epoch": 0.10992048925366686, + "grad_norm": 0.7064258456230164, + "learning_rate": 0.0002999108177061171, + "loss": 4.7787, + "step": 78230 + }, + { + "epoch": 0.10993454019183044, + "grad_norm": 0.7285356521606445, + "learning_rate": 0.0002999105638678439, + "loss": 4.8512, + "step": 78240 + }, + { + "epoch": 0.10994859112999401, + "grad_norm": 0.7336903810501099, + "learning_rate": 0.00029991030966894345, + "loss": 4.7763, + "step": 78250 + }, + { + "epoch": 0.10996264206815759, + "grad_norm": 0.7162684798240662, + "learning_rate": 0.0002999100551094165, + "loss": 4.755, + "step": 78260 + }, + { + "epoch": 0.10997669300632117, + "grad_norm": 0.7184701561927795, + "learning_rate": 0.0002999098001892636, + "loss": 4.7169, + "step": 78270 + }, + { + "epoch": 0.10999074394448474, + "grad_norm": 0.7117341756820679, + "learning_rate": 0.00029990954490848536, + "loss": 4.7958, + "step": 78280 + }, + { + "epoch": 0.11000479488264832, + "grad_norm": 0.7427325248718262, + "learning_rate": 0.00029990928926708235, + "loss": 4.7711, + "step": 78290 + }, + { + "epoch": 0.1100188458208119, + "grad_norm": 0.7277485728263855, + "learning_rate": 0.00029990903326505527, + "loss": 4.7021, + "step": 78300 + }, + { + "epoch": 0.11003289675897547, + "grad_norm": 0.7296305894851685, + "learning_rate": 0.0002999087769024047, + "loss": 4.73, + "step": 78310 + }, + { + "epoch": 0.11004694769713905, + "grad_norm": 0.7299262285232544, + "learning_rate": 0.0002999085201791312, + "loss": 4.8023, + "step": 78320 + }, + { + "epoch": 0.11006099863530264, + "grad_norm": 0.7146306037902832, + "learning_rate": 0.0002999082630952354, + "loss": 4.6505, + "step": 78330 + }, + { + "epoch": 0.11007504957346621, + "grad_norm": 0.7327890396118164, + "learning_rate": 0.00029990800565071806, + "loss": 4.7601, + "step": 78340 + }, + { + "epoch": 0.11008910051162979, + "grad_norm": 0.716513454914093, + "learning_rate": 0.00029990774784557965, + "loss": 4.7456, + "step": 78350 + }, + { + "epoch": 0.11010315144979337, + "grad_norm": 0.7176522612571716, + "learning_rate": 0.0002999074896798208, + "loss": 4.856, + "step": 78360 + }, + { + "epoch": 0.11011720238795694, + "grad_norm": 0.7281580567359924, + "learning_rate": 0.0002999072311534422, + "loss": 4.7127, + "step": 78370 + }, + { + "epoch": 0.11013125332612052, + "grad_norm": 0.7641294002532959, + "learning_rate": 0.00029990697226644445, + "loss": 4.6719, + "step": 78380 + }, + { + "epoch": 0.1101453042642841, + "grad_norm": 0.7364638447761536, + "learning_rate": 0.00029990671301882813, + "loss": 4.672, + "step": 78390 + }, + { + "epoch": 0.11015935520244767, + "grad_norm": 0.7401412725448608, + "learning_rate": 0.0002999064534105939, + "loss": 4.8599, + "step": 78400 + }, + { + "epoch": 0.11017340614061125, + "grad_norm": 0.7288405895233154, + "learning_rate": 0.0002999061934417424, + "loss": 4.7307, + "step": 78410 + }, + { + "epoch": 0.11018745707877482, + "grad_norm": 0.7121359705924988, + "learning_rate": 0.0002999059331122742, + "loss": 4.7776, + "step": 78420 + }, + { + "epoch": 0.11020150801693841, + "grad_norm": 0.6956414580345154, + "learning_rate": 0.00029990567242219, + "loss": 4.6886, + "step": 78430 + }, + { + "epoch": 0.11021555895510199, + "grad_norm": 0.7270907163619995, + "learning_rate": 0.00029990541137149036, + "loss": 4.6126, + "step": 78440 + }, + { + "epoch": 0.11022960989326556, + "grad_norm": 0.731920063495636, + "learning_rate": 0.0002999051499601759, + "loss": 4.7614, + "step": 78450 + }, + { + "epoch": 0.11024366083142914, + "grad_norm": 0.7504376769065857, + "learning_rate": 0.0002999048881882473, + "loss": 4.6018, + "step": 78460 + }, + { + "epoch": 0.11025771176959272, + "grad_norm": 0.7335795164108276, + "learning_rate": 0.0002999046260557052, + "loss": 4.6259, + "step": 78470 + }, + { + "epoch": 0.1102717627077563, + "grad_norm": 0.7405069470405579, + "learning_rate": 0.0002999043635625502, + "loss": 4.7949, + "step": 78480 + }, + { + "epoch": 0.11028581364591987, + "grad_norm": 0.7165108323097229, + "learning_rate": 0.00029990410070878293, + "loss": 4.8312, + "step": 78490 + }, + { + "epoch": 0.11029986458408345, + "grad_norm": 0.7114459872245789, + "learning_rate": 0.00029990383749440403, + "loss": 4.6969, + "step": 78500 + }, + { + "epoch": 0.11031391552224702, + "grad_norm": 0.7313840389251709, + "learning_rate": 0.00029990357391941413, + "loss": 4.7718, + "step": 78510 + }, + { + "epoch": 0.1103279664604106, + "grad_norm": 0.7081531882286072, + "learning_rate": 0.0002999033099838138, + "loss": 4.7343, + "step": 78520 + }, + { + "epoch": 0.11034201739857417, + "grad_norm": 0.7249537706375122, + "learning_rate": 0.00029990304568760384, + "loss": 4.7134, + "step": 78530 + }, + { + "epoch": 0.11035606833673776, + "grad_norm": 0.7393092513084412, + "learning_rate": 0.00029990278103078475, + "loss": 4.7877, + "step": 78540 + }, + { + "epoch": 0.11037011927490134, + "grad_norm": 0.7348365783691406, + "learning_rate": 0.00029990251601335715, + "loss": 4.693, + "step": 78550 + }, + { + "epoch": 0.11038417021306492, + "grad_norm": 0.7228670120239258, + "learning_rate": 0.00029990225063532185, + "loss": 4.7112, + "step": 78560 + }, + { + "epoch": 0.11039822115122849, + "grad_norm": 0.7742883563041687, + "learning_rate": 0.0002999019848966793, + "loss": 4.7404, + "step": 78570 + }, + { + "epoch": 0.11041227208939207, + "grad_norm": 0.7462913393974304, + "learning_rate": 0.00029990171879743015, + "loss": 4.699, + "step": 78580 + }, + { + "epoch": 0.11042632302755565, + "grad_norm": 0.7195731997489929, + "learning_rate": 0.00029990145233757515, + "loss": 4.745, + "step": 78590 + }, + { + "epoch": 0.11044037396571922, + "grad_norm": 0.7176849246025085, + "learning_rate": 0.00029990118551711484, + "loss": 4.7519, + "step": 78600 + }, + { + "epoch": 0.1104544249038828, + "grad_norm": 0.7326333522796631, + "learning_rate": 0.00029990091833604997, + "loss": 4.6879, + "step": 78610 + }, + { + "epoch": 0.11046847584204637, + "grad_norm": 0.7096735239028931, + "learning_rate": 0.00029990065079438114, + "loss": 4.7295, + "step": 78620 + }, + { + "epoch": 0.11048252678020995, + "grad_norm": 0.7012742757797241, + "learning_rate": 0.00029990038289210894, + "loss": 4.5788, + "step": 78630 + }, + { + "epoch": 0.11049657771837354, + "grad_norm": 0.7373076677322388, + "learning_rate": 0.00029990011462923403, + "loss": 4.7283, + "step": 78640 + }, + { + "epoch": 0.11051062865653712, + "grad_norm": 0.7383387684822083, + "learning_rate": 0.0002998998460057571, + "loss": 4.718, + "step": 78650 + }, + { + "epoch": 0.11052467959470069, + "grad_norm": 0.7086196541786194, + "learning_rate": 0.00029989957702167877, + "loss": 4.7736, + "step": 78660 + }, + { + "epoch": 0.11053873053286427, + "grad_norm": 0.7198120951652527, + "learning_rate": 0.0002998993076769997, + "loss": 4.7757, + "step": 78670 + }, + { + "epoch": 0.11055278147102784, + "grad_norm": 0.7587843537330627, + "learning_rate": 0.00029989903797172056, + "loss": 4.801, + "step": 78680 + }, + { + "epoch": 0.11056683240919142, + "grad_norm": 0.7173702716827393, + "learning_rate": 0.0002998987679058419, + "loss": 4.7577, + "step": 78690 + }, + { + "epoch": 0.110580883347355, + "grad_norm": 0.7113364934921265, + "learning_rate": 0.0002998984974793645, + "loss": 4.7938, + "step": 78700 + }, + { + "epoch": 0.11059493428551857, + "grad_norm": 0.7075303196907043, + "learning_rate": 0.00029989822669228885, + "loss": 4.8154, + "step": 78710 + }, + { + "epoch": 0.11060898522368215, + "grad_norm": 0.749019205570221, + "learning_rate": 0.00029989795554461574, + "loss": 4.6428, + "step": 78720 + }, + { + "epoch": 0.11062303616184573, + "grad_norm": 0.7221829295158386, + "learning_rate": 0.00029989768403634583, + "loss": 4.7743, + "step": 78730 + }, + { + "epoch": 0.11063708710000932, + "grad_norm": 0.7469913959503174, + "learning_rate": 0.0002998974121674797, + "loss": 4.7479, + "step": 78740 + }, + { + "epoch": 0.11065113803817289, + "grad_norm": 0.7294034361839294, + "learning_rate": 0.000299897139938018, + "loss": 4.6667, + "step": 78750 + }, + { + "epoch": 0.11066518897633647, + "grad_norm": 0.7280092239379883, + "learning_rate": 0.00029989686734796143, + "loss": 4.8348, + "step": 78760 + }, + { + "epoch": 0.11067923991450004, + "grad_norm": 0.7117518782615662, + "learning_rate": 0.0002998965943973106, + "loss": 4.7427, + "step": 78770 + }, + { + "epoch": 0.11069329085266362, + "grad_norm": 0.7216711640357971, + "learning_rate": 0.0002998963210860662, + "loss": 4.6675, + "step": 78780 + }, + { + "epoch": 0.1107073417908272, + "grad_norm": 0.7047522664070129, + "learning_rate": 0.00029989604741422897, + "loss": 4.7768, + "step": 78790 + }, + { + "epoch": 0.11072139272899077, + "grad_norm": 0.7834486961364746, + "learning_rate": 0.0002998957733817994, + "loss": 4.5577, + "step": 78800 + }, + { + "epoch": 0.11073544366715435, + "grad_norm": 0.6964229345321655, + "learning_rate": 0.00029989549898877825, + "loss": 4.7269, + "step": 78810 + }, + { + "epoch": 0.11074949460531792, + "grad_norm": 0.7209952473640442, + "learning_rate": 0.00029989522423516614, + "loss": 4.6683, + "step": 78820 + }, + { + "epoch": 0.1107635455434815, + "grad_norm": 0.7614384293556213, + "learning_rate": 0.0002998949491209638, + "loss": 4.6271, + "step": 78830 + }, + { + "epoch": 0.11077759648164508, + "grad_norm": 0.706362247467041, + "learning_rate": 0.00029989467364617176, + "loss": 4.7318, + "step": 78840 + }, + { + "epoch": 0.11079164741980867, + "grad_norm": 0.7691898941993713, + "learning_rate": 0.0002998943978107908, + "loss": 4.7161, + "step": 78850 + }, + { + "epoch": 0.11080569835797224, + "grad_norm": 0.6833986043930054, + "learning_rate": 0.00029989412161482155, + "loss": 4.7538, + "step": 78860 + }, + { + "epoch": 0.11081974929613582, + "grad_norm": 0.7353855967521667, + "learning_rate": 0.0002998938450582647, + "loss": 4.8051, + "step": 78870 + }, + { + "epoch": 0.1108338002342994, + "grad_norm": 0.7171785831451416, + "learning_rate": 0.00029989356814112087, + "loss": 4.7027, + "step": 78880 + }, + { + "epoch": 0.11084785117246297, + "grad_norm": 0.6899160146713257, + "learning_rate": 0.00029989329086339074, + "loss": 4.7299, + "step": 78890 + }, + { + "epoch": 0.11086190211062655, + "grad_norm": 0.6934748888015747, + "learning_rate": 0.000299893013225075, + "loss": 4.7226, + "step": 78900 + }, + { + "epoch": 0.11087595304879012, + "grad_norm": 0.7488577961921692, + "learning_rate": 0.0002998927352261743, + "loss": 4.7633, + "step": 78910 + }, + { + "epoch": 0.1108900039869537, + "grad_norm": 0.6993347406387329, + "learning_rate": 0.0002998924568666893, + "loss": 4.6352, + "step": 78920 + }, + { + "epoch": 0.11090405492511728, + "grad_norm": 0.7227916717529297, + "learning_rate": 0.00029989217814662067, + "loss": 4.7705, + "step": 78930 + }, + { + "epoch": 0.11091810586328085, + "grad_norm": 0.7275491952896118, + "learning_rate": 0.00029989189906596907, + "loss": 4.7141, + "step": 78940 + }, + { + "epoch": 0.11093215680144444, + "grad_norm": 0.7532659769058228, + "learning_rate": 0.0002998916196247352, + "loss": 4.6849, + "step": 78950 + }, + { + "epoch": 0.11094620773960802, + "grad_norm": 0.7284137606620789, + "learning_rate": 0.00029989133982291974, + "loss": 4.7539, + "step": 78960 + }, + { + "epoch": 0.1109602586777716, + "grad_norm": 0.7159876823425293, + "learning_rate": 0.0002998910596605233, + "loss": 4.7526, + "step": 78970 + }, + { + "epoch": 0.11097430961593517, + "grad_norm": 0.7106112837791443, + "learning_rate": 0.00029989077913754667, + "loss": 4.7353, + "step": 78980 + }, + { + "epoch": 0.11098836055409875, + "grad_norm": 0.7276374697685242, + "learning_rate": 0.0002998904982539904, + "loss": 4.7902, + "step": 78990 + }, + { + "epoch": 0.11100241149226232, + "grad_norm": 0.7346372604370117, + "learning_rate": 0.0002998902170098552, + "loss": 4.8002, + "step": 79000 + }, + { + "epoch": 0.1110164624304259, + "grad_norm": 0.7175330519676208, + "learning_rate": 0.00029988993540514187, + "loss": 4.7478, + "step": 79010 + }, + { + "epoch": 0.11103051336858948, + "grad_norm": 0.7902714014053345, + "learning_rate": 0.00029988965343985085, + "loss": 4.6789, + "step": 79020 + }, + { + "epoch": 0.11104456430675305, + "grad_norm": 0.7199444770812988, + "learning_rate": 0.00029988937111398304, + "loss": 4.7025, + "step": 79030 + }, + { + "epoch": 0.11105861524491663, + "grad_norm": 0.7127149105072021, + "learning_rate": 0.000299889088427539, + "loss": 4.6838, + "step": 79040 + }, + { + "epoch": 0.11107266618308022, + "grad_norm": 0.700092613697052, + "learning_rate": 0.0002998888053805195, + "loss": 4.7037, + "step": 79050 + }, + { + "epoch": 0.1110867171212438, + "grad_norm": 0.704186737537384, + "learning_rate": 0.0002998885219729251, + "loss": 4.9441, + "step": 79060 + }, + { + "epoch": 0.11110076805940737, + "grad_norm": 0.714832603931427, + "learning_rate": 0.00029988823820475654, + "loss": 4.8514, + "step": 79070 + }, + { + "epoch": 0.11111481899757095, + "grad_norm": 0.7268180251121521, + "learning_rate": 0.0002998879540760145, + "loss": 4.708, + "step": 79080 + }, + { + "epoch": 0.11112886993573452, + "grad_norm": 0.7234380841255188, + "learning_rate": 0.0002998876695866997, + "loss": 4.7114, + "step": 79090 + }, + { + "epoch": 0.1111429208738981, + "grad_norm": 0.7124701738357544, + "learning_rate": 0.0002998873847368128, + "loss": 4.7032, + "step": 79100 + }, + { + "epoch": 0.11115697181206168, + "grad_norm": 0.7144205570220947, + "learning_rate": 0.0002998870995263545, + "loss": 4.7765, + "step": 79110 + }, + { + "epoch": 0.11117102275022525, + "grad_norm": 0.7110472917556763, + "learning_rate": 0.00029988684252865396, + "loss": 4.7136, + "step": 79120 + }, + { + "epoch": 0.11118507368838883, + "grad_norm": 0.7378934025764465, + "learning_rate": 0.0002998865566331118, + "loss": 4.8435, + "step": 79130 + }, + { + "epoch": 0.1111991246265524, + "grad_norm": 0.7095884084701538, + "learning_rate": 0.00029988627037700027, + "loss": 4.8469, + "step": 79140 + }, + { + "epoch": 0.11121317556471598, + "grad_norm": 0.7001661062240601, + "learning_rate": 0.00029988598376031996, + "loss": 4.8903, + "step": 79150 + }, + { + "epoch": 0.11122722650287957, + "grad_norm": 0.7070997357368469, + "learning_rate": 0.00029988569678307157, + "loss": 4.8263, + "step": 79160 + }, + { + "epoch": 0.11124127744104315, + "grad_norm": 0.7273538112640381, + "learning_rate": 0.00029988540944525586, + "loss": 4.758, + "step": 79170 + }, + { + "epoch": 0.11125532837920672, + "grad_norm": 0.7323654890060425, + "learning_rate": 0.0002998851217468735, + "loss": 4.645, + "step": 79180 + }, + { + "epoch": 0.1112693793173703, + "grad_norm": 0.6991866230964661, + "learning_rate": 0.0002998848336879251, + "loss": 4.687, + "step": 79190 + }, + { + "epoch": 0.11128343025553387, + "grad_norm": 0.7104476094245911, + "learning_rate": 0.0002998845452684114, + "loss": 4.7952, + "step": 79200 + }, + { + "epoch": 0.11129748119369745, + "grad_norm": 0.7157678604125977, + "learning_rate": 0.0002998842564883332, + "loss": 4.7413, + "step": 79210 + }, + { + "epoch": 0.11131153213186103, + "grad_norm": 0.7549179196357727, + "learning_rate": 0.0002998839673476911, + "loss": 4.6288, + "step": 79220 + }, + { + "epoch": 0.1113255830700246, + "grad_norm": 0.7367541193962097, + "learning_rate": 0.00029988367784648573, + "loss": 4.8033, + "step": 79230 + }, + { + "epoch": 0.11133963400818818, + "grad_norm": 0.7407969832420349, + "learning_rate": 0.0002998833879847179, + "loss": 4.7142, + "step": 79240 + }, + { + "epoch": 0.11135368494635176, + "grad_norm": 0.7445090413093567, + "learning_rate": 0.0002998830977623883, + "loss": 4.6799, + "step": 79250 + }, + { + "epoch": 0.11136773588451535, + "grad_norm": 0.7357890605926514, + "learning_rate": 0.0002998828071794975, + "loss": 4.6795, + "step": 79260 + }, + { + "epoch": 0.11138178682267892, + "grad_norm": 0.7026675343513489, + "learning_rate": 0.00029988251623604633, + "loss": 4.7248, + "step": 79270 + }, + { + "epoch": 0.1113958377608425, + "grad_norm": 0.7261384129524231, + "learning_rate": 0.00029988222493203547, + "loss": 4.8116, + "step": 79280 + }, + { + "epoch": 0.11140988869900607, + "grad_norm": 0.7369160056114197, + "learning_rate": 0.0002998819332674656, + "loss": 4.6674, + "step": 79290 + }, + { + "epoch": 0.11142393963716965, + "grad_norm": 0.7252683043479919, + "learning_rate": 0.00029988164124233744, + "loss": 4.7722, + "step": 79300 + }, + { + "epoch": 0.11143799057533323, + "grad_norm": 0.694120466709137, + "learning_rate": 0.0002998813488566516, + "loss": 4.7246, + "step": 79310 + }, + { + "epoch": 0.1114520415134968, + "grad_norm": 0.7220812439918518, + "learning_rate": 0.0002998810561104089, + "loss": 4.7342, + "step": 79320 + }, + { + "epoch": 0.11146609245166038, + "grad_norm": 0.7195317149162292, + "learning_rate": 0.00029988076300361004, + "loss": 4.5972, + "step": 79330 + }, + { + "epoch": 0.11148014338982395, + "grad_norm": 0.7166000008583069, + "learning_rate": 0.00029988046953625567, + "loss": 4.5616, + "step": 79340 + }, + { + "epoch": 0.11149419432798753, + "grad_norm": 0.7502967715263367, + "learning_rate": 0.0002998801757083465, + "loss": 4.8734, + "step": 79350 + }, + { + "epoch": 0.11150824526615112, + "grad_norm": 0.7110496163368225, + "learning_rate": 0.00029987988151988326, + "loss": 4.7498, + "step": 79360 + }, + { + "epoch": 0.1115222962043147, + "grad_norm": 0.7079729437828064, + "learning_rate": 0.0002998795869708666, + "loss": 4.7823, + "step": 79370 + }, + { + "epoch": 0.11153634714247827, + "grad_norm": 0.7025529146194458, + "learning_rate": 0.00029987929206129737, + "loss": 4.7594, + "step": 79380 + }, + { + "epoch": 0.11155039808064185, + "grad_norm": 0.6984058022499084, + "learning_rate": 0.00029987899679117613, + "loss": 4.8166, + "step": 79390 + }, + { + "epoch": 0.11156444901880543, + "grad_norm": 0.7246695160865784, + "learning_rate": 0.0002998787011605037, + "loss": 4.7495, + "step": 79400 + }, + { + "epoch": 0.111578499956969, + "grad_norm": 0.7176758050918579, + "learning_rate": 0.0002998784051692807, + "loss": 4.661, + "step": 79410 + }, + { + "epoch": 0.11159255089513258, + "grad_norm": 0.719412624835968, + "learning_rate": 0.00029987810881750785, + "loss": 4.7986, + "step": 79420 + }, + { + "epoch": 0.11160660183329615, + "grad_norm": 0.7081352472305298, + "learning_rate": 0.00029987781210518593, + "loss": 4.6177, + "step": 79430 + }, + { + "epoch": 0.11162065277145973, + "grad_norm": 0.7022585272789001, + "learning_rate": 0.0002998775150323156, + "loss": 4.7622, + "step": 79440 + }, + { + "epoch": 0.1116347037096233, + "grad_norm": 0.7345691919326782, + "learning_rate": 0.0002998772175988976, + "loss": 4.7247, + "step": 79450 + }, + { + "epoch": 0.11164875464778688, + "grad_norm": 0.7096884250640869, + "learning_rate": 0.00029987691980493266, + "loss": 4.6959, + "step": 79460 + }, + { + "epoch": 0.11166280558595047, + "grad_norm": 0.7393691539764404, + "learning_rate": 0.00029987662165042143, + "loss": 4.6315, + "step": 79470 + }, + { + "epoch": 0.11167685652411405, + "grad_norm": 0.7320327758789062, + "learning_rate": 0.0002998763231353647, + "loss": 4.7236, + "step": 79480 + }, + { + "epoch": 0.11169090746227762, + "grad_norm": 0.730957567691803, + "learning_rate": 0.00029987602425976317, + "loss": 4.5586, + "step": 79490 + }, + { + "epoch": 0.1117049584004412, + "grad_norm": 0.7240452766418457, + "learning_rate": 0.00029987572502361754, + "loss": 4.7775, + "step": 79500 + }, + { + "epoch": 0.11171900933860478, + "grad_norm": 0.7169708013534546, + "learning_rate": 0.0002998754254269285, + "loss": 4.8488, + "step": 79510 + }, + { + "epoch": 0.11173306027676835, + "grad_norm": 0.7280612587928772, + "learning_rate": 0.00029987512546969687, + "loss": 4.6919, + "step": 79520 + }, + { + "epoch": 0.11174711121493193, + "grad_norm": 0.7089032530784607, + "learning_rate": 0.0002998748251519233, + "loss": 4.7049, + "step": 79530 + }, + { + "epoch": 0.1117611621530955, + "grad_norm": 0.7215970158576965, + "learning_rate": 0.0002998745244736085, + "loss": 4.8457, + "step": 79540 + }, + { + "epoch": 0.11177521309125908, + "grad_norm": 0.7137922048568726, + "learning_rate": 0.00029987422343475317, + "loss": 4.6256, + "step": 79550 + }, + { + "epoch": 0.11178926402942266, + "grad_norm": 0.768875241279602, + "learning_rate": 0.00029987392203535815, + "loss": 4.7273, + "step": 79560 + }, + { + "epoch": 0.11180331496758625, + "grad_norm": 0.7097663879394531, + "learning_rate": 0.0002998736202754241, + "loss": 4.7569, + "step": 79570 + }, + { + "epoch": 0.11181736590574982, + "grad_norm": 0.7040906548500061, + "learning_rate": 0.00029987331815495175, + "loss": 4.8235, + "step": 79580 + }, + { + "epoch": 0.1118314168439134, + "grad_norm": 0.734401285648346, + "learning_rate": 0.0002998730156739418, + "loss": 4.7113, + "step": 79590 + }, + { + "epoch": 0.11184546778207698, + "grad_norm": 0.7665032148361206, + "learning_rate": 0.0002998727128323949, + "loss": 4.7411, + "step": 79600 + }, + { + "epoch": 0.11185951872024055, + "grad_norm": 0.7201694846153259, + "learning_rate": 0.000299872409630312, + "loss": 4.5992, + "step": 79610 + }, + { + "epoch": 0.11187356965840413, + "grad_norm": 0.7197522521018982, + "learning_rate": 0.00029987210606769365, + "loss": 4.7049, + "step": 79620 + }, + { + "epoch": 0.1118876205965677, + "grad_norm": 0.734275221824646, + "learning_rate": 0.0002998718021445406, + "loss": 4.761, + "step": 79630 + }, + { + "epoch": 0.11190167153473128, + "grad_norm": 0.7246469259262085, + "learning_rate": 0.0002998714978608537, + "loss": 4.6607, + "step": 79640 + }, + { + "epoch": 0.11191572247289486, + "grad_norm": 0.7006386518478394, + "learning_rate": 0.00029987119321663356, + "loss": 4.6202, + "step": 79650 + }, + { + "epoch": 0.11192977341105843, + "grad_norm": 0.7070479989051819, + "learning_rate": 0.00029987088821188093, + "loss": 4.817, + "step": 79660 + }, + { + "epoch": 0.11194382434922202, + "grad_norm": 0.7431702613830566, + "learning_rate": 0.00029987058284659655, + "loss": 4.9736, + "step": 79670 + }, + { + "epoch": 0.1119578752873856, + "grad_norm": 0.7081708908081055, + "learning_rate": 0.0002998702771207812, + "loss": 4.7651, + "step": 79680 + }, + { + "epoch": 0.11197192622554918, + "grad_norm": 0.7065561413764954, + "learning_rate": 0.0002998699710344356, + "loss": 4.8145, + "step": 79690 + }, + { + "epoch": 0.11198597716371275, + "grad_norm": 0.7250800728797913, + "learning_rate": 0.00029986966458756046, + "loss": 4.7197, + "step": 79700 + }, + { + "epoch": 0.11200002810187633, + "grad_norm": 0.7182665467262268, + "learning_rate": 0.0002998693577801565, + "loss": 4.6853, + "step": 79710 + }, + { + "epoch": 0.1120140790400399, + "grad_norm": 0.721691906452179, + "learning_rate": 0.0002998690506122245, + "loss": 4.7432, + "step": 79720 + }, + { + "epoch": 0.11202812997820348, + "grad_norm": 0.7138781547546387, + "learning_rate": 0.0002998687430837652, + "loss": 4.8657, + "step": 79730 + }, + { + "epoch": 0.11204218091636706, + "grad_norm": 0.7386598587036133, + "learning_rate": 0.0002998684351947793, + "loss": 4.5605, + "step": 79740 + }, + { + "epoch": 0.11205623185453063, + "grad_norm": 0.72069251537323, + "learning_rate": 0.0002998681269452676, + "loss": 4.7569, + "step": 79750 + }, + { + "epoch": 0.11207028279269421, + "grad_norm": 0.7185590863227844, + "learning_rate": 0.0002998678183352308, + "loss": 4.8125, + "step": 79760 + }, + { + "epoch": 0.11208433373085779, + "grad_norm": 0.71347576379776, + "learning_rate": 0.00029986750936466963, + "loss": 4.77, + "step": 79770 + }, + { + "epoch": 0.11209838466902138, + "grad_norm": 0.7057868242263794, + "learning_rate": 0.00029986720003358485, + "loss": 4.7322, + "step": 79780 + }, + { + "epoch": 0.11211243560718495, + "grad_norm": 0.7145351767539978, + "learning_rate": 0.0002998668903419772, + "loss": 4.7988, + "step": 79790 + }, + { + "epoch": 0.11212648654534853, + "grad_norm": 0.7262371778488159, + "learning_rate": 0.00029986658028984746, + "loss": 4.7664, + "step": 79800 + }, + { + "epoch": 0.1121405374835121, + "grad_norm": 0.702280580997467, + "learning_rate": 0.0002998662698771963, + "loss": 4.8152, + "step": 79810 + }, + { + "epoch": 0.11215458842167568, + "grad_norm": 0.7286804914474487, + "learning_rate": 0.00029986595910402457, + "loss": 4.7686, + "step": 79820 + }, + { + "epoch": 0.11216863935983926, + "grad_norm": 0.7360864281654358, + "learning_rate": 0.0002998656479703329, + "loss": 4.7821, + "step": 79830 + }, + { + "epoch": 0.11218269029800283, + "grad_norm": 0.7273938059806824, + "learning_rate": 0.0002998653364761221, + "loss": 4.8064, + "step": 79840 + }, + { + "epoch": 0.11219674123616641, + "grad_norm": 0.7493408918380737, + "learning_rate": 0.000299865024621393, + "loss": 4.7691, + "step": 79850 + }, + { + "epoch": 0.11221079217432998, + "grad_norm": 0.7027277946472168, + "learning_rate": 0.00029986471240614623, + "loss": 4.5972, + "step": 79860 + }, + { + "epoch": 0.11222484311249356, + "grad_norm": 0.7099219560623169, + "learning_rate": 0.00029986439983038256, + "loss": 4.8091, + "step": 79870 + }, + { + "epoch": 0.11223889405065715, + "grad_norm": 0.726952314376831, + "learning_rate": 0.00029986408689410275, + "loss": 4.6214, + "step": 79880 + }, + { + "epoch": 0.11225294498882073, + "grad_norm": 0.7291332483291626, + "learning_rate": 0.0002998637735973076, + "loss": 4.7047, + "step": 79890 + }, + { + "epoch": 0.1122669959269843, + "grad_norm": 0.7252445220947266, + "learning_rate": 0.00029986345993999786, + "loss": 4.7274, + "step": 79900 + }, + { + "epoch": 0.11228104686514788, + "grad_norm": 0.7106611132621765, + "learning_rate": 0.0002998631459221742, + "loss": 4.7364, + "step": 79910 + }, + { + "epoch": 0.11229509780331146, + "grad_norm": 0.7336950898170471, + "learning_rate": 0.0002998628315438374, + "loss": 4.8461, + "step": 79920 + }, + { + "epoch": 0.11230914874147503, + "grad_norm": 0.7143675088882446, + "learning_rate": 0.00029986251680498826, + "loss": 4.6871, + "step": 79930 + }, + { + "epoch": 0.11232319967963861, + "grad_norm": 0.7315747737884521, + "learning_rate": 0.0002998622017056276, + "loss": 4.7463, + "step": 79940 + }, + { + "epoch": 0.11233725061780218, + "grad_norm": 0.7288627028465271, + "learning_rate": 0.00029986188624575604, + "loss": 4.7344, + "step": 79950 + }, + { + "epoch": 0.11235130155596576, + "grad_norm": 0.7390293478965759, + "learning_rate": 0.0002998615704253744, + "loss": 4.7487, + "step": 79960 + }, + { + "epoch": 0.11236535249412934, + "grad_norm": 0.7393094301223755, + "learning_rate": 0.00029986125424448344, + "loss": 4.6628, + "step": 79970 + }, + { + "epoch": 0.11237940343229293, + "grad_norm": 0.7108803391456604, + "learning_rate": 0.0002998609377030839, + "loss": 4.6948, + "step": 79980 + }, + { + "epoch": 0.1123934543704565, + "grad_norm": 0.7048096656799316, + "learning_rate": 0.00029986062080117657, + "loss": 4.7815, + "step": 79990 + }, + { + "epoch": 0.11240750530862008, + "grad_norm": 0.722067654132843, + "learning_rate": 0.00029986030353876226, + "loss": 4.6608, + "step": 80000 + }, + { + "epoch": 0.11242155624678365, + "grad_norm": 0.6960885524749756, + "learning_rate": 0.0002998599859158416, + "loss": 4.7304, + "step": 80010 + }, + { + "epoch": 0.11243560718494723, + "grad_norm": 0.7393103241920471, + "learning_rate": 0.0002998596679324155, + "loss": 4.6923, + "step": 80020 + }, + { + "epoch": 0.11244965812311081, + "grad_norm": 0.7443263530731201, + "learning_rate": 0.0002998593495884846, + "loss": 4.7715, + "step": 80030 + }, + { + "epoch": 0.11246370906127438, + "grad_norm": 0.7111883163452148, + "learning_rate": 0.00029985903088404974, + "loss": 4.6946, + "step": 80040 + }, + { + "epoch": 0.11247775999943796, + "grad_norm": 0.7264695763587952, + "learning_rate": 0.00029985871181911164, + "loss": 4.6478, + "step": 80050 + }, + { + "epoch": 0.11249181093760154, + "grad_norm": 0.7133517265319824, + "learning_rate": 0.0002998583923936711, + "loss": 4.7459, + "step": 80060 + }, + { + "epoch": 0.11250586187576511, + "grad_norm": 0.7024343609809875, + "learning_rate": 0.0002998580726077289, + "loss": 4.7401, + "step": 80070 + }, + { + "epoch": 0.1125199128139287, + "grad_norm": 0.7126244306564331, + "learning_rate": 0.0002998577524612858, + "loss": 4.8324, + "step": 80080 + }, + { + "epoch": 0.11253396375209228, + "grad_norm": 0.750248908996582, + "learning_rate": 0.0002998574319543426, + "loss": 4.6754, + "step": 80090 + }, + { + "epoch": 0.11254801469025585, + "grad_norm": 0.6958588361740112, + "learning_rate": 0.0002998571110869, + "loss": 4.7199, + "step": 80100 + }, + { + "epoch": 0.11256206562841943, + "grad_norm": 0.7012098431587219, + "learning_rate": 0.0002998567898589588, + "loss": 4.5606, + "step": 80110 + }, + { + "epoch": 0.112576116566583, + "grad_norm": 0.739078938961029, + "learning_rate": 0.0002998564682705198, + "loss": 4.8491, + "step": 80120 + }, + { + "epoch": 0.11259016750474658, + "grad_norm": 0.745468258857727, + "learning_rate": 0.00029985614632158374, + "loss": 4.7074, + "step": 80130 + }, + { + "epoch": 0.11260421844291016, + "grad_norm": 0.7302658557891846, + "learning_rate": 0.0002998558240121514, + "loss": 4.8606, + "step": 80140 + }, + { + "epoch": 0.11261826938107374, + "grad_norm": 0.7549187541007996, + "learning_rate": 0.00029985550134222356, + "loss": 4.7794, + "step": 80150 + }, + { + "epoch": 0.11263232031923731, + "grad_norm": 0.7186284065246582, + "learning_rate": 0.000299855178311801, + "loss": 4.6571, + "step": 80160 + }, + { + "epoch": 0.11264637125740089, + "grad_norm": 0.7297990918159485, + "learning_rate": 0.0002998548549208845, + "loss": 4.691, + "step": 80170 + }, + { + "epoch": 0.11266042219556446, + "grad_norm": 0.6997066140174866, + "learning_rate": 0.00029985453116947484, + "loss": 4.7066, + "step": 80180 + }, + { + "epoch": 0.11267447313372805, + "grad_norm": 0.7195040583610535, + "learning_rate": 0.00029985420705757275, + "loss": 4.7252, + "step": 80190 + }, + { + "epoch": 0.11268852407189163, + "grad_norm": 0.699199914932251, + "learning_rate": 0.0002998538825851791, + "loss": 4.7883, + "step": 80200 + }, + { + "epoch": 0.1127025750100552, + "grad_norm": 0.6951486468315125, + "learning_rate": 0.00029985355775229464, + "loss": 4.8286, + "step": 80210 + }, + { + "epoch": 0.11271662594821878, + "grad_norm": 0.7475593686103821, + "learning_rate": 0.0002998532325589201, + "loss": 4.7873, + "step": 80220 + }, + { + "epoch": 0.11273067688638236, + "grad_norm": 0.7170931100845337, + "learning_rate": 0.00029985290700505627, + "loss": 4.6795, + "step": 80230 + }, + { + "epoch": 0.11274472782454593, + "grad_norm": 0.7242776155471802, + "learning_rate": 0.000299852581090704, + "loss": 4.7406, + "step": 80240 + }, + { + "epoch": 0.11275877876270951, + "grad_norm": 0.6967573165893555, + "learning_rate": 0.000299852254815864, + "loss": 4.8199, + "step": 80250 + }, + { + "epoch": 0.11277282970087309, + "grad_norm": 0.7147936820983887, + "learning_rate": 0.0002998519281805371, + "loss": 4.7201, + "step": 80260 + }, + { + "epoch": 0.11278688063903666, + "grad_norm": 0.6991884708404541, + "learning_rate": 0.0002998516011847241, + "loss": 4.6309, + "step": 80270 + }, + { + "epoch": 0.11280093157720024, + "grad_norm": 0.7079862356185913, + "learning_rate": 0.0002998512738284257, + "loss": 4.7821, + "step": 80280 + }, + { + "epoch": 0.11281498251536383, + "grad_norm": 0.7197107076644897, + "learning_rate": 0.00029985094611164286, + "loss": 4.8384, + "step": 80290 + }, + { + "epoch": 0.1128290334535274, + "grad_norm": 0.7062892317771912, + "learning_rate": 0.0002998506180343762, + "loss": 4.6175, + "step": 80300 + }, + { + "epoch": 0.11284308439169098, + "grad_norm": 0.7397512793540955, + "learning_rate": 0.0002998502895966265, + "loss": 4.7381, + "step": 80310 + }, + { + "epoch": 0.11285713532985456, + "grad_norm": 0.7265103459358215, + "learning_rate": 0.0002998499607983947, + "loss": 4.7411, + "step": 80320 + }, + { + "epoch": 0.11287118626801813, + "grad_norm": 0.7303242087364197, + "learning_rate": 0.00029984963163968146, + "loss": 4.6545, + "step": 80330 + }, + { + "epoch": 0.11288523720618171, + "grad_norm": 0.6990224719047546, + "learning_rate": 0.0002998493021204877, + "loss": 4.715, + "step": 80340 + }, + { + "epoch": 0.11289928814434529, + "grad_norm": 0.7091715931892395, + "learning_rate": 0.000299848972240814, + "loss": 4.7124, + "step": 80350 + }, + { + "epoch": 0.11291333908250886, + "grad_norm": 0.718712329864502, + "learning_rate": 0.0002998486420006614, + "loss": 4.6979, + "step": 80360 + }, + { + "epoch": 0.11292739002067244, + "grad_norm": 0.7297937870025635, + "learning_rate": 0.00029984831140003056, + "loss": 4.7455, + "step": 80370 + }, + { + "epoch": 0.11294144095883601, + "grad_norm": 0.7224186062812805, + "learning_rate": 0.0002998479804389222, + "loss": 4.6676, + "step": 80380 + }, + { + "epoch": 0.1129554918969996, + "grad_norm": 0.72234046459198, + "learning_rate": 0.0002998476491173373, + "loss": 4.7358, + "step": 80390 + }, + { + "epoch": 0.11296954283516318, + "grad_norm": 0.7080051898956299, + "learning_rate": 0.0002998473174352766, + "loss": 4.6412, + "step": 80400 + }, + { + "epoch": 0.11298359377332676, + "grad_norm": 0.7055094242095947, + "learning_rate": 0.00029984698539274075, + "loss": 4.7382, + "step": 80410 + }, + { + "epoch": 0.11299764471149033, + "grad_norm": 0.7096608281135559, + "learning_rate": 0.00029984665298973074, + "loss": 4.6255, + "step": 80420 + }, + { + "epoch": 0.11301169564965391, + "grad_norm": 0.7352139949798584, + "learning_rate": 0.0002998463202262473, + "loss": 4.6637, + "step": 80430 + }, + { + "epoch": 0.11302574658781749, + "grad_norm": 0.692638635635376, + "learning_rate": 0.0002998459871022912, + "loss": 4.7391, + "step": 80440 + }, + { + "epoch": 0.11303979752598106, + "grad_norm": 0.7054898738861084, + "learning_rate": 0.00029984565361786326, + "loss": 4.7407, + "step": 80450 + }, + { + "epoch": 0.11305384846414464, + "grad_norm": 0.7301256656646729, + "learning_rate": 0.0002998453197729643, + "loss": 4.8302, + "step": 80460 + }, + { + "epoch": 0.11306789940230821, + "grad_norm": 0.7004954814910889, + "learning_rate": 0.00029984498556759516, + "loss": 4.7458, + "step": 80470 + }, + { + "epoch": 0.11308195034047179, + "grad_norm": 0.7302805781364441, + "learning_rate": 0.00029984465100175653, + "loss": 4.6252, + "step": 80480 + }, + { + "epoch": 0.11309600127863537, + "grad_norm": 0.7323416471481323, + "learning_rate": 0.0002998443160754493, + "loss": 4.7379, + "step": 80490 + }, + { + "epoch": 0.11311005221679896, + "grad_norm": 0.7046477198600769, + "learning_rate": 0.00029984398078867423, + "loss": 4.7576, + "step": 80500 + }, + { + "epoch": 0.11312410315496253, + "grad_norm": 0.719993531703949, + "learning_rate": 0.00029984364514143224, + "loss": 4.7161, + "step": 80510 + }, + { + "epoch": 0.11313815409312611, + "grad_norm": 0.7029352784156799, + "learning_rate": 0.000299843309133724, + "loss": 4.7524, + "step": 80520 + }, + { + "epoch": 0.11315220503128968, + "grad_norm": 0.7699235677719116, + "learning_rate": 0.00029984297276555033, + "loss": 4.7803, + "step": 80530 + }, + { + "epoch": 0.11316625596945326, + "grad_norm": 0.7614515423774719, + "learning_rate": 0.0002998426360369121, + "loss": 4.6891, + "step": 80540 + }, + { + "epoch": 0.11318030690761684, + "grad_norm": 0.7042279243469238, + "learning_rate": 0.00029984229894781006, + "loss": 4.7154, + "step": 80550 + }, + { + "epoch": 0.11319435784578041, + "grad_norm": 0.7094720602035522, + "learning_rate": 0.00029984196149824514, + "loss": 4.7491, + "step": 80560 + }, + { + "epoch": 0.11320840878394399, + "grad_norm": 0.722616970539093, + "learning_rate": 0.000299841623688218, + "loss": 4.6867, + "step": 80570 + }, + { + "epoch": 0.11322245972210757, + "grad_norm": 0.719673216342926, + "learning_rate": 0.0002998412855177296, + "loss": 4.6543, + "step": 80580 + }, + { + "epoch": 0.11323651066027114, + "grad_norm": 0.7150025963783264, + "learning_rate": 0.0002998409469867806, + "loss": 4.7076, + "step": 80590 + }, + { + "epoch": 0.11325056159843473, + "grad_norm": 0.7299203276634216, + "learning_rate": 0.0002998406080953719, + "loss": 4.7068, + "step": 80600 + }, + { + "epoch": 0.11326461253659831, + "grad_norm": 0.7360802292823792, + "learning_rate": 0.00029984026884350437, + "loss": 4.6402, + "step": 80610 + }, + { + "epoch": 0.11327866347476188, + "grad_norm": 0.7183154225349426, + "learning_rate": 0.0002998399292311787, + "loss": 4.6114, + "step": 80620 + }, + { + "epoch": 0.11329271441292546, + "grad_norm": 0.6950372457504272, + "learning_rate": 0.0002998395892583958, + "loss": 4.7284, + "step": 80630 + }, + { + "epoch": 0.11330676535108904, + "grad_norm": 0.7128282785415649, + "learning_rate": 0.0002998392489251564, + "loss": 4.8155, + "step": 80640 + }, + { + "epoch": 0.11332081628925261, + "grad_norm": 0.7069471478462219, + "learning_rate": 0.00029983890823146147, + "loss": 4.7038, + "step": 80650 + }, + { + "epoch": 0.11333486722741619, + "grad_norm": 0.7087200880050659, + "learning_rate": 0.00029983856717731165, + "loss": 4.5934, + "step": 80660 + }, + { + "epoch": 0.11334891816557977, + "grad_norm": 0.7316790819168091, + "learning_rate": 0.0002998382257627079, + "loss": 4.6918, + "step": 80670 + }, + { + "epoch": 0.11336296910374334, + "grad_norm": 0.7532024383544922, + "learning_rate": 0.00029983788398765094, + "loss": 4.667, + "step": 80680 + }, + { + "epoch": 0.11337702004190692, + "grad_norm": 0.7689909338951111, + "learning_rate": 0.00029983754185214167, + "loss": 4.6237, + "step": 80690 + }, + { + "epoch": 0.11339107098007051, + "grad_norm": 0.7248286604881287, + "learning_rate": 0.0002998371993561809, + "loss": 4.7349, + "step": 80700 + }, + { + "epoch": 0.11340512191823408, + "grad_norm": 0.7164115309715271, + "learning_rate": 0.00029983685649976943, + "loss": 4.7656, + "step": 80710 + }, + { + "epoch": 0.11341917285639766, + "grad_norm": 0.6937947273254395, + "learning_rate": 0.0002998365132829081, + "loss": 4.7744, + "step": 80720 + }, + { + "epoch": 0.11343322379456124, + "grad_norm": 0.700322687625885, + "learning_rate": 0.00029983616970559766, + "loss": 4.7036, + "step": 80730 + }, + { + "epoch": 0.11344727473272481, + "grad_norm": 0.7567490935325623, + "learning_rate": 0.00029983582576783907, + "loss": 4.7032, + "step": 80740 + }, + { + "epoch": 0.11346132567088839, + "grad_norm": 0.7181823253631592, + "learning_rate": 0.0002998354814696331, + "loss": 4.8266, + "step": 80750 + }, + { + "epoch": 0.11347537660905196, + "grad_norm": 0.7397735714912415, + "learning_rate": 0.0002998351368109805, + "loss": 4.6778, + "step": 80760 + }, + { + "epoch": 0.11348942754721554, + "grad_norm": 0.698970377445221, + "learning_rate": 0.0002998347917918822, + "loss": 4.6136, + "step": 80770 + }, + { + "epoch": 0.11350347848537912, + "grad_norm": 0.6962791085243225, + "learning_rate": 0.00029983444641233897, + "loss": 4.687, + "step": 80780 + }, + { + "epoch": 0.11351752942354269, + "grad_norm": 0.7418192028999329, + "learning_rate": 0.0002998341006723517, + "loss": 4.7033, + "step": 80790 + }, + { + "epoch": 0.11353158036170627, + "grad_norm": 0.7161971926689148, + "learning_rate": 0.00029983375457192117, + "loss": 4.8678, + "step": 80800 + }, + { + "epoch": 0.11354563129986986, + "grad_norm": 0.711134672164917, + "learning_rate": 0.0002998334081110483, + "loss": 4.7424, + "step": 80810 + }, + { + "epoch": 0.11355968223803344, + "grad_norm": 0.7014956474304199, + "learning_rate": 0.00029983306128973376, + "loss": 4.7054, + "step": 80820 + }, + { + "epoch": 0.11357373317619701, + "grad_norm": 0.7056500911712646, + "learning_rate": 0.00029983271410797857, + "loss": 4.7097, + "step": 80830 + }, + { + "epoch": 0.11358778411436059, + "grad_norm": 0.7953014373779297, + "learning_rate": 0.0002998323665657834, + "loss": 4.7563, + "step": 80840 + }, + { + "epoch": 0.11360183505252416, + "grad_norm": 0.7236987948417664, + "learning_rate": 0.00029983201866314924, + "loss": 4.6917, + "step": 80850 + }, + { + "epoch": 0.11361588599068774, + "grad_norm": 0.7303849458694458, + "learning_rate": 0.0002998316704000768, + "loss": 4.6084, + "step": 80860 + }, + { + "epoch": 0.11362993692885132, + "grad_norm": 0.6903831958770752, + "learning_rate": 0.00029983132177656695, + "loss": 4.768, + "step": 80870 + }, + { + "epoch": 0.11364398786701489, + "grad_norm": 0.744113564491272, + "learning_rate": 0.0002998309727926206, + "loss": 4.6886, + "step": 80880 + }, + { + "epoch": 0.11365803880517847, + "grad_norm": 0.7207052707672119, + "learning_rate": 0.0002998306234482385, + "loss": 4.7185, + "step": 80890 + }, + { + "epoch": 0.11367208974334204, + "grad_norm": 0.7463589906692505, + "learning_rate": 0.00029983027374342155, + "loss": 4.7078, + "step": 80900 + }, + { + "epoch": 0.11368614068150563, + "grad_norm": 1.0752589702606201, + "learning_rate": 0.0002998299236781705, + "loss": 4.6764, + "step": 80910 + }, + { + "epoch": 0.11370019161966921, + "grad_norm": 0.7272537350654602, + "learning_rate": 0.0002998295732524863, + "loss": 4.8209, + "step": 80920 + }, + { + "epoch": 0.11371424255783279, + "grad_norm": 0.7111218571662903, + "learning_rate": 0.00029982922246636975, + "loss": 4.71, + "step": 80930 + }, + { + "epoch": 0.11372829349599636, + "grad_norm": 0.7262582182884216, + "learning_rate": 0.0002998288713198217, + "loss": 4.7863, + "step": 80940 + }, + { + "epoch": 0.11374234443415994, + "grad_norm": 0.7486706972122192, + "learning_rate": 0.00029982851981284303, + "loss": 4.7294, + "step": 80950 + }, + { + "epoch": 0.11375639537232352, + "grad_norm": 0.764664351940155, + "learning_rate": 0.00029982816794543445, + "loss": 4.714, + "step": 80960 + }, + { + "epoch": 0.11377044631048709, + "grad_norm": 0.701084554195404, + "learning_rate": 0.000299827815717597, + "loss": 4.8014, + "step": 80970 + }, + { + "epoch": 0.11378449724865067, + "grad_norm": 1.806844711303711, + "learning_rate": 0.0002998274631293314, + "loss": 4.7518, + "step": 80980 + }, + { + "epoch": 0.11379854818681424, + "grad_norm": 0.7623975276947021, + "learning_rate": 0.0002998271101806385, + "loss": 4.8233, + "step": 80990 + }, + { + "epoch": 0.11381259912497782, + "grad_norm": 0.7140116095542908, + "learning_rate": 0.0002998267568715192, + "loss": 4.7421, + "step": 81000 + }, + { + "epoch": 0.11382665006314141, + "grad_norm": 0.7124093174934387, + "learning_rate": 0.0002998264032019743, + "loss": 4.7544, + "step": 81010 + }, + { + "epoch": 0.11384070100130499, + "grad_norm": 0.7354893088340759, + "learning_rate": 0.0002998260491720047, + "loss": 4.8484, + "step": 81020 + }, + { + "epoch": 0.11385475193946856, + "grad_norm": 0.7834903001785278, + "learning_rate": 0.0002998256947816112, + "loss": 4.7304, + "step": 81030 + }, + { + "epoch": 0.11386880287763214, + "grad_norm": 0.7008675336837769, + "learning_rate": 0.00029982534003079474, + "loss": 4.7707, + "step": 81040 + }, + { + "epoch": 0.11388285381579571, + "grad_norm": 0.7236028909683228, + "learning_rate": 0.0002998249849195561, + "loss": 4.7268, + "step": 81050 + }, + { + "epoch": 0.11389690475395929, + "grad_norm": 0.7143322825431824, + "learning_rate": 0.0002998246294478961, + "loss": 4.7511, + "step": 81060 + }, + { + "epoch": 0.11391095569212287, + "grad_norm": 0.7715073227882385, + "learning_rate": 0.0002998242736158157, + "loss": 4.7602, + "step": 81070 + }, + { + "epoch": 0.11392500663028644, + "grad_norm": 0.7184918522834778, + "learning_rate": 0.00029982391742331565, + "loss": 4.7352, + "step": 81080 + }, + { + "epoch": 0.11393905756845002, + "grad_norm": 0.6928876638412476, + "learning_rate": 0.0002998235608703969, + "loss": 4.5818, + "step": 81090 + }, + { + "epoch": 0.1139531085066136, + "grad_norm": 0.7220043540000916, + "learning_rate": 0.00029982320395706023, + "loss": 4.7761, + "step": 81100 + }, + { + "epoch": 0.11396715944477717, + "grad_norm": 0.6904178261756897, + "learning_rate": 0.00029982284668330657, + "loss": 4.8029, + "step": 81110 + }, + { + "epoch": 0.11398121038294076, + "grad_norm": 0.7251768708229065, + "learning_rate": 0.00029982248904913673, + "loss": 4.7282, + "step": 81120 + }, + { + "epoch": 0.11399526132110434, + "grad_norm": 0.6947658658027649, + "learning_rate": 0.00029982213105455153, + "loss": 4.7695, + "step": 81130 + }, + { + "epoch": 0.11400931225926791, + "grad_norm": 0.7100722193717957, + "learning_rate": 0.000299821772699552, + "loss": 4.8002, + "step": 81140 + }, + { + "epoch": 0.11402336319743149, + "grad_norm": 0.7027919292449951, + "learning_rate": 0.0002998214139841388, + "loss": 4.598, + "step": 81150 + }, + { + "epoch": 0.11403741413559507, + "grad_norm": 0.693475604057312, + "learning_rate": 0.0002998210549083129, + "loss": 4.7001, + "step": 81160 + }, + { + "epoch": 0.11405146507375864, + "grad_norm": 0.720072329044342, + "learning_rate": 0.0002998206954720751, + "loss": 4.7215, + "step": 81170 + }, + { + "epoch": 0.11406551601192222, + "grad_norm": 0.6978077292442322, + "learning_rate": 0.0002998203356754264, + "loss": 4.7458, + "step": 81180 + }, + { + "epoch": 0.1140795669500858, + "grad_norm": 0.721059262752533, + "learning_rate": 0.0002998199755183675, + "loss": 4.6789, + "step": 81190 + }, + { + "epoch": 0.11409361788824937, + "grad_norm": 0.7521209716796875, + "learning_rate": 0.0002998196150008994, + "loss": 4.5958, + "step": 81200 + }, + { + "epoch": 0.11410766882641295, + "grad_norm": 0.7243664860725403, + "learning_rate": 0.00029981925412302286, + "loss": 4.7493, + "step": 81210 + }, + { + "epoch": 0.11412171976457654, + "grad_norm": 0.6969353556632996, + "learning_rate": 0.0002998189290247855, + "loss": 4.8096, + "step": 81220 + }, + { + "epoch": 0.11413577070274011, + "grad_norm": 0.7307901978492737, + "learning_rate": 0.0002998185674621355, + "loss": 4.582, + "step": 81230 + }, + { + "epoch": 0.11414982164090369, + "grad_norm": 0.7377418875694275, + "learning_rate": 0.00029981820553907944, + "loss": 4.7947, + "step": 81240 + }, + { + "epoch": 0.11416387257906727, + "grad_norm": 0.7029874324798584, + "learning_rate": 0.00029981784325561856, + "loss": 4.6826, + "step": 81250 + }, + { + "epoch": 0.11417792351723084, + "grad_norm": 0.7139830589294434, + "learning_rate": 0.00029981748061175343, + "loss": 4.7922, + "step": 81260 + }, + { + "epoch": 0.11419197445539442, + "grad_norm": 0.7318224906921387, + "learning_rate": 0.00029981711760748513, + "loss": 4.6779, + "step": 81270 + }, + { + "epoch": 0.114206025393558, + "grad_norm": 0.7297080159187317, + "learning_rate": 0.00029981675424281437, + "loss": 4.7369, + "step": 81280 + }, + { + "epoch": 0.11422007633172157, + "grad_norm": 0.7171475887298584, + "learning_rate": 0.0002998163905177421, + "loss": 4.8174, + "step": 81290 + }, + { + "epoch": 0.11423412726988515, + "grad_norm": 0.710830569267273, + "learning_rate": 0.0002998160264322693, + "loss": 4.7841, + "step": 81300 + }, + { + "epoch": 0.11424817820804872, + "grad_norm": 0.7462754845619202, + "learning_rate": 0.0002998156619863966, + "loss": 4.7658, + "step": 81310 + }, + { + "epoch": 0.11426222914621231, + "grad_norm": 0.7007805705070496, + "learning_rate": 0.0002998152971801251, + "loss": 4.8825, + "step": 81320 + }, + { + "epoch": 0.11427628008437589, + "grad_norm": 0.6981909275054932, + "learning_rate": 0.0002998149320134556, + "loss": 4.7546, + "step": 81330 + }, + { + "epoch": 0.11429033102253947, + "grad_norm": 0.7107502818107605, + "learning_rate": 0.0002998145664863889, + "loss": 4.7203, + "step": 81340 + }, + { + "epoch": 0.11430438196070304, + "grad_norm": 0.7196722030639648, + "learning_rate": 0.000299814200598926, + "loss": 4.7632, + "step": 81350 + }, + { + "epoch": 0.11431843289886662, + "grad_norm": 0.7067022919654846, + "learning_rate": 0.00029981383435106774, + "loss": 4.7363, + "step": 81360 + }, + { + "epoch": 0.1143324838370302, + "grad_norm": 0.7224134802818298, + "learning_rate": 0.000299813467742815, + "loss": 4.6321, + "step": 81370 + }, + { + "epoch": 0.11434653477519377, + "grad_norm": 0.722435712814331, + "learning_rate": 0.00029981310077416864, + "loss": 4.7949, + "step": 81380 + }, + { + "epoch": 0.11436058571335735, + "grad_norm": 0.7146986722946167, + "learning_rate": 0.00029981273344512955, + "loss": 4.7668, + "step": 81390 + }, + { + "epoch": 0.11437463665152092, + "grad_norm": 0.7286592125892639, + "learning_rate": 0.0002998123657556986, + "loss": 4.7215, + "step": 81400 + }, + { + "epoch": 0.1143886875896845, + "grad_norm": 0.7215551137924194, + "learning_rate": 0.00029981199770587677, + "loss": 4.7592, + "step": 81410 + }, + { + "epoch": 0.11440273852784807, + "grad_norm": 0.737603485584259, + "learning_rate": 0.0002998116292956648, + "loss": 4.651, + "step": 81420 + }, + { + "epoch": 0.11441678946601166, + "grad_norm": 0.7101167440414429, + "learning_rate": 0.00029981126052506374, + "loss": 4.8422, + "step": 81430 + }, + { + "epoch": 0.11443084040417524, + "grad_norm": 0.7299492955207825, + "learning_rate": 0.0002998108913940743, + "loss": 4.885, + "step": 81440 + }, + { + "epoch": 0.11444489134233882, + "grad_norm": 0.6973890662193298, + "learning_rate": 0.00029981052190269745, + "loss": 4.8612, + "step": 81450 + }, + { + "epoch": 0.1144589422805024, + "grad_norm": 0.7072581648826599, + "learning_rate": 0.0002998101520509341, + "loss": 4.683, + "step": 81460 + }, + { + "epoch": 0.11447299321866597, + "grad_norm": 0.7035022974014282, + "learning_rate": 0.0002998097818387851, + "loss": 4.6592, + "step": 81470 + }, + { + "epoch": 0.11448704415682955, + "grad_norm": 0.7320752143859863, + "learning_rate": 0.0002998094112662514, + "loss": 4.6978, + "step": 81480 + }, + { + "epoch": 0.11450109509499312, + "grad_norm": 0.6897929906845093, + "learning_rate": 0.00029980904033333384, + "loss": 4.9475, + "step": 81490 + }, + { + "epoch": 0.1145151460331567, + "grad_norm": 0.6888123750686646, + "learning_rate": 0.0002998086690400334, + "loss": 4.6484, + "step": 81500 + }, + { + "epoch": 0.11452919697132027, + "grad_norm": 0.7391230463981628, + "learning_rate": 0.0002998082973863508, + "loss": 4.7175, + "step": 81510 + }, + { + "epoch": 0.11454324790948385, + "grad_norm": 0.7109723091125488, + "learning_rate": 0.000299807925372287, + "loss": 4.7042, + "step": 81520 + }, + { + "epoch": 0.11455729884764744, + "grad_norm": 0.6957715749740601, + "learning_rate": 0.000299807552997843, + "loss": 4.7355, + "step": 81530 + }, + { + "epoch": 0.11457134978581102, + "grad_norm": 0.7242121696472168, + "learning_rate": 0.00029980718026301965, + "loss": 4.8314, + "step": 81540 + }, + { + "epoch": 0.11458540072397459, + "grad_norm": 0.7289703488349915, + "learning_rate": 0.0002998068071678178, + "loss": 4.709, + "step": 81550 + }, + { + "epoch": 0.11459945166213817, + "grad_norm": 0.7176446914672852, + "learning_rate": 0.00029980643371223833, + "loss": 4.7353, + "step": 81560 + }, + { + "epoch": 0.11461350260030174, + "grad_norm": 0.7373208403587341, + "learning_rate": 0.0002998060598962822, + "loss": 4.754, + "step": 81570 + }, + { + "epoch": 0.11462755353846532, + "grad_norm": 0.7444453835487366, + "learning_rate": 0.00029980568571995027, + "loss": 4.6487, + "step": 81580 + }, + { + "epoch": 0.1146416044766289, + "grad_norm": 0.7543371915817261, + "learning_rate": 0.00029980531118324345, + "loss": 4.8064, + "step": 81590 + }, + { + "epoch": 0.11465565541479247, + "grad_norm": 2.069936513900757, + "learning_rate": 0.0002998049362861627, + "loss": 4.6117, + "step": 81600 + }, + { + "epoch": 0.11466970635295605, + "grad_norm": 0.7130931615829468, + "learning_rate": 0.0002998045610287088, + "loss": 4.7338, + "step": 81610 + }, + { + "epoch": 0.11468375729111963, + "grad_norm": 0.7223381400108337, + "learning_rate": 0.0002998041854108828, + "loss": 4.8274, + "step": 81620 + }, + { + "epoch": 0.11469780822928322, + "grad_norm": 0.7344392538070679, + "learning_rate": 0.0002998038094326855, + "loss": 4.7107, + "step": 81630 + }, + { + "epoch": 0.11471185916744679, + "grad_norm": 0.735009491443634, + "learning_rate": 0.00029980343309411777, + "loss": 4.7712, + "step": 81640 + }, + { + "epoch": 0.11472591010561037, + "grad_norm": 0.7493471503257751, + "learning_rate": 0.0002998030563951806, + "loss": 4.692, + "step": 81650 + }, + { + "epoch": 0.11473996104377394, + "grad_norm": 0.7441632747650146, + "learning_rate": 0.0002998026793358749, + "loss": 4.7691, + "step": 81660 + }, + { + "epoch": 0.11475401198193752, + "grad_norm": 0.7013207674026489, + "learning_rate": 0.00029980230191620157, + "loss": 4.6536, + "step": 81670 + }, + { + "epoch": 0.1147680629201011, + "grad_norm": 0.697334349155426, + "learning_rate": 0.00029980192413616146, + "loss": 4.6575, + "step": 81680 + }, + { + "epoch": 0.11478211385826467, + "grad_norm": 0.6946876645088196, + "learning_rate": 0.0002998015459957555, + "loss": 4.7414, + "step": 81690 + }, + { + "epoch": 0.11479616479642825, + "grad_norm": 0.7325066924095154, + "learning_rate": 0.0002998011674949846, + "loss": 4.7229, + "step": 81700 + }, + { + "epoch": 0.11481021573459183, + "grad_norm": 0.6877426505088806, + "learning_rate": 0.00029980078863384976, + "loss": 4.7216, + "step": 81710 + }, + { + "epoch": 0.1148242666727554, + "grad_norm": 0.7010763883590698, + "learning_rate": 0.00029980040941235175, + "loss": 4.77, + "step": 81720 + }, + { + "epoch": 0.11483831761091898, + "grad_norm": 0.7128811478614807, + "learning_rate": 0.00029980002983049156, + "loss": 4.555, + "step": 81730 + }, + { + "epoch": 0.11485236854908257, + "grad_norm": 0.6962786316871643, + "learning_rate": 0.0002997996498882701, + "loss": 4.699, + "step": 81740 + }, + { + "epoch": 0.11486641948724614, + "grad_norm": 0.7425155639648438, + "learning_rate": 0.0002997992695856883, + "loss": 4.7556, + "step": 81750 + }, + { + "epoch": 0.11488047042540972, + "grad_norm": 0.7460845112800598, + "learning_rate": 0.000299798888922747, + "loss": 4.7343, + "step": 81760 + }, + { + "epoch": 0.1148945213635733, + "grad_norm": 0.7301716804504395, + "learning_rate": 0.0002997985078994472, + "loss": 4.8511, + "step": 81770 + }, + { + "epoch": 0.11490857230173687, + "grad_norm": 0.7091270089149475, + "learning_rate": 0.00029979812651578975, + "loss": 4.6521, + "step": 81780 + }, + { + "epoch": 0.11492262323990045, + "grad_norm": 0.7062439918518066, + "learning_rate": 0.00029979774477177564, + "loss": 4.7847, + "step": 81790 + }, + { + "epoch": 0.11493667417806402, + "grad_norm": 0.6802224516868591, + "learning_rate": 0.00029979736266740574, + "loss": 4.7821, + "step": 81800 + }, + { + "epoch": 0.1149507251162276, + "grad_norm": 0.7181935906410217, + "learning_rate": 0.00029979698020268097, + "loss": 4.7611, + "step": 81810 + }, + { + "epoch": 0.11496477605439118, + "grad_norm": 0.6938358545303345, + "learning_rate": 0.0002997965973776023, + "loss": 4.7246, + "step": 81820 + }, + { + "epoch": 0.11497882699255475, + "grad_norm": 0.6967886686325073, + "learning_rate": 0.0002997962141921706, + "loss": 4.7603, + "step": 81830 + }, + { + "epoch": 0.11499287793071834, + "grad_norm": 0.7130746841430664, + "learning_rate": 0.0002997958306463867, + "loss": 4.7692, + "step": 81840 + }, + { + "epoch": 0.11500692886888192, + "grad_norm": 0.7128005027770996, + "learning_rate": 0.0002997954467402517, + "loss": 4.7132, + "step": 81850 + }, + { + "epoch": 0.1150209798070455, + "grad_norm": 0.7023301124572754, + "learning_rate": 0.00029979506247376645, + "loss": 4.7823, + "step": 81860 + }, + { + "epoch": 0.11503503074520907, + "grad_norm": 0.7159565091133118, + "learning_rate": 0.0002997946778469319, + "loss": 4.7408, + "step": 81870 + }, + { + "epoch": 0.11504908168337265, + "grad_norm": 0.6996265649795532, + "learning_rate": 0.00029979429285974886, + "loss": 4.681, + "step": 81880 + }, + { + "epoch": 0.11506313262153622, + "grad_norm": 0.6762425899505615, + "learning_rate": 0.0002997939075122184, + "loss": 4.6677, + "step": 81890 + }, + { + "epoch": 0.1150771835596998, + "grad_norm": 0.6953459978103638, + "learning_rate": 0.0002997935218043414, + "loss": 4.7948, + "step": 81900 + }, + { + "epoch": 0.11509123449786338, + "grad_norm": 0.7166847586631775, + "learning_rate": 0.00029979313573611876, + "loss": 4.7171, + "step": 81910 + }, + { + "epoch": 0.11510528543602695, + "grad_norm": 0.7089852094650269, + "learning_rate": 0.0002997927493075514, + "loss": 4.6225, + "step": 81920 + }, + { + "epoch": 0.11511933637419053, + "grad_norm": 0.7163013815879822, + "learning_rate": 0.00029979236251864033, + "loss": 4.8264, + "step": 81930 + }, + { + "epoch": 0.11513338731235412, + "grad_norm": 0.7131022810935974, + "learning_rate": 0.0002997919753693864, + "loss": 4.7408, + "step": 81940 + }, + { + "epoch": 0.1151474382505177, + "grad_norm": 0.7089284658432007, + "learning_rate": 0.0002997915878597905, + "loss": 4.7225, + "step": 81950 + }, + { + "epoch": 0.11516148918868127, + "grad_norm": 0.770504355430603, + "learning_rate": 0.00029979119998985377, + "loss": 4.5838, + "step": 81960 + }, + { + "epoch": 0.11517554012684485, + "grad_norm": 0.7346895933151245, + "learning_rate": 0.0002997908117595769, + "loss": 4.6957, + "step": 81970 + }, + { + "epoch": 0.11518959106500842, + "grad_norm": 0.7275788187980652, + "learning_rate": 0.00029979042316896095, + "loss": 4.7569, + "step": 81980 + }, + { + "epoch": 0.115203642003172, + "grad_norm": 0.7080996036529541, + "learning_rate": 0.0002997900342180068, + "loss": 4.6806, + "step": 81990 + }, + { + "epoch": 0.11521769294133558, + "grad_norm": 0.6918423175811768, + "learning_rate": 0.0002997896449067155, + "loss": 4.6623, + "step": 82000 + }, + { + "epoch": 0.11523174387949915, + "grad_norm": 0.7253255844116211, + "learning_rate": 0.00029978925523508785, + "loss": 4.7897, + "step": 82010 + }, + { + "epoch": 0.11524579481766273, + "grad_norm": 0.750601053237915, + "learning_rate": 0.00029978886520312485, + "loss": 4.7144, + "step": 82020 + }, + { + "epoch": 0.1152598457558263, + "grad_norm": 0.7183839678764343, + "learning_rate": 0.0002997884748108274, + "loss": 4.8058, + "step": 82030 + }, + { + "epoch": 0.11527389669398988, + "grad_norm": 0.7069101333618164, + "learning_rate": 0.00029978808405819654, + "loss": 4.6957, + "step": 82040 + }, + { + "epoch": 0.11528794763215347, + "grad_norm": 0.7078718543052673, + "learning_rate": 0.000299787692945233, + "loss": 4.6645, + "step": 82050 + }, + { + "epoch": 0.11530199857031705, + "grad_norm": 0.7066739797592163, + "learning_rate": 0.000299787301471938, + "loss": 4.7598, + "step": 82060 + }, + { + "epoch": 0.11531604950848062, + "grad_norm": 0.7062867283821106, + "learning_rate": 0.0002997869096383123, + "loss": 4.6227, + "step": 82070 + }, + { + "epoch": 0.1153301004466442, + "grad_norm": 0.686747670173645, + "learning_rate": 0.00029978651744435686, + "loss": 4.6724, + "step": 82080 + }, + { + "epoch": 0.11534415138480777, + "grad_norm": 0.729067862033844, + "learning_rate": 0.0002997861248900727, + "loss": 4.7635, + "step": 82090 + }, + { + "epoch": 0.11535820232297135, + "grad_norm": 0.7246768474578857, + "learning_rate": 0.00029978573197546066, + "loss": 4.662, + "step": 82100 + }, + { + "epoch": 0.11537225326113493, + "grad_norm": 0.743782103061676, + "learning_rate": 0.0002997853387005218, + "loss": 4.8008, + "step": 82110 + }, + { + "epoch": 0.1153863041992985, + "grad_norm": 0.6988118290901184, + "learning_rate": 0.00029978494506525694, + "loss": 4.8016, + "step": 82120 + }, + { + "epoch": 0.11540035513746208, + "grad_norm": 0.7115128040313721, + "learning_rate": 0.0002997845510696671, + "loss": 4.6794, + "step": 82130 + }, + { + "epoch": 0.11541440607562566, + "grad_norm": 0.7527329921722412, + "learning_rate": 0.0002997841567137532, + "loss": 4.7338, + "step": 82140 + }, + { + "epoch": 0.11542845701378925, + "grad_norm": 0.738523006439209, + "learning_rate": 0.0002997837619975162, + "loss": 4.6762, + "step": 82150 + }, + { + "epoch": 0.11544250795195282, + "grad_norm": 0.7189664244651794, + "learning_rate": 0.00029978336692095713, + "loss": 4.5602, + "step": 82160 + }, + { + "epoch": 0.1154565588901164, + "grad_norm": 0.7112995386123657, + "learning_rate": 0.00029978297148407684, + "loss": 4.7785, + "step": 82170 + }, + { + "epoch": 0.11547060982827997, + "grad_norm": 0.6832894682884216, + "learning_rate": 0.00029978257568687627, + "loss": 4.6804, + "step": 82180 + }, + { + "epoch": 0.11548466076644355, + "grad_norm": 0.6907115578651428, + "learning_rate": 0.00029978217952935645, + "loss": 4.6355, + "step": 82190 + }, + { + "epoch": 0.11549871170460713, + "grad_norm": 0.6926548480987549, + "learning_rate": 0.00029978178301151825, + "loss": 4.6094, + "step": 82200 + }, + { + "epoch": 0.1155127626427707, + "grad_norm": 0.7217580676078796, + "learning_rate": 0.0002997813861333627, + "loss": 4.6527, + "step": 82210 + }, + { + "epoch": 0.11552681358093428, + "grad_norm": 0.7224411368370056, + "learning_rate": 0.0002997809888948907, + "loss": 4.7685, + "step": 82220 + }, + { + "epoch": 0.11554086451909786, + "grad_norm": 0.7286561131477356, + "learning_rate": 0.00029978059129610326, + "loss": 4.5463, + "step": 82230 + }, + { + "epoch": 0.11555491545726143, + "grad_norm": 0.7249482870101929, + "learning_rate": 0.0002997801933370013, + "loss": 4.8454, + "step": 82240 + }, + { + "epoch": 0.11556896639542502, + "grad_norm": 0.7178618907928467, + "learning_rate": 0.00029977979501758574, + "loss": 4.7182, + "step": 82250 + }, + { + "epoch": 0.1155830173335886, + "grad_norm": 0.7384633421897888, + "learning_rate": 0.00029977939633785757, + "loss": 4.7321, + "step": 82260 + }, + { + "epoch": 0.11559706827175217, + "grad_norm": 0.7196593284606934, + "learning_rate": 0.0002997789972978178, + "loss": 4.7556, + "step": 82270 + }, + { + "epoch": 0.11561111920991575, + "grad_norm": 0.7066304087638855, + "learning_rate": 0.0002997785978974674, + "loss": 4.7843, + "step": 82280 + }, + { + "epoch": 0.11562517014807933, + "grad_norm": 0.736116886138916, + "learning_rate": 0.00029977819813680715, + "loss": 4.8727, + "step": 82290 + }, + { + "epoch": 0.1156392210862429, + "grad_norm": 0.708854079246521, + "learning_rate": 0.00029977779801583824, + "loss": 4.7171, + "step": 82300 + }, + { + "epoch": 0.11565327202440648, + "grad_norm": 0.7159163355827332, + "learning_rate": 0.00029977739753456146, + "loss": 4.7331, + "step": 82310 + }, + { + "epoch": 0.11566732296257005, + "grad_norm": 0.7191676497459412, + "learning_rate": 0.0002997769966929779, + "loss": 4.7406, + "step": 82320 + }, + { + "epoch": 0.11568137390073363, + "grad_norm": 0.7123490571975708, + "learning_rate": 0.0002997765954910885, + "loss": 4.7577, + "step": 82330 + }, + { + "epoch": 0.1156954248388972, + "grad_norm": 0.8625441193580627, + "learning_rate": 0.0002997761939288941, + "loss": 4.661, + "step": 82340 + }, + { + "epoch": 0.11570947577706078, + "grad_norm": 0.6980680227279663, + "learning_rate": 0.0002997757920063958, + "loss": 4.7982, + "step": 82350 + }, + { + "epoch": 0.11572352671522437, + "grad_norm": 0.7026267647743225, + "learning_rate": 0.00029977538972359455, + "loss": 4.6444, + "step": 82360 + }, + { + "epoch": 0.11573757765338795, + "grad_norm": 0.7010255455970764, + "learning_rate": 0.0002997749870804913, + "loss": 4.7762, + "step": 82370 + }, + { + "epoch": 0.11575162859155153, + "grad_norm": 0.7031188011169434, + "learning_rate": 0.000299774584077087, + "loss": 4.7665, + "step": 82380 + }, + { + "epoch": 0.1157656795297151, + "grad_norm": 0.6966001391410828, + "learning_rate": 0.0002997741807133827, + "loss": 4.6379, + "step": 82390 + }, + { + "epoch": 0.11577973046787868, + "grad_norm": 0.7564621567726135, + "learning_rate": 0.00029977381737799296, + "loss": 4.7131, + "step": 82400 + }, + { + "epoch": 0.11579378140604225, + "grad_norm": 0.7084454298019409, + "learning_rate": 0.0002997734133297211, + "loss": 4.6699, + "step": 82410 + }, + { + "epoch": 0.11580783234420583, + "grad_norm": 0.6875171661376953, + "learning_rate": 0.0002997730089211521, + "loss": 4.7788, + "step": 82420 + }, + { + "epoch": 0.1158218832823694, + "grad_norm": 0.706990122795105, + "learning_rate": 0.00029977260415228675, + "loss": 4.6794, + "step": 82430 + }, + { + "epoch": 0.11583593422053298, + "grad_norm": 0.6801334023475647, + "learning_rate": 0.0002997721990231261, + "loss": 4.6598, + "step": 82440 + }, + { + "epoch": 0.11584998515869656, + "grad_norm": 0.7689122557640076, + "learning_rate": 0.0002997717935336711, + "loss": 4.5744, + "step": 82450 + }, + { + "epoch": 0.11586403609686015, + "grad_norm": 0.7127279043197632, + "learning_rate": 0.00029977138768392283, + "loss": 4.6187, + "step": 82460 + }, + { + "epoch": 0.11587808703502372, + "grad_norm": 0.7756818532943726, + "learning_rate": 0.0002997709814738821, + "loss": 4.7612, + "step": 82470 + }, + { + "epoch": 0.1158921379731873, + "grad_norm": 0.7288581132888794, + "learning_rate": 0.00029977057490355004, + "loss": 4.7149, + "step": 82480 + }, + { + "epoch": 0.11590618891135088, + "grad_norm": 0.7076622843742371, + "learning_rate": 0.0002997701679729275, + "loss": 4.6853, + "step": 82490 + }, + { + "epoch": 0.11592023984951445, + "grad_norm": 0.7319123148918152, + "learning_rate": 0.00029976976068201556, + "loss": 4.5778, + "step": 82500 + }, + { + "epoch": 0.11593429078767803, + "grad_norm": 0.719047486782074, + "learning_rate": 0.0002997693530308151, + "loss": 4.6536, + "step": 82510 + }, + { + "epoch": 0.1159483417258416, + "grad_norm": 0.7204477190971375, + "learning_rate": 0.00029976894501932723, + "loss": 4.7233, + "step": 82520 + }, + { + "epoch": 0.11596239266400518, + "grad_norm": 0.7214565277099609, + "learning_rate": 0.0002997685366475528, + "loss": 4.7507, + "step": 82530 + }, + { + "epoch": 0.11597644360216876, + "grad_norm": 0.7362378239631653, + "learning_rate": 0.0002997681279154929, + "loss": 4.8398, + "step": 82540 + }, + { + "epoch": 0.11599049454033233, + "grad_norm": 0.7328546643257141, + "learning_rate": 0.0002997677188231484, + "loss": 4.7098, + "step": 82550 + }, + { + "epoch": 0.11600454547849592, + "grad_norm": 0.7075091004371643, + "learning_rate": 0.0002997673093705204, + "loss": 4.7569, + "step": 82560 + }, + { + "epoch": 0.1160185964166595, + "grad_norm": 0.7006434202194214, + "learning_rate": 0.00029976689955760985, + "loss": 4.725, + "step": 82570 + }, + { + "epoch": 0.11603264735482308, + "grad_norm": 0.70038241147995, + "learning_rate": 0.0002997664893844177, + "loss": 4.6548, + "step": 82580 + }, + { + "epoch": 0.11604669829298665, + "grad_norm": 0.7156614065170288, + "learning_rate": 0.00029976607885094495, + "loss": 4.7975, + "step": 82590 + }, + { + "epoch": 0.11606074923115023, + "grad_norm": 0.7034133076667786, + "learning_rate": 0.0002997656679571926, + "loss": 4.7715, + "step": 82600 + }, + { + "epoch": 0.1160748001693138, + "grad_norm": 0.7613126635551453, + "learning_rate": 0.0002997652567031616, + "loss": 4.8339, + "step": 82610 + }, + { + "epoch": 0.11608885110747738, + "grad_norm": 0.7240241765975952, + "learning_rate": 0.000299764845088853, + "loss": 4.6923, + "step": 82620 + }, + { + "epoch": 0.11610290204564096, + "grad_norm": 0.6997239589691162, + "learning_rate": 0.00029976443311426776, + "loss": 4.7306, + "step": 82630 + }, + { + "epoch": 0.11611695298380453, + "grad_norm": 0.7083427309989929, + "learning_rate": 0.0002997640207794069, + "loss": 4.6962, + "step": 82640 + }, + { + "epoch": 0.11613100392196811, + "grad_norm": 0.7667652368545532, + "learning_rate": 0.00029976360808427134, + "loss": 4.6436, + "step": 82650 + }, + { + "epoch": 0.11614505486013169, + "grad_norm": 0.6891388297080994, + "learning_rate": 0.0002997631950288622, + "loss": 4.6205, + "step": 82660 + }, + { + "epoch": 0.11615910579829528, + "grad_norm": 0.7161304354667664, + "learning_rate": 0.00029976278161318034, + "loss": 4.7486, + "step": 82670 + }, + { + "epoch": 0.11617315673645885, + "grad_norm": 0.715480625629425, + "learning_rate": 0.00029976236783722674, + "loss": 4.6952, + "step": 82680 + }, + { + "epoch": 0.11618720767462243, + "grad_norm": 0.7926939725875854, + "learning_rate": 0.00029976195370100256, + "loss": 4.7267, + "step": 82690 + }, + { + "epoch": 0.116201258612786, + "grad_norm": 0.7394651770591736, + "learning_rate": 0.00029976153920450863, + "loss": 4.6537, + "step": 82700 + }, + { + "epoch": 0.11621530955094958, + "grad_norm": 0.7032037973403931, + "learning_rate": 0.0002997611243477461, + "loss": 4.8165, + "step": 82710 + }, + { + "epoch": 0.11622936048911316, + "grad_norm": 0.7072632908821106, + "learning_rate": 0.00029976070913071583, + "loss": 4.6947, + "step": 82720 + }, + { + "epoch": 0.11624341142727673, + "grad_norm": 0.7013282179832458, + "learning_rate": 0.0002997602935534188, + "loss": 4.6732, + "step": 82730 + }, + { + "epoch": 0.11625746236544031, + "grad_norm": 0.7069587111473083, + "learning_rate": 0.0002997598776158562, + "loss": 4.6504, + "step": 82740 + }, + { + "epoch": 0.11627151330360389, + "grad_norm": 0.7177491188049316, + "learning_rate": 0.0002997594613180289, + "loss": 4.5903, + "step": 82750 + }, + { + "epoch": 0.11628556424176746, + "grad_norm": 0.7011220455169678, + "learning_rate": 0.0002997590446599379, + "loss": 4.8592, + "step": 82760 + }, + { + "epoch": 0.11629961517993105, + "grad_norm": 0.7004333138465881, + "learning_rate": 0.00029975862764158417, + "loss": 4.8258, + "step": 82770 + }, + { + "epoch": 0.11631366611809463, + "grad_norm": 0.7188377976417542, + "learning_rate": 0.0002997582102629688, + "loss": 4.7412, + "step": 82780 + }, + { + "epoch": 0.1163277170562582, + "grad_norm": 0.7212134003639221, + "learning_rate": 0.00029975779252409274, + "loss": 4.7791, + "step": 82790 + }, + { + "epoch": 0.11634176799442178, + "grad_norm": 0.7132065892219543, + "learning_rate": 0.00029975737442495696, + "loss": 4.8067, + "step": 82800 + }, + { + "epoch": 0.11635581893258536, + "grad_norm": 0.7281680107116699, + "learning_rate": 0.0002997569559655626, + "loss": 4.7152, + "step": 82810 + }, + { + "epoch": 0.11636986987074893, + "grad_norm": 0.6972888708114624, + "learning_rate": 0.0002997565371459105, + "loss": 4.7821, + "step": 82820 + }, + { + "epoch": 0.11638392080891251, + "grad_norm": 1.2375811338424683, + "learning_rate": 0.0002997561179660018, + "loss": 4.7507, + "step": 82830 + }, + { + "epoch": 0.11639797174707608, + "grad_norm": 0.6999583840370178, + "learning_rate": 0.0002997556984258375, + "loss": 4.7406, + "step": 82840 + }, + { + "epoch": 0.11641202268523966, + "grad_norm": 0.7113127708435059, + "learning_rate": 0.0002997552785254185, + "loss": 4.7202, + "step": 82850 + }, + { + "epoch": 0.11642607362340324, + "grad_norm": 0.6991801857948303, + "learning_rate": 0.0002997548582647459, + "loss": 4.6655, + "step": 82860 + }, + { + "epoch": 0.11644012456156683, + "grad_norm": 0.7328364849090576, + "learning_rate": 0.00029975443764382066, + "loss": 4.7223, + "step": 82870 + }, + { + "epoch": 0.1164541754997304, + "grad_norm": 0.7123426795005798, + "learning_rate": 0.0002997540166626438, + "loss": 4.6433, + "step": 82880 + }, + { + "epoch": 0.11646822643789398, + "grad_norm": 0.7378365397453308, + "learning_rate": 0.0002997535953212164, + "loss": 4.8043, + "step": 82890 + }, + { + "epoch": 0.11648227737605756, + "grad_norm": 0.7234169840812683, + "learning_rate": 0.0002997531736195394, + "loss": 4.726, + "step": 82900 + }, + { + "epoch": 0.11649632831422113, + "grad_norm": 0.709543764591217, + "learning_rate": 0.0002997527515576139, + "loss": 4.6436, + "step": 82910 + }, + { + "epoch": 0.11651037925238471, + "grad_norm": 0.7029217481613159, + "learning_rate": 0.0002997523291354408, + "loss": 4.7416, + "step": 82920 + }, + { + "epoch": 0.11652443019054828, + "grad_norm": 0.7221389412879944, + "learning_rate": 0.0002997519063530212, + "loss": 4.8052, + "step": 82930 + }, + { + "epoch": 0.11653848112871186, + "grad_norm": 0.7333707809448242, + "learning_rate": 0.00029975148321035607, + "loss": 4.6743, + "step": 82940 + }, + { + "epoch": 0.11655253206687544, + "grad_norm": 0.712275505065918, + "learning_rate": 0.00029975105970744646, + "loss": 4.7026, + "step": 82950 + }, + { + "epoch": 0.11656658300503901, + "grad_norm": 0.7303036451339722, + "learning_rate": 0.00029975063584429334, + "loss": 4.7575, + "step": 82960 + }, + { + "epoch": 0.11658063394320259, + "grad_norm": 0.7000608444213867, + "learning_rate": 0.00029975021162089783, + "loss": 4.8079, + "step": 82970 + }, + { + "epoch": 0.11659468488136618, + "grad_norm": 0.7333692312240601, + "learning_rate": 0.0002997497870372608, + "loss": 4.5493, + "step": 82980 + }, + { + "epoch": 0.11660873581952975, + "grad_norm": 0.7260574102401733, + "learning_rate": 0.0002997493620933834, + "loss": 4.6226, + "step": 82990 + }, + { + "epoch": 0.11662278675769333, + "grad_norm": 0.712442934513092, + "learning_rate": 0.0002997489367892666, + "loss": 4.8145, + "step": 83000 + }, + { + "epoch": 0.11663683769585691, + "grad_norm": 0.7070791125297546, + "learning_rate": 0.00029974851112491146, + "loss": 4.6327, + "step": 83010 + }, + { + "epoch": 0.11665088863402048, + "grad_norm": 0.7452596426010132, + "learning_rate": 0.0002997480851003189, + "loss": 4.6218, + "step": 83020 + }, + { + "epoch": 0.11666493957218406, + "grad_norm": 0.7247107625007629, + "learning_rate": 0.0002997476587154901, + "loss": 4.7157, + "step": 83030 + }, + { + "epoch": 0.11667899051034764, + "grad_norm": 0.7191839814186096, + "learning_rate": 0.000299747231970426, + "loss": 4.5815, + "step": 83040 + }, + { + "epoch": 0.11669304144851121, + "grad_norm": 0.7884148955345154, + "learning_rate": 0.0002997468048651276, + "loss": 4.5333, + "step": 83050 + }, + { + "epoch": 0.11670709238667479, + "grad_norm": 0.7189491987228394, + "learning_rate": 0.00029974637739959594, + "loss": 4.7521, + "step": 83060 + }, + { + "epoch": 0.11672114332483836, + "grad_norm": 0.7118692994117737, + "learning_rate": 0.00029974594957383213, + "loss": 4.7522, + "step": 83070 + }, + { + "epoch": 0.11673519426300195, + "grad_norm": 0.7117485404014587, + "learning_rate": 0.0002997455213878371, + "loss": 4.76, + "step": 83080 + }, + { + "epoch": 0.11674924520116553, + "grad_norm": 0.7011753916740417, + "learning_rate": 0.00029974509284161197, + "loss": 4.6493, + "step": 83090 + }, + { + "epoch": 0.1167632961393291, + "grad_norm": 0.7218371033668518, + "learning_rate": 0.0002997446639351576, + "loss": 4.6864, + "step": 83100 + }, + { + "epoch": 0.11677734707749268, + "grad_norm": 0.7155550718307495, + "learning_rate": 0.0002997442346684753, + "loss": 4.7834, + "step": 83110 + }, + { + "epoch": 0.11679139801565626, + "grad_norm": 0.7008214592933655, + "learning_rate": 0.00029974380504156585, + "loss": 4.6983, + "step": 83120 + }, + { + "epoch": 0.11680544895381983, + "grad_norm": 0.697800874710083, + "learning_rate": 0.0002997433750544304, + "loss": 4.7975, + "step": 83130 + }, + { + "epoch": 0.11681949989198341, + "grad_norm": 0.7439045310020447, + "learning_rate": 0.00029974294470706993, + "loss": 4.7438, + "step": 83140 + }, + { + "epoch": 0.11683355083014699, + "grad_norm": 0.7132053971290588, + "learning_rate": 0.00029974251399948553, + "loss": 4.7585, + "step": 83150 + }, + { + "epoch": 0.11684760176831056, + "grad_norm": 0.6998278498649597, + "learning_rate": 0.0002997420829316782, + "loss": 4.7226, + "step": 83160 + }, + { + "epoch": 0.11686165270647414, + "grad_norm": 0.6966537833213806, + "learning_rate": 0.000299741651503649, + "loss": 4.6966, + "step": 83170 + }, + { + "epoch": 0.11687570364463773, + "grad_norm": 0.6955662965774536, + "learning_rate": 0.00029974121971539894, + "loss": 4.6016, + "step": 83180 + }, + { + "epoch": 0.1168897545828013, + "grad_norm": 0.6958626508712769, + "learning_rate": 0.0002997407875669291, + "loss": 4.6607, + "step": 83190 + }, + { + "epoch": 0.11690380552096488, + "grad_norm": 0.7100691199302673, + "learning_rate": 0.00029974035505824046, + "loss": 4.7568, + "step": 83200 + }, + { + "epoch": 0.11691785645912846, + "grad_norm": 0.7105290293693542, + "learning_rate": 0.00029973992218933416, + "loss": 4.8082, + "step": 83210 + }, + { + "epoch": 0.11693190739729203, + "grad_norm": 0.7148688435554504, + "learning_rate": 0.00029973948896021117, + "loss": 4.6506, + "step": 83220 + }, + { + "epoch": 0.11694595833545561, + "grad_norm": 0.7135083079338074, + "learning_rate": 0.0002997390553708725, + "loss": 4.7098, + "step": 83230 + }, + { + "epoch": 0.11696000927361919, + "grad_norm": 0.6871196627616882, + "learning_rate": 0.0002997386214213192, + "loss": 4.7369, + "step": 83240 + }, + { + "epoch": 0.11697406021178276, + "grad_norm": 0.7220321297645569, + "learning_rate": 0.00029973818711155246, + "loss": 4.7927, + "step": 83250 + }, + { + "epoch": 0.11698811114994634, + "grad_norm": 0.6984358429908752, + "learning_rate": 0.0002997377524415731, + "loss": 4.7761, + "step": 83260 + }, + { + "epoch": 0.11700216208810992, + "grad_norm": 0.7106263637542725, + "learning_rate": 0.00029973731741138234, + "loss": 4.7305, + "step": 83270 + }, + { + "epoch": 0.11701621302627349, + "grad_norm": 0.715644121170044, + "learning_rate": 0.00029973688202098115, + "loss": 4.781, + "step": 83280 + }, + { + "epoch": 0.11703026396443708, + "grad_norm": 0.7058255076408386, + "learning_rate": 0.0002997364462703706, + "loss": 4.6378, + "step": 83290 + }, + { + "epoch": 0.11704431490260066, + "grad_norm": 0.6990760564804077, + "learning_rate": 0.0002997360101595517, + "loss": 4.7514, + "step": 83300 + }, + { + "epoch": 0.11705836584076423, + "grad_norm": 0.7147160172462463, + "learning_rate": 0.00029973557368852553, + "loss": 4.6166, + "step": 83310 + }, + { + "epoch": 0.11707241677892781, + "grad_norm": 0.712631344795227, + "learning_rate": 0.00029973513685729316, + "loss": 4.6634, + "step": 83320 + }, + { + "epoch": 0.11708646771709139, + "grad_norm": 0.7080478668212891, + "learning_rate": 0.0002997346996658556, + "loss": 4.6785, + "step": 83330 + }, + { + "epoch": 0.11710051865525496, + "grad_norm": 0.6984758973121643, + "learning_rate": 0.0002997342621142139, + "loss": 4.7836, + "step": 83340 + }, + { + "epoch": 0.11711456959341854, + "grad_norm": 0.7084048986434937, + "learning_rate": 0.00029973382420236917, + "loss": 4.8846, + "step": 83350 + }, + { + "epoch": 0.11712862053158211, + "grad_norm": 0.7941184043884277, + "learning_rate": 0.0002997333859303225, + "loss": 4.5988, + "step": 83360 + }, + { + "epoch": 0.11714267146974569, + "grad_norm": 0.7254170775413513, + "learning_rate": 0.0002997329472980747, + "loss": 4.7335, + "step": 83370 + }, + { + "epoch": 0.11715672240790927, + "grad_norm": 0.7091358304023743, + "learning_rate": 0.0002997325083056271, + "loss": 4.7572, + "step": 83380 + }, + { + "epoch": 0.11717077334607286, + "grad_norm": 0.7274218797683716, + "learning_rate": 0.0002997320689529807, + "loss": 4.6113, + "step": 83390 + }, + { + "epoch": 0.11718482428423643, + "grad_norm": 0.699985146522522, + "learning_rate": 0.0002997316292401364, + "loss": 4.7306, + "step": 83400 + }, + { + "epoch": 0.11719887522240001, + "grad_norm": 0.7258591055870056, + "learning_rate": 0.0002997311891670954, + "loss": 4.7404, + "step": 83410 + }, + { + "epoch": 0.11721292616056359, + "grad_norm": 0.7040428519248962, + "learning_rate": 0.00029973074873385877, + "loss": 4.8231, + "step": 83420 + }, + { + "epoch": 0.11722697709872716, + "grad_norm": 0.7231871485710144, + "learning_rate": 0.0002997303079404275, + "loss": 4.7028, + "step": 83430 + }, + { + "epoch": 0.11724102803689074, + "grad_norm": 0.701480507850647, + "learning_rate": 0.0002997298667868027, + "loss": 4.9066, + "step": 83440 + }, + { + "epoch": 0.11725507897505431, + "grad_norm": 0.739446759223938, + "learning_rate": 0.0002997294252729854, + "loss": 4.8486, + "step": 83450 + }, + { + "epoch": 0.11726912991321789, + "grad_norm": 0.7062489986419678, + "learning_rate": 0.00029972898339897664, + "loss": 4.7646, + "step": 83460 + }, + { + "epoch": 0.11728318085138147, + "grad_norm": 0.7041696906089783, + "learning_rate": 0.0002997285411647775, + "loss": 4.7471, + "step": 83470 + }, + { + "epoch": 0.11729723178954504, + "grad_norm": 0.7124684453010559, + "learning_rate": 0.00029972809857038913, + "loss": 4.719, + "step": 83480 + }, + { + "epoch": 0.11731128272770863, + "grad_norm": 0.7017519474029541, + "learning_rate": 0.0002997276556158124, + "loss": 4.6908, + "step": 83490 + }, + { + "epoch": 0.11732533366587221, + "grad_norm": 0.7156013250350952, + "learning_rate": 0.0002997272123010486, + "loss": 4.7906, + "step": 83500 + }, + { + "epoch": 0.11733938460403578, + "grad_norm": 0.715263307094574, + "learning_rate": 0.0002997267686260987, + "loss": 4.6222, + "step": 83510 + }, + { + "epoch": 0.11735343554219936, + "grad_norm": 0.7254456877708435, + "learning_rate": 0.0002997263245909637, + "loss": 4.585, + "step": 83520 + }, + { + "epoch": 0.11736748648036294, + "grad_norm": 0.7293530106544495, + "learning_rate": 0.00029972588019564476, + "loss": 4.6986, + "step": 83530 + }, + { + "epoch": 0.11738153741852651, + "grad_norm": 0.7089353799819946, + "learning_rate": 0.00029972543544014285, + "loss": 4.6565, + "step": 83540 + }, + { + "epoch": 0.11739558835669009, + "grad_norm": 0.7219105958938599, + "learning_rate": 0.0002997249903244592, + "loss": 4.6934, + "step": 83550 + }, + { + "epoch": 0.11740963929485367, + "grad_norm": 0.7276551723480225, + "learning_rate": 0.00029972454484859475, + "loss": 4.6889, + "step": 83560 + }, + { + "epoch": 0.11742369023301724, + "grad_norm": 0.6939069628715515, + "learning_rate": 0.00029972409901255056, + "loss": 4.7927, + "step": 83570 + }, + { + "epoch": 0.11743774117118082, + "grad_norm": 0.7165120244026184, + "learning_rate": 0.0002997236528163278, + "loss": 4.757, + "step": 83580 + }, + { + "epoch": 0.1174517921093444, + "grad_norm": 0.774430513381958, + "learning_rate": 0.00029972320625992754, + "loss": 4.657, + "step": 83590 + }, + { + "epoch": 0.11746584304750798, + "grad_norm": 0.7265297770500183, + "learning_rate": 0.00029972275934335077, + "loss": 4.6596, + "step": 83600 + }, + { + "epoch": 0.11747989398567156, + "grad_norm": 0.7017237544059753, + "learning_rate": 0.00029972231206659857, + "loss": 4.7227, + "step": 83610 + }, + { + "epoch": 0.11749394492383514, + "grad_norm": 0.703671395778656, + "learning_rate": 0.00029972186442967213, + "loss": 4.6661, + "step": 83620 + }, + { + "epoch": 0.11750799586199871, + "grad_norm": 0.7623944878578186, + "learning_rate": 0.00029972141643257237, + "loss": 4.6292, + "step": 83630 + }, + { + "epoch": 0.11752204680016229, + "grad_norm": 0.73209547996521, + "learning_rate": 0.00029972096807530043, + "loss": 4.6943, + "step": 83640 + }, + { + "epoch": 0.11753609773832586, + "grad_norm": 0.7071471810340881, + "learning_rate": 0.0002997205193578575, + "loss": 4.6732, + "step": 83650 + }, + { + "epoch": 0.11755014867648944, + "grad_norm": 0.7387067675590515, + "learning_rate": 0.00029972007028024447, + "loss": 4.6494, + "step": 83660 + }, + { + "epoch": 0.11756419961465302, + "grad_norm": 0.7155556678771973, + "learning_rate": 0.00029971962084246256, + "loss": 4.6376, + "step": 83670 + }, + { + "epoch": 0.1175782505528166, + "grad_norm": 0.7402138113975525, + "learning_rate": 0.0002997191710445128, + "loss": 4.6761, + "step": 83680 + }, + { + "epoch": 0.11759230149098017, + "grad_norm": 0.7761551141738892, + "learning_rate": 0.0002997187208863963, + "loss": 4.6724, + "step": 83690 + }, + { + "epoch": 0.11760635242914376, + "grad_norm": 0.7482668161392212, + "learning_rate": 0.0002997182703681141, + "loss": 4.7362, + "step": 83700 + }, + { + "epoch": 0.11762040336730734, + "grad_norm": 0.7007404565811157, + "learning_rate": 0.00029971781948966733, + "loss": 4.8267, + "step": 83710 + }, + { + "epoch": 0.11763445430547091, + "grad_norm": 0.6935082077980042, + "learning_rate": 0.000299717368251057, + "loss": 4.6847, + "step": 83720 + }, + { + "epoch": 0.11764850524363449, + "grad_norm": 0.7338686585426331, + "learning_rate": 0.00029971691665228424, + "loss": 4.8848, + "step": 83730 + }, + { + "epoch": 0.11766255618179806, + "grad_norm": 0.7095941305160522, + "learning_rate": 0.00029971646469335015, + "loss": 4.7018, + "step": 83740 + }, + { + "epoch": 0.11767660711996164, + "grad_norm": 0.7719713449478149, + "learning_rate": 0.00029971601237425586, + "loss": 4.6187, + "step": 83750 + }, + { + "epoch": 0.11769065805812522, + "grad_norm": 0.7228941321372986, + "learning_rate": 0.00029971555969500235, + "loss": 4.6907, + "step": 83760 + }, + { + "epoch": 0.11770470899628879, + "grad_norm": 0.7366059422492981, + "learning_rate": 0.0002997151066555908, + "loss": 4.7233, + "step": 83770 + }, + { + "epoch": 0.11771875993445237, + "grad_norm": 0.8631296157836914, + "learning_rate": 0.0002997146532560222, + "loss": 4.7148, + "step": 83780 + }, + { + "epoch": 0.11773281087261595, + "grad_norm": 0.7031307816505432, + "learning_rate": 0.00029971419949629776, + "loss": 4.7728, + "step": 83790 + }, + { + "epoch": 0.11774686181077954, + "grad_norm": 0.7433533668518066, + "learning_rate": 0.00029971374537641853, + "loss": 4.6781, + "step": 83800 + }, + { + "epoch": 0.11776091274894311, + "grad_norm": 0.760563850402832, + "learning_rate": 0.00029971329089638555, + "loss": 4.7035, + "step": 83810 + }, + { + "epoch": 0.11777496368710669, + "grad_norm": 0.7019788026809692, + "learning_rate": 0.00029971283605619996, + "loss": 4.6762, + "step": 83820 + }, + { + "epoch": 0.11778901462527026, + "grad_norm": 0.6909261345863342, + "learning_rate": 0.00029971238085586284, + "loss": 4.6912, + "step": 83830 + }, + { + "epoch": 0.11780306556343384, + "grad_norm": 0.7148104310035706, + "learning_rate": 0.00029971192529537534, + "loss": 4.7071, + "step": 83840 + }, + { + "epoch": 0.11781711650159742, + "grad_norm": 0.7196040749549866, + "learning_rate": 0.00029971146937473844, + "loss": 4.7632, + "step": 83850 + }, + { + "epoch": 0.11783116743976099, + "grad_norm": 0.7324225902557373, + "learning_rate": 0.0002997110130939533, + "loss": 4.5944, + "step": 83860 + }, + { + "epoch": 0.11784521837792457, + "grad_norm": 0.7406251430511475, + "learning_rate": 0.00029971055645302115, + "loss": 4.7681, + "step": 83870 + }, + { + "epoch": 0.11785926931608814, + "grad_norm": 0.6971359252929688, + "learning_rate": 0.00029971009945194284, + "loss": 4.5862, + "step": 83880 + }, + { + "epoch": 0.11787332025425172, + "grad_norm": 0.7004740834236145, + "learning_rate": 0.00029970964209071957, + "loss": 4.6547, + "step": 83890 + }, + { + "epoch": 0.1178873711924153, + "grad_norm": 0.736107349395752, + "learning_rate": 0.0002997091843693525, + "loss": 4.5973, + "step": 83900 + }, + { + "epoch": 0.11790142213057889, + "grad_norm": 0.7069373726844788, + "learning_rate": 0.00029970872628784273, + "loss": 4.6779, + "step": 83910 + }, + { + "epoch": 0.11791547306874246, + "grad_norm": 0.7382087707519531, + "learning_rate": 0.0002997082678461913, + "loss": 4.5928, + "step": 83920 + }, + { + "epoch": 0.11792952400690604, + "grad_norm": 0.7047857642173767, + "learning_rate": 0.0002997078090443993, + "loss": 4.7603, + "step": 83930 + }, + { + "epoch": 0.11794357494506962, + "grad_norm": 0.7913258075714111, + "learning_rate": 0.0002997073498824679, + "loss": 4.5967, + "step": 83940 + }, + { + "epoch": 0.11795762588323319, + "grad_norm": 0.7597509026527405, + "learning_rate": 0.0002997068903603982, + "loss": 4.6874, + "step": 83950 + }, + { + "epoch": 0.11797167682139677, + "grad_norm": 0.7429311871528625, + "learning_rate": 0.00029970643047819124, + "loss": 4.6269, + "step": 83960 + }, + { + "epoch": 0.11798572775956034, + "grad_norm": 0.7132638692855835, + "learning_rate": 0.00029970597023584814, + "loss": 4.6915, + "step": 83970 + }, + { + "epoch": 0.11799977869772392, + "grad_norm": 0.7404999136924744, + "learning_rate": 0.0002997055096333701, + "loss": 4.7002, + "step": 83980 + }, + { + "epoch": 0.1180138296358875, + "grad_norm": 0.7566960453987122, + "learning_rate": 0.0002997050486707581, + "loss": 4.6367, + "step": 83990 + }, + { + "epoch": 0.11802788057405107, + "grad_norm": 0.7191115617752075, + "learning_rate": 0.0002997045873480134, + "loss": 4.6247, + "step": 84000 + }, + { + "epoch": 0.11804193151221466, + "grad_norm": 0.7127372026443481, + "learning_rate": 0.0002997041256651369, + "loss": 4.5988, + "step": 84010 + }, + { + "epoch": 0.11805598245037824, + "grad_norm": 0.7138316035270691, + "learning_rate": 0.0002997036636221299, + "loss": 4.694, + "step": 84020 + }, + { + "epoch": 0.11807003338854181, + "grad_norm": 0.7099950313568115, + "learning_rate": 0.0002997032012189934, + "loss": 4.7058, + "step": 84030 + }, + { + "epoch": 0.11808408432670539, + "grad_norm": 0.7554578185081482, + "learning_rate": 0.00029970273845572864, + "loss": 4.6664, + "step": 84040 + }, + { + "epoch": 0.11809813526486897, + "grad_norm": 0.7338352799415588, + "learning_rate": 0.00029970227533233657, + "loss": 4.6666, + "step": 84050 + }, + { + "epoch": 0.11811218620303254, + "grad_norm": 0.7148082852363586, + "learning_rate": 0.0002997018118488184, + "loss": 4.6799, + "step": 84060 + }, + { + "epoch": 0.11812623714119612, + "grad_norm": 0.7274507284164429, + "learning_rate": 0.0002997013480051753, + "loss": 4.6302, + "step": 84070 + }, + { + "epoch": 0.1181402880793597, + "grad_norm": 0.7034341096878052, + "learning_rate": 0.0002997008838014082, + "loss": 4.743, + "step": 84080 + }, + { + "epoch": 0.11815433901752327, + "grad_norm": 0.7157617807388306, + "learning_rate": 0.00029970041923751837, + "loss": 4.6052, + "step": 84090 + }, + { + "epoch": 0.11816838995568685, + "grad_norm": 0.8156205415725708, + "learning_rate": 0.00029969995431350686, + "loss": 4.7544, + "step": 84100 + }, + { + "epoch": 0.11818244089385044, + "grad_norm": 0.7375916838645935, + "learning_rate": 0.0002996994890293749, + "loss": 4.7907, + "step": 84110 + }, + { + "epoch": 0.11819649183201401, + "grad_norm": 0.6925612092018127, + "learning_rate": 0.00029969902338512343, + "loss": 4.7709, + "step": 84120 + }, + { + "epoch": 0.11821054277017759, + "grad_norm": 0.7296881079673767, + "learning_rate": 0.0002996985573807537, + "loss": 4.7123, + "step": 84130 + }, + { + "epoch": 0.11822459370834117, + "grad_norm": 0.717957079410553, + "learning_rate": 0.0002996980910162668, + "loss": 4.6673, + "step": 84140 + }, + { + "epoch": 0.11823864464650474, + "grad_norm": 0.6915928721427917, + "learning_rate": 0.0002996976242916638, + "loss": 4.6409, + "step": 84150 + }, + { + "epoch": 0.11825269558466832, + "grad_norm": 0.69838547706604, + "learning_rate": 0.00029969715720694595, + "loss": 4.8507, + "step": 84160 + }, + { + "epoch": 0.1182667465228319, + "grad_norm": 0.701752245426178, + "learning_rate": 0.0002996966897621143, + "loss": 4.5703, + "step": 84170 + }, + { + "epoch": 0.11828079746099547, + "grad_norm": 0.7115376591682434, + "learning_rate": 0.0002996962219571699, + "loss": 4.7654, + "step": 84180 + }, + { + "epoch": 0.11829484839915905, + "grad_norm": 0.7013298869132996, + "learning_rate": 0.00029969575379211396, + "loss": 4.6597, + "step": 84190 + }, + { + "epoch": 0.11830889933732262, + "grad_norm": 0.7234477996826172, + "learning_rate": 0.0002996952852669476, + "loss": 4.6087, + "step": 84200 + }, + { + "epoch": 0.1183229502754862, + "grad_norm": 0.6927001476287842, + "learning_rate": 0.00029969481638167193, + "loss": 4.622, + "step": 84210 + }, + { + "epoch": 0.11833700121364979, + "grad_norm": 0.7288459539413452, + "learning_rate": 0.00029969434713628807, + "loss": 4.6142, + "step": 84220 + }, + { + "epoch": 0.11835105215181337, + "grad_norm": 0.7043154239654541, + "learning_rate": 0.0002996938775307972, + "loss": 4.6994, + "step": 84230 + }, + { + "epoch": 0.11836510308997694, + "grad_norm": 0.7041778564453125, + "learning_rate": 0.0002996934075652004, + "loss": 4.6914, + "step": 84240 + }, + { + "epoch": 0.11837915402814052, + "grad_norm": 0.7131189703941345, + "learning_rate": 0.00029969293723949885, + "loss": 4.6957, + "step": 84250 + }, + { + "epoch": 0.1183932049663041, + "grad_norm": 0.7089633345603943, + "learning_rate": 0.0002996924665536936, + "loss": 4.7275, + "step": 84260 + }, + { + "epoch": 0.11840725590446767, + "grad_norm": 0.7021405696868896, + "learning_rate": 0.00029969199550778584, + "loss": 4.7756, + "step": 84270 + }, + { + "epoch": 0.11842130684263125, + "grad_norm": 0.706847071647644, + "learning_rate": 0.00029969152410177667, + "loss": 4.792, + "step": 84280 + }, + { + "epoch": 0.11843535778079482, + "grad_norm": 0.7018424272537231, + "learning_rate": 0.0002996910523356672, + "loss": 4.6697, + "step": 84290 + }, + { + "epoch": 0.1184494087189584, + "grad_norm": 0.7254888415336609, + "learning_rate": 0.0002996905802094587, + "loss": 4.81, + "step": 84300 + }, + { + "epoch": 0.11846345965712198, + "grad_norm": 0.7102382779121399, + "learning_rate": 0.0002996901077231522, + "loss": 4.6677, + "step": 84310 + }, + { + "epoch": 0.11847751059528557, + "grad_norm": 0.6817678213119507, + "learning_rate": 0.00029968963487674884, + "loss": 4.6627, + "step": 84320 + }, + { + "epoch": 0.11849156153344914, + "grad_norm": 0.7572276592254639, + "learning_rate": 0.00029968916167024973, + "loss": 4.8695, + "step": 84330 + }, + { + "epoch": 0.11850561247161272, + "grad_norm": 0.7169764041900635, + "learning_rate": 0.00029968868810365613, + "loss": 4.6564, + "step": 84340 + }, + { + "epoch": 0.1185196634097763, + "grad_norm": 0.708909273147583, + "learning_rate": 0.000299688214176969, + "loss": 4.5852, + "step": 84350 + }, + { + "epoch": 0.11853371434793987, + "grad_norm": 0.7388038039207458, + "learning_rate": 0.0002996877398901896, + "loss": 4.7354, + "step": 84360 + }, + { + "epoch": 0.11854776528610345, + "grad_norm": 0.7097702622413635, + "learning_rate": 0.00029968726524331915, + "loss": 4.7295, + "step": 84370 + }, + { + "epoch": 0.11856181622426702, + "grad_norm": 0.7070819735527039, + "learning_rate": 0.00029968679023635857, + "loss": 4.6223, + "step": 84380 + }, + { + "epoch": 0.1185758671624306, + "grad_norm": 0.7111035585403442, + "learning_rate": 0.0002996863148693092, + "loss": 4.5126, + "step": 84390 + }, + { + "epoch": 0.11858991810059417, + "grad_norm": 0.7140163779258728, + "learning_rate": 0.00029968583914217204, + "loss": 4.7595, + "step": 84400 + }, + { + "epoch": 0.11860396903875775, + "grad_norm": 0.7231415510177612, + "learning_rate": 0.00029968536305494836, + "loss": 4.7312, + "step": 84410 + }, + { + "epoch": 0.11861801997692134, + "grad_norm": 0.74226975440979, + "learning_rate": 0.00029968488660763923, + "loss": 4.6488, + "step": 84420 + }, + { + "epoch": 0.11863207091508492, + "grad_norm": 0.7267938852310181, + "learning_rate": 0.0002996844098002458, + "loss": 4.5626, + "step": 84430 + }, + { + "epoch": 0.11864612185324849, + "grad_norm": 0.7126814126968384, + "learning_rate": 0.00029968393263276927, + "loss": 4.6538, + "step": 84440 + }, + { + "epoch": 0.11866017279141207, + "grad_norm": 0.7250803709030151, + "learning_rate": 0.0002996834551052107, + "loss": 4.6957, + "step": 84450 + }, + { + "epoch": 0.11867422372957565, + "grad_norm": 0.7476853728294373, + "learning_rate": 0.0002996829772175713, + "loss": 4.6687, + "step": 84460 + }, + { + "epoch": 0.11868827466773922, + "grad_norm": 0.7315259575843811, + "learning_rate": 0.0002996824989698522, + "loss": 4.7679, + "step": 84470 + }, + { + "epoch": 0.1187023256059028, + "grad_norm": 0.7134382128715515, + "learning_rate": 0.00029968202036205455, + "loss": 4.7786, + "step": 84480 + }, + { + "epoch": 0.11871637654406637, + "grad_norm": 0.7032891511917114, + "learning_rate": 0.0002996815413941795, + "loss": 4.6831, + "step": 84490 + }, + { + "epoch": 0.11873042748222995, + "grad_norm": 0.708054780960083, + "learning_rate": 0.0002996810620662282, + "loss": 4.7639, + "step": 84500 + }, + { + "epoch": 0.11874447842039353, + "grad_norm": 0.7284858822822571, + "learning_rate": 0.00029968058237820185, + "loss": 4.7167, + "step": 84510 + }, + { + "epoch": 0.1187585293585571, + "grad_norm": 0.7195180058479309, + "learning_rate": 0.0002996801023301015, + "loss": 4.7123, + "step": 84520 + }, + { + "epoch": 0.11877258029672069, + "grad_norm": 0.7547293305397034, + "learning_rate": 0.00029967962192192847, + "loss": 4.7145, + "step": 84530 + }, + { + "epoch": 0.11878663123488427, + "grad_norm": 0.7729194760322571, + "learning_rate": 0.00029967914115368375, + "loss": 4.7879, + "step": 84540 + }, + { + "epoch": 0.11880068217304784, + "grad_norm": 0.7287982702255249, + "learning_rate": 0.0002996786600253686, + "loss": 4.7128, + "step": 84550 + }, + { + "epoch": 0.11881473311121142, + "grad_norm": 0.7168266177177429, + "learning_rate": 0.00029967817853698413, + "loss": 4.6013, + "step": 84560 + }, + { + "epoch": 0.118828784049375, + "grad_norm": 0.6961681246757507, + "learning_rate": 0.0002996776966885315, + "loss": 4.7291, + "step": 84570 + }, + { + "epoch": 0.11884283498753857, + "grad_norm": 0.7002440690994263, + "learning_rate": 0.00029967721448001186, + "loss": 4.6898, + "step": 84580 + }, + { + "epoch": 0.11885688592570215, + "grad_norm": 0.7198019027709961, + "learning_rate": 0.00029967673191142645, + "loss": 4.6502, + "step": 84590 + }, + { + "epoch": 0.11887093686386573, + "grad_norm": 0.7356411814689636, + "learning_rate": 0.0002996762489827763, + "loss": 4.7287, + "step": 84600 + }, + { + "epoch": 0.1188849878020293, + "grad_norm": 0.7238462567329407, + "learning_rate": 0.0002996757656940627, + "loss": 4.6479, + "step": 84610 + }, + { + "epoch": 0.11889903874019288, + "grad_norm": 0.7430039644241333, + "learning_rate": 0.0002996752820452867, + "loss": 4.7358, + "step": 84620 + }, + { + "epoch": 0.11891308967835647, + "grad_norm": 0.6926167011260986, + "learning_rate": 0.0002996747980364496, + "loss": 4.6281, + "step": 84630 + }, + { + "epoch": 0.11892714061652004, + "grad_norm": 0.7018167972564697, + "learning_rate": 0.00029967431366755237, + "loss": 4.7712, + "step": 84640 + }, + { + "epoch": 0.11894119155468362, + "grad_norm": 0.6989160180091858, + "learning_rate": 0.00029967382893859634, + "loss": 4.7982, + "step": 84650 + }, + { + "epoch": 0.1189552424928472, + "grad_norm": 0.7366704940795898, + "learning_rate": 0.0002996733438495826, + "loss": 4.7934, + "step": 84660 + }, + { + "epoch": 0.11896929343101077, + "grad_norm": 0.698725163936615, + "learning_rate": 0.00029967285840051236, + "loss": 4.7432, + "step": 84670 + }, + { + "epoch": 0.11898334436917435, + "grad_norm": 0.7053683400154114, + "learning_rate": 0.0002996723725913868, + "loss": 4.7379, + "step": 84680 + }, + { + "epoch": 0.11899739530733792, + "grad_norm": 0.7268297076225281, + "learning_rate": 0.000299671886422207, + "loss": 4.6208, + "step": 84690 + }, + { + "epoch": 0.1190114462455015, + "grad_norm": 0.7235490679740906, + "learning_rate": 0.0002996713998929742, + "loss": 4.6095, + "step": 84700 + }, + { + "epoch": 0.11902549718366508, + "grad_norm": 0.706933319568634, + "learning_rate": 0.00029967091300368956, + "loss": 4.6821, + "step": 84710 + }, + { + "epoch": 0.11903954812182865, + "grad_norm": 0.7260755300521851, + "learning_rate": 0.0002996704257543542, + "loss": 4.6393, + "step": 84720 + }, + { + "epoch": 0.11905359905999224, + "grad_norm": 0.7088666558265686, + "learning_rate": 0.00029966993814496944, + "loss": 4.7713, + "step": 84730 + }, + { + "epoch": 0.11906764999815582, + "grad_norm": 0.6888231635093689, + "learning_rate": 0.0002996694501755363, + "loss": 4.6284, + "step": 84740 + }, + { + "epoch": 0.1190817009363194, + "grad_norm": 0.7280553579330444, + "learning_rate": 0.000299668961846056, + "loss": 4.6492, + "step": 84750 + }, + { + "epoch": 0.11909575187448297, + "grad_norm": 0.7076670527458191, + "learning_rate": 0.00029966847315652975, + "loss": 4.7118, + "step": 84760 + }, + { + "epoch": 0.11910980281264655, + "grad_norm": 0.7475929856300354, + "learning_rate": 0.0002996679841069587, + "loss": 4.6426, + "step": 84770 + }, + { + "epoch": 0.11912385375081012, + "grad_norm": 0.6810241937637329, + "learning_rate": 0.00029966749469734395, + "loss": 4.7599, + "step": 84780 + }, + { + "epoch": 0.1191379046889737, + "grad_norm": 0.7051819562911987, + "learning_rate": 0.00029966700492768683, + "loss": 4.7097, + "step": 84790 + }, + { + "epoch": 0.11915195562713728, + "grad_norm": 0.7109501361846924, + "learning_rate": 0.0002996665147979884, + "loss": 4.625, + "step": 84800 + }, + { + "epoch": 0.11916600656530085, + "grad_norm": 0.6987961530685425, + "learning_rate": 0.0002996660243082499, + "loss": 4.6492, + "step": 84810 + }, + { + "epoch": 0.11918005750346443, + "grad_norm": 0.7196264863014221, + "learning_rate": 0.00029966553345847245, + "loss": 4.5642, + "step": 84820 + }, + { + "epoch": 0.11919410844162802, + "grad_norm": 0.6904036998748779, + "learning_rate": 0.0002996650422486573, + "loss": 4.7637, + "step": 84830 + }, + { + "epoch": 0.1192081593797916, + "grad_norm": 0.7117659449577332, + "learning_rate": 0.0002996645506788056, + "loss": 4.7709, + "step": 84840 + }, + { + "epoch": 0.11922221031795517, + "grad_norm": 0.7033788561820984, + "learning_rate": 0.0002996640587489185, + "loss": 4.654, + "step": 84850 + }, + { + "epoch": 0.11923626125611875, + "grad_norm": 0.710970938205719, + "learning_rate": 0.00029966356645899726, + "loss": 4.6868, + "step": 84860 + }, + { + "epoch": 0.11925031219428232, + "grad_norm": 0.7019822597503662, + "learning_rate": 0.00029966307380904295, + "loss": 4.6512, + "step": 84870 + }, + { + "epoch": 0.1192643631324459, + "grad_norm": 0.7262449860572815, + "learning_rate": 0.00029966258079905687, + "loss": 4.6789, + "step": 84880 + }, + { + "epoch": 0.11927841407060948, + "grad_norm": 0.7337659001350403, + "learning_rate": 0.0002996620874290402, + "loss": 4.7618, + "step": 84890 + }, + { + "epoch": 0.11929246500877305, + "grad_norm": 0.7070848941802979, + "learning_rate": 0.00029966159369899405, + "loss": 4.7406, + "step": 84900 + }, + { + "epoch": 0.11930651594693663, + "grad_norm": 0.7408710718154907, + "learning_rate": 0.00029966109960891964, + "loss": 4.8364, + "step": 84910 + }, + { + "epoch": 0.1193205668851002, + "grad_norm": 0.6921396851539612, + "learning_rate": 0.0002996606051588182, + "loss": 4.767, + "step": 84920 + }, + { + "epoch": 0.11933461782326378, + "grad_norm": 0.7204297780990601, + "learning_rate": 0.00029966011034869084, + "loss": 4.6362, + "step": 84930 + }, + { + "epoch": 0.11934866876142737, + "grad_norm": 0.7283856868743896, + "learning_rate": 0.00029965961517853883, + "loss": 4.6364, + "step": 84940 + }, + { + "epoch": 0.11936271969959095, + "grad_norm": 0.7073463201522827, + "learning_rate": 0.0002996591196483633, + "loss": 4.7698, + "step": 84950 + }, + { + "epoch": 0.11937677063775452, + "grad_norm": 0.7231873869895935, + "learning_rate": 0.00029965862375816546, + "loss": 4.7165, + "step": 84960 + }, + { + "epoch": 0.1193908215759181, + "grad_norm": 0.7474141716957092, + "learning_rate": 0.00029965812750794654, + "loss": 4.7594, + "step": 84970 + }, + { + "epoch": 0.11940487251408168, + "grad_norm": 0.752204179763794, + "learning_rate": 0.00029965763089770766, + "loss": 4.7098, + "step": 84980 + }, + { + "epoch": 0.11941892345224525, + "grad_norm": 0.6901721954345703, + "learning_rate": 0.0002996571339274501, + "loss": 4.5758, + "step": 84990 + }, + { + "epoch": 0.11943297439040883, + "grad_norm": 0.711621105670929, + "learning_rate": 0.000299656636597175, + "loss": 4.7077, + "step": 85000 + }, + { + "epoch": 0.1194470253285724, + "grad_norm": 0.7142300605773926, + "learning_rate": 0.0002996561389068836, + "loss": 4.7274, + "step": 85010 + }, + { + "epoch": 0.11946107626673598, + "grad_norm": 0.7306178212165833, + "learning_rate": 0.000299655640856577, + "loss": 4.7378, + "step": 85020 + }, + { + "epoch": 0.11947512720489956, + "grad_norm": 0.7129437923431396, + "learning_rate": 0.0002996551424462565, + "loss": 4.679, + "step": 85030 + }, + { + "epoch": 0.11948917814306315, + "grad_norm": 0.7050901055335999, + "learning_rate": 0.00029965464367592327, + "loss": 4.7632, + "step": 85040 + }, + { + "epoch": 0.11950322908122672, + "grad_norm": 0.6919633150100708, + "learning_rate": 0.0002996541445455785, + "loss": 4.629, + "step": 85050 + }, + { + "epoch": 0.1195172800193903, + "grad_norm": 0.705011785030365, + "learning_rate": 0.0002996536450552234, + "loss": 4.639, + "step": 85060 + }, + { + "epoch": 0.11953133095755387, + "grad_norm": 0.7281025648117065, + "learning_rate": 0.0002996531452048592, + "loss": 4.7754, + "step": 85070 + }, + { + "epoch": 0.11954538189571745, + "grad_norm": 0.7474626898765564, + "learning_rate": 0.00029965264499448704, + "loss": 4.6266, + "step": 85080 + }, + { + "epoch": 0.11955943283388103, + "grad_norm": 0.7219833731651306, + "learning_rate": 0.00029965214442410817, + "loss": 4.6425, + "step": 85090 + }, + { + "epoch": 0.1195734837720446, + "grad_norm": 0.7083914875984192, + "learning_rate": 0.0002996516434937237, + "loss": 4.7466, + "step": 85100 + }, + { + "epoch": 0.11958753471020818, + "grad_norm": 0.7148147225379944, + "learning_rate": 0.000299651142203335, + "loss": 4.5847, + "step": 85110 + }, + { + "epoch": 0.11960158564837176, + "grad_norm": 0.7254843711853027, + "learning_rate": 0.00029965064055294315, + "loss": 4.6418, + "step": 85120 + }, + { + "epoch": 0.11961563658653533, + "grad_norm": 0.7078420519828796, + "learning_rate": 0.0002996501385425494, + "loss": 4.6431, + "step": 85130 + }, + { + "epoch": 0.11962968752469892, + "grad_norm": 0.7228267192840576, + "learning_rate": 0.00029964963617215497, + "loss": 4.6071, + "step": 85140 + }, + { + "epoch": 0.1196437384628625, + "grad_norm": 0.700187623500824, + "learning_rate": 0.00029964913344176104, + "loss": 4.765, + "step": 85150 + }, + { + "epoch": 0.11965778940102607, + "grad_norm": 0.723201334476471, + "learning_rate": 0.00029964863035136883, + "loss": 4.7408, + "step": 85160 + }, + { + "epoch": 0.11967184033918965, + "grad_norm": 0.7586082220077515, + "learning_rate": 0.00029964812690097954, + "loss": 4.7016, + "step": 85170 + }, + { + "epoch": 0.11968589127735323, + "grad_norm": 0.7144497632980347, + "learning_rate": 0.0002996476230905944, + "loss": 4.6846, + "step": 85180 + }, + { + "epoch": 0.1196999422155168, + "grad_norm": 0.7355106472969055, + "learning_rate": 0.0002996471189202146, + "loss": 4.7114, + "step": 85190 + }, + { + "epoch": 0.11971399315368038, + "grad_norm": 0.7208760380744934, + "learning_rate": 0.0002996466143898414, + "loss": 4.6776, + "step": 85200 + }, + { + "epoch": 0.11972804409184395, + "grad_norm": 0.724746823310852, + "learning_rate": 0.00029964610949947593, + "loss": 4.8292, + "step": 85210 + }, + { + "epoch": 0.11974209503000753, + "grad_norm": 0.6953820586204529, + "learning_rate": 0.0002996456042491195, + "loss": 4.7256, + "step": 85220 + }, + { + "epoch": 0.11975614596817111, + "grad_norm": 0.7167063355445862, + "learning_rate": 0.0002996450986387733, + "loss": 4.6268, + "step": 85230 + }, + { + "epoch": 0.11977019690633468, + "grad_norm": 0.7081119418144226, + "learning_rate": 0.0002996445926684385, + "loss": 4.644, + "step": 85240 + }, + { + "epoch": 0.11978424784449827, + "grad_norm": 0.7091720700263977, + "learning_rate": 0.0002996440863381163, + "loss": 4.7266, + "step": 85250 + }, + { + "epoch": 0.11979829878266185, + "grad_norm": 0.7209839820861816, + "learning_rate": 0.000299643579647808, + "loss": 4.7454, + "step": 85260 + }, + { + "epoch": 0.11981234972082543, + "grad_norm": 0.6966519951820374, + "learning_rate": 0.00029964307259751474, + "loss": 4.6377, + "step": 85270 + }, + { + "epoch": 0.119826400658989, + "grad_norm": 0.701129674911499, + "learning_rate": 0.00029964256518723786, + "loss": 4.6073, + "step": 85280 + }, + { + "epoch": 0.11984045159715258, + "grad_norm": 0.6923808455467224, + "learning_rate": 0.0002996420574169785, + "loss": 4.6931, + "step": 85290 + }, + { + "epoch": 0.11985450253531615, + "grad_norm": 0.7123744487762451, + "learning_rate": 0.0002996415492867378, + "loss": 4.6669, + "step": 85300 + }, + { + "epoch": 0.11986855347347973, + "grad_norm": 0.7286718487739563, + "learning_rate": 0.0002996410407965171, + "loss": 4.6736, + "step": 85310 + }, + { + "epoch": 0.1198826044116433, + "grad_norm": 0.7242770791053772, + "learning_rate": 0.0002996405319463176, + "loss": 4.6789, + "step": 85320 + }, + { + "epoch": 0.11989665534980688, + "grad_norm": 0.7120243310928345, + "learning_rate": 0.0002996400227361405, + "loss": 4.6072, + "step": 85330 + }, + { + "epoch": 0.11991070628797046, + "grad_norm": 0.7126664519309998, + "learning_rate": 0.00029963951316598706, + "loss": 4.7034, + "step": 85340 + }, + { + "epoch": 0.11992475722613405, + "grad_norm": 0.7311067581176758, + "learning_rate": 0.00029963900323585845, + "loss": 4.7734, + "step": 85350 + }, + { + "epoch": 0.11993880816429763, + "grad_norm": 0.736724317073822, + "learning_rate": 0.00029963849294575596, + "loss": 4.6918, + "step": 85360 + }, + { + "epoch": 0.1199528591024612, + "grad_norm": 0.7146928310394287, + "learning_rate": 0.0002996379822956808, + "loss": 4.5878, + "step": 85370 + }, + { + "epoch": 0.11996691004062478, + "grad_norm": 0.6940208673477173, + "learning_rate": 0.00029963747128563417, + "loss": 4.7649, + "step": 85380 + }, + { + "epoch": 0.11998096097878835, + "grad_norm": 0.7303279638290405, + "learning_rate": 0.0002996369599156173, + "loss": 4.6806, + "step": 85390 + }, + { + "epoch": 0.11999501191695193, + "grad_norm": 0.7200376391410828, + "learning_rate": 0.00029963644818563145, + "loss": 4.7124, + "step": 85400 + }, + { + "epoch": 0.1200090628551155, + "grad_norm": 0.6997661590576172, + "learning_rate": 0.00029963593609567783, + "loss": 4.7171, + "step": 85410 + }, + { + "epoch": 0.12002311379327908, + "grad_norm": 0.7147961854934692, + "learning_rate": 0.00029963542364575766, + "loss": 4.5216, + "step": 85420 + }, + { + "epoch": 0.12003716473144266, + "grad_norm": 0.7196703553199768, + "learning_rate": 0.0002996349108358722, + "loss": 4.7912, + "step": 85430 + }, + { + "epoch": 0.12005121566960623, + "grad_norm": 0.7229997515678406, + "learning_rate": 0.0002996343976660227, + "loss": 4.6595, + "step": 85440 + }, + { + "epoch": 0.12006526660776982, + "grad_norm": 0.7053361535072327, + "learning_rate": 0.00029963388413621037, + "loss": 4.6721, + "step": 85450 + }, + { + "epoch": 0.1200793175459334, + "grad_norm": 0.7070173621177673, + "learning_rate": 0.00029963337024643637, + "loss": 4.6046, + "step": 85460 + }, + { + "epoch": 0.12009336848409698, + "grad_norm": 0.6948744654655457, + "learning_rate": 0.0002996328559967021, + "loss": 4.7345, + "step": 85470 + }, + { + "epoch": 0.12010741942226055, + "grad_norm": 0.7141450643539429, + "learning_rate": 0.00029963234138700865, + "loss": 4.6867, + "step": 85480 + }, + { + "epoch": 0.12012147036042413, + "grad_norm": 0.7317810654640198, + "learning_rate": 0.0002996318264173573, + "loss": 4.5998, + "step": 85490 + }, + { + "epoch": 0.1201355212985877, + "grad_norm": 0.7293758392333984, + "learning_rate": 0.0002996313110877494, + "loss": 4.7669, + "step": 85500 + }, + { + "epoch": 0.12014957223675128, + "grad_norm": 0.7065700888633728, + "learning_rate": 0.000299630795398186, + "loss": 4.6155, + "step": 85510 + }, + { + "epoch": 0.12016362317491486, + "grad_norm": 0.7263256907463074, + "learning_rate": 0.0002996302793486685, + "loss": 4.6669, + "step": 85520 + }, + { + "epoch": 0.12017767411307843, + "grad_norm": 0.7085826396942139, + "learning_rate": 0.000299629762939198, + "loss": 4.6789, + "step": 85530 + }, + { + "epoch": 0.12019172505124201, + "grad_norm": 0.7257848381996155, + "learning_rate": 0.0002996292461697759, + "loss": 4.7586, + "step": 85540 + }, + { + "epoch": 0.12020577598940559, + "grad_norm": 0.7008543610572815, + "learning_rate": 0.0002996287290404033, + "loss": 4.7479, + "step": 85550 + }, + { + "epoch": 0.12021982692756918, + "grad_norm": 0.6934198141098022, + "learning_rate": 0.00029962821155108153, + "loss": 4.7428, + "step": 85560 + }, + { + "epoch": 0.12023387786573275, + "grad_norm": 0.6932348012924194, + "learning_rate": 0.0002996276937018119, + "loss": 4.701, + "step": 85570 + }, + { + "epoch": 0.12024792880389633, + "grad_norm": 0.6966341733932495, + "learning_rate": 0.0002996271754925954, + "loss": 4.7036, + "step": 85580 + }, + { + "epoch": 0.1202619797420599, + "grad_norm": 0.7221991419792175, + "learning_rate": 0.00029962665692343356, + "loss": 4.6009, + "step": 85590 + }, + { + "epoch": 0.12027603068022348, + "grad_norm": 0.7265183925628662, + "learning_rate": 0.00029962613799432744, + "loss": 4.6958, + "step": 85600 + }, + { + "epoch": 0.12029008161838706, + "grad_norm": 0.7104468941688538, + "learning_rate": 0.0002996256187052784, + "loss": 4.6442, + "step": 85610 + }, + { + "epoch": 0.12030413255655063, + "grad_norm": 0.725537896156311, + "learning_rate": 0.0002996250990562877, + "loss": 4.7405, + "step": 85620 + }, + { + "epoch": 0.12031818349471421, + "grad_norm": 0.7155194282531738, + "learning_rate": 0.0002996245790473564, + "loss": 4.7107, + "step": 85630 + }, + { + "epoch": 0.12033223443287779, + "grad_norm": 0.7233233451843262, + "learning_rate": 0.00029962405867848597, + "loss": 4.6326, + "step": 85640 + }, + { + "epoch": 0.12034628537104136, + "grad_norm": 0.7190825939178467, + "learning_rate": 0.0002996235379496776, + "loss": 4.7711, + "step": 85650 + }, + { + "epoch": 0.12036033630920495, + "grad_norm": 0.7331857681274414, + "learning_rate": 0.00029962301686093246, + "loss": 4.7669, + "step": 85660 + }, + { + "epoch": 0.12037438724736853, + "grad_norm": 0.7344217896461487, + "learning_rate": 0.00029962249541225195, + "loss": 4.7563, + "step": 85670 + }, + { + "epoch": 0.1203884381855321, + "grad_norm": 0.706200122833252, + "learning_rate": 0.0002996219736036372, + "loss": 4.7661, + "step": 85680 + }, + { + "epoch": 0.12040248912369568, + "grad_norm": 0.7147502899169922, + "learning_rate": 0.0002996214514350895, + "loss": 4.5101, + "step": 85690 + }, + { + "epoch": 0.12041654006185926, + "grad_norm": 0.7082406878471375, + "learning_rate": 0.0002996209289066101, + "loss": 4.6323, + "step": 85700 + }, + { + "epoch": 0.12043059100002283, + "grad_norm": 0.7128416299819946, + "learning_rate": 0.0002996204060182003, + "loss": 4.6708, + "step": 85710 + }, + { + "epoch": 0.12044464193818641, + "grad_norm": 0.7045454382896423, + "learning_rate": 0.0002996198827698613, + "loss": 4.6902, + "step": 85720 + }, + { + "epoch": 0.12045869287634998, + "grad_norm": 0.7640816569328308, + "learning_rate": 0.0002996193591615944, + "loss": 4.7689, + "step": 85730 + }, + { + "epoch": 0.12047274381451356, + "grad_norm": 0.7236823439598083, + "learning_rate": 0.00029961883519340085, + "loss": 4.7993, + "step": 85740 + }, + { + "epoch": 0.12048679475267714, + "grad_norm": 0.7274718880653381, + "learning_rate": 0.0002996183108652819, + "loss": 4.7037, + "step": 85750 + }, + { + "epoch": 0.12050084569084073, + "grad_norm": 0.7061973810195923, + "learning_rate": 0.00029961778617723883, + "loss": 4.6896, + "step": 85760 + }, + { + "epoch": 0.1205148966290043, + "grad_norm": 0.7118090391159058, + "learning_rate": 0.00029961726112927284, + "loss": 4.6545, + "step": 85770 + }, + { + "epoch": 0.12052894756716788, + "grad_norm": 0.6875846982002258, + "learning_rate": 0.00029961673572138534, + "loss": 4.6925, + "step": 85780 + }, + { + "epoch": 0.12054299850533146, + "grad_norm": 0.7023685574531555, + "learning_rate": 0.00029961620995357747, + "loss": 4.6063, + "step": 85790 + }, + { + "epoch": 0.12055704944349503, + "grad_norm": 0.7327830195426941, + "learning_rate": 0.0002996156838258505, + "loss": 4.7115, + "step": 85800 + }, + { + "epoch": 0.12057110038165861, + "grad_norm": 0.7080143690109253, + "learning_rate": 0.00029961515733820565, + "loss": 4.7993, + "step": 85810 + }, + { + "epoch": 0.12058515131982218, + "grad_norm": 0.7189124822616577, + "learning_rate": 0.00029961463049064436, + "loss": 4.6521, + "step": 85820 + }, + { + "epoch": 0.12059920225798576, + "grad_norm": 0.7159058451652527, + "learning_rate": 0.0002996141032831677, + "loss": 4.798, + "step": 85830 + }, + { + "epoch": 0.12061325319614934, + "grad_norm": 0.7728943228721619, + "learning_rate": 0.0002996135757157771, + "loss": 4.6605, + "step": 85840 + }, + { + "epoch": 0.12062730413431291, + "grad_norm": 0.9222893118858337, + "learning_rate": 0.0002996130477884737, + "loss": 4.6658, + "step": 85850 + }, + { + "epoch": 0.12064135507247649, + "grad_norm": 0.7004067301750183, + "learning_rate": 0.00029961251950125884, + "loss": 4.7422, + "step": 85860 + }, + { + "epoch": 0.12065540601064008, + "grad_norm": 0.7140782475471497, + "learning_rate": 0.00029961199085413385, + "loss": 4.6833, + "step": 85870 + }, + { + "epoch": 0.12066945694880366, + "grad_norm": 0.7055903077125549, + "learning_rate": 0.00029961146184709987, + "loss": 4.7995, + "step": 85880 + }, + { + "epoch": 0.12068350788696723, + "grad_norm": 0.7646137475967407, + "learning_rate": 0.00029961093248015826, + "loss": 4.7584, + "step": 85890 + }, + { + "epoch": 0.12069755882513081, + "grad_norm": 0.710187554359436, + "learning_rate": 0.0002996104027533102, + "loss": 4.5475, + "step": 85900 + }, + { + "epoch": 0.12071160976329438, + "grad_norm": 0.7333691120147705, + "learning_rate": 0.00029960987266655707, + "loss": 4.7007, + "step": 85910 + }, + { + "epoch": 0.12072566070145796, + "grad_norm": 0.7043735384941101, + "learning_rate": 0.00029960934221990015, + "loss": 4.7965, + "step": 85920 + }, + { + "epoch": 0.12073971163962154, + "grad_norm": 0.7109283804893494, + "learning_rate": 0.00029960881141334063, + "loss": 4.6432, + "step": 85930 + }, + { + "epoch": 0.12075376257778511, + "grad_norm": 0.6858183145523071, + "learning_rate": 0.00029960828024687983, + "loss": 4.7364, + "step": 85940 + }, + { + "epoch": 0.12076781351594869, + "grad_norm": 0.7360689043998718, + "learning_rate": 0.0002996077487205191, + "loss": 4.5901, + "step": 85950 + }, + { + "epoch": 0.12078186445411226, + "grad_norm": 0.6898038387298584, + "learning_rate": 0.0002996072700390809, + "loss": 4.6589, + "step": 85960 + }, + { + "epoch": 0.12079591539227585, + "grad_norm": 1.0662989616394043, + "learning_rate": 0.00029960673782891364, + "loss": 4.5829, + "step": 85970 + }, + { + "epoch": 0.12080996633043943, + "grad_norm": 0.7156269550323486, + "learning_rate": 0.0002996062052588501, + "loss": 4.722, + "step": 85980 + }, + { + "epoch": 0.120824017268603, + "grad_norm": 0.7196338176727295, + "learning_rate": 0.0002996056723288915, + "loss": 4.655, + "step": 85990 + }, + { + "epoch": 0.12083806820676658, + "grad_norm": 0.6985352039337158, + "learning_rate": 0.0002996051390390392, + "loss": 4.7494, + "step": 86000 + }, + { + "epoch": 0.12085211914493016, + "grad_norm": 0.7045134902000427, + "learning_rate": 0.0002996046053892945, + "loss": 4.7251, + "step": 86010 + }, + { + "epoch": 0.12086617008309374, + "grad_norm": 0.7500072121620178, + "learning_rate": 0.00029960407137965856, + "loss": 4.732, + "step": 86020 + }, + { + "epoch": 0.12088022102125731, + "grad_norm": 0.7060306668281555, + "learning_rate": 0.0002996035370101327, + "loss": 4.7745, + "step": 86030 + }, + { + "epoch": 0.12089427195942089, + "grad_norm": 0.7595226764678955, + "learning_rate": 0.0002996030022807184, + "loss": 4.6405, + "step": 86040 + }, + { + "epoch": 0.12090832289758446, + "grad_norm": 0.696519136428833, + "learning_rate": 0.0002996024671914167, + "loss": 4.724, + "step": 86050 + }, + { + "epoch": 0.12092237383574804, + "grad_norm": 0.7316704988479614, + "learning_rate": 0.000299601931742229, + "loss": 4.5628, + "step": 86060 + }, + { + "epoch": 0.12093642477391163, + "grad_norm": 0.6876169443130493, + "learning_rate": 0.0002996013959331566, + "loss": 4.7434, + "step": 86070 + }, + { + "epoch": 0.1209504757120752, + "grad_norm": 0.7389705777168274, + "learning_rate": 0.00029960085976420066, + "loss": 4.7681, + "step": 86080 + }, + { + "epoch": 0.12096452665023878, + "grad_norm": 0.702412486076355, + "learning_rate": 0.00029960032323536265, + "loss": 4.7209, + "step": 86090 + }, + { + "epoch": 0.12097857758840236, + "grad_norm": 0.6958481669425964, + "learning_rate": 0.0002995997863466437, + "loss": 4.8133, + "step": 86100 + }, + { + "epoch": 0.12099262852656593, + "grad_norm": 0.717162013053894, + "learning_rate": 0.00029959924909804525, + "loss": 4.7312, + "step": 86110 + }, + { + "epoch": 0.12100667946472951, + "grad_norm": 0.7261740565299988, + "learning_rate": 0.00029959871148956853, + "loss": 4.6485, + "step": 86120 + }, + { + "epoch": 0.12102073040289309, + "grad_norm": 0.7102240324020386, + "learning_rate": 0.00029959817352121485, + "loss": 4.8627, + "step": 86130 + }, + { + "epoch": 0.12103478134105666, + "grad_norm": 0.7106133699417114, + "learning_rate": 0.0002995976351929854, + "loss": 4.7682, + "step": 86140 + }, + { + "epoch": 0.12104883227922024, + "grad_norm": 0.6954272389411926, + "learning_rate": 0.00029959709650488164, + "loss": 4.6691, + "step": 86150 + }, + { + "epoch": 0.12106288321738382, + "grad_norm": 0.7176921367645264, + "learning_rate": 0.0002995965574569047, + "loss": 4.6487, + "step": 86160 + }, + { + "epoch": 0.12107693415554739, + "grad_norm": 0.8005315661430359, + "learning_rate": 0.000299596018049056, + "loss": 4.735, + "step": 86170 + }, + { + "epoch": 0.12109098509371098, + "grad_norm": 0.7331059575080872, + "learning_rate": 0.0002995954782813368, + "loss": 4.7956, + "step": 86180 + }, + { + "epoch": 0.12110503603187456, + "grad_norm": 0.7373118996620178, + "learning_rate": 0.0002995949381537484, + "loss": 4.584, + "step": 86190 + }, + { + "epoch": 0.12111908697003813, + "grad_norm": 0.7049098610877991, + "learning_rate": 0.00029959439766629207, + "loss": 4.6658, + "step": 86200 + }, + { + "epoch": 0.12113313790820171, + "grad_norm": 0.7197480797767639, + "learning_rate": 0.00029959385681896914, + "loss": 4.8936, + "step": 86210 + }, + { + "epoch": 0.12114718884636529, + "grad_norm": 0.7270209193229675, + "learning_rate": 0.00029959331561178087, + "loss": 4.6427, + "step": 86220 + }, + { + "epoch": 0.12116123978452886, + "grad_norm": 0.7451220750808716, + "learning_rate": 0.0002995927740447287, + "loss": 4.7832, + "step": 86230 + }, + { + "epoch": 0.12117529072269244, + "grad_norm": 0.7173677682876587, + "learning_rate": 0.00029959223211781374, + "loss": 4.6468, + "step": 86240 + }, + { + "epoch": 0.12118934166085601, + "grad_norm": 0.7185055613517761, + "learning_rate": 0.0002995916898310374, + "loss": 4.7216, + "step": 86250 + }, + { + "epoch": 0.12120339259901959, + "grad_norm": 0.729455828666687, + "learning_rate": 0.00029959114718440094, + "loss": 4.7083, + "step": 86260 + }, + { + "epoch": 0.12121744353718317, + "grad_norm": 0.70658940076828, + "learning_rate": 0.0002995906041779057, + "loss": 4.6727, + "step": 86270 + }, + { + "epoch": 0.12123149447534676, + "grad_norm": 0.7007445693016052, + "learning_rate": 0.00029959006081155303, + "loss": 4.7122, + "step": 86280 + }, + { + "epoch": 0.12124554541351033, + "grad_norm": 0.7127397656440735, + "learning_rate": 0.00029958951708534417, + "loss": 4.6473, + "step": 86290 + }, + { + "epoch": 0.12125959635167391, + "grad_norm": 0.7260735630989075, + "learning_rate": 0.0002995889729992805, + "loss": 4.6155, + "step": 86300 + }, + { + "epoch": 0.12127364728983749, + "grad_norm": 0.7390902638435364, + "learning_rate": 0.00029958842855336314, + "loss": 4.6914, + "step": 86310 + }, + { + "epoch": 0.12128769822800106, + "grad_norm": 0.7290669083595276, + "learning_rate": 0.00029958788374759364, + "loss": 4.6071, + "step": 86320 + }, + { + "epoch": 0.12130174916616464, + "grad_norm": 0.7232752442359924, + "learning_rate": 0.00029958733858197314, + "loss": 4.5553, + "step": 86330 + }, + { + "epoch": 0.12131580010432821, + "grad_norm": 0.7251487970352173, + "learning_rate": 0.0002995867930565031, + "loss": 4.6294, + "step": 86340 + }, + { + "epoch": 0.12132985104249179, + "grad_norm": 0.7375115156173706, + "learning_rate": 0.00029958624717118463, + "loss": 4.7586, + "step": 86350 + }, + { + "epoch": 0.12134390198065537, + "grad_norm": 0.7383313775062561, + "learning_rate": 0.0002995857009260192, + "loss": 4.7191, + "step": 86360 + }, + { + "epoch": 0.12135795291881894, + "grad_norm": 0.730225145816803, + "learning_rate": 0.0002995851543210081, + "loss": 4.6134, + "step": 86370 + }, + { + "epoch": 0.12137200385698253, + "grad_norm": 1.1462152004241943, + "learning_rate": 0.00029958460735615266, + "loss": 4.6346, + "step": 86380 + }, + { + "epoch": 0.12138605479514611, + "grad_norm": 0.7488716840744019, + "learning_rate": 0.00029958406003145413, + "loss": 4.7212, + "step": 86390 + }, + { + "epoch": 0.12140010573330969, + "grad_norm": 0.7324610352516174, + "learning_rate": 0.0002995835123469139, + "loss": 4.6655, + "step": 86400 + }, + { + "epoch": 0.12141415667147326, + "grad_norm": 0.7316756248474121, + "learning_rate": 0.0002995829643025332, + "loss": 4.5674, + "step": 86410 + }, + { + "epoch": 0.12142820760963684, + "grad_norm": 0.7131539583206177, + "learning_rate": 0.0002995824158983134, + "loss": 4.7464, + "step": 86420 + }, + { + "epoch": 0.12144225854780041, + "grad_norm": 0.7157332897186279, + "learning_rate": 0.0002995818671342559, + "loss": 4.7049, + "step": 86430 + }, + { + "epoch": 0.12145630948596399, + "grad_norm": 0.7550666332244873, + "learning_rate": 0.0002995813180103618, + "loss": 4.5843, + "step": 86440 + }, + { + "epoch": 0.12147036042412757, + "grad_norm": 0.7143164277076721, + "learning_rate": 0.00029958076852663266, + "loss": 4.6484, + "step": 86450 + }, + { + "epoch": 0.12148441136229114, + "grad_norm": 0.7037560343742371, + "learning_rate": 0.0002995802186830697, + "loss": 4.806, + "step": 86460 + }, + { + "epoch": 0.12149846230045472, + "grad_norm": 0.710610032081604, + "learning_rate": 0.0002995796684796742, + "loss": 4.7, + "step": 86470 + }, + { + "epoch": 0.1215125132386183, + "grad_norm": 0.7024252414703369, + "learning_rate": 0.0002995791179164475, + "loss": 4.6566, + "step": 86480 + }, + { + "epoch": 0.12152656417678188, + "grad_norm": 0.7062261700630188, + "learning_rate": 0.00029957856699339104, + "loss": 4.69, + "step": 86490 + }, + { + "epoch": 0.12154061511494546, + "grad_norm": 0.7243465185165405, + "learning_rate": 0.00029957801571050597, + "loss": 4.7999, + "step": 86500 + }, + { + "epoch": 0.12155466605310904, + "grad_norm": 0.7086811661720276, + "learning_rate": 0.0002995774640677937, + "loss": 4.6722, + "step": 86510 + }, + { + "epoch": 0.12156871699127261, + "grad_norm": 0.7045823931694031, + "learning_rate": 0.0002995769672817016, + "loss": 4.8183, + "step": 86520 + }, + { + "epoch": 0.12158276792943619, + "grad_norm": 0.7064098119735718, + "learning_rate": 0.0002995764149553213, + "loss": 4.7609, + "step": 86530 + }, + { + "epoch": 0.12159681886759977, + "grad_norm": 0.7484224438667297, + "learning_rate": 0.0002995758622691177, + "loss": 4.681, + "step": 86540 + }, + { + "epoch": 0.12161086980576334, + "grad_norm": 0.6937485933303833, + "learning_rate": 0.0002995753092230921, + "loss": 4.7115, + "step": 86550 + }, + { + "epoch": 0.12162492074392692, + "grad_norm": 0.7272822856903076, + "learning_rate": 0.00029957475581724586, + "loss": 4.5993, + "step": 86560 + }, + { + "epoch": 0.1216389716820905, + "grad_norm": 0.696310818195343, + "learning_rate": 0.0002995742020515802, + "loss": 4.7079, + "step": 86570 + }, + { + "epoch": 0.12165302262025407, + "grad_norm": 0.7192110419273376, + "learning_rate": 0.0002995736479260965, + "loss": 4.7158, + "step": 86580 + }, + { + "epoch": 0.12166707355841766, + "grad_norm": 0.7095620632171631, + "learning_rate": 0.0002995730934407961, + "loss": 4.6992, + "step": 86590 + }, + { + "epoch": 0.12168112449658124, + "grad_norm": 0.6891617178916931, + "learning_rate": 0.0002995725385956804, + "loss": 4.7227, + "step": 86600 + }, + { + "epoch": 0.12169517543474481, + "grad_norm": 0.7383580207824707, + "learning_rate": 0.0002995719833907506, + "loss": 4.7221, + "step": 86610 + }, + { + "epoch": 0.12170922637290839, + "grad_norm": 0.7053825259208679, + "learning_rate": 0.0002995714278260082, + "loss": 4.6793, + "step": 86620 + }, + { + "epoch": 0.12172327731107196, + "grad_norm": 0.7410792708396912, + "learning_rate": 0.0002995708719014544, + "loss": 4.769, + "step": 86630 + }, + { + "epoch": 0.12173732824923554, + "grad_norm": 0.711789071559906, + "learning_rate": 0.0002995703156170906, + "loss": 4.7196, + "step": 86640 + }, + { + "epoch": 0.12175137918739912, + "grad_norm": 0.6985942125320435, + "learning_rate": 0.0002995697589729181, + "loss": 4.6477, + "step": 86650 + }, + { + "epoch": 0.1217654301255627, + "grad_norm": 0.6995700001716614, + "learning_rate": 0.00029956920196893834, + "loss": 4.5672, + "step": 86660 + }, + { + "epoch": 0.12177948106372627, + "grad_norm": 0.7413903474807739, + "learning_rate": 0.0002995686446051525, + "loss": 4.7654, + "step": 86670 + }, + { + "epoch": 0.12179353200188985, + "grad_norm": 0.7320995926856995, + "learning_rate": 0.00029956808688156206, + "loss": 4.7567, + "step": 86680 + }, + { + "epoch": 0.12180758294005344, + "grad_norm": 0.7222046852111816, + "learning_rate": 0.0002995675287981683, + "loss": 4.6457, + "step": 86690 + }, + { + "epoch": 0.12182163387821701, + "grad_norm": 0.711199939250946, + "learning_rate": 0.00029956697035497256, + "loss": 4.6399, + "step": 86700 + }, + { + "epoch": 0.12183568481638059, + "grad_norm": 0.7224550247192383, + "learning_rate": 0.00029956641155197617, + "loss": 4.6731, + "step": 86710 + }, + { + "epoch": 0.12184973575454416, + "grad_norm": 0.6902201175689697, + "learning_rate": 0.00029956585238918057, + "loss": 4.6802, + "step": 86720 + }, + { + "epoch": 0.12186378669270774, + "grad_norm": 0.7383785247802734, + "learning_rate": 0.00029956529286658695, + "loss": 4.7232, + "step": 86730 + }, + { + "epoch": 0.12187783763087132, + "grad_norm": 0.7981114387512207, + "learning_rate": 0.00029956473298419677, + "loss": 4.7792, + "step": 86740 + }, + { + "epoch": 0.12189188856903489, + "grad_norm": 0.7090650200843811, + "learning_rate": 0.00029956417274201134, + "loss": 4.7199, + "step": 86750 + }, + { + "epoch": 0.12190593950719847, + "grad_norm": 0.7152296900749207, + "learning_rate": 0.00029956361214003205, + "loss": 4.6483, + "step": 86760 + }, + { + "epoch": 0.12191999044536204, + "grad_norm": 0.7130936980247498, + "learning_rate": 0.0002995630511782602, + "loss": 4.6469, + "step": 86770 + }, + { + "epoch": 0.12193404138352562, + "grad_norm": 0.7285060286521912, + "learning_rate": 0.0002995624898566971, + "loss": 4.7693, + "step": 86780 + }, + { + "epoch": 0.1219480923216892, + "grad_norm": 0.7463589906692505, + "learning_rate": 0.0002995619281753442, + "loss": 4.8208, + "step": 86790 + }, + { + "epoch": 0.12196214325985279, + "grad_norm": 0.7308253049850464, + "learning_rate": 0.00029956136613420276, + "loss": 4.7281, + "step": 86800 + }, + { + "epoch": 0.12197619419801636, + "grad_norm": 0.7179209589958191, + "learning_rate": 0.0002995608037332742, + "loss": 4.672, + "step": 86810 + }, + { + "epoch": 0.12199024513617994, + "grad_norm": 0.731372594833374, + "learning_rate": 0.00029956024097255987, + "loss": 4.8004, + "step": 86820 + }, + { + "epoch": 0.12200429607434352, + "grad_norm": 0.7444266080856323, + "learning_rate": 0.00029955967785206107, + "loss": 4.5729, + "step": 86830 + }, + { + "epoch": 0.12201834701250709, + "grad_norm": 0.7302293181419373, + "learning_rate": 0.0002995591143717792, + "loss": 4.662, + "step": 86840 + }, + { + "epoch": 0.12203239795067067, + "grad_norm": 0.7154730558395386, + "learning_rate": 0.0002995585505317156, + "loss": 4.5486, + "step": 86850 + }, + { + "epoch": 0.12204644888883424, + "grad_norm": 0.701775074005127, + "learning_rate": 0.00029955798633187164, + "loss": 4.5835, + "step": 86860 + }, + { + "epoch": 0.12206049982699782, + "grad_norm": 0.6990428566932678, + "learning_rate": 0.0002995574217722487, + "loss": 4.6875, + "step": 86870 + }, + { + "epoch": 0.1220745507651614, + "grad_norm": 0.7048460245132446, + "learning_rate": 0.000299556856852848, + "loss": 4.6615, + "step": 86880 + }, + { + "epoch": 0.12208860170332497, + "grad_norm": 0.7063455581665039, + "learning_rate": 0.00029955629157367105, + "loss": 4.8373, + "step": 86890 + }, + { + "epoch": 0.12210265264148856, + "grad_norm": 0.7515528202056885, + "learning_rate": 0.0002995557259347192, + "loss": 4.7006, + "step": 86900 + }, + { + "epoch": 0.12211670357965214, + "grad_norm": 0.7269037365913391, + "learning_rate": 0.0002995551599359937, + "loss": 4.7502, + "step": 86910 + }, + { + "epoch": 0.12213075451781572, + "grad_norm": 0.7405303716659546, + "learning_rate": 0.0002995545935774961, + "loss": 4.6159, + "step": 86920 + }, + { + "epoch": 0.12214480545597929, + "grad_norm": 0.7235696315765381, + "learning_rate": 0.00029955402685922757, + "loss": 4.7358, + "step": 86930 + }, + { + "epoch": 0.12215885639414287, + "grad_norm": 0.711112916469574, + "learning_rate": 0.0002995534597811895, + "loss": 4.7783, + "step": 86940 + }, + { + "epoch": 0.12217290733230644, + "grad_norm": 0.7069167494773865, + "learning_rate": 0.0002995528923433834, + "loss": 4.4935, + "step": 86950 + }, + { + "epoch": 0.12218695827047002, + "grad_norm": 0.7162838578224182, + "learning_rate": 0.0002995523245458105, + "loss": 4.5865, + "step": 86960 + }, + { + "epoch": 0.1222010092086336, + "grad_norm": 0.7079212069511414, + "learning_rate": 0.0002995517563884722, + "loss": 4.6902, + "step": 86970 + }, + { + "epoch": 0.12221506014679717, + "grad_norm": 0.7136911749839783, + "learning_rate": 0.0002995511878713699, + "loss": 4.7081, + "step": 86980 + }, + { + "epoch": 0.12222911108496075, + "grad_norm": 0.7026124000549316, + "learning_rate": 0.0002995506189945049, + "loss": 4.7789, + "step": 86990 + }, + { + "epoch": 0.12224316202312434, + "grad_norm": 0.8226521611213684, + "learning_rate": 0.00029955004975787863, + "loss": 4.6564, + "step": 87000 + }, + { + "epoch": 0.12225721296128791, + "grad_norm": 0.7236669659614563, + "learning_rate": 0.0002995494801614924, + "loss": 4.785, + "step": 87010 + }, + { + "epoch": 0.12227126389945149, + "grad_norm": 0.7194319367408752, + "learning_rate": 0.0002995489102053477, + "loss": 4.7189, + "step": 87020 + }, + { + "epoch": 0.12228531483761507, + "grad_norm": 0.710686445236206, + "learning_rate": 0.00029954833988944576, + "loss": 4.5753, + "step": 87030 + }, + { + "epoch": 0.12229936577577864, + "grad_norm": 0.7050729393959045, + "learning_rate": 0.000299547769213788, + "loss": 4.4799, + "step": 87040 + }, + { + "epoch": 0.12231341671394222, + "grad_norm": 0.7316957712173462, + "learning_rate": 0.00029954719817837583, + "loss": 4.772, + "step": 87050 + }, + { + "epoch": 0.1223274676521058, + "grad_norm": 0.6963942050933838, + "learning_rate": 0.0002995466267832106, + "loss": 4.8665, + "step": 87060 + }, + { + "epoch": 0.12234151859026937, + "grad_norm": 0.7359170913696289, + "learning_rate": 0.00029954605502829364, + "loss": 4.6619, + "step": 87070 + }, + { + "epoch": 0.12235556952843295, + "grad_norm": 0.6984687447547913, + "learning_rate": 0.0002995454829136264, + "loss": 4.7002, + "step": 87080 + }, + { + "epoch": 0.12236962046659652, + "grad_norm": 0.7410992383956909, + "learning_rate": 0.00029954491043921025, + "loss": 4.7953, + "step": 87090 + }, + { + "epoch": 0.1223836714047601, + "grad_norm": 0.7628210186958313, + "learning_rate": 0.0002995443376050465, + "loss": 4.5949, + "step": 87100 + }, + { + "epoch": 0.12239772234292369, + "grad_norm": 0.7284057140350342, + "learning_rate": 0.0002995437644111366, + "loss": 4.6736, + "step": 87110 + }, + { + "epoch": 0.12241177328108727, + "grad_norm": 0.706147313117981, + "learning_rate": 0.0002995431908574819, + "loss": 4.7252, + "step": 87120 + }, + { + "epoch": 0.12242582421925084, + "grad_norm": 0.7244818806648254, + "learning_rate": 0.00029954261694408374, + "loss": 4.6662, + "step": 87130 + }, + { + "epoch": 0.12243987515741442, + "grad_norm": 0.7074465155601501, + "learning_rate": 0.0002995420426709436, + "loss": 4.7113, + "step": 87140 + }, + { + "epoch": 0.122453926095578, + "grad_norm": 0.7343944311141968, + "learning_rate": 0.0002995414680380627, + "loss": 4.6968, + "step": 87150 + }, + { + "epoch": 0.12246797703374157, + "grad_norm": 0.7110148668289185, + "learning_rate": 0.0002995408930454426, + "loss": 4.7478, + "step": 87160 + }, + { + "epoch": 0.12248202797190515, + "grad_norm": 0.6801456809043884, + "learning_rate": 0.0002995403176930846, + "loss": 4.6153, + "step": 87170 + }, + { + "epoch": 0.12249607891006872, + "grad_norm": 0.7084956169128418, + "learning_rate": 0.0002995397419809901, + "loss": 4.7813, + "step": 87180 + }, + { + "epoch": 0.1225101298482323, + "grad_norm": 0.7134016156196594, + "learning_rate": 0.0002995391659091604, + "loss": 4.7072, + "step": 87190 + }, + { + "epoch": 0.12252418078639588, + "grad_norm": 0.7109416723251343, + "learning_rate": 0.000299538589477597, + "loss": 4.6814, + "step": 87200 + }, + { + "epoch": 0.12253823172455947, + "grad_norm": 0.7339860200881958, + "learning_rate": 0.00029953801268630125, + "loss": 4.7426, + "step": 87210 + }, + { + "epoch": 0.12255228266272304, + "grad_norm": 0.7281706929206848, + "learning_rate": 0.00029953743553527457, + "loss": 4.6708, + "step": 87220 + }, + { + "epoch": 0.12256633360088662, + "grad_norm": 0.7278706431388855, + "learning_rate": 0.0002995368580245183, + "loss": 4.6032, + "step": 87230 + }, + { + "epoch": 0.1225803845390502, + "grad_norm": 0.6926674842834473, + "learning_rate": 0.00029953628015403375, + "loss": 4.7332, + "step": 87240 + }, + { + "epoch": 0.12259443547721377, + "grad_norm": 0.6893281936645508, + "learning_rate": 0.00029953570192382253, + "loss": 4.6901, + "step": 87250 + }, + { + "epoch": 0.12260848641537735, + "grad_norm": 0.7695411443710327, + "learning_rate": 0.0002995351233338858, + "loss": 4.6268, + "step": 87260 + }, + { + "epoch": 0.12262253735354092, + "grad_norm": 0.7140504121780396, + "learning_rate": 0.00029953454438422515, + "loss": 4.6299, + "step": 87270 + }, + { + "epoch": 0.1226365882917045, + "grad_norm": 0.7018819451332092, + "learning_rate": 0.0002995339650748418, + "loss": 4.7092, + "step": 87280 + }, + { + "epoch": 0.12265063922986807, + "grad_norm": 0.7678191065788269, + "learning_rate": 0.00029953338540573727, + "loss": 4.7711, + "step": 87290 + }, + { + "epoch": 0.12266469016803165, + "grad_norm": 0.7389617562294006, + "learning_rate": 0.00029953280537691287, + "loss": 4.7441, + "step": 87300 + }, + { + "epoch": 0.12267874110619524, + "grad_norm": 0.7068891525268555, + "learning_rate": 0.00029953222498837003, + "loss": 4.6663, + "step": 87310 + }, + { + "epoch": 0.12269279204435882, + "grad_norm": 0.7148218750953674, + "learning_rate": 0.00029953164424011017, + "loss": 4.7392, + "step": 87320 + }, + { + "epoch": 0.1227068429825224, + "grad_norm": 0.68925541639328, + "learning_rate": 0.0002995310631321347, + "loss": 4.5429, + "step": 87330 + }, + { + "epoch": 0.12272089392068597, + "grad_norm": 0.7171045541763306, + "learning_rate": 0.00029953048166444493, + "loss": 4.7119, + "step": 87340 + }, + { + "epoch": 0.12273494485884955, + "grad_norm": 0.7084298133850098, + "learning_rate": 0.0002995298998370423, + "loss": 4.7834, + "step": 87350 + }, + { + "epoch": 0.12274899579701312, + "grad_norm": 0.7399872541427612, + "learning_rate": 0.00029952931764992826, + "loss": 4.6349, + "step": 87360 + }, + { + "epoch": 0.1227630467351767, + "grad_norm": 0.7395424246788025, + "learning_rate": 0.00029952873510310417, + "loss": 4.6944, + "step": 87370 + }, + { + "epoch": 0.12277709767334027, + "grad_norm": 0.7185593843460083, + "learning_rate": 0.00029952815219657145, + "loss": 4.6208, + "step": 87380 + }, + { + "epoch": 0.12279114861150385, + "grad_norm": 0.7182762622833252, + "learning_rate": 0.0002995275689303314, + "loss": 4.536, + "step": 87390 + }, + { + "epoch": 0.12280519954966743, + "grad_norm": 0.7268804311752319, + "learning_rate": 0.0002995269853043856, + "loss": 4.6515, + "step": 87400 + }, + { + "epoch": 0.122819250487831, + "grad_norm": 0.7242562174797058, + "learning_rate": 0.0002995264013187353, + "loss": 4.7664, + "step": 87410 + }, + { + "epoch": 0.12283330142599459, + "grad_norm": 0.7390630841255188, + "learning_rate": 0.000299525816973382, + "loss": 4.7246, + "step": 87420 + }, + { + "epoch": 0.12284735236415817, + "grad_norm": 0.7184797525405884, + "learning_rate": 0.00029952523226832705, + "loss": 4.6634, + "step": 87430 + }, + { + "epoch": 0.12286140330232175, + "grad_norm": 0.7052783370018005, + "learning_rate": 0.0002995246472035719, + "loss": 4.6488, + "step": 87440 + }, + { + "epoch": 0.12287545424048532, + "grad_norm": 0.7244933247566223, + "learning_rate": 0.0002995240617791179, + "loss": 4.655, + "step": 87450 + }, + { + "epoch": 0.1228895051786489, + "grad_norm": 0.6974210739135742, + "learning_rate": 0.00029952347599496653, + "loss": 4.6001, + "step": 87460 + }, + { + "epoch": 0.12290355611681247, + "grad_norm": 0.7236213088035583, + "learning_rate": 0.00029952288985111915, + "loss": 4.7134, + "step": 87470 + }, + { + "epoch": 0.12291760705497605, + "grad_norm": 0.7080331444740295, + "learning_rate": 0.0002995223033475772, + "loss": 4.6978, + "step": 87480 + }, + { + "epoch": 0.12293165799313963, + "grad_norm": 0.7316338419914246, + "learning_rate": 0.00029952171648434204, + "loss": 4.601, + "step": 87490 + }, + { + "epoch": 0.1229457089313032, + "grad_norm": 0.687455415725708, + "learning_rate": 0.0002995211292614151, + "loss": 4.5823, + "step": 87500 + }, + { + "epoch": 0.12295975986946678, + "grad_norm": 0.7364456653594971, + "learning_rate": 0.0002995205416787979, + "loss": 4.6627, + "step": 87510 + }, + { + "epoch": 0.12297381080763037, + "grad_norm": 0.7389670014381409, + "learning_rate": 0.00029951995373649174, + "loss": 4.6651, + "step": 87520 + }, + { + "epoch": 0.12298786174579394, + "grad_norm": 0.6907218098640442, + "learning_rate": 0.000299519365434498, + "loss": 4.6806, + "step": 87530 + }, + { + "epoch": 0.12300191268395752, + "grad_norm": 0.7170994281768799, + "learning_rate": 0.0002995187767728182, + "loss": 4.6415, + "step": 87540 + }, + { + "epoch": 0.1230159636221211, + "grad_norm": 0.7091425061225891, + "learning_rate": 0.0002995181877514537, + "loss": 4.669, + "step": 87550 + }, + { + "epoch": 0.12303001456028467, + "grad_norm": 0.7479362487792969, + "learning_rate": 0.0002995175983704059, + "loss": 4.6114, + "step": 87560 + }, + { + "epoch": 0.12304406549844825, + "grad_norm": 0.7046723961830139, + "learning_rate": 0.00029951700862967625, + "loss": 4.6076, + "step": 87570 + }, + { + "epoch": 0.12305811643661183, + "grad_norm": 0.7411142587661743, + "learning_rate": 0.0002995164185292662, + "loss": 4.6606, + "step": 87580 + }, + { + "epoch": 0.1230721673747754, + "grad_norm": 0.7122248411178589, + "learning_rate": 0.00029951582806917707, + "loss": 4.8045, + "step": 87590 + }, + { + "epoch": 0.12308621831293898, + "grad_norm": 0.7476603984832764, + "learning_rate": 0.00029951523724941037, + "loss": 4.6634, + "step": 87600 + }, + { + "epoch": 0.12310026925110255, + "grad_norm": 0.7404193878173828, + "learning_rate": 0.0002995146460699675, + "loss": 4.6486, + "step": 87610 + }, + { + "epoch": 0.12311432018926614, + "grad_norm": 0.707002580165863, + "learning_rate": 0.0002995140545308499, + "loss": 4.7272, + "step": 87620 + }, + { + "epoch": 0.12312837112742972, + "grad_norm": 0.762656569480896, + "learning_rate": 0.0002995134626320589, + "loss": 4.6558, + "step": 87630 + }, + { + "epoch": 0.1231424220655933, + "grad_norm": 0.7162299752235413, + "learning_rate": 0.00029951287037359605, + "loss": 4.6148, + "step": 87640 + }, + { + "epoch": 0.12315647300375687, + "grad_norm": 0.7232511043548584, + "learning_rate": 0.0002995122777554627, + "loss": 4.6304, + "step": 87650 + }, + { + "epoch": 0.12317052394192045, + "grad_norm": 0.7200028896331787, + "learning_rate": 0.00029951168477766033, + "loss": 4.7234, + "step": 87660 + }, + { + "epoch": 0.12318457488008402, + "grad_norm": 0.705317497253418, + "learning_rate": 0.00029951109144019024, + "loss": 4.6568, + "step": 87670 + }, + { + "epoch": 0.1231986258182476, + "grad_norm": 0.7253167033195496, + "learning_rate": 0.000299510497743054, + "loss": 4.7056, + "step": 87680 + }, + { + "epoch": 0.12321267675641118, + "grad_norm": 0.7323938012123108, + "learning_rate": 0.000299509903686253, + "loss": 4.6481, + "step": 87690 + }, + { + "epoch": 0.12322672769457475, + "grad_norm": 0.6936154961585999, + "learning_rate": 0.00029950930926978865, + "loss": 4.7263, + "step": 87700 + }, + { + "epoch": 0.12324077863273833, + "grad_norm": 0.7268804311752319, + "learning_rate": 0.0002995087144936624, + "loss": 4.6468, + "step": 87710 + }, + { + "epoch": 0.1232548295709019, + "grad_norm": 0.7117732167243958, + "learning_rate": 0.0002995081193578756, + "loss": 4.7268, + "step": 87720 + }, + { + "epoch": 0.1232688805090655, + "grad_norm": 0.7191809415817261, + "learning_rate": 0.0002995075238624298, + "loss": 4.7574, + "step": 87730 + }, + { + "epoch": 0.12328293144722907, + "grad_norm": 0.7095280885696411, + "learning_rate": 0.0002995069280073263, + "loss": 4.689, + "step": 87740 + }, + { + "epoch": 0.12329698238539265, + "grad_norm": 0.7109695076942444, + "learning_rate": 0.0002995063317925667, + "loss": 4.6289, + "step": 87750 + }, + { + "epoch": 0.12331103332355622, + "grad_norm": 0.7158311605453491, + "learning_rate": 0.0002995057352181523, + "loss": 4.6884, + "step": 87760 + }, + { + "epoch": 0.1233250842617198, + "grad_norm": 0.7213734984397888, + "learning_rate": 0.00029950513828408464, + "loss": 4.8162, + "step": 87770 + }, + { + "epoch": 0.12333913519988338, + "grad_norm": 0.7377204895019531, + "learning_rate": 0.00029950454099036505, + "loss": 4.6634, + "step": 87780 + }, + { + "epoch": 0.12335318613804695, + "grad_norm": 0.7066074013710022, + "learning_rate": 0.000299503943336995, + "loss": 4.6969, + "step": 87790 + }, + { + "epoch": 0.12336723707621053, + "grad_norm": 0.7790802121162415, + "learning_rate": 0.00029950334532397597, + "loss": 4.7552, + "step": 87800 + }, + { + "epoch": 0.1233812880143741, + "grad_norm": 0.7286075949668884, + "learning_rate": 0.0002995027469513094, + "loss": 4.6512, + "step": 87810 + }, + { + "epoch": 0.12339533895253768, + "grad_norm": 0.7531540989875793, + "learning_rate": 0.00029950214821899664, + "loss": 4.7612, + "step": 87820 + }, + { + "epoch": 0.12340938989070127, + "grad_norm": 0.7376166582107544, + "learning_rate": 0.00029950154912703923, + "loss": 4.6591, + "step": 87830 + }, + { + "epoch": 0.12342344082886485, + "grad_norm": 0.7482818365097046, + "learning_rate": 0.0002995009496754385, + "loss": 4.5681, + "step": 87840 + }, + { + "epoch": 0.12343749176702842, + "grad_norm": 0.7085266709327698, + "learning_rate": 0.00029950034986419606, + "loss": 4.7676, + "step": 87850 + }, + { + "epoch": 0.123451542705192, + "grad_norm": 0.7395931482315063, + "learning_rate": 0.0002994997496933132, + "loss": 4.6275, + "step": 87860 + }, + { + "epoch": 0.12346559364335558, + "grad_norm": 0.7464209198951721, + "learning_rate": 0.0002994991491627914, + "loss": 4.746, + "step": 87870 + }, + { + "epoch": 0.12347964458151915, + "grad_norm": 0.7421188950538635, + "learning_rate": 0.0002994985482726322, + "loss": 4.6564, + "step": 87880 + }, + { + "epoch": 0.12349369551968273, + "grad_norm": 0.7147485017776489, + "learning_rate": 0.0002994979470228369, + "loss": 4.6841, + "step": 87890 + }, + { + "epoch": 0.1235077464578463, + "grad_norm": 0.7064347267150879, + "learning_rate": 0.0002994973454134071, + "loss": 4.6705, + "step": 87900 + }, + { + "epoch": 0.12352179739600988, + "grad_norm": 0.6962974071502686, + "learning_rate": 0.00029949674344434407, + "loss": 4.5298, + "step": 87910 + }, + { + "epoch": 0.12353584833417346, + "grad_norm": 0.7260435223579407, + "learning_rate": 0.0002994961411156494, + "loss": 4.8508, + "step": 87920 + }, + { + "epoch": 0.12354989927233705, + "grad_norm": 0.7157175540924072, + "learning_rate": 0.00029949553842732447, + "loss": 4.5779, + "step": 87930 + }, + { + "epoch": 0.12356395021050062, + "grad_norm": 0.7105233669281006, + "learning_rate": 0.0002994949353793708, + "loss": 4.6836, + "step": 87940 + }, + { + "epoch": 0.1235780011486642, + "grad_norm": 0.7243243455886841, + "learning_rate": 0.0002994943319717897, + "loss": 4.6184, + "step": 87950 + }, + { + "epoch": 0.12359205208682777, + "grad_norm": 0.7264546751976013, + "learning_rate": 0.0002994937282045828, + "loss": 4.6754, + "step": 87960 + }, + { + "epoch": 0.12360610302499135, + "grad_norm": 0.7493976354598999, + "learning_rate": 0.00029949312407775145, + "loss": 4.709, + "step": 87970 + }, + { + "epoch": 0.12362015396315493, + "grad_norm": 0.7562281489372253, + "learning_rate": 0.0002994925195912971, + "loss": 4.6625, + "step": 87980 + }, + { + "epoch": 0.1236342049013185, + "grad_norm": 0.7009782195091248, + "learning_rate": 0.00029949191474522123, + "loss": 4.6659, + "step": 87990 + }, + { + "epoch": 0.12364825583948208, + "grad_norm": 0.7240691781044006, + "learning_rate": 0.0002994913095395253, + "loss": 4.6298, + "step": 88000 + }, + { + "epoch": 0.12366230677764566, + "grad_norm": 0.7109857797622681, + "learning_rate": 0.0002994907039742107, + "loss": 4.829, + "step": 88010 + }, + { + "epoch": 0.12367635771580923, + "grad_norm": 0.734104335308075, + "learning_rate": 0.000299490098049279, + "loss": 4.7854, + "step": 88020 + }, + { + "epoch": 0.12369040865397281, + "grad_norm": 0.707338273525238, + "learning_rate": 0.00029948949176473153, + "loss": 4.4639, + "step": 88030 + }, + { + "epoch": 0.1237044595921364, + "grad_norm": 0.7034321427345276, + "learning_rate": 0.0002994888851205699, + "loss": 4.5313, + "step": 88040 + }, + { + "epoch": 0.12371851053029997, + "grad_norm": 0.7377339005470276, + "learning_rate": 0.0002994882781167955, + "loss": 4.6773, + "step": 88050 + }, + { + "epoch": 0.12373256146846355, + "grad_norm": 0.7189347147941589, + "learning_rate": 0.0002994876707534097, + "loss": 4.7458, + "step": 88060 + }, + { + "epoch": 0.12374661240662713, + "grad_norm": 0.7128537893295288, + "learning_rate": 0.00029948706303041403, + "loss": 4.7733, + "step": 88070 + }, + { + "epoch": 0.1237606633447907, + "grad_norm": 0.7069502472877502, + "learning_rate": 0.00029948645494781, + "loss": 4.6594, + "step": 88080 + }, + { + "epoch": 0.12377471428295428, + "grad_norm": 0.7175171375274658, + "learning_rate": 0.00029948584650559907, + "loss": 4.8268, + "step": 88090 + }, + { + "epoch": 0.12378876522111786, + "grad_norm": 0.7087175846099854, + "learning_rate": 0.0002994852377037826, + "loss": 4.7501, + "step": 88100 + }, + { + "epoch": 0.12380281615928143, + "grad_norm": 0.7500482201576233, + "learning_rate": 0.00029948462854236213, + "loss": 4.6788, + "step": 88110 + }, + { + "epoch": 0.12381686709744501, + "grad_norm": 0.7136823534965515, + "learning_rate": 0.00029948401902133917, + "loss": 4.6901, + "step": 88120 + }, + { + "epoch": 0.12383091803560858, + "grad_norm": 0.7292098999023438, + "learning_rate": 0.0002994834091407151, + "loss": 4.7064, + "step": 88130 + }, + { + "epoch": 0.12384496897377217, + "grad_norm": 0.698124349117279, + "learning_rate": 0.0002994827989004914, + "loss": 4.8037, + "step": 88140 + }, + { + "epoch": 0.12385901991193575, + "grad_norm": 0.7261769771575928, + "learning_rate": 0.00029948218830066955, + "loss": 4.5345, + "step": 88150 + }, + { + "epoch": 0.12387307085009933, + "grad_norm": 0.706026554107666, + "learning_rate": 0.000299481577341251, + "loss": 4.6582, + "step": 88160 + }, + { + "epoch": 0.1238871217882629, + "grad_norm": 0.7198874354362488, + "learning_rate": 0.0002994809660222373, + "loss": 4.7463, + "step": 88170 + }, + { + "epoch": 0.12390117272642648, + "grad_norm": 0.7321640849113464, + "learning_rate": 0.0002994803543436299, + "loss": 4.6111, + "step": 88180 + }, + { + "epoch": 0.12391522366459005, + "grad_norm": 0.7120465040206909, + "learning_rate": 0.0002994797423054302, + "loss": 4.7361, + "step": 88190 + }, + { + "epoch": 0.12392927460275363, + "grad_norm": 0.718183159828186, + "learning_rate": 0.0002994791299076397, + "loss": 4.6667, + "step": 88200 + }, + { + "epoch": 0.1239433255409172, + "grad_norm": 0.7051864266395569, + "learning_rate": 0.0002994785171502599, + "loss": 4.7235, + "step": 88210 + }, + { + "epoch": 0.12395737647908078, + "grad_norm": 0.71971595287323, + "learning_rate": 0.0002994779040332922, + "loss": 4.7439, + "step": 88220 + }, + { + "epoch": 0.12397142741724436, + "grad_norm": 0.6985116004943848, + "learning_rate": 0.0002994772905567382, + "loss": 4.6536, + "step": 88230 + }, + { + "epoch": 0.12398547835540795, + "grad_norm": 0.7011168003082275, + "learning_rate": 0.0002994766767205993, + "loss": 4.7449, + "step": 88240 + }, + { + "epoch": 0.12399952929357153, + "grad_norm": 0.7223268747329712, + "learning_rate": 0.00029947606252487695, + "loss": 4.7109, + "step": 88250 + }, + { + "epoch": 0.1240135802317351, + "grad_norm": 0.7336310744285583, + "learning_rate": 0.0002994754479695727, + "loss": 4.5966, + "step": 88260 + }, + { + "epoch": 0.12402763116989868, + "grad_norm": 0.7206735014915466, + "learning_rate": 0.000299474833054688, + "loss": 4.5686, + "step": 88270 + }, + { + "epoch": 0.12404168210806225, + "grad_norm": 0.7099548578262329, + "learning_rate": 0.0002994742177802243, + "loss": 4.7035, + "step": 88280 + }, + { + "epoch": 0.12405573304622583, + "grad_norm": 0.7359626293182373, + "learning_rate": 0.0002994736021461831, + "loss": 4.6886, + "step": 88290 + }, + { + "epoch": 0.1240697839843894, + "grad_norm": 0.7080744504928589, + "learning_rate": 0.00029947298615256587, + "loss": 4.7505, + "step": 88300 + }, + { + "epoch": 0.12408383492255298, + "grad_norm": 0.7350786924362183, + "learning_rate": 0.00029947236979937413, + "loss": 4.5652, + "step": 88310 + }, + { + "epoch": 0.12409788586071656, + "grad_norm": 1.0457093715667725, + "learning_rate": 0.0002994717530866093, + "loss": 4.7684, + "step": 88320 + }, + { + "epoch": 0.12411193679888013, + "grad_norm": 0.7474514245986938, + "learning_rate": 0.00029947113601427286, + "loss": 4.7522, + "step": 88330 + }, + { + "epoch": 0.12412598773704371, + "grad_norm": 0.7411333322525024, + "learning_rate": 0.0002994705185823664, + "loss": 4.7131, + "step": 88340 + }, + { + "epoch": 0.1241400386752073, + "grad_norm": 0.7307577133178711, + "learning_rate": 0.0002994699007908914, + "loss": 4.5768, + "step": 88350 + }, + { + "epoch": 0.12415408961337088, + "grad_norm": 0.7115901708602905, + "learning_rate": 0.00029946928263984915, + "loss": 4.6929, + "step": 88360 + }, + { + "epoch": 0.12416814055153445, + "grad_norm": 0.7410994172096252, + "learning_rate": 0.00029946866412924135, + "loss": 4.7275, + "step": 88370 + }, + { + "epoch": 0.12418219148969803, + "grad_norm": 0.7032224535942078, + "learning_rate": 0.0002994680452590694, + "loss": 4.7629, + "step": 88380 + }, + { + "epoch": 0.1241962424278616, + "grad_norm": 0.6944802403450012, + "learning_rate": 0.00029946742602933484, + "loss": 4.7391, + "step": 88390 + }, + { + "epoch": 0.12421029336602518, + "grad_norm": 0.7015830874443054, + "learning_rate": 0.000299466806440039, + "loss": 4.5912, + "step": 88400 + }, + { + "epoch": 0.12422434430418876, + "grad_norm": 0.7076965570449829, + "learning_rate": 0.0002994661864911836, + "loss": 4.7955, + "step": 88410 + }, + { + "epoch": 0.12423839524235233, + "grad_norm": 0.7263289093971252, + "learning_rate": 0.00029946556618277, + "loss": 4.7882, + "step": 88420 + }, + { + "epoch": 0.12425244618051591, + "grad_norm": 0.7141406536102295, + "learning_rate": 0.0002994649455147997, + "loss": 4.7778, + "step": 88430 + }, + { + "epoch": 0.12426649711867949, + "grad_norm": 0.7139621376991272, + "learning_rate": 0.00029946432448727417, + "loss": 4.7299, + "step": 88440 + }, + { + "epoch": 0.12428054805684308, + "grad_norm": 0.7191937565803528, + "learning_rate": 0.000299463703100195, + "loss": 4.6505, + "step": 88450 + }, + { + "epoch": 0.12429459899500665, + "grad_norm": 0.7122807502746582, + "learning_rate": 0.00029946308135356356, + "loss": 4.7084, + "step": 88460 + }, + { + "epoch": 0.12430864993317023, + "grad_norm": 0.707273006439209, + "learning_rate": 0.00029946245924738146, + "loss": 4.6373, + "step": 88470 + }, + { + "epoch": 0.1243227008713338, + "grad_norm": 0.6925734281539917, + "learning_rate": 0.00029946183678165015, + "loss": 4.7202, + "step": 88480 + }, + { + "epoch": 0.12433675180949738, + "grad_norm": 0.6993895769119263, + "learning_rate": 0.0002994612139563711, + "loss": 4.7537, + "step": 88490 + }, + { + "epoch": 0.12435080274766096, + "grad_norm": 0.7005565166473389, + "learning_rate": 0.00029946059077154587, + "loss": 4.7321, + "step": 88500 + }, + { + "epoch": 0.12436485368582453, + "grad_norm": 0.7049751281738281, + "learning_rate": 0.00029945996722717587, + "loss": 4.7177, + "step": 88510 + }, + { + "epoch": 0.12437890462398811, + "grad_norm": 0.7042818069458008, + "learning_rate": 0.00029945934332326264, + "loss": 4.5704, + "step": 88520 + }, + { + "epoch": 0.12439295556215169, + "grad_norm": 0.6895237565040588, + "learning_rate": 0.00029945871905980774, + "loss": 4.6741, + "step": 88530 + }, + { + "epoch": 0.12440700650031526, + "grad_norm": 0.7173465490341187, + "learning_rate": 0.0002994580944368126, + "loss": 4.7058, + "step": 88540 + }, + { + "epoch": 0.12442105743847885, + "grad_norm": 0.7089383006095886, + "learning_rate": 0.00029945746945427877, + "loss": 4.7225, + "step": 88550 + }, + { + "epoch": 0.12443510837664243, + "grad_norm": 0.7033636569976807, + "learning_rate": 0.00029945684411220773, + "loss": 4.7401, + "step": 88560 + }, + { + "epoch": 0.124449159314806, + "grad_norm": 0.7240288257598877, + "learning_rate": 0.00029945621841060094, + "loss": 4.7217, + "step": 88570 + }, + { + "epoch": 0.12446321025296958, + "grad_norm": 0.7067498564720154, + "learning_rate": 0.00029945559234945997, + "loss": 4.6413, + "step": 88580 + }, + { + "epoch": 0.12447726119113316, + "grad_norm": 0.7195616364479065, + "learning_rate": 0.00029945496592878633, + "loss": 4.7053, + "step": 88590 + }, + { + "epoch": 0.12449131212929673, + "grad_norm": 0.7112637162208557, + "learning_rate": 0.0002994543391485815, + "loss": 4.6635, + "step": 88600 + }, + { + "epoch": 0.12450536306746031, + "grad_norm": 0.6900596618652344, + "learning_rate": 0.000299453712008847, + "loss": 4.7249, + "step": 88610 + }, + { + "epoch": 0.12451941400562389, + "grad_norm": 0.6978746652603149, + "learning_rate": 0.00029945308450958426, + "loss": 4.6273, + "step": 88620 + }, + { + "epoch": 0.12453346494378746, + "grad_norm": 0.707542359828949, + "learning_rate": 0.00029945245665079494, + "loss": 4.648, + "step": 88630 + }, + { + "epoch": 0.12454751588195104, + "grad_norm": 0.7132956981658936, + "learning_rate": 0.00029945182843248044, + "loss": 4.6548, + "step": 88640 + }, + { + "epoch": 0.12456156682011461, + "grad_norm": 0.7130081057548523, + "learning_rate": 0.0002994511998546423, + "loss": 4.666, + "step": 88650 + }, + { + "epoch": 0.1245756177582782, + "grad_norm": 0.7389416098594666, + "learning_rate": 0.000299450570917282, + "loss": 4.6872, + "step": 88660 + }, + { + "epoch": 0.12458966869644178, + "grad_norm": 0.7738654613494873, + "learning_rate": 0.00029944994162040114, + "loss": 4.6537, + "step": 88670 + }, + { + "epoch": 0.12460371963460536, + "grad_norm": 0.7259090542793274, + "learning_rate": 0.00029944931196400115, + "loss": 4.6315, + "step": 88680 + }, + { + "epoch": 0.12461777057276893, + "grad_norm": 0.7047815322875977, + "learning_rate": 0.00029944868194808357, + "loss": 4.7483, + "step": 88690 + }, + { + "epoch": 0.12463182151093251, + "grad_norm": 0.722084641456604, + "learning_rate": 0.0002994480515726499, + "loss": 4.7039, + "step": 88700 + }, + { + "epoch": 0.12464587244909608, + "grad_norm": 0.717406153678894, + "learning_rate": 0.00029944742083770176, + "loss": 4.622, + "step": 88710 + }, + { + "epoch": 0.12465992338725966, + "grad_norm": 0.7036492228507996, + "learning_rate": 0.00029944678974324045, + "loss": 4.7124, + "step": 88720 + }, + { + "epoch": 0.12467397432542324, + "grad_norm": 0.7213016748428345, + "learning_rate": 0.0002994461582892678, + "loss": 4.6973, + "step": 88730 + }, + { + "epoch": 0.12468802526358681, + "grad_norm": 0.775976300239563, + "learning_rate": 0.000299445526475785, + "loss": 4.7035, + "step": 88740 + }, + { + "epoch": 0.12470207620175039, + "grad_norm": 0.7207806706428528, + "learning_rate": 0.00029944489430279377, + "loss": 4.7464, + "step": 88750 + }, + { + "epoch": 0.12471612713991398, + "grad_norm": 0.6966215968132019, + "learning_rate": 0.00029944426177029555, + "loss": 4.7001, + "step": 88760 + }, + { + "epoch": 0.12473017807807756, + "grad_norm": 0.7105135321617126, + "learning_rate": 0.000299443628878292, + "loss": 4.778, + "step": 88770 + }, + { + "epoch": 0.12474422901624113, + "grad_norm": 0.7118408679962158, + "learning_rate": 0.0002994429956267844, + "loss": 4.7942, + "step": 88780 + }, + { + "epoch": 0.12475827995440471, + "grad_norm": 0.7115956544876099, + "learning_rate": 0.0002994423620157745, + "loss": 4.7289, + "step": 88790 + }, + { + "epoch": 0.12477233089256828, + "grad_norm": 0.8141958117485046, + "learning_rate": 0.0002994417280452637, + "loss": 4.7197, + "step": 88800 + }, + { + "epoch": 0.12478638183073186, + "grad_norm": 0.6964655518531799, + "learning_rate": 0.00029944109371525356, + "loss": 4.6832, + "step": 88810 + }, + { + "epoch": 0.12480043276889544, + "grad_norm": 0.6905677318572998, + "learning_rate": 0.0002994404590257456, + "loss": 4.6532, + "step": 88820 + }, + { + "epoch": 0.12481448370705901, + "grad_norm": 0.7176385521888733, + "learning_rate": 0.0002994398239767413, + "loss": 4.6367, + "step": 88830 + }, + { + "epoch": 0.12482853464522259, + "grad_norm": 0.723534107208252, + "learning_rate": 0.0002994391885682423, + "loss": 4.6079, + "step": 88840 + }, + { + "epoch": 0.12484258558338616, + "grad_norm": 0.7028889060020447, + "learning_rate": 0.0002994385528002501, + "loss": 4.7336, + "step": 88850 + }, + { + "epoch": 0.12485663652154975, + "grad_norm": 0.7260593771934509, + "learning_rate": 0.0002994379166727661, + "loss": 4.5638, + "step": 88860 + }, + { + "epoch": 0.12487068745971333, + "grad_norm": 0.7041234374046326, + "learning_rate": 0.00029943728018579205, + "loss": 4.7209, + "step": 88870 + }, + { + "epoch": 0.12488473839787691, + "grad_norm": 0.6976323127746582, + "learning_rate": 0.0002994366433393293, + "loss": 4.6149, + "step": 88880 + }, + { + "epoch": 0.12489878933604048, + "grad_norm": 0.7066828012466431, + "learning_rate": 0.00029943600613337943, + "loss": 4.6027, + "step": 88890 + }, + { + "epoch": 0.12491284027420406, + "grad_norm": 0.7120361328125, + "learning_rate": 0.000299435368567944, + "loss": 4.5695, + "step": 88900 + }, + { + "epoch": 0.12492689121236764, + "grad_norm": 0.7178036570549011, + "learning_rate": 0.0002994347306430245, + "loss": 4.7551, + "step": 88910 + }, + { + "epoch": 0.12494094215053121, + "grad_norm": 1.0115084648132324, + "learning_rate": 0.0002994340923586225, + "loss": 4.5328, + "step": 88920 + }, + { + "epoch": 0.12495499308869479, + "grad_norm": 0.7211881279945374, + "learning_rate": 0.0002994334537147395, + "loss": 4.753, + "step": 88930 + }, + { + "epoch": 0.12496904402685836, + "grad_norm": 0.72914057970047, + "learning_rate": 0.00029943281471137706, + "loss": 4.6328, + "step": 88940 + }, + { + "epoch": 0.12498309496502194, + "grad_norm": 0.7203587889671326, + "learning_rate": 0.00029943217534853677, + "loss": 4.7458, + "step": 88950 + }, + { + "epoch": 0.12499714590318552, + "grad_norm": 0.7104395031929016, + "learning_rate": 0.0002994315356262201, + "loss": 4.6597, + "step": 88960 + }, + { + "epoch": 0.1250111968413491, + "grad_norm": 0.7179931998252869, + "learning_rate": 0.0002994308955444286, + "loss": 4.6605, + "step": 88970 + }, + { + "epoch": 0.12502524777951268, + "grad_norm": 0.7127233743667603, + "learning_rate": 0.00029943025510316385, + "loss": 4.7703, + "step": 88980 + }, + { + "epoch": 0.12503929871767624, + "grad_norm": 0.7128562331199646, + "learning_rate": 0.00029942961430242727, + "loss": 4.8032, + "step": 88990 + }, + { + "epoch": 0.12505334965583983, + "grad_norm": 0.7086876630783081, + "learning_rate": 0.0002994289731422206, + "loss": 4.6504, + "step": 89000 + }, + { + "epoch": 0.1250674005940034, + "grad_norm": 0.7091412544250488, + "learning_rate": 0.0002994283316225452, + "loss": 4.6319, + "step": 89010 + }, + { + "epoch": 0.125081451532167, + "grad_norm": 0.7031800746917725, + "learning_rate": 0.0002994276897434027, + "loss": 4.6304, + "step": 89020 + }, + { + "epoch": 0.12509550247033058, + "grad_norm": 0.7421759963035583, + "learning_rate": 0.00029942704750479464, + "loss": 4.7184, + "step": 89030 + }, + { + "epoch": 0.12510955340849414, + "grad_norm": 0.7047469019889832, + "learning_rate": 0.00029942640490672256, + "loss": 4.6966, + "step": 89040 + }, + { + "epoch": 0.12512360434665773, + "grad_norm": 0.717984139919281, + "learning_rate": 0.00029942576194918796, + "loss": 4.6959, + "step": 89050 + }, + { + "epoch": 0.1251376552848213, + "grad_norm": 0.674823522567749, + "learning_rate": 0.0002994251186321925, + "loss": 4.7184, + "step": 89060 + }, + { + "epoch": 0.12515170622298488, + "grad_norm": 0.725084662437439, + "learning_rate": 0.0002994244749557376, + "loss": 4.5896, + "step": 89070 + }, + { + "epoch": 0.12516575716114844, + "grad_norm": 0.7343477010726929, + "learning_rate": 0.00029942383091982486, + "loss": 4.6372, + "step": 89080 + }, + { + "epoch": 0.12517980809931203, + "grad_norm": 0.6985862255096436, + "learning_rate": 0.0002994231865244558, + "loss": 4.7568, + "step": 89090 + }, + { + "epoch": 0.1251938590374756, + "grad_norm": 0.7344580292701721, + "learning_rate": 0.00029942254176963207, + "loss": 4.774, + "step": 89100 + }, + { + "epoch": 0.1252079099756392, + "grad_norm": 0.714970588684082, + "learning_rate": 0.0002994218966553551, + "loss": 4.7269, + "step": 89110 + }, + { + "epoch": 0.12522196091380278, + "grad_norm": 0.7142235636711121, + "learning_rate": 0.00029942125118162657, + "loss": 4.7118, + "step": 89120 + }, + { + "epoch": 0.12523601185196634, + "grad_norm": 0.7200468182563782, + "learning_rate": 0.00029942060534844786, + "loss": 4.5759, + "step": 89130 + }, + { + "epoch": 0.12525006279012993, + "grad_norm": 0.7273971438407898, + "learning_rate": 0.0002994199591558207, + "loss": 4.596, + "step": 89140 + }, + { + "epoch": 0.1252641137282935, + "grad_norm": 0.708501398563385, + "learning_rate": 0.00029941931260374653, + "loss": 4.7371, + "step": 89150 + }, + { + "epoch": 0.12527816466645708, + "grad_norm": 0.6819218397140503, + "learning_rate": 0.00029941866569222695, + "loss": 4.6924, + "step": 89160 + }, + { + "epoch": 0.12529221560462064, + "grad_norm": 0.7276637554168701, + "learning_rate": 0.0002994180184212635, + "loss": 4.5807, + "step": 89170 + }, + { + "epoch": 0.12530626654278423, + "grad_norm": 0.8149365186691284, + "learning_rate": 0.00029941737079085777, + "loss": 4.7547, + "step": 89180 + }, + { + "epoch": 0.1253203174809478, + "grad_norm": 0.7019021511077881, + "learning_rate": 0.00029941672280101126, + "loss": 4.7022, + "step": 89190 + }, + { + "epoch": 0.12533436841911139, + "grad_norm": 0.7072144150733948, + "learning_rate": 0.00029941607445172554, + "loss": 4.6415, + "step": 89200 + }, + { + "epoch": 0.12534841935727495, + "grad_norm": 0.6896094083786011, + "learning_rate": 0.00029941542574300225, + "loss": 4.5062, + "step": 89210 + }, + { + "epoch": 0.12536247029543854, + "grad_norm": 0.7020169496536255, + "learning_rate": 0.00029941477667484284, + "loss": 4.8094, + "step": 89220 + }, + { + "epoch": 0.12537652123360213, + "grad_norm": 0.6777108311653137, + "learning_rate": 0.000299414127247249, + "loss": 4.6478, + "step": 89230 + }, + { + "epoch": 0.1253905721717657, + "grad_norm": 0.7213195562362671, + "learning_rate": 0.00029941347746022213, + "loss": 4.7103, + "step": 89240 + }, + { + "epoch": 0.12540462310992928, + "grad_norm": 0.6935251951217651, + "learning_rate": 0.00029941282731376387, + "loss": 4.3878, + "step": 89250 + }, + { + "epoch": 0.12541867404809284, + "grad_norm": 0.8068158626556396, + "learning_rate": 0.00029941217680787583, + "loss": 4.6871, + "step": 89260 + }, + { + "epoch": 0.12543272498625643, + "grad_norm": 0.7309907078742981, + "learning_rate": 0.00029941152594255955, + "loss": 4.7632, + "step": 89270 + }, + { + "epoch": 0.12544677592442, + "grad_norm": 0.7052793502807617, + "learning_rate": 0.0002994108747178166, + "loss": 4.8095, + "step": 89280 + }, + { + "epoch": 0.12546082686258359, + "grad_norm": 0.7147111892700195, + "learning_rate": 0.00029941022313364845, + "loss": 4.6167, + "step": 89290 + }, + { + "epoch": 0.12547487780074715, + "grad_norm": 0.7007347345352173, + "learning_rate": 0.00029940957119005683, + "loss": 4.7428, + "step": 89300 + }, + { + "epoch": 0.12548892873891074, + "grad_norm": 0.6927651166915894, + "learning_rate": 0.0002994089188870432, + "loss": 4.7348, + "step": 89310 + }, + { + "epoch": 0.1255029796770743, + "grad_norm": 0.7285720109939575, + "learning_rate": 0.0002994082662246091, + "loss": 4.6294, + "step": 89320 + }, + { + "epoch": 0.1255170306152379, + "grad_norm": 0.7167104482650757, + "learning_rate": 0.0002994076132027563, + "loss": 4.6708, + "step": 89330 + }, + { + "epoch": 0.12553108155340148, + "grad_norm": 0.7202353477478027, + "learning_rate": 0.00029940695982148607, + "loss": 4.7455, + "step": 89340 + }, + { + "epoch": 0.12554513249156504, + "grad_norm": 0.7299073934555054, + "learning_rate": 0.0002994063060808002, + "loss": 4.6263, + "step": 89350 + }, + { + "epoch": 0.12555918342972863, + "grad_norm": 0.7049494981765747, + "learning_rate": 0.00029940565198070015, + "loss": 4.6685, + "step": 89360 + }, + { + "epoch": 0.1255732343678922, + "grad_norm": 0.7378440499305725, + "learning_rate": 0.00029940499752118764, + "loss": 4.628, + "step": 89370 + }, + { + "epoch": 0.12558728530605578, + "grad_norm": 0.705937922000885, + "learning_rate": 0.00029940434270226405, + "loss": 4.6245, + "step": 89380 + }, + { + "epoch": 0.12560133624421935, + "grad_norm": 0.7085596323013306, + "learning_rate": 0.00029940368752393113, + "loss": 4.607, + "step": 89390 + }, + { + "epoch": 0.12561538718238294, + "grad_norm": 0.7117762565612793, + "learning_rate": 0.00029940303198619034, + "loss": 4.5603, + "step": 89400 + }, + { + "epoch": 0.1256294381205465, + "grad_norm": 0.7034866809844971, + "learning_rate": 0.0002994023760890433, + "loss": 4.6884, + "step": 89410 + }, + { + "epoch": 0.1256434890587101, + "grad_norm": 0.6858923435211182, + "learning_rate": 0.00029940171983249155, + "loss": 4.6888, + "step": 89420 + }, + { + "epoch": 0.12565753999687368, + "grad_norm": 0.7072145938873291, + "learning_rate": 0.00029940106321653674, + "loss": 4.6249, + "step": 89430 + }, + { + "epoch": 0.12567159093503724, + "grad_norm": 0.730768620967865, + "learning_rate": 0.0002994004062411804, + "loss": 4.5767, + "step": 89440 + }, + { + "epoch": 0.12568564187320083, + "grad_norm": 0.7527211308479309, + "learning_rate": 0.0002993997489064242, + "loss": 4.6003, + "step": 89450 + }, + { + "epoch": 0.1256996928113644, + "grad_norm": 0.7296200394630432, + "learning_rate": 0.0002993990912122695, + "loss": 4.7804, + "step": 89460 + }, + { + "epoch": 0.12571374374952798, + "grad_norm": 0.7212024927139282, + "learning_rate": 0.00029939843315871813, + "loss": 4.6649, + "step": 89470 + }, + { + "epoch": 0.12572779468769155, + "grad_norm": 0.7408531308174133, + "learning_rate": 0.00029939777474577157, + "loss": 4.7058, + "step": 89480 + }, + { + "epoch": 0.12574184562585514, + "grad_norm": 0.7241307497024536, + "learning_rate": 0.00029939711597343135, + "loss": 4.7249, + "step": 89490 + }, + { + "epoch": 0.1257558965640187, + "grad_norm": 0.7100182175636292, + "learning_rate": 0.00029939645684169913, + "loss": 4.5997, + "step": 89500 + }, + { + "epoch": 0.1257699475021823, + "grad_norm": 0.7237468957901001, + "learning_rate": 0.00029939579735057645, + "loss": 4.698, + "step": 89510 + }, + { + "epoch": 0.12578399844034585, + "grad_norm": 0.6810537576675415, + "learning_rate": 0.0002993951375000649, + "loss": 4.6484, + "step": 89520 + }, + { + "epoch": 0.12579804937850944, + "grad_norm": 0.7313446402549744, + "learning_rate": 0.0002993944772901662, + "loss": 4.5818, + "step": 89530 + }, + { + "epoch": 0.12581210031667303, + "grad_norm": 0.7293576598167419, + "learning_rate": 0.00029939381672088175, + "loss": 4.6622, + "step": 89540 + }, + { + "epoch": 0.1258261512548366, + "grad_norm": 0.7267341613769531, + "learning_rate": 0.0002993931557922132, + "loss": 4.5724, + "step": 89550 + }, + { + "epoch": 0.12584020219300018, + "grad_norm": 0.722690999507904, + "learning_rate": 0.00029939249450416217, + "loss": 4.7577, + "step": 89560 + }, + { + "epoch": 0.12585425313116375, + "grad_norm": 0.7288978695869446, + "learning_rate": 0.00029939183285673023, + "loss": 4.6926, + "step": 89570 + }, + { + "epoch": 0.12586830406932734, + "grad_norm": 0.7535728216171265, + "learning_rate": 0.000299391170849919, + "loss": 4.6465, + "step": 89580 + }, + { + "epoch": 0.1258823550074909, + "grad_norm": 0.7304617166519165, + "learning_rate": 0.00029939050848373, + "loss": 4.6726, + "step": 89590 + }, + { + "epoch": 0.1258964059456545, + "grad_norm": 0.7804543972015381, + "learning_rate": 0.00029938984575816495, + "loss": 4.7101, + "step": 89600 + }, + { + "epoch": 0.12591045688381805, + "grad_norm": 0.7636072635650635, + "learning_rate": 0.0002993891826732253, + "loss": 4.7559, + "step": 89610 + }, + { + "epoch": 0.12592450782198164, + "grad_norm": 0.8159419298171997, + "learning_rate": 0.0002993885192289128, + "loss": 4.5094, + "step": 89620 + }, + { + "epoch": 0.1259385587601452, + "grad_norm": 0.7560939192771912, + "learning_rate": 0.00029938785542522883, + "loss": 4.7041, + "step": 89630 + }, + { + "epoch": 0.1259526096983088, + "grad_norm": 0.7355748414993286, + "learning_rate": 0.0002993871912621752, + "loss": 4.7708, + "step": 89640 + }, + { + "epoch": 0.12596666063647238, + "grad_norm": 0.7174279689788818, + "learning_rate": 0.0002993865267397534, + "loss": 4.6629, + "step": 89650 + }, + { + "epoch": 0.12598071157463595, + "grad_norm": 0.7072028517723083, + "learning_rate": 0.00029938586185796504, + "loss": 4.708, + "step": 89660 + }, + { + "epoch": 0.12599476251279954, + "grad_norm": 0.7467581033706665, + "learning_rate": 0.00029938519661681175, + "loss": 4.6189, + "step": 89670 + }, + { + "epoch": 0.1260088134509631, + "grad_norm": 0.724288821220398, + "learning_rate": 0.0002993845310162951, + "loss": 4.6856, + "step": 89680 + }, + { + "epoch": 0.1260228643891267, + "grad_norm": 0.7614923715591431, + "learning_rate": 0.0002993838650564167, + "loss": 4.6472, + "step": 89690 + }, + { + "epoch": 0.12603691532729025, + "grad_norm": 0.7412780523300171, + "learning_rate": 0.00029938319873717815, + "loss": 4.6539, + "step": 89700 + }, + { + "epoch": 0.12605096626545384, + "grad_norm": 0.7168459892272949, + "learning_rate": 0.00029938253205858106, + "loss": 4.775, + "step": 89710 + }, + { + "epoch": 0.1260650172036174, + "grad_norm": 0.7272344827651978, + "learning_rate": 0.00029938186502062705, + "loss": 4.698, + "step": 89720 + }, + { + "epoch": 0.126079068141781, + "grad_norm": 0.6984730958938599, + "learning_rate": 0.0002993811976233177, + "loss": 4.5482, + "step": 89730 + }, + { + "epoch": 0.12609311907994458, + "grad_norm": 0.698017418384552, + "learning_rate": 0.0002993805298666546, + "loss": 4.6122, + "step": 89740 + }, + { + "epoch": 0.12610717001810814, + "grad_norm": 0.7286360263824463, + "learning_rate": 0.0002993798617506394, + "loss": 4.7628, + "step": 89750 + }, + { + "epoch": 0.12612122095627173, + "grad_norm": 0.6975075006484985, + "learning_rate": 0.00029937919327527367, + "loss": 4.7699, + "step": 89760 + }, + { + "epoch": 0.1261352718944353, + "grad_norm": 0.7229320406913757, + "learning_rate": 0.000299378524440559, + "loss": 4.7071, + "step": 89770 + }, + { + "epoch": 0.1261493228325989, + "grad_norm": 0.7221950888633728, + "learning_rate": 0.00029937785524649713, + "loss": 4.6989, + "step": 89780 + }, + { + "epoch": 0.12616337377076245, + "grad_norm": 0.6790987849235535, + "learning_rate": 0.00029937718569308946, + "loss": 4.7405, + "step": 89790 + }, + { + "epoch": 0.12617742470892604, + "grad_norm": 0.7140920162200928, + "learning_rate": 0.0002993765157803378, + "loss": 4.7422, + "step": 89800 + }, + { + "epoch": 0.1261914756470896, + "grad_norm": 0.7047672867774963, + "learning_rate": 0.0002993758455082436, + "loss": 4.6582, + "step": 89810 + }, + { + "epoch": 0.1262055265852532, + "grad_norm": 0.7307078242301941, + "learning_rate": 0.0002993751748768086, + "loss": 4.6722, + "step": 89820 + }, + { + "epoch": 0.12621957752341675, + "grad_norm": 0.7230780124664307, + "learning_rate": 0.0002993745038860344, + "loss": 4.7186, + "step": 89830 + }, + { + "epoch": 0.12623362846158034, + "grad_norm": 0.7278953790664673, + "learning_rate": 0.00029937383253592244, + "loss": 4.7258, + "step": 89840 + }, + { + "epoch": 0.12624767939974393, + "grad_norm": 0.7198377251625061, + "learning_rate": 0.00029937316082647456, + "loss": 4.7229, + "step": 89850 + }, + { + "epoch": 0.1262617303379075, + "grad_norm": 0.6939035058021545, + "learning_rate": 0.00029937248875769226, + "loss": 4.6455, + "step": 89860 + }, + { + "epoch": 0.1262757812760711, + "grad_norm": 0.7266935706138611, + "learning_rate": 0.00029937181632957717, + "loss": 4.5993, + "step": 89870 + }, + { + "epoch": 0.12628983221423465, + "grad_norm": 0.7101845145225525, + "learning_rate": 0.00029937114354213097, + "loss": 4.6855, + "step": 89880 + }, + { + "epoch": 0.12630388315239824, + "grad_norm": 0.7384141087532043, + "learning_rate": 0.0002993704703953552, + "loss": 4.7441, + "step": 89890 + }, + { + "epoch": 0.1263179340905618, + "grad_norm": 0.6860256791114807, + "learning_rate": 0.0002993697968892515, + "loss": 4.6038, + "step": 89900 + }, + { + "epoch": 0.1263319850287254, + "grad_norm": 0.7191735506057739, + "learning_rate": 0.0002993691230238215, + "loss": 4.6831, + "step": 89910 + }, + { + "epoch": 0.12634603596688895, + "grad_norm": 0.7158761620521545, + "learning_rate": 0.0002993684487990668, + "loss": 4.5944, + "step": 89920 + }, + { + "epoch": 0.12636008690505254, + "grad_norm": 0.7162553071975708, + "learning_rate": 0.0002993677742149891, + "loss": 4.6579, + "step": 89930 + }, + { + "epoch": 0.1263741378432161, + "grad_norm": 0.7150312662124634, + "learning_rate": 0.0002993670992715899, + "loss": 4.7404, + "step": 89940 + }, + { + "epoch": 0.1263881887813797, + "grad_norm": 0.7158754467964172, + "learning_rate": 0.0002993664239688709, + "loss": 4.6764, + "step": 89950 + }, + { + "epoch": 0.12640223971954329, + "grad_norm": 0.7216680645942688, + "learning_rate": 0.0002993657483068337, + "loss": 4.6328, + "step": 89960 + }, + { + "epoch": 0.12641629065770685, + "grad_norm": 0.7339308261871338, + "learning_rate": 0.00029936507228547996, + "loss": 4.6651, + "step": 89970 + }, + { + "epoch": 0.12643034159587044, + "grad_norm": 0.700171709060669, + "learning_rate": 0.00029936439590481126, + "loss": 4.6854, + "step": 89980 + }, + { + "epoch": 0.126444392534034, + "grad_norm": 0.7425507307052612, + "learning_rate": 0.0002993637191648293, + "loss": 4.5799, + "step": 89990 + }, + { + "epoch": 0.1264584434721976, + "grad_norm": 0.7096763849258423, + "learning_rate": 0.0002993630420655356, + "loss": 4.5681, + "step": 90000 + } + ], + "logging_steps": 10, + "max_steps": 711696, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.7405109256192e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}