diff --git "a/checkpoint-6828/trainer_state.json" "b/checkpoint-6828/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-6828/trainer_state.json" @@ -0,0 +1,4808 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 6828, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005858230814294083, + "grad_norm": 14.95982551574707, + "learning_rate": 1.9978910369068542e-05, + "loss": 4.3468, + "step": 10 + }, + { + "epoch": 0.011716461628588167, + "grad_norm": 19.284658432006836, + "learning_rate": 1.9955477445811366e-05, + "loss": 4.6374, + "step": 20 + }, + { + "epoch": 0.01757469244288225, + "grad_norm": 11.212137222290039, + "learning_rate": 1.993204452255419e-05, + "loss": 2.4865, + "step": 30 + }, + { + "epoch": 0.023432923257176334, + "grad_norm": 6.0227203369140625, + "learning_rate": 1.9908611599297014e-05, + "loss": 2.662, + "step": 40 + }, + { + "epoch": 0.029291154071470416, + "grad_norm": 9.085187911987305, + "learning_rate": 1.988517867603984e-05, + "loss": 2.3138, + "step": 50 + }, + { + "epoch": 0.0351493848857645, + "grad_norm": 8.839755058288574, + "learning_rate": 1.9861745752782663e-05, + "loss": 1.9221, + "step": 60 + }, + { + "epoch": 0.041007615700058585, + "grad_norm": 5.003777027130127, + "learning_rate": 1.9838312829525487e-05, + "loss": 1.3857, + "step": 70 + }, + { + "epoch": 0.04686584651435267, + "grad_norm": 7.108287811279297, + "learning_rate": 1.981487990626831e-05, + "loss": 1.5308, + "step": 80 + }, + { + "epoch": 0.05272407732864675, + "grad_norm": 6.89133882522583, + "learning_rate": 1.9791446983011135e-05, + "loss": 1.3522, + "step": 90 + }, + { + "epoch": 0.05858230814294083, + "grad_norm": 6.8093695640563965, + "learning_rate": 1.9768014059753956e-05, + "loss": 1.0296, + "step": 100 + }, + { + "epoch": 0.06444053895723492, + "grad_norm": 2.8785605430603027, + "learning_rate": 1.974458113649678e-05, + "loss": 1.0909, + "step": 110 + }, + { + "epoch": 0.070298769771529, + "grad_norm": 3.344712018966675, + "learning_rate": 1.9721148213239604e-05, + "loss": 0.8385, + "step": 120 + }, + { + "epoch": 0.07615700058582309, + "grad_norm": 4.9901041984558105, + "learning_rate": 1.9697715289982428e-05, + "loss": 0.9958, + "step": 130 + }, + { + "epoch": 0.08201523140011717, + "grad_norm": 4.131304740905762, + "learning_rate": 1.967428236672525e-05, + "loss": 0.8819, + "step": 140 + }, + { + "epoch": 0.08787346221441125, + "grad_norm": 2.20668888092041, + "learning_rate": 1.9650849443468073e-05, + "loss": 0.7881, + "step": 150 + }, + { + "epoch": 0.09373169302870533, + "grad_norm": 4.06991720199585, + "learning_rate": 1.9627416520210897e-05, + "loss": 0.8286, + "step": 160 + }, + { + "epoch": 0.09958992384299942, + "grad_norm": 2.710232973098755, + "learning_rate": 1.960398359695372e-05, + "loss": 0.605, + "step": 170 + }, + { + "epoch": 0.1054481546572935, + "grad_norm": 1.2272921800613403, + "learning_rate": 1.9580550673696545e-05, + "loss": 0.5988, + "step": 180 + }, + { + "epoch": 0.11130638547158758, + "grad_norm": 1.7559036016464233, + "learning_rate": 1.955711775043937e-05, + "loss": 0.6549, + "step": 190 + }, + { + "epoch": 0.11716461628588166, + "grad_norm": 2.8257083892822266, + "learning_rate": 1.9533684827182193e-05, + "loss": 0.6415, + "step": 200 + }, + { + "epoch": 0.12302284710017575, + "grad_norm": 2.2019424438476562, + "learning_rate": 1.9510251903925017e-05, + "loss": 0.4694, + "step": 210 + }, + { + "epoch": 0.12888107791446984, + "grad_norm": 2.916757345199585, + "learning_rate": 1.948681898066784e-05, + "loss": 0.5989, + "step": 220 + }, + { + "epoch": 0.1347393087287639, + "grad_norm": 1.955367088317871, + "learning_rate": 1.9463386057410662e-05, + "loss": 0.478, + "step": 230 + }, + { + "epoch": 0.140597539543058, + "grad_norm": 1.6670275926589966, + "learning_rate": 1.9439953134153486e-05, + "loss": 0.4398, + "step": 240 + }, + { + "epoch": 0.14645577035735208, + "grad_norm": 1.7495224475860596, + "learning_rate": 1.941652021089631e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.15231400117164617, + "grad_norm": 1.3516908884048462, + "learning_rate": 1.9393087287639135e-05, + "loss": 0.3734, + "step": 260 + }, + { + "epoch": 0.15817223198594024, + "grad_norm": 1.5071974992752075, + "learning_rate": 1.936965436438196e-05, + "loss": 0.4378, + "step": 270 + }, + { + "epoch": 0.16403046280023434, + "grad_norm": 1.428642749786377, + "learning_rate": 1.9346221441124783e-05, + "loss": 0.3708, + "step": 280 + }, + { + "epoch": 0.1698886936145284, + "grad_norm": 2.0682930946350098, + "learning_rate": 1.9322788517867607e-05, + "loss": 0.4321, + "step": 290 + }, + { + "epoch": 0.1757469244288225, + "grad_norm": 4.945940971374512, + "learning_rate": 1.9299355594610428e-05, + "loss": 0.4092, + "step": 300 + }, + { + "epoch": 0.18160515524311657, + "grad_norm": 1.2532398700714111, + "learning_rate": 1.9275922671353252e-05, + "loss": 0.2347, + "step": 310 + }, + { + "epoch": 0.18746338605741067, + "grad_norm": 1.3484373092651367, + "learning_rate": 1.9252489748096076e-05, + "loss": 0.2977, + "step": 320 + }, + { + "epoch": 0.19332161687170474, + "grad_norm": 1.7067067623138428, + "learning_rate": 1.92290568248389e-05, + "loss": 0.2155, + "step": 330 + }, + { + "epoch": 0.19917984768599883, + "grad_norm": 1.3422434329986572, + "learning_rate": 1.9205623901581724e-05, + "loss": 0.2571, + "step": 340 + }, + { + "epoch": 0.2050380785002929, + "grad_norm": 0.6787233948707581, + "learning_rate": 1.9182190978324548e-05, + "loss": 0.2459, + "step": 350 + }, + { + "epoch": 0.210896309314587, + "grad_norm": 0.7697986364364624, + "learning_rate": 1.915875805506737e-05, + "loss": 0.2171, + "step": 360 + }, + { + "epoch": 0.21675454012888107, + "grad_norm": 2.9185574054718018, + "learning_rate": 1.9135325131810193e-05, + "loss": 0.2731, + "step": 370 + }, + { + "epoch": 0.22261277094317516, + "grad_norm": 1.8088535070419312, + "learning_rate": 1.9111892208553017e-05, + "loss": 0.2691, + "step": 380 + }, + { + "epoch": 0.22847100175746923, + "grad_norm": 1.4816365242004395, + "learning_rate": 1.908845928529584e-05, + "loss": 0.1665, + "step": 390 + }, + { + "epoch": 0.23432923257176333, + "grad_norm": 3.9424285888671875, + "learning_rate": 1.9065026362038665e-05, + "loss": 0.1885, + "step": 400 + }, + { + "epoch": 0.2401874633860574, + "grad_norm": 1.9072023630142212, + "learning_rate": 1.904159343878149e-05, + "loss": 0.3134, + "step": 410 + }, + { + "epoch": 0.2460456942003515, + "grad_norm": 0.817992627620697, + "learning_rate": 1.9018160515524314e-05, + "loss": 0.1547, + "step": 420 + }, + { + "epoch": 0.2519039250146456, + "grad_norm": 2.727170944213867, + "learning_rate": 1.8994727592267138e-05, + "loss": 0.1292, + "step": 430 + }, + { + "epoch": 0.2577621558289397, + "grad_norm": 1.7762303352355957, + "learning_rate": 1.8971294669009962e-05, + "loss": 0.1353, + "step": 440 + }, + { + "epoch": 0.26362038664323373, + "grad_norm": 0.4511788785457611, + "learning_rate": 1.8947861745752786e-05, + "loss": 0.1999, + "step": 450 + }, + { + "epoch": 0.2694786174575278, + "grad_norm": 2.340090751647949, + "learning_rate": 1.892442882249561e-05, + "loss": 0.0877, + "step": 460 + }, + { + "epoch": 0.2753368482718219, + "grad_norm": 1.6077126264572144, + "learning_rate": 1.890099589923843e-05, + "loss": 0.178, + "step": 470 + }, + { + "epoch": 0.281195079086116, + "grad_norm": 1.6138532161712646, + "learning_rate": 1.8877562975981255e-05, + "loss": 0.2541, + "step": 480 + }, + { + "epoch": 0.28705330990041006, + "grad_norm": 3.209932327270508, + "learning_rate": 1.885413005272408e-05, + "loss": 0.108, + "step": 490 + }, + { + "epoch": 0.29291154071470415, + "grad_norm": 3.5678186416625977, + "learning_rate": 1.8830697129466903e-05, + "loss": 0.141, + "step": 500 + }, + { + "epoch": 0.29876977152899825, + "grad_norm": 3.03879976272583, + "learning_rate": 1.8807264206209724e-05, + "loss": 0.2155, + "step": 510 + }, + { + "epoch": 0.30462800234329235, + "grad_norm": 0.4257088899612427, + "learning_rate": 1.8783831282952548e-05, + "loss": 0.2603, + "step": 520 + }, + { + "epoch": 0.3104862331575864, + "grad_norm": 2.5923264026641846, + "learning_rate": 1.8760398359695372e-05, + "loss": 0.1271, + "step": 530 + }, + { + "epoch": 0.3163444639718805, + "grad_norm": 2.9671404361724854, + "learning_rate": 1.8736965436438196e-05, + "loss": 0.1323, + "step": 540 + }, + { + "epoch": 0.3222026947861746, + "grad_norm": 0.1397818624973297, + "learning_rate": 1.871353251318102e-05, + "loss": 0.0597, + "step": 550 + }, + { + "epoch": 0.3280609256004687, + "grad_norm": 0.5060864686965942, + "learning_rate": 1.8690099589923845e-05, + "loss": 0.2495, + "step": 560 + }, + { + "epoch": 0.3339191564147627, + "grad_norm": 1.6251068115234375, + "learning_rate": 1.866666666666667e-05, + "loss": 0.3011, + "step": 570 + }, + { + "epoch": 0.3397773872290568, + "grad_norm": 0.20143267512321472, + "learning_rate": 1.8643233743409493e-05, + "loss": 0.1201, + "step": 580 + }, + { + "epoch": 0.3456356180433509, + "grad_norm": 0.1996494084596634, + "learning_rate": 1.8619800820152317e-05, + "loss": 0.2238, + "step": 590 + }, + { + "epoch": 0.351493848857645, + "grad_norm": 2.6201045513153076, + "learning_rate": 1.859636789689514e-05, + "loss": 0.1274, + "step": 600 + }, + { + "epoch": 0.35735207967193905, + "grad_norm": 3.8342576026916504, + "learning_rate": 1.8572934973637965e-05, + "loss": 0.0662, + "step": 610 + }, + { + "epoch": 0.36321031048623315, + "grad_norm": 4.215113162994385, + "learning_rate": 1.8549502050380786e-05, + "loss": 0.0751, + "step": 620 + }, + { + "epoch": 0.36906854130052724, + "grad_norm": 6.333745002746582, + "learning_rate": 1.852606912712361e-05, + "loss": 0.1664, + "step": 630 + }, + { + "epoch": 0.37492677211482134, + "grad_norm": 6.037374973297119, + "learning_rate": 1.8502636203866434e-05, + "loss": 0.3169, + "step": 640 + }, + { + "epoch": 0.38078500292911543, + "grad_norm": 0.39565032720565796, + "learning_rate": 1.8479203280609258e-05, + "loss": 0.1296, + "step": 650 + }, + { + "epoch": 0.3866432337434095, + "grad_norm": 6.911872386932373, + "learning_rate": 1.8455770357352082e-05, + "loss": 0.256, + "step": 660 + }, + { + "epoch": 0.39250146455770357, + "grad_norm": 0.17452439665794373, + "learning_rate": 1.8432337434094903e-05, + "loss": 0.0724, + "step": 670 + }, + { + "epoch": 0.39835969537199767, + "grad_norm": 0.1822187453508377, + "learning_rate": 1.8408904510837727e-05, + "loss": 0.0972, + "step": 680 + }, + { + "epoch": 0.40421792618629176, + "grad_norm": 2.590902328491211, + "learning_rate": 1.838547158758055e-05, + "loss": 0.1851, + "step": 690 + }, + { + "epoch": 0.4100761570005858, + "grad_norm": 0.2342429906129837, + "learning_rate": 1.8362038664323375e-05, + "loss": 0.2552, + "step": 700 + }, + { + "epoch": 0.4159343878148799, + "grad_norm": 1.4866358041763306, + "learning_rate": 1.83386057410662e-05, + "loss": 0.131, + "step": 710 + }, + { + "epoch": 0.421792618629174, + "grad_norm": 3.6473488807678223, + "learning_rate": 1.8315172817809024e-05, + "loss": 0.2056, + "step": 720 + }, + { + "epoch": 0.4276508494434681, + "grad_norm": 4.957830429077148, + "learning_rate": 1.8291739894551848e-05, + "loss": 0.2499, + "step": 730 + }, + { + "epoch": 0.43350908025776214, + "grad_norm": 0.18394331634044647, + "learning_rate": 1.8268306971294672e-05, + "loss": 0.0795, + "step": 740 + }, + { + "epoch": 0.43936731107205623, + "grad_norm": 0.15296845138072968, + "learning_rate": 1.8244874048037493e-05, + "loss": 0.0301, + "step": 750 + }, + { + "epoch": 0.44522554188635033, + "grad_norm": 0.2405184507369995, + "learning_rate": 1.8221441124780317e-05, + "loss": 0.0557, + "step": 760 + }, + { + "epoch": 0.4510837727006444, + "grad_norm": 2.8683652877807617, + "learning_rate": 1.819800820152314e-05, + "loss": 0.2331, + "step": 770 + }, + { + "epoch": 0.45694200351493847, + "grad_norm": 0.8913503289222717, + "learning_rate": 1.8174575278265965e-05, + "loss": 0.2253, + "step": 780 + }, + { + "epoch": 0.46280023432923256, + "grad_norm": 0.7034773826599121, + "learning_rate": 1.815114235500879e-05, + "loss": 0.1177, + "step": 790 + }, + { + "epoch": 0.46865846514352666, + "grad_norm": 0.3686612844467163, + "learning_rate": 1.8127709431751613e-05, + "loss": 0.2178, + "step": 800 + }, + { + "epoch": 0.47451669595782076, + "grad_norm": 4.0067524909973145, + "learning_rate": 1.8104276508494437e-05, + "loss": 0.1373, + "step": 810 + }, + { + "epoch": 0.4803749267721148, + "grad_norm": 4.526524066925049, + "learning_rate": 1.808084358523726e-05, + "loss": 0.1769, + "step": 820 + }, + { + "epoch": 0.4862331575864089, + "grad_norm": 10.620451927185059, + "learning_rate": 1.8057410661980082e-05, + "loss": 0.0887, + "step": 830 + }, + { + "epoch": 0.492091388400703, + "grad_norm": 1.2064718008041382, + "learning_rate": 1.8033977738722906e-05, + "loss": 0.2784, + "step": 840 + }, + { + "epoch": 0.4979496192149971, + "grad_norm": 2.4158217906951904, + "learning_rate": 1.801054481546573e-05, + "loss": 0.1493, + "step": 850 + }, + { + "epoch": 0.5038078500292912, + "grad_norm": 4.48486328125, + "learning_rate": 1.7987111892208554e-05, + "loss": 0.0502, + "step": 860 + }, + { + "epoch": 0.5096660808435852, + "grad_norm": 0.5434872508049011, + "learning_rate": 1.796367896895138e-05, + "loss": 0.052, + "step": 870 + }, + { + "epoch": 0.5155243116578794, + "grad_norm": 0.11767154932022095, + "learning_rate": 1.7940246045694203e-05, + "loss": 0.1079, + "step": 880 + }, + { + "epoch": 0.5213825424721734, + "grad_norm": 0.14388304948806763, + "learning_rate": 1.7916813122437023e-05, + "loss": 0.1596, + "step": 890 + }, + { + "epoch": 0.5272407732864675, + "grad_norm": 4.8957905769348145, + "learning_rate": 1.7893380199179847e-05, + "loss": 0.1127, + "step": 900 + }, + { + "epoch": 0.5330990041007616, + "grad_norm": 0.09456570446491241, + "learning_rate": 1.786994727592267e-05, + "loss": 0.215, + "step": 910 + }, + { + "epoch": 0.5389572349150556, + "grad_norm": 0.1968098133802414, + "learning_rate": 1.7846514352665496e-05, + "loss": 0.1591, + "step": 920 + }, + { + "epoch": 0.5448154657293497, + "grad_norm": 0.3306093215942383, + "learning_rate": 1.782308142940832e-05, + "loss": 0.0166, + "step": 930 + }, + { + "epoch": 0.5506736965436438, + "grad_norm": 3.040661573410034, + "learning_rate": 1.7799648506151144e-05, + "loss": 0.0556, + "step": 940 + }, + { + "epoch": 0.5565319273579379, + "grad_norm": 0.1786162257194519, + "learning_rate": 1.7776215582893968e-05, + "loss": 0.15, + "step": 950 + }, + { + "epoch": 0.562390158172232, + "grad_norm": 3.1493492126464844, + "learning_rate": 1.7752782659636792e-05, + "loss": 0.3242, + "step": 960 + }, + { + "epoch": 0.5682483889865261, + "grad_norm": 0.3529878556728363, + "learning_rate": 1.7729349736379616e-05, + "loss": 0.1432, + "step": 970 + }, + { + "epoch": 0.5741066198008201, + "grad_norm": 3.4300451278686523, + "learning_rate": 1.770591681312244e-05, + "loss": 0.1302, + "step": 980 + }, + { + "epoch": 0.5799648506151143, + "grad_norm": 0.2585265636444092, + "learning_rate": 1.7682483889865264e-05, + "loss": 0.0252, + "step": 990 + }, + { + "epoch": 0.5858230814294083, + "grad_norm": 5.724643707275391, + "learning_rate": 1.7659050966608085e-05, + "loss": 0.1441, + "step": 1000 + }, + { + "epoch": 0.5916813122437024, + "grad_norm": 3.814067840576172, + "learning_rate": 1.763561804335091e-05, + "loss": 0.0298, + "step": 1010 + }, + { + "epoch": 0.5975395430579965, + "grad_norm": 0.9568992257118225, + "learning_rate": 1.7612185120093733e-05, + "loss": 0.375, + "step": 1020 + }, + { + "epoch": 0.6033977738722905, + "grad_norm": 0.07911839336156845, + "learning_rate": 1.7588752196836558e-05, + "loss": 0.1334, + "step": 1030 + }, + { + "epoch": 0.6092560046865847, + "grad_norm": 12.957563400268555, + "learning_rate": 1.7565319273579378e-05, + "loss": 0.2123, + "step": 1040 + }, + { + "epoch": 0.6151142355008787, + "grad_norm": 0.25080686807632446, + "learning_rate": 1.7541886350322202e-05, + "loss": 0.0201, + "step": 1050 + }, + { + "epoch": 0.6209724663151728, + "grad_norm": 0.17345930635929108, + "learning_rate": 1.7518453427065027e-05, + "loss": 0.108, + "step": 1060 + }, + { + "epoch": 0.6268306971294669, + "grad_norm": 5.377856731414795, + "learning_rate": 1.749502050380785e-05, + "loss": 0.1263, + "step": 1070 + }, + { + "epoch": 0.632688927943761, + "grad_norm": 0.24767795205116272, + "learning_rate": 1.7471587580550675e-05, + "loss": 0.0413, + "step": 1080 + }, + { + "epoch": 0.6385471587580551, + "grad_norm": 0.0758834108710289, + "learning_rate": 1.74481546572935e-05, + "loss": 0.017, + "step": 1090 + }, + { + "epoch": 0.6444053895723492, + "grad_norm": 4.1260905265808105, + "learning_rate": 1.7424721734036323e-05, + "loss": 0.061, + "step": 1100 + }, + { + "epoch": 0.6502636203866432, + "grad_norm": 0.050045400857925415, + "learning_rate": 1.7401288810779147e-05, + "loss": 0.0523, + "step": 1110 + }, + { + "epoch": 0.6561218512009374, + "grad_norm": 4.139227390289307, + "learning_rate": 1.737785588752197e-05, + "loss": 0.1138, + "step": 1120 + }, + { + "epoch": 0.6619800820152314, + "grad_norm": 2.0994224548339844, + "learning_rate": 1.7354422964264795e-05, + "loss": 0.2773, + "step": 1130 + }, + { + "epoch": 0.6678383128295254, + "grad_norm": 5.285941123962402, + "learning_rate": 1.733099004100762e-05, + "loss": 0.1247, + "step": 1140 + }, + { + "epoch": 0.6736965436438196, + "grad_norm": 0.09476584196090698, + "learning_rate": 1.730755711775044e-05, + "loss": 0.0738, + "step": 1150 + }, + { + "epoch": 0.6795547744581136, + "grad_norm": 0.10948991030454636, + "learning_rate": 1.7284124194493264e-05, + "loss": 0.1981, + "step": 1160 + }, + { + "epoch": 0.6854130052724078, + "grad_norm": 0.179415762424469, + "learning_rate": 1.726069127123609e-05, + "loss": 0.1038, + "step": 1170 + }, + { + "epoch": 0.6912712360867018, + "grad_norm": 4.8998894691467285, + "learning_rate": 1.7237258347978912e-05, + "loss": 0.1813, + "step": 1180 + }, + { + "epoch": 0.6971294669009959, + "grad_norm": 0.17422302067279816, + "learning_rate": 1.7213825424721737e-05, + "loss": 0.0713, + "step": 1190 + }, + { + "epoch": 0.70298769771529, + "grad_norm": 0.6925610899925232, + "learning_rate": 1.7190392501464557e-05, + "loss": 0.0414, + "step": 1200 + }, + { + "epoch": 0.7088459285295841, + "grad_norm": 0.05044275522232056, + "learning_rate": 1.716695957820738e-05, + "loss": 0.1019, + "step": 1210 + }, + { + "epoch": 0.7147041593438781, + "grad_norm": 0.14552544057369232, + "learning_rate": 1.7143526654950206e-05, + "loss": 0.1534, + "step": 1220 + }, + { + "epoch": 0.7205623901581723, + "grad_norm": 7.160550594329834, + "learning_rate": 1.712009373169303e-05, + "loss": 0.185, + "step": 1230 + }, + { + "epoch": 0.7264206209724663, + "grad_norm": 0.20127852261066437, + "learning_rate": 1.7096660808435854e-05, + "loss": 0.0505, + "step": 1240 + }, + { + "epoch": 0.7322788517867604, + "grad_norm": 0.07321985810995102, + "learning_rate": 1.7073227885178678e-05, + "loss": 0.0443, + "step": 1250 + }, + { + "epoch": 0.7381370826010545, + "grad_norm": 1.5325711965560913, + "learning_rate": 1.7049794961921502e-05, + "loss": 0.0673, + "step": 1260 + }, + { + "epoch": 0.7439953134153485, + "grad_norm": 1.1066352128982544, + "learning_rate": 1.7026362038664326e-05, + "loss": 0.1124, + "step": 1270 + }, + { + "epoch": 0.7498535442296427, + "grad_norm": 0.061127688735723495, + "learning_rate": 1.7002929115407147e-05, + "loss": 0.0795, + "step": 1280 + }, + { + "epoch": 0.7557117750439367, + "grad_norm": 0.2831653654575348, + "learning_rate": 1.697949619214997e-05, + "loss": 0.0952, + "step": 1290 + }, + { + "epoch": 0.7615700058582309, + "grad_norm": 0.04763650521636009, + "learning_rate": 1.6956063268892795e-05, + "loss": 0.227, + "step": 1300 + }, + { + "epoch": 0.7674282366725249, + "grad_norm": 0.043026477098464966, + "learning_rate": 1.693263034563562e-05, + "loss": 0.1314, + "step": 1310 + }, + { + "epoch": 0.773286467486819, + "grad_norm": 0.0663742944598198, + "learning_rate": 1.6909197422378443e-05, + "loss": 0.1627, + "step": 1320 + }, + { + "epoch": 0.7791446983011131, + "grad_norm": 0.09994468092918396, + "learning_rate": 1.6885764499121267e-05, + "loss": 0.0781, + "step": 1330 + }, + { + "epoch": 0.7850029291154071, + "grad_norm": 0.07375376671552658, + "learning_rate": 1.686233157586409e-05, + "loss": 0.1963, + "step": 1340 + }, + { + "epoch": 0.7908611599297012, + "grad_norm": 0.5165359973907471, + "learning_rate": 1.6838898652606916e-05, + "loss": 0.275, + "step": 1350 + }, + { + "epoch": 0.7967193907439953, + "grad_norm": 4.770512580871582, + "learning_rate": 1.681546572934974e-05, + "loss": 0.0923, + "step": 1360 + }, + { + "epoch": 0.8025776215582894, + "grad_norm": 0.10542096197605133, + "learning_rate": 1.679203280609256e-05, + "loss": 0.0993, + "step": 1370 + }, + { + "epoch": 0.8084358523725835, + "grad_norm": 0.08764227479696274, + "learning_rate": 1.6768599882835385e-05, + "loss": 0.0535, + "step": 1380 + }, + { + "epoch": 0.8142940831868776, + "grad_norm": 3.3957512378692627, + "learning_rate": 1.674516695957821e-05, + "loss": 0.0141, + "step": 1390 + }, + { + "epoch": 0.8201523140011716, + "grad_norm": 0.03514677286148071, + "learning_rate": 1.6721734036321033e-05, + "loss": 0.0481, + "step": 1400 + }, + { + "epoch": 0.8260105448154658, + "grad_norm": 0.054979559034109116, + "learning_rate": 1.6698301113063854e-05, + "loss": 0.0223, + "step": 1410 + }, + { + "epoch": 0.8318687756297598, + "grad_norm": 6.025042533874512, + "learning_rate": 1.6674868189806678e-05, + "loss": 0.2666, + "step": 1420 + }, + { + "epoch": 0.8377270064440538, + "grad_norm": 0.11764468997716904, + "learning_rate": 1.6651435266549502e-05, + "loss": 0.1323, + "step": 1430 + }, + { + "epoch": 0.843585237258348, + "grad_norm": 0.08743251860141754, + "learning_rate": 1.6628002343292326e-05, + "loss": 0.1267, + "step": 1440 + }, + { + "epoch": 0.849443468072642, + "grad_norm": 0.16297052800655365, + "learning_rate": 1.660456942003515e-05, + "loss": 0.1671, + "step": 1450 + }, + { + "epoch": 0.8553016988869362, + "grad_norm": 0.12175700068473816, + "learning_rate": 1.6581136496777974e-05, + "loss": 0.0084, + "step": 1460 + }, + { + "epoch": 0.8611599297012302, + "grad_norm": 4.576499938964844, + "learning_rate": 1.6557703573520798e-05, + "loss": 0.0398, + "step": 1470 + }, + { + "epoch": 0.8670181605155243, + "grad_norm": 1.2775115966796875, + "learning_rate": 1.6534270650263622e-05, + "loss": 0.0634, + "step": 1480 + }, + { + "epoch": 0.8728763913298184, + "grad_norm": 4.699284553527832, + "learning_rate": 1.6510837727006446e-05, + "loss": 0.0668, + "step": 1490 + }, + { + "epoch": 0.8787346221441125, + "grad_norm": 0.12817470729351044, + "learning_rate": 1.648740480374927e-05, + "loss": 0.0749, + "step": 1500 + }, + { + "epoch": 0.8845928529584065, + "grad_norm": 5.865419864654541, + "learning_rate": 1.6463971880492095e-05, + "loss": 0.115, + "step": 1510 + }, + { + "epoch": 0.8904510837727007, + "grad_norm": 3.5330662727355957, + "learning_rate": 1.644053895723492e-05, + "loss": 0.1223, + "step": 1520 + }, + { + "epoch": 0.8963093145869947, + "grad_norm": 0.0576438307762146, + "learning_rate": 1.641710603397774e-05, + "loss": 0.2236, + "step": 1530 + }, + { + "epoch": 0.9021675454012889, + "grad_norm": 0.03019624389708042, + "learning_rate": 1.6393673110720564e-05, + "loss": 0.062, + "step": 1540 + }, + { + "epoch": 0.9080257762155829, + "grad_norm": 0.11998365819454193, + "learning_rate": 1.6370240187463388e-05, + "loss": 0.1792, + "step": 1550 + }, + { + "epoch": 0.9138840070298769, + "grad_norm": 7.0185370445251465, + "learning_rate": 1.6346807264206212e-05, + "loss": 0.1184, + "step": 1560 + }, + { + "epoch": 0.9197422378441711, + "grad_norm": 0.26941293478012085, + "learning_rate": 1.6323374340949033e-05, + "loss": 0.0451, + "step": 1570 + }, + { + "epoch": 0.9256004686584651, + "grad_norm": 0.11105812340974808, + "learning_rate": 1.6299941417691857e-05, + "loss": 0.0957, + "step": 1580 + }, + { + "epoch": 0.9314586994727593, + "grad_norm": 7.551598072052002, + "learning_rate": 1.627650849443468e-05, + "loss": 0.063, + "step": 1590 + }, + { + "epoch": 0.9373169302870533, + "grad_norm": 0.03378705307841301, + "learning_rate": 1.6253075571177505e-05, + "loss": 0.0234, + "step": 1600 + }, + { + "epoch": 0.9431751611013474, + "grad_norm": 0.25736942887306213, + "learning_rate": 1.622964264792033e-05, + "loss": 0.041, + "step": 1610 + }, + { + "epoch": 0.9490333919156415, + "grad_norm": 0.14889341592788696, + "learning_rate": 1.6206209724663153e-05, + "loss": 0.0337, + "step": 1620 + }, + { + "epoch": 0.9548916227299356, + "grad_norm": 0.05913722887635231, + "learning_rate": 1.6182776801405977e-05, + "loss": 0.1896, + "step": 1630 + }, + { + "epoch": 0.9607498535442296, + "grad_norm": 0.3258120119571686, + "learning_rate": 1.61593438781488e-05, + "loss": 0.1096, + "step": 1640 + }, + { + "epoch": 0.9666080843585237, + "grad_norm": 0.05084146186709404, + "learning_rate": 1.6135910954891626e-05, + "loss": 0.1543, + "step": 1650 + }, + { + "epoch": 0.9724663151728178, + "grad_norm": 3.364567518234253, + "learning_rate": 1.611247803163445e-05, + "loss": 0.2953, + "step": 1660 + }, + { + "epoch": 0.9783245459871119, + "grad_norm": 0.07459431141614914, + "learning_rate": 1.608904510837727e-05, + "loss": 0.0135, + "step": 1670 + }, + { + "epoch": 0.984182776801406, + "grad_norm": 4.472078323364258, + "learning_rate": 1.6065612185120094e-05, + "loss": 0.1756, + "step": 1680 + }, + { + "epoch": 0.9900410076157, + "grad_norm": 0.0766572579741478, + "learning_rate": 1.604217926186292e-05, + "loss": 0.0382, + "step": 1690 + }, + { + "epoch": 0.9958992384299942, + "grad_norm": 4.629612445831299, + "learning_rate": 1.6018746338605743e-05, + "loss": 0.1475, + "step": 1700 + }, + { + "epoch": 1.0017574692442883, + "grad_norm": 2.2122583389282227, + "learning_rate": 1.5995313415348567e-05, + "loss": 0.007, + "step": 1710 + }, + { + "epoch": 1.0076157000585824, + "grad_norm": 2.1959640979766846, + "learning_rate": 1.597188049209139e-05, + "loss": 0.0108, + "step": 1720 + }, + { + "epoch": 1.0134739308728764, + "grad_norm": 6.177903652191162, + "learning_rate": 1.594844756883421e-05, + "loss": 0.1057, + "step": 1730 + }, + { + "epoch": 1.0193321616871704, + "grad_norm": 0.07769495248794556, + "learning_rate": 1.5925014645577036e-05, + "loss": 0.062, + "step": 1740 + }, + { + "epoch": 1.0251903925014645, + "grad_norm": 5.370426177978516, + "learning_rate": 1.590158172231986e-05, + "loss": 0.0702, + "step": 1750 + }, + { + "epoch": 1.0310486233157587, + "grad_norm": 0.06684985756874084, + "learning_rate": 1.5878148799062684e-05, + "loss": 0.1196, + "step": 1760 + }, + { + "epoch": 1.0369068541300528, + "grad_norm": 0.277066171169281, + "learning_rate": 1.5854715875805508e-05, + "loss": 0.0566, + "step": 1770 + }, + { + "epoch": 1.0427650849443468, + "grad_norm": 2.941352367401123, + "learning_rate": 1.5831282952548332e-05, + "loss": 0.069, + "step": 1780 + }, + { + "epoch": 1.0486233157586409, + "grad_norm": 0.050202783197164536, + "learning_rate": 1.5807850029291156e-05, + "loss": 0.0451, + "step": 1790 + }, + { + "epoch": 1.054481546572935, + "grad_norm": 0.04496518895030022, + "learning_rate": 1.5784417106033977e-05, + "loss": 0.1135, + "step": 1800 + }, + { + "epoch": 1.060339777387229, + "grad_norm": 7.685583114624023, + "learning_rate": 1.57609841827768e-05, + "loss": 0.2183, + "step": 1810 + }, + { + "epoch": 1.0661980082015232, + "grad_norm": 0.041232019662857056, + "learning_rate": 1.5737551259519625e-05, + "loss": 0.0681, + "step": 1820 + }, + { + "epoch": 1.0720562390158173, + "grad_norm": 0.11009787768125534, + "learning_rate": 1.571411833626245e-05, + "loss": 0.1064, + "step": 1830 + }, + { + "epoch": 1.0779144698301113, + "grad_norm": 10.8427095413208, + "learning_rate": 1.5690685413005274e-05, + "loss": 0.0494, + "step": 1840 + }, + { + "epoch": 1.0837727006444053, + "grad_norm": 0.10766121000051498, + "learning_rate": 1.5667252489748098e-05, + "loss": 0.161, + "step": 1850 + }, + { + "epoch": 1.0896309314586994, + "grad_norm": 6.553893566131592, + "learning_rate": 1.5643819566490922e-05, + "loss": 0.1559, + "step": 1860 + }, + { + "epoch": 1.0954891622729936, + "grad_norm": 10.493359565734863, + "learning_rate": 1.5620386643233746e-05, + "loss": 0.0245, + "step": 1870 + }, + { + "epoch": 1.1013473930872877, + "grad_norm": 8.9678373336792, + "learning_rate": 1.559695371997657e-05, + "loss": 0.1051, + "step": 1880 + }, + { + "epoch": 1.1072056239015817, + "grad_norm": 0.08524399250745773, + "learning_rate": 1.5573520796719394e-05, + "loss": 0.0044, + "step": 1890 + }, + { + "epoch": 1.1130638547158758, + "grad_norm": 0.027697084471583366, + "learning_rate": 1.5550087873462215e-05, + "loss": 0.0847, + "step": 1900 + }, + { + "epoch": 1.1189220855301698, + "grad_norm": 0.2592675983905792, + "learning_rate": 1.552665495020504e-05, + "loss": 0.0702, + "step": 1910 + }, + { + "epoch": 1.124780316344464, + "grad_norm": 0.033791959285736084, + "learning_rate": 1.5503222026947863e-05, + "loss": 0.1042, + "step": 1920 + }, + { + "epoch": 1.1306385471587581, + "grad_norm": 5.939085960388184, + "learning_rate": 1.5479789103690687e-05, + "loss": 0.0095, + "step": 1930 + }, + { + "epoch": 1.1364967779730522, + "grad_norm": 0.7582628726959229, + "learning_rate": 1.5456356180433508e-05, + "loss": 0.0844, + "step": 1940 + }, + { + "epoch": 1.1423550087873462, + "grad_norm": 0.34229347109794617, + "learning_rate": 1.5432923257176332e-05, + "loss": 0.0173, + "step": 1950 + }, + { + "epoch": 1.1482132396016402, + "grad_norm": 0.24234962463378906, + "learning_rate": 1.5409490333919156e-05, + "loss": 0.2988, + "step": 1960 + }, + { + "epoch": 1.1540714704159343, + "grad_norm": 0.21282519400119781, + "learning_rate": 1.538605741066198e-05, + "loss": 0.0115, + "step": 1970 + }, + { + "epoch": 1.1599297012302285, + "grad_norm": 0.2657891809940338, + "learning_rate": 1.5362624487404804e-05, + "loss": 0.1126, + "step": 1980 + }, + { + "epoch": 1.1657879320445226, + "grad_norm": 0.2175203263759613, + "learning_rate": 1.533919156414763e-05, + "loss": 0.2326, + "step": 1990 + }, + { + "epoch": 1.1716461628588166, + "grad_norm": 7.306258201599121, + "learning_rate": 1.5315758640890453e-05, + "loss": 0.1161, + "step": 2000 + }, + { + "epoch": 1.1775043936731107, + "grad_norm": 0.4325994849205017, + "learning_rate": 1.5292325717633277e-05, + "loss": 0.1118, + "step": 2010 + }, + { + "epoch": 1.1833626244874047, + "grad_norm": 0.03175567463040352, + "learning_rate": 1.52688927943761e-05, + "loss": 0.1036, + "step": 2020 + }, + { + "epoch": 1.189220855301699, + "grad_norm": 0.06606226414442062, + "learning_rate": 1.5245459871118923e-05, + "loss": 0.0048, + "step": 2030 + }, + { + "epoch": 1.195079086115993, + "grad_norm": 4.948124408721924, + "learning_rate": 1.5222026947861747e-05, + "loss": 0.1035, + "step": 2040 + }, + { + "epoch": 1.200937316930287, + "grad_norm": 0.05158844590187073, + "learning_rate": 1.5198594024604571e-05, + "loss": 0.2282, + "step": 2050 + }, + { + "epoch": 1.206795547744581, + "grad_norm": 5.9457106590271, + "learning_rate": 1.5175161101347396e-05, + "loss": 0.0251, + "step": 2060 + }, + { + "epoch": 1.2126537785588751, + "grad_norm": 0.41592299938201904, + "learning_rate": 1.5151728178090216e-05, + "loss": 0.1272, + "step": 2070 + }, + { + "epoch": 1.2185120093731694, + "grad_norm": 0.031298503279685974, + "learning_rate": 1.512829525483304e-05, + "loss": 0.0719, + "step": 2080 + }, + { + "epoch": 1.2243702401874634, + "grad_norm": 0.05066341161727905, + "learning_rate": 1.5104862331575865e-05, + "loss": 0.0489, + "step": 2090 + }, + { + "epoch": 1.2302284710017575, + "grad_norm": 0.06502586603164673, + "learning_rate": 1.5081429408318689e-05, + "loss": 0.0045, + "step": 2100 + }, + { + "epoch": 1.2360867018160515, + "grad_norm": 0.22800350189208984, + "learning_rate": 1.5057996485061513e-05, + "loss": 0.0287, + "step": 2110 + }, + { + "epoch": 1.2419449326303456, + "grad_norm": 0.05417035520076752, + "learning_rate": 1.5034563561804337e-05, + "loss": 0.0656, + "step": 2120 + }, + { + "epoch": 1.2478031634446398, + "grad_norm": 0.09729498624801636, + "learning_rate": 1.501113063854716e-05, + "loss": 0.1163, + "step": 2130 + }, + { + "epoch": 1.2536613942589339, + "grad_norm": 0.03419837728142738, + "learning_rate": 1.4987697715289983e-05, + "loss": 0.045, + "step": 2140 + }, + { + "epoch": 1.259519625073228, + "grad_norm": 35.93345642089844, + "learning_rate": 1.4964264792032807e-05, + "loss": 0.2297, + "step": 2150 + }, + { + "epoch": 1.265377855887522, + "grad_norm": 4.408044815063477, + "learning_rate": 1.4940831868775632e-05, + "loss": 0.0762, + "step": 2160 + }, + { + "epoch": 1.271236086701816, + "grad_norm": 0.2184348851442337, + "learning_rate": 1.4917398945518456e-05, + "loss": 0.042, + "step": 2170 + }, + { + "epoch": 1.2770943175161102, + "grad_norm": 0.036489009857177734, + "learning_rate": 1.489396602226128e-05, + "loss": 0.0239, + "step": 2180 + }, + { + "epoch": 1.2829525483304043, + "grad_norm": 0.10858841240406036, + "learning_rate": 1.4870533099004102e-05, + "loss": 0.0724, + "step": 2190 + }, + { + "epoch": 1.2888107791446983, + "grad_norm": 0.1485690176486969, + "learning_rate": 1.4847100175746925e-05, + "loss": 0.077, + "step": 2200 + }, + { + "epoch": 1.2946690099589924, + "grad_norm": 0.3935866951942444, + "learning_rate": 1.4823667252489749e-05, + "loss": 0.1539, + "step": 2210 + }, + { + "epoch": 1.3005272407732864, + "grad_norm": 0.10183463990688324, + "learning_rate": 1.4800234329232573e-05, + "loss": 0.1011, + "step": 2220 + }, + { + "epoch": 1.3063854715875807, + "grad_norm": 0.23003660142421722, + "learning_rate": 1.4776801405975395e-05, + "loss": 0.0398, + "step": 2230 + }, + { + "epoch": 1.3122437024018747, + "grad_norm": 1.0053473711013794, + "learning_rate": 1.475336848271822e-05, + "loss": 0.0114, + "step": 2240 + }, + { + "epoch": 1.3181019332161688, + "grad_norm": 0.0832851454615593, + "learning_rate": 1.4729935559461044e-05, + "loss": 0.0889, + "step": 2250 + }, + { + "epoch": 1.3239601640304628, + "grad_norm": 0.044370219111442566, + "learning_rate": 1.4706502636203868e-05, + "loss": 0.0047, + "step": 2260 + }, + { + "epoch": 1.3298183948447568, + "grad_norm": 2.207583427429199, + "learning_rate": 1.4683069712946692e-05, + "loss": 0.1405, + "step": 2270 + }, + { + "epoch": 1.335676625659051, + "grad_norm": 0.2738548219203949, + "learning_rate": 1.4659636789689516e-05, + "loss": 0.1053, + "step": 2280 + }, + { + "epoch": 1.341534856473345, + "grad_norm": 5.799202919006348, + "learning_rate": 1.4636203866432338e-05, + "loss": 0.1031, + "step": 2290 + }, + { + "epoch": 1.3473930872876392, + "grad_norm": 0.03329119086265564, + "learning_rate": 1.4612770943175162e-05, + "loss": 0.0167, + "step": 2300 + }, + { + "epoch": 1.3532513181019332, + "grad_norm": 0.10547787696123123, + "learning_rate": 1.4589338019917987e-05, + "loss": 0.0068, + "step": 2310 + }, + { + "epoch": 1.3591095489162273, + "grad_norm": 14.383464813232422, + "learning_rate": 1.456590509666081e-05, + "loss": 0.0457, + "step": 2320 + }, + { + "epoch": 1.3649677797305213, + "grad_norm": 1.1754510402679443, + "learning_rate": 1.4542472173403633e-05, + "loss": 0.1441, + "step": 2330 + }, + { + "epoch": 1.3708260105448153, + "grad_norm": 0.1931016892194748, + "learning_rate": 1.4519039250146455e-05, + "loss": 0.0026, + "step": 2340 + }, + { + "epoch": 1.3766842413591096, + "grad_norm": 0.036222219467163086, + "learning_rate": 1.449560632688928e-05, + "loss": 0.0575, + "step": 2350 + }, + { + "epoch": 1.3825424721734036, + "grad_norm": 0.05857452005147934, + "learning_rate": 1.4472173403632104e-05, + "loss": 0.0035, + "step": 2360 + }, + { + "epoch": 1.3884007029876977, + "grad_norm": 0.027153009548783302, + "learning_rate": 1.4448740480374928e-05, + "loss": 0.0071, + "step": 2370 + }, + { + "epoch": 1.3942589338019917, + "grad_norm": 0.030506083741784096, + "learning_rate": 1.4425307557117752e-05, + "loss": 0.0501, + "step": 2380 + }, + { + "epoch": 1.4001171646162858, + "grad_norm": 0.24087435007095337, + "learning_rate": 1.4401874633860576e-05, + "loss": 0.0457, + "step": 2390 + }, + { + "epoch": 1.40597539543058, + "grad_norm": 7.0238566398620605, + "learning_rate": 1.4378441710603398e-05, + "loss": 0.0856, + "step": 2400 + }, + { + "epoch": 1.411833626244874, + "grad_norm": 17.90679359436035, + "learning_rate": 1.4355008787346223e-05, + "loss": 0.1618, + "step": 2410 + }, + { + "epoch": 1.4176918570591681, + "grad_norm": 0.031517572700977325, + "learning_rate": 1.4331575864089047e-05, + "loss": 0.0033, + "step": 2420 + }, + { + "epoch": 1.4235500878734622, + "grad_norm": 0.8404099345207214, + "learning_rate": 1.430814294083187e-05, + "loss": 0.0529, + "step": 2430 + }, + { + "epoch": 1.4294083186877562, + "grad_norm": 0.01688993163406849, + "learning_rate": 1.4284710017574695e-05, + "loss": 0.0032, + "step": 2440 + }, + { + "epoch": 1.4352665495020505, + "grad_norm": 0.06315304338932037, + "learning_rate": 1.4261277094317519e-05, + "loss": 0.1193, + "step": 2450 + }, + { + "epoch": 1.4411247803163445, + "grad_norm": 0.04080997034907341, + "learning_rate": 1.423784417106034e-05, + "loss": 0.0028, + "step": 2460 + }, + { + "epoch": 1.4469830111306385, + "grad_norm": 0.023772750049829483, + "learning_rate": 1.4214411247803164e-05, + "loss": 0.0127, + "step": 2470 + }, + { + "epoch": 1.4528412419449326, + "grad_norm": 0.015330899506807327, + "learning_rate": 1.4190978324545988e-05, + "loss": 0.0706, + "step": 2480 + }, + { + "epoch": 1.4586994727592266, + "grad_norm": 0.4532860517501831, + "learning_rate": 1.4167545401288812e-05, + "loss": 0.0056, + "step": 2490 + }, + { + "epoch": 1.4645577035735209, + "grad_norm": 0.24234826862812042, + "learning_rate": 1.4144112478031635e-05, + "loss": 0.0024, + "step": 2500 + }, + { + "epoch": 1.470415934387815, + "grad_norm": 0.6973457932472229, + "learning_rate": 1.4120679554774459e-05, + "loss": 0.2314, + "step": 2510 + }, + { + "epoch": 1.476274165202109, + "grad_norm": 8.073694229125977, + "learning_rate": 1.4097246631517283e-05, + "loss": 0.0922, + "step": 2520 + }, + { + "epoch": 1.482132396016403, + "grad_norm": 0.03306486830115318, + "learning_rate": 1.4073813708260107e-05, + "loss": 0.1651, + "step": 2530 + }, + { + "epoch": 1.487990626830697, + "grad_norm": 0.10176576673984528, + "learning_rate": 1.4050380785002931e-05, + "loss": 0.0416, + "step": 2540 + }, + { + "epoch": 1.4938488576449913, + "grad_norm": 0.10366008430719376, + "learning_rate": 1.4026947861745755e-05, + "loss": 0.0615, + "step": 2550 + }, + { + "epoch": 1.4997070884592854, + "grad_norm": 0.21881893277168274, + "learning_rate": 1.4003514938488578e-05, + "loss": 0.0196, + "step": 2560 + }, + { + "epoch": 1.5055653192735794, + "grad_norm": 0.43505847454071045, + "learning_rate": 1.3980082015231402e-05, + "loss": 0.0996, + "step": 2570 + }, + { + "epoch": 1.5114235500878734, + "grad_norm": 7.067519187927246, + "learning_rate": 1.3956649091974226e-05, + "loss": 0.1073, + "step": 2580 + }, + { + "epoch": 1.5172817809021675, + "grad_norm": 0.022074054926633835, + "learning_rate": 1.3933216168717048e-05, + "loss": 0.0023, + "step": 2590 + }, + { + "epoch": 1.5231400117164617, + "grad_norm": 0.021333998069167137, + "learning_rate": 1.390978324545987e-05, + "loss": 0.0192, + "step": 2600 + }, + { + "epoch": 1.5289982425307556, + "grad_norm": 4.094031810760498, + "learning_rate": 1.3886350322202695e-05, + "loss": 0.1487, + "step": 2610 + }, + { + "epoch": 1.5348564733450498, + "grad_norm": 0.028797738254070282, + "learning_rate": 1.3862917398945519e-05, + "loss": 0.0181, + "step": 2620 + }, + { + "epoch": 1.5407147041593439, + "grad_norm": 0.37651917338371277, + "learning_rate": 1.3839484475688343e-05, + "loss": 0.0397, + "step": 2630 + }, + { + "epoch": 1.546572934973638, + "grad_norm": 0.04619598761200905, + "learning_rate": 1.3816051552431167e-05, + "loss": 0.0334, + "step": 2640 + }, + { + "epoch": 1.5524311657879322, + "grad_norm": 0.09797985851764679, + "learning_rate": 1.3792618629173991e-05, + "loss": 0.2177, + "step": 2650 + }, + { + "epoch": 1.558289396602226, + "grad_norm": 4.648836612701416, + "learning_rate": 1.3769185705916814e-05, + "loss": 0.1014, + "step": 2660 + }, + { + "epoch": 1.5641476274165202, + "grad_norm": 0.013686669990420341, + "learning_rate": 1.3745752782659638e-05, + "loss": 0.102, + "step": 2670 + }, + { + "epoch": 1.5700058582308143, + "grad_norm": 0.33198732137680054, + "learning_rate": 1.3722319859402462e-05, + "loss": 0.0181, + "step": 2680 + }, + { + "epoch": 1.5758640890451083, + "grad_norm": 9.264932632446289, + "learning_rate": 1.3698886936145286e-05, + "loss": 0.1374, + "step": 2690 + }, + { + "epoch": 1.5817223198594026, + "grad_norm": 4.808820724487305, + "learning_rate": 1.367545401288811e-05, + "loss": 0.0389, + "step": 2700 + }, + { + "epoch": 1.5875805506736964, + "grad_norm": 1.272582769393921, + "learning_rate": 1.3652021089630934e-05, + "loss": 0.0035, + "step": 2710 + }, + { + "epoch": 1.5934387814879907, + "grad_norm": 0.05750690773129463, + "learning_rate": 1.3628588166373755e-05, + "loss": 0.0981, + "step": 2720 + }, + { + "epoch": 1.5992970123022847, + "grad_norm": 0.174026757478714, + "learning_rate": 1.3605155243116579e-05, + "loss": 0.063, + "step": 2730 + }, + { + "epoch": 1.6051552431165788, + "grad_norm": 0.034066952764987946, + "learning_rate": 1.3581722319859403e-05, + "loss": 0.0291, + "step": 2740 + }, + { + "epoch": 1.611013473930873, + "grad_norm": 0.16694338619709015, + "learning_rate": 1.3558289396602227e-05, + "loss": 0.0909, + "step": 2750 + }, + { + "epoch": 1.6168717047451668, + "grad_norm": 1.0258171558380127, + "learning_rate": 1.353485647334505e-05, + "loss": 0.0612, + "step": 2760 + }, + { + "epoch": 1.622729935559461, + "grad_norm": 5.220030784606934, + "learning_rate": 1.3511423550087874e-05, + "loss": 0.1047, + "step": 2770 + }, + { + "epoch": 1.6285881663737551, + "grad_norm": 1.811539649963379, + "learning_rate": 1.3487990626830698e-05, + "loss": 0.1035, + "step": 2780 + }, + { + "epoch": 1.6344463971880492, + "grad_norm": 0.04276169091463089, + "learning_rate": 1.3464557703573522e-05, + "loss": 0.0646, + "step": 2790 + }, + { + "epoch": 1.6403046280023434, + "grad_norm": 0.025083236396312714, + "learning_rate": 1.3441124780316346e-05, + "loss": 0.0682, + "step": 2800 + }, + { + "epoch": 1.6461628588166373, + "grad_norm": 0.5909777879714966, + "learning_rate": 1.341769185705917e-05, + "loss": 0.0625, + "step": 2810 + }, + { + "epoch": 1.6520210896309315, + "grad_norm": 0.03190384432673454, + "learning_rate": 1.3394258933801994e-05, + "loss": 0.0549, + "step": 2820 + }, + { + "epoch": 1.6578793204452256, + "grad_norm": 0.008607199415564537, + "learning_rate": 1.3370826010544817e-05, + "loss": 0.1849, + "step": 2830 + }, + { + "epoch": 1.6637375512595196, + "grad_norm": 0.6105258464813232, + "learning_rate": 1.3347393087287641e-05, + "loss": 0.0044, + "step": 2840 + }, + { + "epoch": 1.6695957820738139, + "grad_norm": 0.008611824363470078, + "learning_rate": 1.3323960164030465e-05, + "loss": 0.0542, + "step": 2850 + }, + { + "epoch": 1.6754540128881077, + "grad_norm": 9.882821083068848, + "learning_rate": 1.3300527240773287e-05, + "loss": 0.0708, + "step": 2860 + }, + { + "epoch": 1.681312243702402, + "grad_norm": 0.09280505031347275, + "learning_rate": 1.327709431751611e-05, + "loss": 0.0021, + "step": 2870 + }, + { + "epoch": 1.687170474516696, + "grad_norm": 0.016819052398204803, + "learning_rate": 1.3253661394258934e-05, + "loss": 0.0595, + "step": 2880 + }, + { + "epoch": 1.69302870533099, + "grad_norm": 0.06223960965871811, + "learning_rate": 1.3230228471001758e-05, + "loss": 0.1019, + "step": 2890 + }, + { + "epoch": 1.698886936145284, + "grad_norm": 0.012930012308061123, + "learning_rate": 1.3206795547744582e-05, + "loss": 0.0211, + "step": 2900 + }, + { + "epoch": 1.7047451669595781, + "grad_norm": 10.481481552124023, + "learning_rate": 1.3183362624487406e-05, + "loss": 0.2043, + "step": 2910 + }, + { + "epoch": 1.7106033977738724, + "grad_norm": 0.48479416966438293, + "learning_rate": 1.315992970123023e-05, + "loss": 0.1616, + "step": 2920 + }, + { + "epoch": 1.7164616285881664, + "grad_norm": 0.01907259412109852, + "learning_rate": 1.3136496777973053e-05, + "loss": 0.0932, + "step": 2930 + }, + { + "epoch": 1.7223198594024605, + "grad_norm": 0.20193631947040558, + "learning_rate": 1.3113063854715877e-05, + "loss": 0.081, + "step": 2940 + }, + { + "epoch": 1.7281780902167545, + "grad_norm": 0.059374675154685974, + "learning_rate": 1.3089630931458701e-05, + "loss": 0.1085, + "step": 2950 + }, + { + "epoch": 1.7340363210310485, + "grad_norm": 0.023819535970687866, + "learning_rate": 1.3066198008201525e-05, + "loss": 0.0626, + "step": 2960 + }, + { + "epoch": 1.7398945518453428, + "grad_norm": 6.034724712371826, + "learning_rate": 1.304276508494435e-05, + "loss": 0.0466, + "step": 2970 + }, + { + "epoch": 1.7457527826596366, + "grad_norm": 0.017972761765122414, + "learning_rate": 1.3019332161687173e-05, + "loss": 0.0877, + "step": 2980 + }, + { + "epoch": 1.751611013473931, + "grad_norm": 0.4916331171989441, + "learning_rate": 1.2995899238429994e-05, + "loss": 0.1017, + "step": 2990 + }, + { + "epoch": 1.757469244288225, + "grad_norm": 0.07853368669748306, + "learning_rate": 1.2972466315172818e-05, + "loss": 0.0699, + "step": 3000 + }, + { + "epoch": 1.763327475102519, + "grad_norm": 0.03654066100716591, + "learning_rate": 1.2949033391915642e-05, + "loss": 0.0959, + "step": 3010 + }, + { + "epoch": 1.7691857059168132, + "grad_norm": 0.3065502345561981, + "learning_rate": 1.2925600468658466e-05, + "loss": 0.0642, + "step": 3020 + }, + { + "epoch": 1.775043936731107, + "grad_norm": 0.020343072712421417, + "learning_rate": 1.2902167545401289e-05, + "loss": 0.06, + "step": 3030 + }, + { + "epoch": 1.7809021675454013, + "grad_norm": 0.11528685688972473, + "learning_rate": 1.2878734622144113e-05, + "loss": 0.0314, + "step": 3040 + }, + { + "epoch": 1.7867603983596954, + "grad_norm": 0.08143705874681473, + "learning_rate": 1.2855301698886937e-05, + "loss": 0.0786, + "step": 3050 + }, + { + "epoch": 1.7926186291739894, + "grad_norm": 0.06304443627595901, + "learning_rate": 1.2831868775629761e-05, + "loss": 0.1096, + "step": 3060 + }, + { + "epoch": 1.7984768599882837, + "grad_norm": 0.020441118627786636, + "learning_rate": 1.2808435852372585e-05, + "loss": 0.1275, + "step": 3070 + }, + { + "epoch": 1.8043350908025775, + "grad_norm": 10.3707914352417, + "learning_rate": 1.278500292911541e-05, + "loss": 0.0262, + "step": 3080 + }, + { + "epoch": 1.8101933216168717, + "grad_norm": 0.03443164750933647, + "learning_rate": 1.2761570005858232e-05, + "loss": 0.0024, + "step": 3090 + }, + { + "epoch": 1.8160515524311658, + "grad_norm": 0.014218361116945744, + "learning_rate": 1.2738137082601056e-05, + "loss": 0.0672, + "step": 3100 + }, + { + "epoch": 1.8219097832454598, + "grad_norm": 0.12258031219244003, + "learning_rate": 1.271470415934388e-05, + "loss": 0.0822, + "step": 3110 + }, + { + "epoch": 1.827768014059754, + "grad_norm": 0.02281299978494644, + "learning_rate": 1.2691271236086702e-05, + "loss": 0.1317, + "step": 3120 + }, + { + "epoch": 1.833626244874048, + "grad_norm": 0.25066182017326355, + "learning_rate": 1.2667838312829525e-05, + "loss": 0.0247, + "step": 3130 + }, + { + "epoch": 1.8394844756883422, + "grad_norm": 0.01159273274242878, + "learning_rate": 1.2644405389572349e-05, + "loss": 0.0727, + "step": 3140 + }, + { + "epoch": 1.8453427065026362, + "grad_norm": 2.5836172103881836, + "learning_rate": 1.2620972466315173e-05, + "loss": 0.033, + "step": 3150 + }, + { + "epoch": 1.8512009373169303, + "grad_norm": 0.018062349408864975, + "learning_rate": 1.2597539543057997e-05, + "loss": 0.2145, + "step": 3160 + }, + { + "epoch": 1.8570591681312245, + "grad_norm": 6.357547283172607, + "learning_rate": 1.2574106619800821e-05, + "loss": 0.0635, + "step": 3170 + }, + { + "epoch": 1.8629173989455183, + "grad_norm": 0.021944306790828705, + "learning_rate": 1.2550673696543645e-05, + "loss": 0.0821, + "step": 3180 + }, + { + "epoch": 1.8687756297598126, + "grad_norm": 0.19342736899852753, + "learning_rate": 1.2527240773286468e-05, + "loss": 0.0994, + "step": 3190 + }, + { + "epoch": 1.8746338605741066, + "grad_norm": 0.1281069964170456, + "learning_rate": 1.2503807850029292e-05, + "loss": 0.2058, + "step": 3200 + }, + { + "epoch": 1.8804920913884007, + "grad_norm": 0.020071441307663918, + "learning_rate": 1.2480374926772116e-05, + "loss": 0.0817, + "step": 3210 + }, + { + "epoch": 1.886350322202695, + "grad_norm": 0.023471413180232048, + "learning_rate": 1.245694200351494e-05, + "loss": 0.0627, + "step": 3220 + }, + { + "epoch": 1.8922085530169888, + "grad_norm": 0.0315391905605793, + "learning_rate": 1.2433509080257764e-05, + "loss": 0.0035, + "step": 3230 + }, + { + "epoch": 1.898066783831283, + "grad_norm": 0.01387933362275362, + "learning_rate": 1.2410076157000588e-05, + "loss": 0.0019, + "step": 3240 + }, + { + "epoch": 1.903925014645577, + "grad_norm": 0.08367707580327988, + "learning_rate": 1.238664323374341e-05, + "loss": 0.0023, + "step": 3250 + }, + { + "epoch": 1.909783245459871, + "grad_norm": 0.020648911595344543, + "learning_rate": 1.2363210310486233e-05, + "loss": 0.0334, + "step": 3260 + }, + { + "epoch": 1.9156414762741654, + "grad_norm": 0.2322312295436859, + "learning_rate": 1.2339777387229057e-05, + "loss": 0.057, + "step": 3270 + }, + { + "epoch": 1.9214997070884592, + "grad_norm": 1.6584337949752808, + "learning_rate": 1.2316344463971882e-05, + "loss": 0.0028, + "step": 3280 + }, + { + "epoch": 1.9273579379027534, + "grad_norm": 0.12671466171741486, + "learning_rate": 1.2292911540714706e-05, + "loss": 0.1286, + "step": 3290 + }, + { + "epoch": 1.9332161687170475, + "grad_norm": 0.012327142059803009, + "learning_rate": 1.2269478617457528e-05, + "loss": 0.209, + "step": 3300 + }, + { + "epoch": 1.9390743995313415, + "grad_norm": 0.011200600303709507, + "learning_rate": 1.2246045694200352e-05, + "loss": 0.0742, + "step": 3310 + }, + { + "epoch": 1.9449326303456356, + "grad_norm": 0.019261429086327553, + "learning_rate": 1.2222612770943176e-05, + "loss": 0.0462, + "step": 3320 + }, + { + "epoch": 1.9507908611599296, + "grad_norm": 2.897488594055176, + "learning_rate": 1.2199179847686e-05, + "loss": 0.0962, + "step": 3330 + }, + { + "epoch": 1.9566490919742239, + "grad_norm": 0.047658707946538925, + "learning_rate": 1.2175746924428825e-05, + "loss": 0.0511, + "step": 3340 + }, + { + "epoch": 1.962507322788518, + "grad_norm": 0.07103870064020157, + "learning_rate": 1.2152314001171649e-05, + "loss": 0.0042, + "step": 3350 + }, + { + "epoch": 1.968365553602812, + "grad_norm": 0.06463142484426498, + "learning_rate": 1.2128881077914471e-05, + "loss": 0.0618, + "step": 3360 + }, + { + "epoch": 1.974223784417106, + "grad_norm": 0.04360285401344299, + "learning_rate": 1.2105448154657295e-05, + "loss": 0.0028, + "step": 3370 + }, + { + "epoch": 1.9800820152314, + "grad_norm": 0.06742165982723236, + "learning_rate": 1.2082015231400118e-05, + "loss": 0.0025, + "step": 3380 + }, + { + "epoch": 1.9859402460456943, + "grad_norm": 1.9260814189910889, + "learning_rate": 1.2058582308142942e-05, + "loss": 0.0641, + "step": 3390 + }, + { + "epoch": 1.9917984768599881, + "grad_norm": 9.20433521270752, + "learning_rate": 1.2035149384885764e-05, + "loss": 0.1534, + "step": 3400 + }, + { + "epoch": 1.9976567076742824, + "grad_norm": 0.06310843676328659, + "learning_rate": 1.2011716461628588e-05, + "loss": 0.0084, + "step": 3410 + }, + { + "epoch": 2.0035149384885766, + "grad_norm": 4.824597358703613, + "learning_rate": 1.1988283538371412e-05, + "loss": 0.1753, + "step": 3420 + }, + { + "epoch": 2.0093731693028705, + "grad_norm": 0.2831396162509918, + "learning_rate": 1.1964850615114236e-05, + "loss": 0.0487, + "step": 3430 + }, + { + "epoch": 2.0152314001171647, + "grad_norm": 0.027552679181098938, + "learning_rate": 1.194141769185706e-05, + "loss": 0.0609, + "step": 3440 + }, + { + "epoch": 2.0210896309314585, + "grad_norm": 0.059698984026908875, + "learning_rate": 1.1917984768599885e-05, + "loss": 0.0541, + "step": 3450 + }, + { + "epoch": 2.026947861745753, + "grad_norm": 0.024172687903046608, + "learning_rate": 1.1894551845342707e-05, + "loss": 0.0226, + "step": 3460 + }, + { + "epoch": 2.032806092560047, + "grad_norm": 0.02423352748155594, + "learning_rate": 1.1871118922085531e-05, + "loss": 0.0426, + "step": 3470 + }, + { + "epoch": 2.038664323374341, + "grad_norm": 0.01572337932884693, + "learning_rate": 1.1847685998828355e-05, + "loss": 0.0119, + "step": 3480 + }, + { + "epoch": 2.044522554188635, + "grad_norm": 0.01435055397450924, + "learning_rate": 1.182425307557118e-05, + "loss": 0.0529, + "step": 3490 + }, + { + "epoch": 2.050380785002929, + "grad_norm": 0.1196121945977211, + "learning_rate": 1.1800820152314004e-05, + "loss": 0.0428, + "step": 3500 + }, + { + "epoch": 2.0562390158172232, + "grad_norm": 0.014025365933775902, + "learning_rate": 1.1777387229056824e-05, + "loss": 0.0655, + "step": 3510 + }, + { + "epoch": 2.0620972466315175, + "grad_norm": 1.1602953672409058, + "learning_rate": 1.1753954305799648e-05, + "loss": 0.0953, + "step": 3520 + }, + { + "epoch": 2.0679554774458113, + "grad_norm": 0.04896058142185211, + "learning_rate": 1.1730521382542473e-05, + "loss": 0.0767, + "step": 3530 + }, + { + "epoch": 2.0738137082601056, + "grad_norm": 0.12855136394500732, + "learning_rate": 1.1707088459285297e-05, + "loss": 0.0127, + "step": 3540 + }, + { + "epoch": 2.0796719390743994, + "grad_norm": 0.19695617258548737, + "learning_rate": 1.168365553602812e-05, + "loss": 0.2379, + "step": 3550 + }, + { + "epoch": 2.0855301698886937, + "grad_norm": 1.3962332010269165, + "learning_rate": 1.1660222612770943e-05, + "loss": 0.1809, + "step": 3560 + }, + { + "epoch": 2.0913884007029875, + "grad_norm": 0.0690867155790329, + "learning_rate": 1.1636789689513767e-05, + "loss": 0.1417, + "step": 3570 + }, + { + "epoch": 2.0972466315172817, + "grad_norm": 0.20552076399326324, + "learning_rate": 1.1613356766256591e-05, + "loss": 0.0345, + "step": 3580 + }, + { + "epoch": 2.103104862331576, + "grad_norm": 0.014500975608825684, + "learning_rate": 1.1589923842999416e-05, + "loss": 0.034, + "step": 3590 + }, + { + "epoch": 2.10896309314587, + "grad_norm": 0.025813542306423187, + "learning_rate": 1.156649091974224e-05, + "loss": 0.1902, + "step": 3600 + }, + { + "epoch": 2.114821323960164, + "grad_norm": 0.09736531227827072, + "learning_rate": 1.1543057996485064e-05, + "loss": 0.0927, + "step": 3610 + }, + { + "epoch": 2.120679554774458, + "grad_norm": 3.991480588912964, + "learning_rate": 1.1519625073227886e-05, + "loss": 0.0578, + "step": 3620 + }, + { + "epoch": 2.126537785588752, + "grad_norm": 0.47018668055534363, + "learning_rate": 1.149619214997071e-05, + "loss": 0.0033, + "step": 3630 + }, + { + "epoch": 2.1323960164030464, + "grad_norm": 0.043486375361680984, + "learning_rate": 1.1472759226713533e-05, + "loss": 0.0472, + "step": 3640 + }, + { + "epoch": 2.1382542472173403, + "grad_norm": 0.12205738574266434, + "learning_rate": 1.1449326303456357e-05, + "loss": 0.0029, + "step": 3650 + }, + { + "epoch": 2.1441124780316345, + "grad_norm": 0.04534014314413071, + "learning_rate": 1.142589338019918e-05, + "loss": 0.1298, + "step": 3660 + }, + { + "epoch": 2.1499707088459283, + "grad_norm": 0.01751992292702198, + "learning_rate": 1.1402460456942003e-05, + "loss": 0.0116, + "step": 3670 + }, + { + "epoch": 2.1558289396602226, + "grad_norm": 0.017521362751722336, + "learning_rate": 1.1379027533684827e-05, + "loss": 0.1328, + "step": 3680 + }, + { + "epoch": 2.161687170474517, + "grad_norm": 0.015062491409480572, + "learning_rate": 1.1355594610427652e-05, + "loss": 0.0123, + "step": 3690 + }, + { + "epoch": 2.1675454012888107, + "grad_norm": 0.10153401643037796, + "learning_rate": 1.1332161687170476e-05, + "loss": 0.0029, + "step": 3700 + }, + { + "epoch": 2.173403632103105, + "grad_norm": 7.739234924316406, + "learning_rate": 1.13087287639133e-05, + "loss": 0.129, + "step": 3710 + }, + { + "epoch": 2.1792618629173988, + "grad_norm": 0.02731635794043541, + "learning_rate": 1.1285295840656122e-05, + "loss": 0.0144, + "step": 3720 + }, + { + "epoch": 2.185120093731693, + "grad_norm": 0.21970869600772858, + "learning_rate": 1.1261862917398946e-05, + "loss": 0.1084, + "step": 3730 + }, + { + "epoch": 2.1909783245459873, + "grad_norm": 0.04228947311639786, + "learning_rate": 1.123842999414177e-05, + "loss": 0.1032, + "step": 3740 + }, + { + "epoch": 2.196836555360281, + "grad_norm": 0.5683772563934326, + "learning_rate": 1.1214997070884595e-05, + "loss": 0.0031, + "step": 3750 + }, + { + "epoch": 2.2026947861745754, + "grad_norm": 0.01960350014269352, + "learning_rate": 1.1191564147627419e-05, + "loss": 0.0207, + "step": 3760 + }, + { + "epoch": 2.208553016988869, + "grad_norm": 0.22766079008579254, + "learning_rate": 1.116813122437024e-05, + "loss": 0.0379, + "step": 3770 + }, + { + "epoch": 2.2144112478031635, + "grad_norm": 0.08811239153146744, + "learning_rate": 1.1144698301113064e-05, + "loss": 0.0265, + "step": 3780 + }, + { + "epoch": 2.2202694786174577, + "grad_norm": 0.015844443812966347, + "learning_rate": 1.1121265377855888e-05, + "loss": 0.0542, + "step": 3790 + }, + { + "epoch": 2.2261277094317515, + "grad_norm": 0.022639505565166473, + "learning_rate": 1.1097832454598712e-05, + "loss": 0.0724, + "step": 3800 + }, + { + "epoch": 2.231985940246046, + "grad_norm": 0.011850577779114246, + "learning_rate": 1.1074399531341536e-05, + "loss": 0.0218, + "step": 3810 + }, + { + "epoch": 2.2378441710603396, + "grad_norm": 0.03309320658445358, + "learning_rate": 1.105096660808436e-05, + "loss": 0.004, + "step": 3820 + }, + { + "epoch": 2.243702401874634, + "grad_norm": 0.00993265025317669, + "learning_rate": 1.1027533684827182e-05, + "loss": 0.1141, + "step": 3830 + }, + { + "epoch": 2.249560632688928, + "grad_norm": 0.061374466866254807, + "learning_rate": 1.1004100761570007e-05, + "loss": 0.1819, + "step": 3840 + }, + { + "epoch": 2.255418863503222, + "grad_norm": 0.055793508887290955, + "learning_rate": 1.098066783831283e-05, + "loss": 0.0955, + "step": 3850 + }, + { + "epoch": 2.2612770943175162, + "grad_norm": 0.06129812076687813, + "learning_rate": 1.0957234915055655e-05, + "loss": 0.0022, + "step": 3860 + }, + { + "epoch": 2.26713532513181, + "grad_norm": 1.5925140380859375, + "learning_rate": 1.0933801991798479e-05, + "loss": 0.0715, + "step": 3870 + }, + { + "epoch": 2.2729935559461043, + "grad_norm": 0.6164373159408569, + "learning_rate": 1.0910369068541303e-05, + "loss": 0.0864, + "step": 3880 + }, + { + "epoch": 2.2788517867603986, + "grad_norm": 0.05251573026180267, + "learning_rate": 1.0886936145284125e-05, + "loss": 0.0227, + "step": 3890 + }, + { + "epoch": 2.2847100175746924, + "grad_norm": 5.049495220184326, + "learning_rate": 1.086350322202695e-05, + "loss": 0.1728, + "step": 3900 + }, + { + "epoch": 2.2905682483889866, + "grad_norm": 0.018607361242175102, + "learning_rate": 1.0840070298769772e-05, + "loss": 0.0381, + "step": 3910 + }, + { + "epoch": 2.2964264792032805, + "grad_norm": 0.013198847882449627, + "learning_rate": 1.0816637375512596e-05, + "loss": 0.2289, + "step": 3920 + }, + { + "epoch": 2.3022847100175747, + "grad_norm": 0.028414737433195114, + "learning_rate": 1.0793204452255418e-05, + "loss": 0.0066, + "step": 3930 + }, + { + "epoch": 2.3081429408318686, + "grad_norm": 0.02251887135207653, + "learning_rate": 1.0769771528998243e-05, + "loss": 0.0562, + "step": 3940 + }, + { + "epoch": 2.314001171646163, + "grad_norm": 0.03463216498494148, + "learning_rate": 1.0746338605741067e-05, + "loss": 0.0673, + "step": 3950 + }, + { + "epoch": 2.319859402460457, + "grad_norm": 0.028568778187036514, + "learning_rate": 1.072290568248389e-05, + "loss": 0.003, + "step": 3960 + }, + { + "epoch": 2.325717633274751, + "grad_norm": 0.012898040004074574, + "learning_rate": 1.0699472759226715e-05, + "loss": 0.005, + "step": 3970 + }, + { + "epoch": 2.331575864089045, + "grad_norm": 0.03388252109289169, + "learning_rate": 1.0676039835969539e-05, + "loss": 0.0102, + "step": 3980 + }, + { + "epoch": 2.3374340949033394, + "grad_norm": 0.02966475300490856, + "learning_rate": 1.0652606912712361e-05, + "loss": 0.0062, + "step": 3990 + }, + { + "epoch": 2.3432923257176332, + "grad_norm": 0.021042481064796448, + "learning_rate": 1.0629173989455186e-05, + "loss": 0.0849, + "step": 4000 + }, + { + "epoch": 2.3491505565319275, + "grad_norm": 0.25563105940818787, + "learning_rate": 1.060574106619801e-05, + "loss": 0.0025, + "step": 4010 + }, + { + "epoch": 2.3550087873462213, + "grad_norm": 0.010090864263474941, + "learning_rate": 1.0582308142940834e-05, + "loss": 0.0016, + "step": 4020 + }, + { + "epoch": 2.3608670181605156, + "grad_norm": 0.016884010285139084, + "learning_rate": 1.0558875219683658e-05, + "loss": 0.0906, + "step": 4030 + }, + { + "epoch": 2.3667252489748094, + "grad_norm": 0.011160296387970448, + "learning_rate": 1.0535442296426479e-05, + "loss": 0.0016, + "step": 4040 + }, + { + "epoch": 2.3725834797891037, + "grad_norm": 9.898134231567383, + "learning_rate": 1.0512009373169303e-05, + "loss": 0.1282, + "step": 4050 + }, + { + "epoch": 2.378441710603398, + "grad_norm": 4.420975685119629, + "learning_rate": 1.0488576449912127e-05, + "loss": 0.0056, + "step": 4060 + }, + { + "epoch": 2.3842999414176917, + "grad_norm": 0.08097882568836212, + "learning_rate": 1.0465143526654951e-05, + "loss": 0.0236, + "step": 4070 + }, + { + "epoch": 2.390158172231986, + "grad_norm": 0.014296771958470345, + "learning_rate": 1.0441710603397775e-05, + "loss": 0.0062, + "step": 4080 + }, + { + "epoch": 2.39601640304628, + "grad_norm": 0.08250059932470322, + "learning_rate": 1.0418277680140597e-05, + "loss": 0.0464, + "step": 4090 + }, + { + "epoch": 2.401874633860574, + "grad_norm": 0.09679897874593735, + "learning_rate": 1.0394844756883422e-05, + "loss": 0.0237, + "step": 4100 + }, + { + "epoch": 2.4077328646748684, + "grad_norm": 9.378355979919434, + "learning_rate": 1.0371411833626246e-05, + "loss": 0.0379, + "step": 4110 + }, + { + "epoch": 2.413591095489162, + "grad_norm": 11.158398628234863, + "learning_rate": 1.034797891036907e-05, + "loss": 0.1332, + "step": 4120 + }, + { + "epoch": 2.4194493263034564, + "grad_norm": 9.437854766845703, + "learning_rate": 1.0324545987111894e-05, + "loss": 0.2336, + "step": 4130 + }, + { + "epoch": 2.4253075571177503, + "grad_norm": 0.034454096108675, + "learning_rate": 1.0301113063854718e-05, + "loss": 0.0112, + "step": 4140 + }, + { + "epoch": 2.4311657879320445, + "grad_norm": 0.010446906089782715, + "learning_rate": 1.027768014059754e-05, + "loss": 0.0139, + "step": 4150 + }, + { + "epoch": 2.437024018746339, + "grad_norm": 0.008245312608778477, + "learning_rate": 1.0254247217340365e-05, + "loss": 0.0259, + "step": 4160 + }, + { + "epoch": 2.4428822495606326, + "grad_norm": 12.191228866577148, + "learning_rate": 1.0230814294083187e-05, + "loss": 0.0496, + "step": 4170 + }, + { + "epoch": 2.448740480374927, + "grad_norm": 1.1207407712936401, + "learning_rate": 1.0207381370826011e-05, + "loss": 0.1827, + "step": 4180 + }, + { + "epoch": 2.4545987111892207, + "grad_norm": 0.04254107177257538, + "learning_rate": 1.0183948447568834e-05, + "loss": 0.0021, + "step": 4190 + }, + { + "epoch": 2.460456942003515, + "grad_norm": 8.863370895385742, + "learning_rate": 1.0160515524311658e-05, + "loss": 0.0365, + "step": 4200 + }, + { + "epoch": 2.466315172817809, + "grad_norm": 0.019871840253472328, + "learning_rate": 1.0137082601054482e-05, + "loss": 0.0961, + "step": 4210 + }, + { + "epoch": 2.472173403632103, + "grad_norm": 12.787776947021484, + "learning_rate": 1.0113649677797306e-05, + "loss": 0.0048, + "step": 4220 + }, + { + "epoch": 2.4780316344463973, + "grad_norm": 0.010828366503119469, + "learning_rate": 1.009021675454013e-05, + "loss": 0.0633, + "step": 4230 + }, + { + "epoch": 2.483889865260691, + "grad_norm": 0.021939842030405998, + "learning_rate": 1.0066783831282954e-05, + "loss": 0.0236, + "step": 4240 + }, + { + "epoch": 2.4897480960749854, + "grad_norm": 0.9022541046142578, + "learning_rate": 1.0043350908025777e-05, + "loss": 0.0044, + "step": 4250 + }, + { + "epoch": 2.4956063268892796, + "grad_norm": 0.015246485359966755, + "learning_rate": 1.00199179847686e-05, + "loss": 0.0202, + "step": 4260 + }, + { + "epoch": 2.5014645577035735, + "grad_norm": 4.695196151733398, + "learning_rate": 9.996485061511425e-06, + "loss": 0.0345, + "step": 4270 + }, + { + "epoch": 2.5073227885178677, + "grad_norm": 0.3940305709838867, + "learning_rate": 9.973052138254247e-06, + "loss": 0.0334, + "step": 4280 + }, + { + "epoch": 2.5131810193321615, + "grad_norm": 0.010812806896865368, + "learning_rate": 9.949619214997071e-06, + "loss": 0.0723, + "step": 4290 + }, + { + "epoch": 2.519039250146456, + "grad_norm": 0.00904472079128027, + "learning_rate": 9.926186291739895e-06, + "loss": 0.1392, + "step": 4300 + }, + { + "epoch": 2.5248974809607496, + "grad_norm": 0.10065434128046036, + "learning_rate": 9.90275336848272e-06, + "loss": 0.0072, + "step": 4310 + }, + { + "epoch": 2.530755711775044, + "grad_norm": 0.02397744543850422, + "learning_rate": 9.879320445225544e-06, + "loss": 0.1593, + "step": 4320 + }, + { + "epoch": 2.536613942589338, + "grad_norm": 0.16617602109909058, + "learning_rate": 9.855887521968366e-06, + "loss": 0.0473, + "step": 4330 + }, + { + "epoch": 2.542472173403632, + "grad_norm": 0.03026733361184597, + "learning_rate": 9.83245459871119e-06, + "loss": 0.065, + "step": 4340 + }, + { + "epoch": 2.5483304042179262, + "grad_norm": 0.02081727422773838, + "learning_rate": 9.809021675454014e-06, + "loss": 0.0093, + "step": 4350 + }, + { + "epoch": 2.5541886350322205, + "grad_norm": 0.030087795108556747, + "learning_rate": 9.785588752196837e-06, + "loss": 0.1034, + "step": 4360 + }, + { + "epoch": 2.5600468658465143, + "grad_norm": 7.436207294464111, + "learning_rate": 9.76215582893966e-06, + "loss": 0.0905, + "step": 4370 + }, + { + "epoch": 2.5659050966608086, + "grad_norm": 0.014431145042181015, + "learning_rate": 9.738722905682485e-06, + "loss": 0.1189, + "step": 4380 + }, + { + "epoch": 2.5717633274751024, + "grad_norm": 13.210556983947754, + "learning_rate": 9.715289982425309e-06, + "loss": 0.0138, + "step": 4390 + }, + { + "epoch": 2.5776215582893967, + "grad_norm": 0.02586820349097252, + "learning_rate": 9.691857059168131e-06, + "loss": 0.0616, + "step": 4400 + }, + { + "epoch": 2.5834797891036905, + "grad_norm": 0.017524389550089836, + "learning_rate": 9.668424135910956e-06, + "loss": 0.0466, + "step": 4410 + }, + { + "epoch": 2.5893380199179847, + "grad_norm": 3.2110509872436523, + "learning_rate": 9.64499121265378e-06, + "loss": 0.007, + "step": 4420 + }, + { + "epoch": 2.595196250732279, + "grad_norm": 3.626349687576294, + "learning_rate": 9.621558289396604e-06, + "loss": 0.0712, + "step": 4430 + }, + { + "epoch": 2.601054481546573, + "grad_norm": 0.0480428971350193, + "learning_rate": 9.598125366139426e-06, + "loss": 0.0501, + "step": 4440 + }, + { + "epoch": 2.606912712360867, + "grad_norm": 0.1278507560491562, + "learning_rate": 9.57469244288225e-06, + "loss": 0.0185, + "step": 4450 + }, + { + "epoch": 2.6127709431751613, + "grad_norm": 0.01875203661620617, + "learning_rate": 9.551259519625073e-06, + "loss": 0.0111, + "step": 4460 + }, + { + "epoch": 2.618629173989455, + "grad_norm": 0.02753681316971779, + "learning_rate": 9.527826596367897e-06, + "loss": 0.0015, + "step": 4470 + }, + { + "epoch": 2.6244874048037494, + "grad_norm": 12.951713562011719, + "learning_rate": 9.504393673110721e-06, + "loss": 0.0405, + "step": 4480 + }, + { + "epoch": 2.6303456356180432, + "grad_norm": 0.025739021599292755, + "learning_rate": 9.480960749853545e-06, + "loss": 0.0115, + "step": 4490 + }, + { + "epoch": 2.6362038664323375, + "grad_norm": 0.6508920788764954, + "learning_rate": 9.45752782659637e-06, + "loss": 0.0777, + "step": 4500 + }, + { + "epoch": 2.6420620972466313, + "grad_norm": 0.0040559773333370686, + "learning_rate": 9.434094903339193e-06, + "loss": 0.0029, + "step": 4510 + }, + { + "epoch": 2.6479203280609256, + "grad_norm": 0.2040073573589325, + "learning_rate": 9.410661980082016e-06, + "loss": 0.0033, + "step": 4520 + }, + { + "epoch": 2.65377855887522, + "grad_norm": 0.14921385049819946, + "learning_rate": 9.38722905682484e-06, + "loss": 0.0383, + "step": 4530 + }, + { + "epoch": 2.6596367896895137, + "grad_norm": 0.017338193953037262, + "learning_rate": 9.363796133567662e-06, + "loss": 0.0802, + "step": 4540 + }, + { + "epoch": 2.665495020503808, + "grad_norm": 1.1883032321929932, + "learning_rate": 9.340363210310486e-06, + "loss": 0.0617, + "step": 4550 + }, + { + "epoch": 2.671353251318102, + "grad_norm": 0.0065250336192548275, + "learning_rate": 9.31693028705331e-06, + "loss": 0.0305, + "step": 4560 + }, + { + "epoch": 2.677211482132396, + "grad_norm": 4.130406379699707, + "learning_rate": 9.293497363796135e-06, + "loss": 0.0961, + "step": 4570 + }, + { + "epoch": 2.68306971294669, + "grad_norm": 0.02023070678114891, + "learning_rate": 9.270064440538959e-06, + "loss": 0.0086, + "step": 4580 + }, + { + "epoch": 2.688927943760984, + "grad_norm": 0.10395500808954239, + "learning_rate": 9.246631517281783e-06, + "loss": 0.0364, + "step": 4590 + }, + { + "epoch": 2.6947861745752784, + "grad_norm": 0.013902227394282818, + "learning_rate": 9.223198594024605e-06, + "loss": 0.01, + "step": 4600 + }, + { + "epoch": 2.700644405389572, + "grad_norm": 0.08112169802188873, + "learning_rate": 9.19976567076743e-06, + "loss": 0.0222, + "step": 4610 + }, + { + "epoch": 2.7065026362038664, + "grad_norm": 0.06331131607294083, + "learning_rate": 9.176332747510252e-06, + "loss": 0.0089, + "step": 4620 + }, + { + "epoch": 2.7123608670181607, + "grad_norm": 0.007761865854263306, + "learning_rate": 9.152899824253076e-06, + "loss": 0.0633, + "step": 4630 + }, + { + "epoch": 2.7182190978324545, + "grad_norm": 5.599218368530273, + "learning_rate": 9.1294669009959e-06, + "loss": 0.0516, + "step": 4640 + }, + { + "epoch": 2.724077328646749, + "grad_norm": 3.4084956645965576, + "learning_rate": 9.106033977738724e-06, + "loss": 0.0057, + "step": 4650 + }, + { + "epoch": 2.7299355594610426, + "grad_norm": 0.01933354139328003, + "learning_rate": 9.082601054481547e-06, + "loss": 0.0406, + "step": 4660 + }, + { + "epoch": 2.735793790275337, + "grad_norm": 0.033855751156806946, + "learning_rate": 9.05916813122437e-06, + "loss": 0.1619, + "step": 4670 + }, + { + "epoch": 2.7416520210896307, + "grad_norm": 0.009661266580224037, + "learning_rate": 9.035735207967195e-06, + "loss": 0.0012, + "step": 4680 + }, + { + "epoch": 2.747510251903925, + "grad_norm": 2.7071592807769775, + "learning_rate": 9.012302284710019e-06, + "loss": 0.0783, + "step": 4690 + }, + { + "epoch": 2.753368482718219, + "grad_norm": 0.045839034020900726, + "learning_rate": 8.988869361452841e-06, + "loss": 0.1256, + "step": 4700 + }, + { + "epoch": 2.759226713532513, + "grad_norm": 0.01187943760305643, + "learning_rate": 8.965436438195665e-06, + "loss": 0.0467, + "step": 4710 + }, + { + "epoch": 2.7650849443468073, + "grad_norm": 0.12289223819971085, + "learning_rate": 8.94200351493849e-06, + "loss": 0.224, + "step": 4720 + }, + { + "epoch": 2.7709431751611016, + "grad_norm": 0.24266257882118225, + "learning_rate": 8.918570591681312e-06, + "loss": 0.0033, + "step": 4730 + }, + { + "epoch": 2.7768014059753954, + "grad_norm": 0.05902494117617607, + "learning_rate": 8.895137668424136e-06, + "loss": 0.0786, + "step": 4740 + }, + { + "epoch": 2.7826596367896896, + "grad_norm": 0.03103451617062092, + "learning_rate": 8.87170474516696e-06, + "loss": 0.062, + "step": 4750 + }, + { + "epoch": 2.7885178676039835, + "grad_norm": 0.01481554750353098, + "learning_rate": 8.848271821909784e-06, + "loss": 0.0537, + "step": 4760 + }, + { + "epoch": 2.7943760984182777, + "grad_norm": 0.0064656296744942665, + "learning_rate": 8.824838898652608e-06, + "loss": 0.0799, + "step": 4770 + }, + { + "epoch": 2.8002343292325715, + "grad_norm": 0.018277239054441452, + "learning_rate": 8.801405975395433e-06, + "loss": 0.0438, + "step": 4780 + }, + { + "epoch": 2.806092560046866, + "grad_norm": 0.008237453177571297, + "learning_rate": 8.777973052138255e-06, + "loss": 0.0481, + "step": 4790 + }, + { + "epoch": 2.81195079086116, + "grad_norm": 0.06086944043636322, + "learning_rate": 8.754540128881079e-06, + "loss": 0.0578, + "step": 4800 + }, + { + "epoch": 2.817809021675454, + "grad_norm": 0.0078117563389241695, + "learning_rate": 8.731107205623902e-06, + "loss": 0.0054, + "step": 4810 + }, + { + "epoch": 2.823667252489748, + "grad_norm": 0.22204460203647614, + "learning_rate": 8.707674282366726e-06, + "loss": 0.0452, + "step": 4820 + }, + { + "epoch": 2.8295254833040424, + "grad_norm": 0.011765822768211365, + "learning_rate": 8.68424135910955e-06, + "loss": 0.1226, + "step": 4830 + }, + { + "epoch": 2.8353837141183362, + "grad_norm": 0.034977905452251434, + "learning_rate": 8.660808435852374e-06, + "loss": 0.0028, + "step": 4840 + }, + { + "epoch": 2.8412419449326305, + "grad_norm": 0.006177605129778385, + "learning_rate": 8.637375512595198e-06, + "loss": 0.0055, + "step": 4850 + }, + { + "epoch": 2.8471001757469243, + "grad_norm": 0.06611007452011108, + "learning_rate": 8.61394258933802e-06, + "loss": 0.0012, + "step": 4860 + }, + { + "epoch": 2.8529584065612186, + "grad_norm": 0.007338257972151041, + "learning_rate": 8.590509666080845e-06, + "loss": 0.0413, + "step": 4870 + }, + { + "epoch": 2.8588166373755124, + "grad_norm": 0.04989036172628403, + "learning_rate": 8.567076742823669e-06, + "loss": 0.0606, + "step": 4880 + }, + { + "epoch": 2.8646748681898067, + "grad_norm": 0.007559357676655054, + "learning_rate": 8.543643819566491e-06, + "loss": 0.1396, + "step": 4890 + }, + { + "epoch": 2.870533099004101, + "grad_norm": 0.05906621739268303, + "learning_rate": 8.520210896309315e-06, + "loss": 0.1917, + "step": 4900 + }, + { + "epoch": 2.8763913298183947, + "grad_norm": 0.012134838849306107, + "learning_rate": 8.49677797305214e-06, + "loss": 0.0011, + "step": 4910 + }, + { + "epoch": 2.882249560632689, + "grad_norm": 4.250200271606445, + "learning_rate": 8.473345049794962e-06, + "loss": 0.1104, + "step": 4920 + }, + { + "epoch": 2.8881077914469833, + "grad_norm": 0.15310168266296387, + "learning_rate": 8.449912126537786e-06, + "loss": 0.1596, + "step": 4930 + }, + { + "epoch": 2.893966022261277, + "grad_norm": 7.746487617492676, + "learning_rate": 8.42647920328061e-06, + "loss": 0.1139, + "step": 4940 + }, + { + "epoch": 2.899824253075571, + "grad_norm": 4.003668785095215, + "learning_rate": 8.403046280023434e-06, + "loss": 0.1337, + "step": 4950 + }, + { + "epoch": 2.905682483889865, + "grad_norm": 0.013791323639452457, + "learning_rate": 8.379613356766258e-06, + "loss": 0.0577, + "step": 4960 + }, + { + "epoch": 2.9115407147041594, + "grad_norm": 0.03509506955742836, + "learning_rate": 8.35618043350908e-06, + "loss": 0.0023, + "step": 4970 + }, + { + "epoch": 2.9173989455184532, + "grad_norm": 0.017392152920365334, + "learning_rate": 8.332747510251905e-06, + "loss": 0.0017, + "step": 4980 + }, + { + "epoch": 2.9232571763327475, + "grad_norm": 0.0742042288184166, + "learning_rate": 8.309314586994727e-06, + "loss": 0.0583, + "step": 4990 + }, + { + "epoch": 2.9291154071470418, + "grad_norm": 0.025125756859779358, + "learning_rate": 8.285881663737551e-06, + "loss": 0.0018, + "step": 5000 + }, + { + "epoch": 2.9349736379613356, + "grad_norm": 0.01341651938855648, + "learning_rate": 8.262448740480375e-06, + "loss": 0.0487, + "step": 5010 + }, + { + "epoch": 2.94083186877563, + "grad_norm": 0.024638604372739792, + "learning_rate": 8.2390158172232e-06, + "loss": 0.001, + "step": 5020 + }, + { + "epoch": 2.946690099589924, + "grad_norm": 0.038911301642656326, + "learning_rate": 8.215582893966024e-06, + "loss": 0.0506, + "step": 5030 + }, + { + "epoch": 2.952548330404218, + "grad_norm": 0.023850092664361, + "learning_rate": 8.192149970708848e-06, + "loss": 0.0145, + "step": 5040 + }, + { + "epoch": 2.9584065612185118, + "grad_norm": 0.012319614179432392, + "learning_rate": 8.16871704745167e-06, + "loss": 0.0683, + "step": 5050 + }, + { + "epoch": 2.964264792032806, + "grad_norm": 0.011331205256283283, + "learning_rate": 8.145284124194494e-06, + "loss": 0.001, + "step": 5060 + }, + { + "epoch": 2.9701230228471003, + "grad_norm": 0.003681463422253728, + "learning_rate": 8.121851200937317e-06, + "loss": 0.0044, + "step": 5070 + }, + { + "epoch": 2.975981253661394, + "grad_norm": 0.051831282675266266, + "learning_rate": 8.09841827768014e-06, + "loss": 0.0034, + "step": 5080 + }, + { + "epoch": 2.9818394844756884, + "grad_norm": 0.07008065283298492, + "learning_rate": 8.074985354422965e-06, + "loss": 0.1246, + "step": 5090 + }, + { + "epoch": 2.9876977152899826, + "grad_norm": 0.009249912574887276, + "learning_rate": 8.051552431165789e-06, + "loss": 0.0006, + "step": 5100 + }, + { + "epoch": 2.9935559461042764, + "grad_norm": 0.019453031942248344, + "learning_rate": 8.028119507908613e-06, + "loss": 0.0053, + "step": 5110 + }, + { + "epoch": 2.9994141769185707, + "grad_norm": 0.5509079098701477, + "learning_rate": 8.004686584651435e-06, + "loss": 0.0994, + "step": 5120 + }, + { + "epoch": 3.0052724077328645, + "grad_norm": 0.012448657304048538, + "learning_rate": 7.98125366139426e-06, + "loss": 0.0016, + "step": 5130 + }, + { + "epoch": 3.011130638547159, + "grad_norm": 6.02830171585083, + "learning_rate": 7.957820738137084e-06, + "loss": 0.0384, + "step": 5140 + }, + { + "epoch": 3.016988869361453, + "grad_norm": 0.013466407544910908, + "learning_rate": 7.934387814879906e-06, + "loss": 0.0013, + "step": 5150 + }, + { + "epoch": 3.022847100175747, + "grad_norm": 10.52933120727539, + "learning_rate": 7.91095489162273e-06, + "loss": 0.0748, + "step": 5160 + }, + { + "epoch": 3.028705330990041, + "grad_norm": 4.3404622077941895, + "learning_rate": 7.887521968365554e-06, + "loss": 0.1159, + "step": 5170 + }, + { + "epoch": 3.034563561804335, + "grad_norm": 0.016072312369942665, + "learning_rate": 7.864089045108378e-06, + "loss": 0.0696, + "step": 5180 + }, + { + "epoch": 3.040421792618629, + "grad_norm": 0.06083187833428383, + "learning_rate": 7.840656121851201e-06, + "loss": 0.0322, + "step": 5190 + }, + { + "epoch": 3.0462800234329235, + "grad_norm": 0.012007597833871841, + "learning_rate": 7.817223198594025e-06, + "loss": 0.1097, + "step": 5200 + }, + { + "epoch": 3.0521382542472173, + "grad_norm": 0.06487168371677399, + "learning_rate": 7.793790275336849e-06, + "loss": 0.0023, + "step": 5210 + }, + { + "epoch": 3.0579964850615116, + "grad_norm": 0.07004842162132263, + "learning_rate": 7.770357352079673e-06, + "loss": 0.0008, + "step": 5220 + }, + { + "epoch": 3.0638547158758054, + "grad_norm": 0.05898735299706459, + "learning_rate": 7.746924428822497e-06, + "loss": 0.0533, + "step": 5230 + }, + { + "epoch": 3.0697129466900996, + "grad_norm": 0.015656787902116776, + "learning_rate": 7.72349150556532e-06, + "loss": 0.0414, + "step": 5240 + }, + { + "epoch": 3.0755711775043935, + "grad_norm": 0.013448948040604591, + "learning_rate": 7.700058582308144e-06, + "loss": 0.0338, + "step": 5250 + }, + { + "epoch": 3.0814294083186877, + "grad_norm": 5.860596656799316, + "learning_rate": 7.676625659050966e-06, + "loss": 0.065, + "step": 5260 + }, + { + "epoch": 3.087287639132982, + "grad_norm": 0.013592137955129147, + "learning_rate": 7.65319273579379e-06, + "loss": 0.007, + "step": 5270 + }, + { + "epoch": 3.093145869947276, + "grad_norm": 20.216941833496094, + "learning_rate": 7.629759812536615e-06, + "loss": 0.1278, + "step": 5280 + }, + { + "epoch": 3.09900410076157, + "grad_norm": 0.07607584446668625, + "learning_rate": 7.606326889279439e-06, + "loss": 0.0025, + "step": 5290 + }, + { + "epoch": 3.104862331575864, + "grad_norm": 3.425032615661621, + "learning_rate": 7.582893966022262e-06, + "loss": 0.058, + "step": 5300 + }, + { + "epoch": 3.110720562390158, + "grad_norm": 0.026434851810336113, + "learning_rate": 7.559461042765086e-06, + "loss": 0.0361, + "step": 5310 + }, + { + "epoch": 3.1165787932044524, + "grad_norm": 0.047157056629657745, + "learning_rate": 7.5360281195079085e-06, + "loss": 0.0357, + "step": 5320 + }, + { + "epoch": 3.1224370240187462, + "grad_norm": 0.16366933286190033, + "learning_rate": 7.5125951962507326e-06, + "loss": 0.0015, + "step": 5330 + }, + { + "epoch": 3.1282952548330405, + "grad_norm": 0.013051210902631283, + "learning_rate": 7.489162272993557e-06, + "loss": 0.0354, + "step": 5340 + }, + { + "epoch": 3.1341534856473343, + "grad_norm": 0.05217352882027626, + "learning_rate": 7.46572934973638e-06, + "loss": 0.0009, + "step": 5350 + }, + { + "epoch": 3.1400117164616286, + "grad_norm": 7.535463333129883, + "learning_rate": 7.442296426479204e-06, + "loss": 0.0053, + "step": 5360 + }, + { + "epoch": 3.145869947275923, + "grad_norm": 0.7577245831489563, + "learning_rate": 7.418863503222028e-06, + "loss": 0.0021, + "step": 5370 + }, + { + "epoch": 3.1517281780902167, + "grad_norm": 0.09404406696557999, + "learning_rate": 7.395430579964851e-06, + "loss": 0.0646, + "step": 5380 + }, + { + "epoch": 3.157586408904511, + "grad_norm": 0.009803865104913712, + "learning_rate": 7.371997656707675e-06, + "loss": 0.1086, + "step": 5390 + }, + { + "epoch": 3.1634446397188047, + "grad_norm": 0.010212755762040615, + "learning_rate": 7.348564733450498e-06, + "loss": 0.0056, + "step": 5400 + }, + { + "epoch": 3.169302870533099, + "grad_norm": 0.04309909790754318, + "learning_rate": 7.325131810193322e-06, + "loss": 0.0217, + "step": 5410 + }, + { + "epoch": 3.1751611013473933, + "grad_norm": 2.965693712234497, + "learning_rate": 7.301698886936146e-06, + "loss": 0.0442, + "step": 5420 + }, + { + "epoch": 3.181019332161687, + "grad_norm": 0.6837348937988281, + "learning_rate": 7.2782659636789695e-06, + "loss": 0.1925, + "step": 5430 + }, + { + "epoch": 3.1868775629759813, + "grad_norm": 0.029974374920129776, + "learning_rate": 7.254833040421794e-06, + "loss": 0.001, + "step": 5440 + }, + { + "epoch": 3.192735793790275, + "grad_norm": 0.053507156670093536, + "learning_rate": 7.231400117164616e-06, + "loss": 0.0618, + "step": 5450 + }, + { + "epoch": 3.1985940246045694, + "grad_norm": 0.00892311055213213, + "learning_rate": 7.20796719390744e-06, + "loss": 0.0115, + "step": 5460 + }, + { + "epoch": 3.2044522554188637, + "grad_norm": 0.0072519052773714066, + "learning_rate": 7.184534270650264e-06, + "loss": 0.003, + "step": 5470 + }, + { + "epoch": 3.2103104862331575, + "grad_norm": 0.3369971215724945, + "learning_rate": 7.161101347393088e-06, + "loss": 0.0478, + "step": 5480 + }, + { + "epoch": 3.2161687170474518, + "grad_norm": 5.9024553298950195, + "learning_rate": 7.137668424135912e-06, + "loss": 0.0551, + "step": 5490 + }, + { + "epoch": 3.2220269478617456, + "grad_norm": 0.0728222206234932, + "learning_rate": 7.114235500878736e-06, + "loss": 0.0039, + "step": 5500 + }, + { + "epoch": 3.22788517867604, + "grad_norm": 0.012450517155230045, + "learning_rate": 7.090802577621558e-06, + "loss": 0.0358, + "step": 5510 + }, + { + "epoch": 3.233743409490334, + "grad_norm": 1.3134894371032715, + "learning_rate": 7.067369654364382e-06, + "loss": 0.066, + "step": 5520 + }, + { + "epoch": 3.239601640304628, + "grad_norm": 1.1952403783798218, + "learning_rate": 7.043936731107206e-06, + "loss": 0.0938, + "step": 5530 + }, + { + "epoch": 3.245459871118922, + "grad_norm": 0.003476966405287385, + "learning_rate": 7.02050380785003e-06, + "loss": 0.0918, + "step": 5540 + }, + { + "epoch": 3.251318101933216, + "grad_norm": 0.3250567317008972, + "learning_rate": 6.997070884592854e-06, + "loss": 0.0323, + "step": 5550 + }, + { + "epoch": 3.2571763327475103, + "grad_norm": 0.02057233452796936, + "learning_rate": 6.973637961335678e-06, + "loss": 0.0259, + "step": 5560 + }, + { + "epoch": 3.2630345635618045, + "grad_norm": 0.23570382595062256, + "learning_rate": 6.950205038078501e-06, + "loss": 0.0075, + "step": 5570 + }, + { + "epoch": 3.2688927943760984, + "grad_norm": 0.10391777008771896, + "learning_rate": 6.926772114821324e-06, + "loss": 0.0617, + "step": 5580 + }, + { + "epoch": 3.2747510251903926, + "grad_norm": 0.02281038463115692, + "learning_rate": 6.903339191564148e-06, + "loss": 0.1036, + "step": 5590 + }, + { + "epoch": 3.2806092560046864, + "grad_norm": 0.003879611613228917, + "learning_rate": 6.879906268306972e-06, + "loss": 0.0023, + "step": 5600 + }, + { + "epoch": 3.2864674868189807, + "grad_norm": 0.00758686289191246, + "learning_rate": 6.856473345049796e-06, + "loss": 0.058, + "step": 5610 + }, + { + "epoch": 3.2923257176332745, + "grad_norm": 1.52095365524292, + "learning_rate": 6.833040421792619e-06, + "loss": 0.0806, + "step": 5620 + }, + { + "epoch": 3.298183948447569, + "grad_norm": 0.0948934480547905, + "learning_rate": 6.809607498535443e-06, + "loss": 0.0012, + "step": 5630 + }, + { + "epoch": 3.304042179261863, + "grad_norm": 1.0891087055206299, + "learning_rate": 6.786174575278267e-06, + "loss": 0.0029, + "step": 5640 + }, + { + "epoch": 3.309900410076157, + "grad_norm": 0.1640109270811081, + "learning_rate": 6.76274165202109e-06, + "loss": 0.0035, + "step": 5650 + }, + { + "epoch": 3.315758640890451, + "grad_norm": 0.005726497154682875, + "learning_rate": 6.739308728763914e-06, + "loss": 0.0679, + "step": 5660 + }, + { + "epoch": 3.3216168717047454, + "grad_norm": 0.6655123233795166, + "learning_rate": 6.715875805506737e-06, + "loss": 0.0859, + "step": 5670 + }, + { + "epoch": 3.327475102519039, + "grad_norm": 0.015817932784557343, + "learning_rate": 6.692442882249561e-06, + "loss": 0.0406, + "step": 5680 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.7005910873413086, + "learning_rate": 6.669009958992385e-06, + "loss": 0.019, + "step": 5690 + }, + { + "epoch": 3.3391915641476273, + "grad_norm": 0.09578946232795715, + "learning_rate": 6.645577035735209e-06, + "loss": 0.0703, + "step": 5700 + }, + { + "epoch": 3.3450497949619216, + "grad_norm": 0.05586531385779381, + "learning_rate": 6.622144112478032e-06, + "loss": 0.0011, + "step": 5710 + }, + { + "epoch": 3.3509080257762154, + "grad_norm": 0.005320393946021795, + "learning_rate": 6.598711189220855e-06, + "loss": 0.0006, + "step": 5720 + }, + { + "epoch": 3.3567662565905096, + "grad_norm": 0.012893118895590305, + "learning_rate": 6.575278265963679e-06, + "loss": 0.0128, + "step": 5730 + }, + { + "epoch": 3.362624487404804, + "grad_norm": 0.014687235467135906, + "learning_rate": 6.5518453427065035e-06, + "loss": 0.0048, + "step": 5740 + }, + { + "epoch": 3.3684827182190977, + "grad_norm": 5.860557556152344, + "learning_rate": 6.528412419449327e-06, + "loss": 0.096, + "step": 5750 + }, + { + "epoch": 3.374340949033392, + "grad_norm": 3.1887881755828857, + "learning_rate": 6.504979496192151e-06, + "loss": 0.0686, + "step": 5760 + }, + { + "epoch": 3.380199179847686, + "grad_norm": 0.10010349005460739, + "learning_rate": 6.481546572934975e-06, + "loss": 0.0462, + "step": 5770 + }, + { + "epoch": 3.38605741066198, + "grad_norm": 0.0034016838762909174, + "learning_rate": 6.458113649677797e-06, + "loss": 0.0645, + "step": 5780 + }, + { + "epoch": 3.3919156414762743, + "grad_norm": 0.008133571594953537, + "learning_rate": 6.4346807264206215e-06, + "loss": 0.0044, + "step": 5790 + }, + { + "epoch": 3.397773872290568, + "grad_norm": 0.024280358105897903, + "learning_rate": 6.411247803163445e-06, + "loss": 0.0012, + "step": 5800 + }, + { + "epoch": 3.4036321031048624, + "grad_norm": 0.07370976358652115, + "learning_rate": 6.387814879906269e-06, + "loss": 0.002, + "step": 5810 + }, + { + "epoch": 3.4094903339191562, + "grad_norm": 0.002925069537013769, + "learning_rate": 6.364381956649093e-06, + "loss": 0.0899, + "step": 5820 + }, + { + "epoch": 3.4153485647334505, + "grad_norm": 0.0019779757130891085, + "learning_rate": 6.340949033391916e-06, + "loss": 0.0552, + "step": 5830 + }, + { + "epoch": 3.4212067955477448, + "grad_norm": 0.004934338852763176, + "learning_rate": 6.3175161101347395e-06, + "loss": 0.0536, + "step": 5840 + }, + { + "epoch": 3.4270650263620386, + "grad_norm": 0.014058866538107395, + "learning_rate": 6.294083186877563e-06, + "loss": 0.0165, + "step": 5850 + }, + { + "epoch": 3.432923257176333, + "grad_norm": 0.004640714265406132, + "learning_rate": 6.270650263620387e-06, + "loss": 0.0161, + "step": 5860 + }, + { + "epoch": 3.4387814879906267, + "grad_norm": 0.00481452327221632, + "learning_rate": 6.247217340363211e-06, + "loss": 0.0007, + "step": 5870 + }, + { + "epoch": 3.444639718804921, + "grad_norm": 0.0019116317853331566, + "learning_rate": 6.223784417106034e-06, + "loss": 0.0305, + "step": 5880 + }, + { + "epoch": 3.450497949619215, + "grad_norm": 0.003990430850535631, + "learning_rate": 6.200351493848858e-06, + "loss": 0.0041, + "step": 5890 + }, + { + "epoch": 3.456356180433509, + "grad_norm": 0.0065063601359725, + "learning_rate": 6.1769185705916825e-06, + "loss": 0.0761, + "step": 5900 + }, + { + "epoch": 3.4622144112478033, + "grad_norm": 0.11059655994176865, + "learning_rate": 6.153485647334505e-06, + "loss": 0.0466, + "step": 5910 + }, + { + "epoch": 3.468072642062097, + "grad_norm": 0.02467559650540352, + "learning_rate": 6.130052724077329e-06, + "loss": 0.0004, + "step": 5920 + }, + { + "epoch": 3.4739308728763914, + "grad_norm": 0.043093279004096985, + "learning_rate": 6.106619800820152e-06, + "loss": 0.0302, + "step": 5930 + }, + { + "epoch": 3.4797891036906856, + "grad_norm": 0.008807145990431309, + "learning_rate": 6.083186877562976e-06, + "loss": 0.019, + "step": 5940 + }, + { + "epoch": 3.4856473345049794, + "grad_norm": 0.0038015455938875675, + "learning_rate": 6.0597539543058005e-06, + "loss": 0.0585, + "step": 5950 + }, + { + "epoch": 3.4915055653192737, + "grad_norm": 0.00422382028773427, + "learning_rate": 6.036321031048624e-06, + "loss": 0.0962, + "step": 5960 + }, + { + "epoch": 3.4973637961335675, + "grad_norm": 0.016047485172748566, + "learning_rate": 6.012888107791447e-06, + "loss": 0.0006, + "step": 5970 + }, + { + "epoch": 3.503222026947862, + "grad_norm": 0.005743114277720451, + "learning_rate": 5.989455184534271e-06, + "loss": 0.0021, + "step": 5980 + }, + { + "epoch": 3.5090802577621556, + "grad_norm": 0.12893952429294586, + "learning_rate": 5.9660222612770944e-06, + "loss": 0.0011, + "step": 5990 + }, + { + "epoch": 3.51493848857645, + "grad_norm": 0.05099204555153847, + "learning_rate": 5.9425893380199186e-06, + "loss": 0.0009, + "step": 6000 + }, + { + "epoch": 3.520796719390744, + "grad_norm": 0.09570227563381195, + "learning_rate": 5.919156414762743e-06, + "loss": 0.0159, + "step": 6010 + }, + { + "epoch": 3.526654950205038, + "grad_norm": 0.055699292570352554, + "learning_rate": 5.895723491505566e-06, + "loss": 0.0012, + "step": 6020 + }, + { + "epoch": 3.532513181019332, + "grad_norm": 0.010486723855137825, + "learning_rate": 5.87229056824839e-06, + "loss": 0.0076, + "step": 6030 + }, + { + "epoch": 3.5383714118336265, + "grad_norm": 0.006424172315746546, + "learning_rate": 5.8488576449912125e-06, + "loss": 0.0119, + "step": 6040 + }, + { + "epoch": 3.5442296426479203, + "grad_norm": 0.042542118579149246, + "learning_rate": 5.825424721734037e-06, + "loss": 0.0117, + "step": 6050 + }, + { + "epoch": 3.5500878734622145, + "grad_norm": 0.009197155013680458, + "learning_rate": 5.801991798476861e-06, + "loss": 0.0016, + "step": 6060 + }, + { + "epoch": 3.5559461042765084, + "grad_norm": 0.020288530737161636, + "learning_rate": 5.778558875219684e-06, + "loss": 0.0012, + "step": 6070 + }, + { + "epoch": 3.5618043350908026, + "grad_norm": 14.225655555725098, + "learning_rate": 5.755125951962508e-06, + "loss": 0.0402, + "step": 6080 + }, + { + "epoch": 3.5676625659050965, + "grad_norm": 0.41536006331443787, + "learning_rate": 5.731693028705332e-06, + "loss": 0.0442, + "step": 6090 + }, + { + "epoch": 3.5735207967193907, + "grad_norm": 0.025049546733498573, + "learning_rate": 5.7082601054481555e-06, + "loss": 0.0009, + "step": 6100 + }, + { + "epoch": 3.579379027533685, + "grad_norm": 0.006835598032921553, + "learning_rate": 5.684827182190979e-06, + "loss": 0.18, + "step": 6110 + }, + { + "epoch": 3.585237258347979, + "grad_norm": 0.0030140692833811045, + "learning_rate": 5.661394258933802e-06, + "loss": 0.082, + "step": 6120 + }, + { + "epoch": 3.591095489162273, + "grad_norm": 0.008736114017665386, + "learning_rate": 5.637961335676626e-06, + "loss": 0.0187, + "step": 6130 + }, + { + "epoch": 3.5969537199765673, + "grad_norm": 0.053168874233961105, + "learning_rate": 5.61452841241945e-06, + "loss": 0.0209, + "step": 6140 + }, + { + "epoch": 3.602811950790861, + "grad_norm": 0.11160540580749512, + "learning_rate": 5.5910954891622735e-06, + "loss": 0.0193, + "step": 6150 + }, + { + "epoch": 3.6086701816051554, + "grad_norm": 0.0031351852230727673, + "learning_rate": 5.567662565905098e-06, + "loss": 0.0008, + "step": 6160 + }, + { + "epoch": 3.614528412419449, + "grad_norm": 10.10761547088623, + "learning_rate": 5.54422964264792e-06, + "loss": 0.0278, + "step": 6170 + }, + { + "epoch": 3.6203866432337435, + "grad_norm": 0.03536173328757286, + "learning_rate": 5.520796719390744e-06, + "loss": 0.032, + "step": 6180 + }, + { + "epoch": 3.6262448740480373, + "grad_norm": 0.05558720603585243, + "learning_rate": 5.497363796133568e-06, + "loss": 0.0622, + "step": 6190 + }, + { + "epoch": 3.6321031048623316, + "grad_norm": 0.011010909453034401, + "learning_rate": 5.4739308728763915e-06, + "loss": 0.0061, + "step": 6200 + }, + { + "epoch": 3.637961335676626, + "grad_norm": 0.03313250094652176, + "learning_rate": 5.450497949619216e-06, + "loss": 0.0008, + "step": 6210 + }, + { + "epoch": 3.6438195664909196, + "grad_norm": 5.195527076721191, + "learning_rate": 5.42706502636204e-06, + "loss": 0.2337, + "step": 6220 + }, + { + "epoch": 3.649677797305214, + "grad_norm": 0.05604048818349838, + "learning_rate": 5.403632103104863e-06, + "loss": 0.0463, + "step": 6230 + }, + { + "epoch": 3.655536028119508, + "grad_norm": 0.012128411792218685, + "learning_rate": 5.380199179847686e-06, + "loss": 0.0006, + "step": 6240 + }, + { + "epoch": 3.661394258933802, + "grad_norm": 5.831113338470459, + "learning_rate": 5.3567662565905095e-06, + "loss": 0.1197, + "step": 6250 + }, + { + "epoch": 3.667252489748096, + "grad_norm": 0.009865056723356247, + "learning_rate": 5.333333333333334e-06, + "loss": 0.1289, + "step": 6260 + }, + { + "epoch": 3.67311072056239, + "grad_norm": 0.011429395526647568, + "learning_rate": 5.309900410076158e-06, + "loss": 0.0004, + "step": 6270 + }, + { + "epoch": 3.6789689513766843, + "grad_norm": 0.01841702312231064, + "learning_rate": 5.286467486818981e-06, + "loss": 0.0037, + "step": 6280 + }, + { + "epoch": 3.684827182190978, + "grad_norm": 0.008382062427699566, + "learning_rate": 5.263034563561805e-06, + "loss": 0.0015, + "step": 6290 + }, + { + "epoch": 3.6906854130052724, + "grad_norm": 0.002653555478900671, + "learning_rate": 5.2396016403046276e-06, + "loss": 0.0074, + "step": 6300 + }, + { + "epoch": 3.6965436438195667, + "grad_norm": 0.02200056053698063, + "learning_rate": 5.216168717047452e-06, + "loss": 0.0071, + "step": 6310 + }, + { + "epoch": 3.7024018746338605, + "grad_norm": 0.12032772600650787, + "learning_rate": 5.192735793790276e-06, + "loss": 0.0742, + "step": 6320 + }, + { + "epoch": 3.7082601054481548, + "grad_norm": 0.03608386218547821, + "learning_rate": 5.169302870533099e-06, + "loss": 0.0713, + "step": 6330 + }, + { + "epoch": 3.7141183362624486, + "grad_norm": 0.004531078971922398, + "learning_rate": 5.145869947275923e-06, + "loss": 0.0423, + "step": 6340 + }, + { + "epoch": 3.719976567076743, + "grad_norm": 0.004161532036960125, + "learning_rate": 5.122437024018747e-06, + "loss": 0.0784, + "step": 6350 + }, + { + "epoch": 3.7258347978910367, + "grad_norm": 0.01157628558576107, + "learning_rate": 5.0990041007615706e-06, + "loss": 0.1958, + "step": 6360 + }, + { + "epoch": 3.731693028705331, + "grad_norm": 0.004814586602151394, + "learning_rate": 5.075571177504394e-06, + "loss": 0.0728, + "step": 6370 + }, + { + "epoch": 3.737551259519625, + "grad_norm": 0.061883583664894104, + "learning_rate": 5.052138254247217e-06, + "loss": 0.0012, + "step": 6380 + }, + { + "epoch": 3.743409490333919, + "grad_norm": 5.269415855407715, + "learning_rate": 5.028705330990041e-06, + "loss": 0.1278, + "step": 6390 + }, + { + "epoch": 3.7492677211482133, + "grad_norm": 0.005129367113113403, + "learning_rate": 5.005272407732865e-06, + "loss": 0.006, + "step": 6400 + }, + { + "epoch": 3.7551259519625075, + "grad_norm": 0.0043735988438129425, + "learning_rate": 4.981839484475689e-06, + "loss": 0.081, + "step": 6410 + }, + { + "epoch": 3.7609841827768014, + "grad_norm": 0.004041170235723257, + "learning_rate": 4.958406561218512e-06, + "loss": 0.0009, + "step": 6420 + }, + { + "epoch": 3.7668424135910956, + "grad_norm": 0.2732388377189636, + "learning_rate": 4.934973637961336e-06, + "loss": 0.0113, + "step": 6430 + }, + { + "epoch": 3.7727006444053894, + "grad_norm": 0.01091376505792141, + "learning_rate": 4.91154071470416e-06, + "loss": 0.0263, + "step": 6440 + }, + { + "epoch": 3.7785588752196837, + "grad_norm": 0.08126252889633179, + "learning_rate": 4.888107791446983e-06, + "loss": 0.0537, + "step": 6450 + }, + { + "epoch": 3.7844171060339775, + "grad_norm": 0.004293652717024088, + "learning_rate": 4.8646748681898075e-06, + "loss": 0.1306, + "step": 6460 + }, + { + "epoch": 3.790275336848272, + "grad_norm": 13.555880546569824, + "learning_rate": 4.841241944932631e-06, + "loss": 0.0568, + "step": 6470 + }, + { + "epoch": 3.796133567662566, + "grad_norm": 0.04533328115940094, + "learning_rate": 4.817809021675454e-06, + "loss": 0.0361, + "step": 6480 + }, + { + "epoch": 3.80199179847686, + "grad_norm": 0.0036789195146411657, + "learning_rate": 4.794376098418278e-06, + "loss": 0.0885, + "step": 6490 + }, + { + "epoch": 3.807850029291154, + "grad_norm": 0.015161854214966297, + "learning_rate": 4.770943175161102e-06, + "loss": 0.0025, + "step": 6500 + }, + { + "epoch": 3.8137082601054484, + "grad_norm": 0.03570953756570816, + "learning_rate": 4.7475102519039255e-06, + "loss": 0.0014, + "step": 6510 + }, + { + "epoch": 3.819566490919742, + "grad_norm": 0.029073569923639297, + "learning_rate": 4.724077328646749e-06, + "loss": 0.0007, + "step": 6520 + }, + { + "epoch": 3.8254247217340365, + "grad_norm": 0.001704677357338369, + "learning_rate": 4.700644405389573e-06, + "loss": 0.0637, + "step": 6530 + }, + { + "epoch": 3.8312829525483303, + "grad_norm": 0.014290403574705124, + "learning_rate": 4.677211482132397e-06, + "loss": 0.0005, + "step": 6540 + }, + { + "epoch": 3.8371411833626246, + "grad_norm": 0.0036969457287341356, + "learning_rate": 4.65377855887522e-06, + "loss": 0.0065, + "step": 6550 + }, + { + "epoch": 3.8429994141769184, + "grad_norm": 5.847663402557373, + "learning_rate": 4.6303456356180435e-06, + "loss": 0.1708, + "step": 6560 + }, + { + "epoch": 3.8488576449912126, + "grad_norm": 0.012637904845178127, + "learning_rate": 4.606912712360868e-06, + "loss": 0.0073, + "step": 6570 + }, + { + "epoch": 3.854715875805507, + "grad_norm": 0.0401185005903244, + "learning_rate": 4.583479789103691e-06, + "loss": 0.0516, + "step": 6580 + }, + { + "epoch": 3.8605741066198007, + "grad_norm": 0.030397135764360428, + "learning_rate": 4.560046865846515e-06, + "loss": 0.0711, + "step": 6590 + }, + { + "epoch": 3.866432337434095, + "grad_norm": 4.24653434753418, + "learning_rate": 4.536613942589338e-06, + "loss": 0.0038, + "step": 6600 + }, + { + "epoch": 3.8722905682483892, + "grad_norm": 0.017508642747998238, + "learning_rate": 4.5131810193321615e-06, + "loss": 0.0004, + "step": 6610 + }, + { + "epoch": 3.878148799062683, + "grad_norm": 7.2771124839782715, + "learning_rate": 4.489748096074986e-06, + "loss": 0.0742, + "step": 6620 + }, + { + "epoch": 3.884007029876977, + "grad_norm": 0.009751928970217705, + "learning_rate": 4.46631517281781e-06, + "loss": 0.0031, + "step": 6630 + }, + { + "epoch": 3.889865260691271, + "grad_norm": 0.006663624197244644, + "learning_rate": 4.442882249560633e-06, + "loss": 0.0506, + "step": 6640 + }, + { + "epoch": 3.8957234915055654, + "grad_norm": 0.09746979176998138, + "learning_rate": 4.419449326303456e-06, + "loss": 0.0009, + "step": 6650 + }, + { + "epoch": 3.9015817223198592, + "grad_norm": 0.021097734570503235, + "learning_rate": 4.3960164030462804e-06, + "loss": 0.0191, + "step": 6660 + }, + { + "epoch": 3.9074399531341535, + "grad_norm": 0.2939240634441376, + "learning_rate": 4.3725834797891045e-06, + "loss": 0.0042, + "step": 6670 + }, + { + "epoch": 3.9132981839484478, + "grad_norm": 11.390765190124512, + "learning_rate": 4.349150556531928e-06, + "loss": 0.0667, + "step": 6680 + }, + { + "epoch": 3.9191564147627416, + "grad_norm": 0.002888856688514352, + "learning_rate": 4.325717633274751e-06, + "loss": 0.0622, + "step": 6690 + }, + { + "epoch": 3.925014645577036, + "grad_norm": 0.0075246552005410194, + "learning_rate": 4.302284710017575e-06, + "loss": 0.0798, + "step": 6700 + }, + { + "epoch": 3.93087287639133, + "grad_norm": 0.03289024531841278, + "learning_rate": 4.2788517867603985e-06, + "loss": 0.0321, + "step": 6710 + }, + { + "epoch": 3.936731107205624, + "grad_norm": 0.013219290412962437, + "learning_rate": 4.2554188635032226e-06, + "loss": 0.0799, + "step": 6720 + }, + { + "epoch": 3.9425893380199177, + "grad_norm": 0.0025315419770777225, + "learning_rate": 4.231985940246046e-06, + "loss": 0.0069, + "step": 6730 + }, + { + "epoch": 3.948447568834212, + "grad_norm": 10.275527954101562, + "learning_rate": 4.20855301698887e-06, + "loss": 0.1077, + "step": 6740 + }, + { + "epoch": 3.9543057996485063, + "grad_norm": 0.09231194853782654, + "learning_rate": 4.185120093731693e-06, + "loss": 0.0109, + "step": 6750 + }, + { + "epoch": 3.9601640304628, + "grad_norm": 0.04757778346538544, + "learning_rate": 4.161687170474517e-06, + "loss": 0.0562, + "step": 6760 + }, + { + "epoch": 3.9660222612770943, + "grad_norm": 0.0062348204664886, + "learning_rate": 4.138254247217341e-06, + "loss": 0.0006, + "step": 6770 + }, + { + "epoch": 3.9718804920913886, + "grad_norm": 0.21855579316616058, + "learning_rate": 4.114821323960164e-06, + "loss": 0.0363, + "step": 6780 + }, + { + "epoch": 3.9777387229056824, + "grad_norm": 1.3651577234268188, + "learning_rate": 4.091388400702988e-06, + "loss": 0.0709, + "step": 6790 + }, + { + "epoch": 3.9835969537199767, + "grad_norm": 0.11818064749240875, + "learning_rate": 4.067955477445812e-06, + "loss": 0.0571, + "step": 6800 + }, + { + "epoch": 3.9894551845342705, + "grad_norm": 0.22804498672485352, + "learning_rate": 4.044522554188635e-06, + "loss": 0.1319, + "step": 6810 + }, + { + "epoch": 3.9953134153485648, + "grad_norm": 0.003853770438581705, + "learning_rate": 4.021089630931459e-06, + "loss": 0.0703, + "step": 6820 + } + ], + "logging_steps": 10, + "max_steps": 8535, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7382675598336000.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}